"""
Data Science Copilot Orchestrator
Main orchestration class that uses LLM function calling to execute data science workflows.
Supports multiple providers: Groq and Gemini.
"""

import json
import os
import re
from typing import Dict, Any, List, Optional
from pathlib import Path
import time
import httpx

from groq import Groq
import google.generativeai as genai
from dotenv import load_dotenv

from .cache.cache_manager import CacheManager
from .tools.tools_registry import TOOLS, get_all_tool_names, get_tools_by_category
from .tools.agent_tool_mapping import (get_tools_for_agent, filter_tools_by_names, 
                                        get_agent_description, suggest_next_agent)
from .reasoning.reasoning_trace import get_reasoning_trace, reset_reasoning_trace
from .reasoning.findings import FindingsAccumulator, Finding
from .reasoning.reasoner import Reasoner, ReasoningOutput
from .reasoning.evaluator import Evaluator, EvaluationOutput
from .reasoning.synthesizer import Synthesizer
from .routing.intent_classifier import IntentClassifier, IntentResult
from .session_memory import SessionMemory
from .session_store import SessionStore
from .workflow_state import WorkflowState
from .utils.schema_extraction import extract_schema_local, infer_task_type
from .progress_manager import progress_manager

# New systems for improvements
from .utils.semantic_layer import get_semantic_layer
from .utils.error_recovery import get_recovery_manager, retry_with_fallback
from .utils.token_budget import get_token_manager
from .utils.parallel_executor import get_parallel_executor, ToolExecution, TOOL_WEIGHTS, ToolWeight
import asyncio
from difflib import get_close_matches
from .tools import (
    # Basic Tools (13) - UPDATED: Added get_smart_summary + 3 wrangling tools
    profile_dataset,
    detect_data_quality_issues,
    analyze_correlations,
    detect_label_errors,  # NEW: cleanlab label error detection
    get_smart_summary,  # NEW
    clean_missing_values,
    handle_outliers,
    fix_data_types,
    force_numeric_conversion,
    smart_type_inference,
    create_time_features,
    encode_categorical,
    train_baseline_models,
    generate_model_report,
    # AutoGluon Tools (9) - NEW: AutoML at Scale
    train_with_autogluon,
    predict_with_autogluon,
    forecast_with_autogluon,
    optimize_autogluon_model,
    analyze_autogluon_model,
    extend_autogluon_training,
    train_multilabel_autogluon,
    backtest_timeseries,
    analyze_timeseries_model,
    # Data Wrangling Tools (3) - NEW
    merge_datasets,
    concat_datasets,
    reshape_dataset,
    # Advanced Analysis (5)
    perform_eda_analysis,
    detect_model_issues,
    detect_anomalies,
    detect_and_handle_multicollinearity,
    perform_statistical_tests,
    # Advanced Feature Engineering (4)
    create_interaction_features,
    create_aggregation_features,
    engineer_text_features,
    auto_feature_engineering,
    # Advanced Preprocessing (3)
    handle_imbalanced_data,
    perform_feature_scaling,
    split_data_strategically,
    # Advanced Training (3)
    hyperparameter_tuning,
    train_ensemble_models,
    perform_cross_validation,
    # Business Intelligence (4)
    perform_cohort_analysis,
    perform_rfm_analysis,
    detect_causal_relationships,
    generate_business_insights,
    # Computer Vision (3)
    extract_image_features,
    perform_image_clustering,
    analyze_tabular_image_hybrid,
    # NLP/Text Analytics (4)
    perform_topic_modeling,
    perform_named_entity_recognition,
    analyze_sentiment_advanced,
    perform_text_similarity,
    # Production/MLOps (5 + 2 new)
    monitor_model_drift,
    explain_predictions,
    generate_model_card,
    perform_ab_test_analysis,
    detect_feature_leakage,
    monitor_drift_evidently,
    explain_with_dtreeviz,
    # Time Series (3)
    forecast_time_series,
    detect_seasonality_trends,
    create_time_series_features,
    # Advanced Insights (6)
    analyze_root_cause,
    detect_trends_and_seasonality,
    detect_anomalies_advanced,
    perform_hypothesis_testing,
    analyze_distribution,
    perform_segment_analysis,
    # Automated Pipeline (2)
    auto_ml_pipeline,
    auto_feature_selection,
    # Visualization (5)
    generate_all_plots,
    generate_data_quality_plots,
    generate_eda_plots,
    generate_model_performance_plots,
    generate_feature_importance_plot,
    # Interactive Plotly Visualizations (6) - NEW PHASE 2
    generate_interactive_scatter,
    generate_interactive_histogram,
    generate_interactive_correlation_heatmap,
    generate_interactive_box_plots,
    generate_interactive_time_series,
    generate_plotly_dashboard,
    # EDA Report Generation (2) - NEW PHASE 2
    generate_ydata_profiling_report,
    generate_sweetviz_report,
    # Code Interpreter (2) - NEW PHASE 2 - TRUE AI AGENT CAPABILITY
    execute_python_code,
    execute_code_from_file,
    # Cloud Data Sources (4) - NEW: BigQuery Integration
    load_bigquery_table,
    write_bigquery_table,
    profile_bigquery_table,
    query_bigquery,
    # Enhanced Feature Engineering (4)
    create_ratio_features,
    create_statistical_features,
    create_log_features,
    create_binned_features,
)


class DataScienceCopilot:
    """
    Main orchestrator for data science workflows using LLM function calling.
    
    Supports multiple providers: Groq and Gemini.
    Uses function calling to intelligently route to data profiling, cleaning,
    feature engineering, and model training tools.
    """
    
    def __init__(self, groq_api_key: Optional[str] = None, 
                 google_api_key: Optional[str] = None,
                 mistral_api_key: Optional[str] = None,
                 cache_db_path: Optional[str] = None,
                 reasoning_effort: str = "medium",
                 provider: Optional[str] = None,
                 session_id: Optional[str] = None,
                 use_session_memory: bool = True,
                 use_compact_prompts: bool = False,
                 progress_callback: Optional[callable] = None):
        """
        Initialize the Data Science Copilot.
        
        Args:
            groq_api_key: Groq API key (or set GROQ_API_KEY env var)
            google_api_key: Google API key (or set GOOGLE_API_KEY env var)
            mistral_api_key: Mistral API key (or set MISTRAL_API_KEY env var)
            cache_db_path: Path to cache database
            reasoning_effort: Reasoning effort for Groq ('low', 'medium', 'high')
            provider: LLM provider - 'groq' or 'gemini' (or set LLM_PROVIDER env var)
            session_id: Session ID to resume (None = auto-resume recent or create new)
            use_session_memory: Enable session-based memory for context across requests
            use_compact_prompts: Use compact prompts for small context window models (e.g., Groq)
            progress_callback: Optional callback function to report progress (receives step_name, status)
        """
        # Load environment variables
        load_dotenv()
        
        # Store progress callback
        self.progress_callback = progress_callback
        
        # Store HTTP session key for SSE streaming (set by app.py)
        self.http_session_key = None
        
        # Determine provider
        self.provider = provider or os.getenv("LLM_PROVIDER", "mistral").lower()
        
        # Use compact prompts as specified (multi-agent has focused prompts per specialist)
        self.use_compact_prompts = use_compact_prompts
        
        if self.provider == "mistral":
            # Initialize Mistral client
            api_key = mistral_api_key or os.getenv("MISTRAL_API_KEY")
            if not api_key:
                raise ValueError("Mistral API key must be provided or set in MISTRAL_API_KEY env var")
            
            # Try new SDK first (v1.x), fall back to old SDK (v0.x)
            try:
                from mistralai import Mistral  # New SDK (v1.x)
                self.mistral_client = Mistral(api_key=api_key.strip())
            except ImportError:
                # Fall back to old SDK (v0.x)
                from mistralai.client import MistralClient
                self.mistral_client = MistralClient(api_key=api_key.strip())
            
            self.model = os.getenv("MISTRAL_MODEL", "mistral-large-latest")
            self.reasoning_effort = reasoning_effort
            self.gemini_model = None
            self.groq_client = None
            print(f"🤖 Initialized with Mistral provider - Model: {self.model}")
            
        elif self.provider == "groq":
            # Initialize Groq client
            api_key = groq_api_key or os.getenv("GROQ_API_KEY")
            if not api_key:
                raise ValueError("Groq API key must be provided or set in GROQ_API_KEY env var")
            
            self.groq_client = Groq(api_key=api_key.strip())
            self.model = os.getenv("GROQ_MODEL", "llama-3.3-70b-versatile")
            self.reasoning_effort = reasoning_effort
            self.gemini_model = None
            self.mistral_client = None
            print(f"🤖 Initialized with Groq provider - Model: {self.model}")
            
        elif self.provider == "gemini":
            # Initialize Gemini client
            api_key = google_api_key or os.getenv("GOOGLE_API_KEY") or os.getenv("GEMINI_API_KEY")
            if not api_key:
                raise ValueError("Google API key must be provided or set in GOOGLE_API_KEY or GEMINI_API_KEY env var")
            
            genai.configure(api_key=api_key.strip())
            self.model = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
            
            # Configure safety settings to be more permissive for data science content
            safety_settings = [
                {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
                {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
                {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
                {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
            ]
            
            self.gemini_model = genai.GenerativeModel(
                self.model,
                generation_config={"temperature": 0.1},
                safety_settings=safety_settings
            )
            self.groq_client = None
            self.mistral_client = None
            print(f"🤖 Initialized with Gemini provider - Model: {self.model}")
            
        else:
            raise ValueError(f"Invalid provider: {self.provider}. Must be 'mistral', 'groq', or 'gemini'")
            raise ValueError(f"Unsupported provider: {self.provider}. Choose 'groq' or 'gemini'")
        
        # Initialize cache
        cache_path = cache_db_path or os.getenv("CACHE_DB_PATH", "./cache_db/cache.db")
        self.cache = CacheManager(db_path=cache_path)
        
        # 🧠 Initialize semantic layer for column understanding and agent routing
        self.semantic_layer = get_semantic_layer()
        
        # 🛡️ Initialize error recovery manager
        self.recovery_manager = get_recovery_manager()
        
        # 📊 Initialize token budget manager
        # Calculate max tokens based on provider
        provider_max_tokens = {
            "mistral": 128000,  # Mistral Large
            "groq": 32768,     # Llama 3.3 70B
            "gemini": 1000000  # Gemini 2.5 Flash
        }
        max_context = provider_max_tokens.get(self.provider, 128000)
        self.token_manager = get_token_manager(model=self.model, max_tokens=max_context)
        
        # ⚡ Parallel executor DISABLED - running tools sequentially for stability
        # self.parallel_executor = get_parallel_executor()
        self.parallel_executor = None  # Disabled for scale optimization
        
        # 🧠 Initialize session memory
        self.use_session_memory = use_session_memory
        if use_session_memory:
            self.session_store = SessionStore()
            
            # Try to load existing session or create new one
            if session_id:
                # Explicit session ID provided - load it
                self.session = self.session_store.load(session_id)
                if not self.session:
                    print(f"⚠️  Session {session_id} not found, creating new session")
                    self.session = SessionMemory(session_id=session_id)
                else:
                    print(f"✅ Loaded session: {session_id}")
            else:
                # Try to continue recent session (within 24 hours)
                self.session = self.session_store.get_recent_session(max_age_hours=24)
                if self.session:
                    print(f"✅ Resuming recent session: {self.session.session_id}")
                else:
                    # No recent session - create new one
                    self.session = SessionMemory()
                    print(f"✅ Created new session: {self.session.session_id}")
            
            # Show context if available
            if self.session.last_dataset or self.session.last_model:
                print(f"📝 Session Context:")
                if self.session.last_dataset:
                    print(f"   - Last dataset: {self.session.last_dataset}")
                if self.session.last_model:
                    print(f"   - Last model: {self.session.last_model} (score: {self.session.best_score:.4f})" if self.session.best_score else f"   - Last model: {self.session.last_model}")
        else:
            self.session = None
            print("⚠️  Session memory disabled")
        
        # 🔍 Initialize reasoning trace for decision tracking
        self.reasoning_trace = get_reasoning_trace()
        
        # Tools registry
        self.tools_registry = TOOLS
        self.tool_functions = self._build_tool_functions_map()
        
        # Token tracking and rate limiting
        self.total_tokens_used = 0
        self.tokens_this_minute = 0
        self.minute_start_time = time.time()
        self.api_calls_made = 0
        
        # Provider-specific limits
        if self.provider == "mistral":
            self.tpm_limit = 500000  # 500K tokens/minute (very generous)
            self.rpm_limit = 500     # 500 requests/minute
            self.min_api_call_interval = 0.1  # Minimal delay
        elif self.provider == "groq":
            self.tpm_limit = 12000  # Tokens per minute
            self.rpm_limit = 30     # Requests per minute
            self.min_api_call_interval = 0.5  # Wait between calls
        elif self.provider == "gemini":
            self.tpm_limit = 32000  # More generous
            self.rpm_limit = 15
            self.min_api_call_interval = 1.0  # Gemini free tier: safer spacing
        
        # Rate limiting for Gemini (10 RPM free tier)
        self.last_api_call_time = 0
        
        # Workflow state for context management (reduces token usage)
        self.workflow_state = WorkflowState()
        
        # Multi-Agent Architecture - Specialist Agents
        self.specialist_agents = self._initialize_specialist_agents()
        self.active_agent = "Orchestrator"  # Track which agent is working
        
        # Determine output directory based on environment
        # In production (HuggingFace/Cloud Run), use /tmp for ephemeral storage
        if os.path.exists("/tmp") and os.access("/tmp", os.W_OK):
            self.output_base = Path("/tmp/data_science_agent/outputs")
        else:
            self.output_base = Path("./outputs")
        
        # Set environment variable for tools to use
        os.environ["DS_AGENT_OUTPUT_DIR"] = str(self.output_base)
        
        # Ensure output directories exist
        self.output_base.mkdir(parents=True, exist_ok=True)
        (self.output_base / "models").mkdir(exist_ok=True)
        (self.output_base / "reports").mkdir(exist_ok=True)
        (self.output_base / "data").mkdir(exist_ok=True)
        (self.output_base / "plots").mkdir(exist_ok=True)
        (self.output_base / "plots" / "interactive").mkdir(exist_ok=True)
        
        print(f"📁 Output directory: {self.output_base}")
    
    def _build_tool_functions_map(self) -> Dict[str, callable]:
        """Build mapping of tool names to their functions - All 75 tools."""
        return {
            # Basic Tools (13) - UPDATED: Added 4 new tools
            "profile_dataset": profile_dataset,
            "detect_data_quality_issues": detect_data_quality_issues,
            "analyze_correlations": analyze_correlations,
            "detect_label_errors": detect_label_errors,  # NEW: cleanlab
            "get_smart_summary": get_smart_summary,  # NEW
            "clean_missing_values": clean_missing_values,
            "handle_outliers": handle_outliers,
            "fix_data_types": fix_data_types,
            "force_numeric_conversion": force_numeric_conversion,
            "smart_type_inference": smart_type_inference,
            "create_time_features": create_time_features,
            "encode_categorical": encode_categorical,
            "train_baseline_models": train_baseline_models,
            "generate_model_report": generate_model_report,
            # AutoGluon Tools (9) - NEW: AutoML at Scale
            "train_with_autogluon": train_with_autogluon,
            "predict_with_autogluon": predict_with_autogluon,
            "forecast_with_autogluon": forecast_with_autogluon,
            "optimize_autogluon_model": optimize_autogluon_model,
            "analyze_autogluon_model": analyze_autogluon_model,
            "extend_autogluon_training": extend_autogluon_training,
            "train_multilabel_autogluon": train_multilabel_autogluon,
            "backtest_timeseries": backtest_timeseries,
            "analyze_timeseries_model": analyze_timeseries_model,
            # Data Wrangling Tools (3) - NEW
            "merge_datasets": merge_datasets,
            "concat_datasets": concat_datasets,
            "reshape_dataset": reshape_dataset,
            # Advanced Analysis (5)
            "perform_eda_analysis": perform_eda_analysis,
            "detect_model_issues": detect_model_issues,
            "detect_anomalies": detect_anomalies,
            "detect_and_handle_multicollinearity": detect_and_handle_multicollinearity,
            "perform_statistical_tests": perform_statistical_tests,
            # Advanced Feature Engineering (4)
            "create_interaction_features": create_interaction_features,
            "create_aggregation_features": create_aggregation_features,
            "engineer_text_features": engineer_text_features,
            "auto_feature_engineering": auto_feature_engineering,
            # Advanced Preprocessing (3)
            "handle_imbalanced_data": handle_imbalanced_data,
            "perform_feature_scaling": perform_feature_scaling,
            "split_data_strategically": split_data_strategically,
            # Advanced Training (3)
            "hyperparameter_tuning": hyperparameter_tuning,
            # "train_ensemble_models": train_ensemble_models,  # DISABLED - Too resource intensive for scale
            "perform_cross_validation": perform_cross_validation,
            # Business Intelligence (4)
            "perform_cohort_analysis": perform_cohort_analysis,
            "perform_rfm_analysis": perform_rfm_analysis,
            "detect_causal_relationships": detect_causal_relationships,
            "generate_business_insights": generate_business_insights,
            # Computer Vision (3)
            "extract_image_features": extract_image_features,
            "perform_image_clustering": perform_image_clustering,
            "analyze_tabular_image_hybrid": analyze_tabular_image_hybrid,
            # NLP/Text Analytics (4)
            "perform_topic_modeling": perform_topic_modeling,
            "perform_named_entity_recognition": perform_named_entity_recognition,
            "analyze_sentiment_advanced": analyze_sentiment_advanced,
            "perform_text_similarity": perform_text_similarity,
            # Production/MLOps (5 + 2 new)
            "monitor_model_drift": monitor_model_drift,
            "explain_predictions": explain_predictions,
            "generate_model_card": generate_model_card,
            "perform_ab_test_analysis": perform_ab_test_analysis,
            "detect_feature_leakage": detect_feature_leakage,
            "monitor_drift_evidently": monitor_drift_evidently,
            "explain_with_dtreeviz": explain_with_dtreeviz,
            # Time Series (3)
            "forecast_time_series": forecast_time_series,
            "detect_seasonality_trends": detect_seasonality_trends,
            "create_time_series_features": create_time_series_features,
            # Advanced Insights (6)
            "analyze_root_cause": analyze_root_cause,
            "detect_trends_and_seasonality": detect_trends_and_seasonality,
            "detect_anomalies_advanced": detect_anomalies_advanced,
            "perform_hypothesis_testing": perform_hypothesis_testing,
            "analyze_distribution": analyze_distribution,
            "perform_segment_analysis": perform_segment_analysis,
            # Automated Pipeline (2)
            "auto_ml_pipeline": auto_ml_pipeline,
            "auto_feature_selection": auto_feature_selection,
            # Visualization (5)
            "generate_all_plots": generate_all_plots,
            "generate_data_quality_plots": generate_data_quality_plots,
            "generate_eda_plots": generate_eda_plots,
            "generate_model_performance_plots": generate_model_performance_plots,
            "generate_feature_importance_plot": generate_feature_importance_plot,
            # Interactive Plotly Visualizations (6) - NEW PHASE 2
            "generate_interactive_scatter": generate_interactive_scatter,
            "generate_interactive_histogram": generate_interactive_histogram,
            "generate_interactive_correlation_heatmap": generate_interactive_correlation_heatmap,
            "generate_interactive_box_plots": generate_interactive_box_plots,
            "generate_interactive_time_series": generate_interactive_time_series,
            "generate_plotly_dashboard": generate_plotly_dashboard,
            # EDA Report Generation (2) - NEW PHASE 2
            "generate_ydata_profiling_report": generate_ydata_profiling_report,
            "generate_sweetviz_report": generate_sweetviz_report,
            # Code Interpreter (2) - NEW PHASE 2 - TRUE AI AGENT CAPABILITY
            "execute_python_code": execute_python_code,
            "execute_code_from_file": execute_code_from_file,
            # Cloud Data Sources (4) - NEW: BigQuery Integration
            "load_bigquery_table": load_bigquery_table,
            "write_bigquery_table": write_bigquery_table,
            "profile_bigquery_table": profile_bigquery_table,
            "query_bigquery": query_bigquery,
            # Enhanced Feature Engineering (4)
            "create_ratio_features": create_ratio_features,
            "create_statistical_features": create_statistical_features,
            "create_log_features": create_log_features,
            "create_binned_features": create_binned_features,
        }
    
    def _extract_content_text(self, content) -> str:
        """Extract text from message content (handles both string and list formats)"""
        if content is None:
            return None
        if isinstance(content, str):
            return content
        if isinstance(content, list):
            # Content is list of objects like [{'type': 'text', 'text': '...'}]
            text_parts = []
            for item in content:
                if isinstance(item, dict) and 'text' in item:
                    text_parts.append(item['text'])
                elif isinstance(item, str):
                    text_parts.append(item)
            return ''.join(text_parts)
        return str(content)
    
    def _build_system_prompt(self) -> str:
        """Build comprehensive system prompt for the copilot."""
        return """You are an autonomous Data Science Agent. You EXECUTE tasks, not advise.

**CRITICAL: User Interface Integration & Response Formatting**
- The user interface automatically displays clickable buttons for all generated plots, reports, and outputs
- **ABSOLUTELY FORBIDDEN**: NEVER EVER mention file paths in your responses
  - ❌ NEVER write: "./outputs/...", "/outputs/...", "saved to", "output file:", "file path:"
  - ❌ NEVER use markdown code blocks for file paths (no backticks around paths)
  - ❌ NEVER say: "Output File:", "Saved to:", "File:", "Path:", "Location:"
- **WHAT TO SAY INSTEAD**:
  - ✅ "Generated an interactive correlation heatmap"
  - ✅ "Cleaned the dataset by handling missing values"
  - ✅ "Created visualizations showing the relationships"
  - ✅ "Trained multiple models and optimized the best performer"
- Users can click buttons to view outputs - you don't need to tell them where files are
- Use clean, aesthetic formatting with sections, bullets, and proper spacing

**🎨 MARKDOWN FORMATTING RULES (CRITICAL FOR CLEAN UI):**
- **INLINE CODE**: Keep inline code on the SAME LINE as surrounding text
  - ✅ CORRECT: "Extract features like `column_a`, `column_b`, and `column_c` from the dataset."
  - ❌ WRONG: "Extract features like\n`column_a`\n,\n`column_b`\n"
- **LISTS**: Write list items as complete sentences on single lines
  - ✅ CORRECT: "1. Extract `feature_1`, `feature_2`, `feature_3` from the datetime column"
  - ❌ WRONG: "1. Extract\n`feature_1`\n,\n`feature_2`\n"
- **TABLES**: Keep each cell's content on ONE line, no line breaks inside cells
  - ✅ CORRECT: "| `feature_name` | Numeric | Extracted from `source_column` |"
  - ❌ WRONG: "|\n`feature_name`\n| Numeric |\nExtracted from\n`source_column`\n|"
- **COMMAS/PUNCTUATION**: Keep punctuation attached to text, not on separate lines
  - ✅ CORRECT: "`col1`, `col2`, and `col3`"
  - ❌ WRONG: "`col1`\n,\n`col2`"
- **INLINE CODE IN SENTENCES**: Always embed column/feature names naturally in prose
  - ✅ CORRECT: "The `price` column shows correlation with `quantity` and `discount`."
  - ❌ WRONG: "The\n`price`\ncolumn shows correlation with\n`quantity`\n"
- **GENERAL**: Write flowing prose. Never put backticked terms on their own lines unless showing code blocks.

**CRITICAL: Tool Calling Format**
When you need to use a tool, respond with a JSON block like this:
```json
{
  "tool": "tool_name",
  "arguments": {
    "param1": "value1",
    "param2": 123
  }
}
```

**ONE TOOL PER RESPONSE**. After tool execution, I will send you the result and you can call the next tool.

**CRITICAL: Detect the user's intent and use the appropriate workflow.**

**🎯 INTENT DETECTION (ALWAYS DO THIS FIRST):**

**A. CODE-ONLY TASKS** - User wants to execute custom Python code:
- Keywords: "execute", "run code", "calculate", "generate data", "create plot", "custom visualization"
- No dataset file provided (file_path="dummy" or similar)
- Specific programming task (Fibonacci, custom charts, synthetic data, etc.)
- **ACTION**: Use execute_python_code tool ONCE and IMMEDIATELY return success. DO NOT run ML workflow!
- **CRITICAL**: After execute_python_code succeeds → STOP IMMEDIATELY, return summary, DO NOT call any other tools!
- **Example**: "Calculate Fibonacci" → execute_python_code → RETURN SUCCESS ✓ (NO other tools!)

**B. VISUALIZATION-ONLY REQUESTS** - User wants charts/graphs without ML:
- Keywords: "generate plots", "create dashboard", "visualize", "show graphs", "interactive charts"
- **NO keywords for ML**: No "train", "predict", "model", "classify", "forecast"
- Real dataset provided BUT only wants visualization
- **ACTION**: Generate visualizations directly, skip data cleaning/ML steps
- **Workflow**: 
  1. generate_interactive_scatter() OR generate_plotly_dashboard() 
  2. STOP - DO NOT clean data, encode, or train models!
- **Example**: "Generate interactive scatter plot for price vs quantity" → generate_interactive_scatter → DONE ✓

**C. DATA PROFILING REPORT** - User wants comprehensive data analysis report:
- Keywords: "detailed report", "comprehensive report", "data report", "profiling report", "full analysis"
- **NO specific visualization mentioned** (no "plot", "chart", "graph")
- Real dataset provided
- **ACTION**: Use generate_ydata_profiling_report tool
- **Workflow**:
  1. generate_ydata_profiling_report(file_path) 
  2. STOP - This generates a complete HTML report with all stats, correlations, distributions
- **Example**: "Generate a detailed report for this" → generate_ydata_profiling_report → DONE ✓

**D. DATA ANALYSIS WITH ML** - Full workflow with model training:
- Real dataset file path provided (CSV, Excel, etc. - NOT "dummy")
- Keywords: "train model", "predict", "classify", "build model", "forecast"
- User wants: cleaning + feature engineering + model training
- **ACTION**: Run full ML workflow (steps 1-15 below)
- **🎯 IMPORTANT**: ALWAYS generate ydata_profiling_report at the END of workflow for comprehensive final analysis
- **Example**: "Train a model to predict sales/price/target" → Full pipeline + ydata_profiling_report at end

**E. UNCLEAR/AMBIGUOUS REQUESTS** - Intent is not obvious:
- User says: "analyze", "look at", "check", "review" (without specifics)
- Could mean: visualization only OR full ML OR just exploration
- **ACTION**: ASK USER to clarify BEFORE starting work
- **Questions to ask**:
  - "Would you like me to: (1) Just create visualizations, (2) Train a predictive model, or (3) Both?"
  - "Do you need model training or just want to explore the data visually?"
- **DO NOT ASSUME** - Always ask when unclear!

**F. SIMPLE QUESTIONS** - User asks for explanation/advice:
- Keywords: "what is", "how to", "explain", "recommend"
- **ACTION**: Answer directly, no tools needed

---

**WORKFLOW FOR VISUALIZATION-ONLY (Type B above):**
- User wants: "generate plots", "create dashboard", "visualize X and Y"
- **DO NOT run full pipeline** - Skip cleaning, encoding, training!
- **Quick workflow**:
  1. If specific columns mentioned → generate_interactive_scatter(x_col, y_col)
  2. If "dashboard" mentioned → generate_plotly_dashboard(file_path, target_col)
  3. STOP - Return success
- **Example**: "Generate interactive plots for price and quantity"
  → generate_interactive_scatter(x_col="price", y_col="quantity") → DONE ✓

**📊 COLUMN SELECTION FOR VAGUE REQUESTS:**
When user doesn't specify columns (e.g., "plot a scatter" without mentioning X/Y):

1. **Analyze the dataset structure and domain**:
   - Inspect column names, types, and value ranges
   - Identify patterns: spatial coordinates (lat/lon, x/y), temporal data (dates, timestamps), 
     categorical hierarchies, numerical measurements, identifiers
   - Infer domain from filename/columns (geographic, financial, health, retail, etc.)
   
2. **Apply intelligent selection strategies**:
   
   **For Scatter Plots** - Choose variables with meaningful relationships:
   - Geographic data: Pair coordinate columns (latitude+longitude, x+y coordinates)
   - Price/size relationships: Pair cost with quantity/area/volume metrics
   - Performance metrics: Pair effort/input with outcome/output variables
   - Temporal relationships: Pair time with trend variables
   - Categorical vs numeric: Use most important numeric split by key category
   
   **For Histograms** - Select the primary measure of interest:
   - Target variable (if identified): The variable being predicted/analyzed
   - Main metric: Revenue, score, magnitude, count, amount (key business/scientific measure)
   - Distribution of interest: Variable with expected patterns (age, income, frequency)
   - First numeric column with meaningful range (avoid IDs, binary flags)
   
   **For Box Plots** - Show distribution comparisons:
   - Numeric variable grouped by categorical (e.g., price by category, score by region)
   - Multiple related numeric variables side-by-side
   
   **For Time Series** - Identify temporal patterns:
   - Date/datetime column + primary metric to track over time
   - Multiple metrics over time if related (sales, costs, profit)
   
   **For Heatmaps** - No column choice needed (shows all numeric correlations)
   
3. **Selection principles** (no dataset-specific bias):
   - Avoid ID columns, constants, or binary flags for visualizations
   - Prefer columns with high variance and meaningful ranges
   - Choose natural pairs (coordinates, input-output, cause-effect)
   - Select variables that answer implicit questions about the data
   - When uncertain, pick columns that reveal the most information
   
4. **ALWAYS EXPLAIN YOUR REASONING** in the final summary:
   - State WHAT columns you chose
   - Explain WHY those columns (their relationship/significance)
   - Describe WHAT INSIGHTS the visualization reveals
   
   ✅ Good explanation:
   "I created a scatter plot of [Column A] vs [Column B] because they represent [relationship type].
   This visualization reveals [pattern/insight]. For the histogram, I chose [Column C] as it's 
   the [primary metric/target variable], showing [distribution pattern]."
   
   ❌ Bad explanation:
   "Scatter plot created" (no reasoning about column selection)

**TRANSPARENCY RULE**: Justify every column choice with domain-agnostic reasoning based on data 
structure, variable relationships, and expected insights - not hardcoded domain assumptions.

**WORKFLOW FOR FULL ML ANALYSIS (Type C above):**
- User wants: model training, prediction, classification
- Execute steps IN ORDER (1 → 2 → 3 → ... → 15)
- Each step runs ONCE (unless explicitly noted like "call for each datetime column")
- After step completes successfully (✓ Completed) → IMMEDIATELY move to NEXT step
- DO NOT repeat steps, DO NOT go backwards, DO NOT skip steps (unless optional)
- Track your progress: "Completed steps 1-8, now executing step 9..."

**FULL ML WORKFLOW (Execute ALL steps - DO NOT SKIP):**
1. profile_dataset(file_path) - ONCE ONLY
2. detect_data_quality_issues(file_path) - ONCE ONLY
3. generate_data_quality_plots(file_path, output_dir="./outputs/plots/quality") - Generate quality visualizations
4. clean_missing_values(file_path=file_path, strategy="auto", output_path="./outputs/data/cleaned.csv")
5. handle_outliers(file_path=cleaned, method="clip", columns=["all"], output_path="./outputs/data/no_outliers.csv")
6. force_numeric_conversion(file_path=latest, columns=["all"], output_path="./outputs/data/numeric.csv", errors="coerce")
7. **IF DATETIME COLUMNS EXIST**: create_time_features(file_path=latest, date_col="<column_name>", output_path="./outputs/data/time_features.csv") - Extract year/month/day/hour/weekday/timestamp from each datetime column
8. encode_categorical(file_path=latest, method="auto", output_path="./outputs/data/encoded.csv")
9. generate_eda_plots(encoded, target_col, output_dir="./outputs/plots/eda") - Generate EDA visualizations
10. **ONLY IF USER EXPLICITLY REQUESTED ML**: train_with_autogluon(file_path=encoded, target_col=target_col, task_type="auto", time_limit=120, presets="medium_quality")
    - AutoGluon is the DEFAULT training tool. It trains 10+ models with auto ensembling.
    - It handles raw data directly (categoricals, missing values) but we clean first for best results.
    - Fallback: train_baseline_models(encoded, target_col, task_type="auto") if AutoGluon unavailable.
    - For multi-label prediction: train_multilabel_autogluon(file_path, target_cols=["col1","col2"])
    - Post-training: optimize_autogluon_model(model_path, operation="refit_full|distill|calibrate_threshold|deploy_optimize")
    - Model inspection: analyze_autogluon_model(model_path, operation="summary|transform_features|info")
    - Add more models: extend_autogluon_training(model_path, operation="fit_extra")
    - For time series: forecast_with_autogluon (supports covariates, holidays, model selection)
    - TS backtesting: backtest_timeseries(file_path, target_col, time_col, num_val_windows=3)
    - TS analysis: analyze_timeseries_model(model_path, data_path, time_col, operation="plot|feature_importance")
10b. **ALWAYS AFTER MODEL TRAINING**: generate_ydata_profiling_report(encoded, output_path="./outputs/reports/ydata_profile.html") - Comprehensive data analysis report
11. **HYPERPARAMETER TUNING (⚠️ ONLY WHEN EXPLICITLY REQUESTED)**:
    - ⚠️ **CRITICAL WARNING**: This is EXTREMELY expensive (5-10 minutes) and resource-intensive!
    - ⚠️ **DO NOT USE UNLESS USER EXPLICITLY ASKS FOR IT**
    - **ONLY use when user says**: "tune", "optimize", "hyperparameter", "improve model", "best parameters"
    - **NEVER auto-trigger** based on scores - user must explicitly request it
    - **How**: hyperparameter_tuning(file_path=encoded, target_col=target_col, model_type="xgboost", n_trials=50)
    - **Large datasets (>100K rows)**: n_trials automatically reduced to 20 to prevent timeout
    - **Only tune the WINNING model** (don't waste time on others)
    - **Map model names**: XGBoost→"xgboost", Ridge→"ridge", Lasso→use Ridge
    - **Note**: Time features should already be extracted in step 7 (create_time_features)
12. **CROSS-VALIDATION (OPTIONAL - Production Models)**:
    - IF user says "validate", "production", "robust", "deploy" → ALWAYS cross-validate
    - IF best model score > 0.85 → Cross-validate to confirm robustness
    - ELSE → Skip (focus on improving score first with tuning)
    - **How**: perform_cross_validation(file_path=encoded, target_col=target_col, model_type="xgboost", cv_strategy="kfold", n_splits=5)
    - **Use same model type as winner** (e.g., if XGBoost won, use model_type="xgboost")
    - **Provides**: Mean CV score ± std dev (shows if model is reliable)
    - **Note**: Time features should already be extracted in step 7 (create_time_features)
13. **AFTER TRAINING/TUNING**: generate_combined_eda_report(encoded, target_col, output_dir="./outputs/reports") - Generate comprehensive HTML reports
14. **INTERACTIVE DASHBOARD (OPTIONAL - Smart Detection)**:
    - **ALWAYS generate IF user mentions**: "dashboard", "interactive", "plotly", "visualize", "charts", "graphs", "plots"
    - **ALWAYS generate IF user wants exploration**: "explore", "show me", "visualize data"
    - **SKIP IF**: User only wants model training without visualization
    - **How**: generate_plotly_dashboard(encoded, target_col, output_dir="./outputs/plots/interactive")
    - **What it creates**: Correlation heatmap, box plots, scatter plots, histograms - all interactive with zoom/pan/hover
    - **Works with ANY dataset**: Automatically detects numeric/categorical columns and generates appropriate visualizations
15. STOP when the user's request is fulfilled

**CRITICAL RULES:**

🚨 **RULE #1 - NEVER REPEAT SUCCESSFUL TOOLS**:
  - If a tool returns "✓ Completed" → MOVE TO NEXT STEP IMMEDIATELY
  - DO NOT call the same tool again (even with different arguments)
  - DO NOT call a different tool for the same task
  - Examples:
    * encode_categorical succeeded → DO NOT call execute_python_code for encoding
    * create_time_features succeeded → DO NOT call execute_python_code for time features
    * clean_missing_values succeeded → DO NOT call execute_python_code for cleaning
  - **ONLY EXCEPTION**: Different columns require separate calls (e.g., create_time_features for 'time' AND 'updated')

🚨 **RULE #2 - ENCODING IS ONE-TIME ONLY**:
  - Categorical encoding happens ONCE in step 8
  - If encode_categorical succeeds → SKIP to step 9 (generate_eda_plots)
  - DO NOT call execute_python_code with pd.get_dummies() or one-hot encoding
  - DO NOT call encode_categorical again
  - The file ./outputs/data/encoded.csv exists? → Encoding is DONE, move forward!

🚨 **RULE #3 - PREFER SPECIALIZED TOOLS**:
  - For time features → USE create_time_features(), NOT execute_python_code
  - For encoding → USE encode_categorical(), NOT execute_python_code
  - For cleaning → USE clean_missing_values(), NOT execute_python_code
  - For outliers → USE handle_outliers(), NOT execute_python_code
  - ONLY use execute_python_code when NO specialized tool exists!

- DO NOT repeat profile_dataset or detect_data_quality_issues multiple times
- DO NOT call smart_type_inference after encoding - data is ready
- **⚠️ ERROR RECOVERY - If a Tool Fails**:
  - DO NOT get stuck retrying the same failed tool
  - MOVE FORWARD to the next step (reports, visualizations, etc.)
  - Example: If hyperparameter_tuning fails → generate_combined_eda_report
  - Example: If encode_categorical fails → try force_numeric_conversion OR move to EDA
  - **NEVER let one failure stop the entire workflow!**
- **⚠️ HYPERPARAMETER TUNING - When to Use**:
  - AFTER train_baseline_models completes successfully
  - ONLY tune the BEST performing model (highest score)
  - DO NOT tune all 6 models (waste of time!)
  - Tune IF: user wants "optimize"/"improve" OR best score < 0.90
  - Skip IF: best score > 0.95 (already excellent)
  - **How to call**: hyperparameter_tuning(file_path, target_col, model_type="xgboost", n_trials=50)
  - **Model types**: "xgboost", "random_forest", "ridge", "logistic"
  - **Example**: If XGBoost wins → hyperparameter_tuning(..., model_type="xgboost")
- **⚠️ CROSS-VALIDATION - When to Use**:
  - AFTER hyperparameter_tuning (or if user explicitly requests validation)
  - Use to confirm model robustness with confidence intervals
  - IF best score > 0.85 → Cross-validate to ensure consistency
  - IF user says "validate", "production", "deploy" → ALWAYS cross-validate
  - **How to call**: perform_cross_validation(file_path, target_col, model_type="xgboost", cv_strategy="kfold", n_splits=5)
  - **Use same model_type as winner** (e.g., XGBoost→"xgboost", RandomForest→"random_forest")
  - **Returns**: Mean score ± std dev across folds (e.g., "0.92 ± 0.03" means reliable)
- **ALWAYS generate EDA reports after training/tuning** using generate_combined_eda_report
- **⭐ INTERACTIVE DASHBOARD - When to Generate**:
  - **ALWAYS IF user says**: "dashboard", "interactive", "plotly", "visualize", "charts", "graphs", "show plots", "explore data"
  - **ALWAYS IF analysis/exploration request**: "analyze dataset", "show insights", "explore patterns"
  - **SKIP IF**: User ONLY wants model training (e.g., "just train model", "only predict")
  - **Tool**: generate_plotly_dashboard(encoded, target_col, output_dir="./outputs/plots/interactive")
  - **Works with ANY dataset**: Auto-detects columns and generates appropriate visualizations
- **ONLY train models when user explicitly asks with keywords**: "train", "predict", "model", "classification", "regression", "forecast", "build a model"
- **For analysis/exploration requests ONLY**: Stop after EDA plots/dashboard - DO NOT train models
- **Read user intent carefully**: "analyze" ≠ "train", "show insights" ≠ "predict"
- **When target column is unclear**: Ask user before training

**🎯 CRITICAL EXAMPLES - DETECT INTENT CORRECTLY:**

**Type B (Visualization-Only) - NO ML WORKFLOW:**
- ✅ "Generate interactive plots for price and quantity"
  → generate_interactive_scatter(x_col="price", y_col="quantity") → STOP
- ✅ "Create a dashboard showing correlations"
  → generate_plotly_dashboard(file_path) → STOP
- ✅ "Visualize the distribution of revenue"
  → generate_interactive_histogram(column="revenue") → STOP
- ✅ "Show me graphs of sales over time"
  → generate_interactive_time_series() → STOP

**Type C (Full ML) - RUN COMPLETE WORKFLOW:**
- ✅ "Train a model to predict house prices"
  → Full pipeline (steps 1-15)
- ✅ "Build a classifier for customer churn"
  → Full pipeline (steps 1-15)
- ✅ "Analyze data and train model to forecast revenue"
  → Full pipeline (steps 1-15)

**Type D (Unclear) - ASK USER:**
- ❓ "Analyze this dataset"
  → ASK: "Would you like me to (1) Create visualizations, (2) Train a predictive model, or (3) Both?"
- ❓ "Look at this CSV file"
  → ASK: "What would you like me to do? Visualize data or build a model?"
- ❓ "Check out my data"
  → ASK: "Do you want to explore the data visually or train a forecasting model?"

**⚠️ COMMON MISTAKES - AVOID THESE:**
- ❌ User says "generate plots" → Agent runs full ML workflow (WRONG!)
- ❌ User says "visualize" → Agent cleans data, encodes, trains models (WRONG!)
- ❌ User says "analyze" → Agent assumes ML training (WRONG - ask first!)
- ✅ User says "generate plots" → Agent creates plots and STOPS (CORRECT!)
- ✅ User says "train model" → Agent runs full pipeline (CORRECT!)

⭐ **CODE INTERPRETER - HOW TO USE:**

**For CODE-ONLY Tasks (Type A):**
1. User asks to "execute code", "calculate", "generate data", "create custom plot"
2. Call execute_python_code with the full Python code
3. STOP after code executes - DO NOT run ML workflow!
4. Example:
   ```
   execute_python_code(
       code='''
import numpy as np
# Calculate fibonacci
def fib(n):
    a, b = 0, 1
    for _ in range(n):
        print(a)
        a, b = b, a+b
fib(20)
       ''',
       working_directory="./outputs/code"
   )
   # Then STOP - task complete!
   ```

**For Data Analysis Workflow (Type B):**
Use specialized tools FIRST. Only use execute_python_code for:
1. **Custom Visualizations**: Specific plot types (dropdown filters, custom buttons, animated charts)
2. **Domain-Specific Calculations**: Custom business metrics, specialized formulas
3. **Custom Data Transformations**: Unique reshaping not covered by tools
4. **Interactive Widgets**: Plotly dropdowns, sliders, buttons

**⚠️ DO NOT USE execute_python_code FOR:**
- ❌ Time feature extraction → USE create_time_features() tool
- ❌ Categorical encoding → USE encode_categorical() tool
- ❌ Missing values → USE clean_missing_values() tool
- ❌ Outliers → USE handle_outliers() tool
- ❌ Standard EDA plots → USE generate_eda_plots() or generate_plotly_dashboard()
- ❌ Model training → USE train_with_autogluon() (preferred) or train_baseline_models()
- ❌ Model optimization → USE optimize_autogluon_model() (refit, distill, deploy)
- ❌ Time series forecasting → USE forecast_with_autogluon() (supports covariates, holidays)
- ❌ Time series backtesting → USE backtest_timeseries()
- ❌ Multi-label prediction → USE train_multilabel_autogluon()
- ❌ Tasks with dedicated tools → USE THE TOOL, NOT custom code!

**Rule of Thumb:**
- CODE-ONLY task? → execute_python_code ONCE → STOP
- Data analysis task? → Use specialized tools, execute_python_code only for custom needs
- If a specialized tool exists → USE THE TOOL, not custom code

**KEY TOOLS (77 total available via function calling):**
- force_numeric_conversion: Converts string columns to numeric (auto-detects, skips text)
- clean_missing_values: "auto" mode supported
- encode_categorical: one-hot/target/frequency encoding
- **⭐ train_with_autogluon**: AutoML - trains 10+ models with auto ensembling (PREFERRED)
- forecast_with_autogluon: Time series forecasting with AutoGluon (supports covariates, holidays, model selection)
- optimize_autogluon_model: Post-training optimization (refit_full, distill, calibrate_threshold, deploy_optimize, delete_models)
- analyze_autogluon_model: Model inspection (summary, transform_features, info)
- extend_autogluon_training: Add models incrementally (fit_extra, fit_weighted_ensemble)
- train_multilabel_autogluon: Multi-label prediction (multiple target columns)
- backtest_timeseries: Time series backtesting with multiple validation windows
- analyze_timeseries_model: TS model analysis (feature_importance, plot, make_future_dataframe)
- train_baseline_models: Fallback - trains 4 basic models
- **⭐ execute_python_code**: Write and run custom Python code for ANY task not covered by tools (TRUE AI AGENT capability)
- **execute_code_from_file**: Run existing Python scripts
- Advanced: hyperparameter_tuning, perform_eda_analysis, handle_imbalanced_data, perform_feature_scaling, detect_anomalies, detect_and_handle_multicollinearity, auto_feature_engineering, forecast_time_series, explain_predictions, generate_business_insights, perform_topic_modeling, extract_image_features, monitor_model_drift
- NEW Advanced Insights: analyze_root_cause, detect_trends_and_seasonality, detect_anomalies_advanced, perform_hypothesis_testing, analyze_distribution, perform_segment_analysis
- NEW Automation: auto_ml_pipeline (zero-config full pipeline), auto_feature_selection
- NEW Visualization: generate_all_plots, generate_data_quality_plots, generate_eda_plots, generate_model_performance_plots, generate_feature_importance_plot
- NEW Interactive Plotly Visualizations: generate_interactive_scatter, generate_interactive_histogram, generate_interactive_correlation_heatmap, generate_interactive_box_plots, generate_interactive_time_series, generate_plotly_dashboard (interactive web-based plots with zoom/pan/hover)
- NEW EDA Report Generation: generate_ydata_profiling_report (comprehensive detailed analysis with full statistics, distributions, correlations, and data quality insights)
- NEW Enhanced Feature Engineering: create_ratio_features, create_statistical_features, create_log_features, create_binned_features

**RULES:**
✅ **DETECT INTENT FIRST**: Code-only (Type A), Visualization-only (Type B), Full ML (Type C), or Unclear (Type D)?
✅ **ASK BEFORE ACTING** if user intent is ambiguous (Type D)
✅ **VISUALIZATION-ONLY**: If user just wants plots → generate_interactive_scatter OR generate_plotly_dashboard → STOP
✅ **CODE-ONLY Tasks**: execute_python_code → STOP (no ML workflow!)
✅ **FULL ML ONLY**: If user wants model training → Run complete workflow (steps 1-15)
✅ Use OUTPUT of each tool as INPUT to next
✅ Save to ./outputs/data/
✅ **CRITICAL ERROR RECOVERY - HIGHEST PRIORITY:**
   - When you see "💡 HINT: Did you mean 'X'?" → IMMEDIATELY retry with 'X'
   - When tool returns {"suggestion": "Did you mean: X?"} → Extract X and retry
   - Example: train_baseline_models fails with hint "Did you mean 'mag'?" 
     → Your NEXT call MUST be: train_baseline_models(..., target_col="mag")
   - NO OTHER CALLS until you retry with corrected parameter
✅ **READ ERROR MESSAGES CAREFULLY** - Extract actual column names from errors
✅ **When training fails with "Column X not found"**: 
   - Look for "Available columns:" in error message
   - Look for suggestion in tool_result["suggestion"]
   - Use the EXACT suggested column name from the error
   - Column names may be abbreviated or different from user input
   - Retry IMMEDIATELY with correct column name (NO OTHER TOOLS FIRST)
✅ **When file not found**: Check previous step - if it failed, don't continue with that file
✅ **ASK USER for target column if unclear** - Don't guess!
✅ **STOP cascading errors**: If a file creation step fails, don't try to use that file in next steps
✅ When tool fails → analyze error → fix the specific issue → RETRY THAT SAME TOOL (max 1 retry per step)
❌ NO recommendations without action
❌ NO stopping after detecting issues
❌ NO repeating failed file paths - if file wasn't created, use previous working file
❌ NO repeating the same error twice - learn from error messages
❌ NO calling different tools when one fails - RETRY the failed tool with corrections first
❌ NO training models when user only wants analysis/exploration
❌ NO assuming column names - read error messages for actual names
❌ NO XML-style function syntax like <function=name />

**ERROR RECOVERY PATTERNS - FOLLOW THESE EXACTLY:**

**Pattern 1: Column Not Found**
❌ Tool fails: train_baseline_models(file_path="data.csv", target_col="target_column")
📋 Error: "Column 'target_column' not found. 💡 HINT: Did you mean 'target_col'?"
✅ Next call MUST be: train_baseline_models(file_path="data.csv", target_col="target_col")
❌ WRONG: Calling analyze_distribution or any other tool first!

**Pattern 2: File Not Found (Previous Step Failed)**
❌ Tool fails: auto_feature_engineering(...) → creates engineered_features.csv FAILED
❌ Next tool fails: train_baseline_models(file_path="engineered_features.csv") → File not found!
✅ Correct action: Use LAST SUCCESSFUL file → train_baseline_models(file_path="encoded.csv")

**Pattern 3: Missing Argument**
❌ Tool fails: "missing 1 required positional argument: 'target_col'"
✅ Next call: Include ALL required arguments

**CRITICAL RULES:**
1. If tool_result contains "suggestion", extract the suggested value and retry IMMEDIATELY
2. If you see "💡 HINT:", use that exact value in your retry
3. RETRY THE SAME TOOL with corrections before moving to different tools
4. Max 1 retry per tool - if it fails twice, move on with last successful file

**CRITICAL: Call ONE function at a time. Wait for its result before calling the next.**

**USER INTENT DETECTION:**
- Keywords for ML training: "train", "model", "predict", "classification", "regression", "forecast"
- Keywords for analysis only: "analyze", "explore", "show", "visualize", "understand", "summary"
- If ambiguous → Complete data prep, then ASK user about next steps

File chain: original → cleaned.csv → no_outliers.csv → numeric.csv → encoded.csv → models (if requested)

**FINAL SUMMARY - WHEN WORKFLOW IS COMPLETE:**
When you've finished all tool executions and are ready to return the final response, provide a comprehensive summary that includes:

1. **What was accomplished**: List all major steps completed (data cleaning, feature engineering, model training, etc.)
2. **Key findings from the data**:
   - ONLY cite statistics and numbers that appeared in ACTUAL tool results — do NOT fabricate thresholds, anomalies, or percentages
   - If no data quality issues were reported by tools, state "No significant data quality issues detected"
   - BUT DO provide DEEP interpretation of actual values: explain what real column ranges, correlations, and distributions MEAN for the user's domain
   - Derive insights from actual data: compare feature distributions, explain what strong/weak correlations imply practically, identify which features vary most and why that matters
   - What correlations were found? (report EXACT values from tool results AND explain their practical significance)
   - What were the most important features? (based on actual scores, with domain interpretation)
3. **Model performance** (if trained) - **CRITICAL: YOU MUST INCLUDE THESE METRICS**:
   - **ALWAYS extract and display** the exact metrics from tool results:
   - R² Score, RMSE, MAE from the train_with_autogluon or train_baseline_models results
   - List ALL models trained (not just the best one)
   - Example: "Trained 6 models: XGBoost (R²=0.713, RMSE=0.207), Random Forest (R²=0.685, RMSE=0.218), etc."
   - If hyperparameter tuning was done, show before/after comparison
   - How accurate is the model? What does the score mean in practical terms?
   - Were there any challenges (imbalanced data, multicollinearity, etc.)?
4. **Recommendations** (grounded in data — recommend based on what the tools found, not hypothetical scenarios):
   - Is the model ready for use?
   - What could improve performance further?
   - Align recommendations with the user's stated goal (e.g., if the user said "energy optimization", recommend optimization-relevant next steps, NOT generic survival analysis)
5. **Generated artifacts**: Mention reports, plots, and visualizations (but DON'T include file paths - the UI shows buttons automatically)

Example final response:
"I've completed the full machine learning workflow for [TARGET] prediction:

**Data Preparation:**
- Cleaned [N] records from the dataset
- Removed [N] columns with >50% missing values
- Extracted time-based features (`year`, `month`, `day`, `hour`) from datetime columns
- Encoded categorical variables using appropriate methods

**Key Findings:**
- [Feature A] shows strong correlation with the target variable
- Identified [N] distinct patterns/clusters in the data
- Most records fall within [specific range or category]

**Model Performance:**
- Best model: [Model Name]
- R² Score: [X.XX] (explains [X]% of target variance) OR Accuracy: [X]% for classification
- RMSE/MAE: [X.XX] (prediction error margin)
- Cross-validation: [X.XX] ± [X.XX] (consistent performance across folds)

After hyperparameter tuning, improved [metric] from [X] to [Y].

**Recommendation:**
The model shows [good/moderate] predictive power. Consider:
- Adding more relevant features if available
- Trying ensemble methods to boost performance
- Collecting more data for underrepresented categories

All visualizations, reports, and the trained model are available via the buttons above."

You are a DOER. Complete workflows based on user intent."""
    
    def _initialize_specialist_agents(self) -> Dict[str, Dict]:
        """Initialize specialist agent configurations with focused system prompts."""
        return {
            "eda_agent": {
                "name": "EDA Specialist",
                "emoji": "🔬",
                "description": "Explore and understand data patterns, relationships, correlations, and distributions. Answer questions about how variables relate, change together, or affect each other. Analyze data quality, detect outliers and anomalies. Generate descriptive statistics, correlation matrices, scatter plots, histograms, box plots, and distribution visualizations to reveal insights.",
                "system_prompt": """You are the EDA Specialist Agent - an expert in exploratory data analysis.

**Your Expertise:**
- Data profiling and statistical summaries
- Data quality assessment and anomaly detection
- Correlation analysis and feature relationships
- Distribution analysis and outlier detection
- Missing data patterns and strategies

**Your Tools (13 EDA-focused):**
- profile_dataset, detect_data_quality_issues, analyze_correlations
- get_smart_summary, detect_anomalies, perform_statistical_tests
- perform_eda_analysis, generate_ydata_profiling_report
- profile_bigquery_table, query_bigquery

**Your Approach:**
1. Always start with comprehensive data profiling
2. Identify quality issues before recommending fixes
3. Generate visualizations to reveal patterns
4. Provide actionable insights about data characteristics
5. Recommend next steps for data preparation

You work collaboratively with other specialists and hand off cleaned data to preprocessing and modeling agents.""",
                "tool_keywords": ["profile", "eda", "quality", "correlat", "anomal", "statistic", "distribution", "explore", "understand", "detect", "outlier"]
            },
            
            "modeling_agent": {
                "name": "ML Modeling Specialist",
                "emoji": "🤖",
                "description": "Build and train predictive machine learning models to forecast outcomes, classify categories, predict future values, or forecast time series. Perform supervised learning tasks including regression, classification, and time series forecasting. Train models using AutoGluon AutoML (preferred) or baseline models, optimize hyperparameters, conduct cross-validation, and evaluate model performance.",
                "system_prompt": """You are the ML Modeling Specialist Agent - an expert in machine learning powered by AutoGluon AutoML.

**Your Expertise:**
- AutoML with AutoGluon (preferred for best results)
- Model selection and baseline training
- Hyperparameter tuning and optimization
- Ensemble methods and model stacking
- Time series forecasting
- Cross-validation strategies
- Model evaluation and performance metrics

**CRITICAL: Target Column Validation**
BEFORE calling any training tools, you MUST:
1. Use profile_dataset to see actual column names
2. Verify the target column exists in the dataset
3. NEVER hallucinate or guess column names
4. If target column was provided or inferred, proceed with modeling
5. Only if NO target is available: analyze correlations to find best candidate

**Your Tools (8 modeling-focused):**
- train_with_autogluon (PREFERRED - AutoML with 10+ models, auto ensembling, handles raw data)
- predict_with_autogluon (predictions with trained AutoGluon model)
- forecast_with_autogluon (time series forecasting with AutoGluon - better than Prophet/ARIMA)
- train_baseline_models (fallback - trains 4 basic models)
- hyperparameter_tuning, perform_cross_validation
- generate_model_report, detect_model_issues

**TOOL PRIORITY (use in this order):**
| Task | Use This Tool | NOT This |
|------|--------------|----------|
| Classification/Regression | train_with_autogluon | train_baseline_models |
| Time Series Forecasting | forecast_with_autogluon | forecast_time_series |
| Predictions on new data | predict_with_autogluon | execute_python_code |
| Quick baseline check | train_baseline_models | execute_python_code |

**AutoGluon Advantages (explain to user):**
- Trains 10+ models automatically (vs 4 in baseline)
- Auto ensembles with multi-layer stacking
- Handles categorical features directly (no manual encoding needed)
- Handles missing values automatically (no manual imputation needed)
- Time-bounded training (won't run forever)
- Better accuracy than manual model selection

**Your Approach:**
1. FIRST: Profile the dataset to see actual columns (if not done)
2. VALIDATE: Confirm target column exists
3. PREFERRED: Use train_with_autogluon for best results
4. For time series data: Use forecast_with_autogluon
5. Validate with proper cross-validation if needed
6. Generate comprehensive model reports with metrics
7. Detect and address model issues (overfitting, bias, etc.)

**Common Errors to Avoid:**
❌ Calling train tools with non-existent target column
❌ Guessing column names like "Occupation", "Target", "Label"
❌ Using execute_python_code when dedicated tools exist
❌ Using train_baseline_models when train_with_autogluon is available
✅ Always verify column names from profile_dataset first
✅ Use train_with_autogluon as the DEFAULT training tool

You receive preprocessed data from data engineering agents and collaborate with visualization agents for model performance plots.""",
                "tool_keywords": ["train", "model", "hyperparameter", "ensemble", "cross-validation", "predict", "classify", "regress", "autogluon", "automl", "forecast"]
            },
            
            "viz_agent": {
                "name": "Visualization Specialist",
                "emoji": "📊",
                "description": "Create visual representations, charts, graphs, and dashboards to display data patterns. Generate interactive plots including scatter plots, line charts, bar graphs, heatmaps, time series visualizations, and statistical plots. Design comprehensive dashboards and visual reports to communicate findings clearly.",
                "system_prompt": """You are the Visualization Specialist Agent - an expert in data visualization.

**Your Expertise:**
- Interactive Plotly visualizations
- Statistical matplotlib plots
- Business intelligence dashboards
- Model performance visualizations
- Time series and geospatial plots

**Your Tools (8 visualization-focused):**
- create_plotly_scatter, create_plotly_heatmap, create_plotly_line
- create_matplotlib_plots, create_combined_plots
- generate_data_quality_plots, create_shap_plots
- generate_ydata_profiling_report (visual report)

**Your Approach:**
1. Choose the right visualization type for the data
2. Create interactive plots when possible (Plotly)
3. Use appropriate color schemes and layouts
4. Generate comprehensive visual reports
5. Highlight key insights through visual storytelling

You collaborate with all agents to visualize their outputs - EDA results, model performance, feature importance, etc.""",
                "tool_keywords": ["plot", "visualiz", "chart", "graph", "heatmap", "scatter", "dashboard", "matplotlib", "plotly", "create", "generate", "show", "display"]
            },
            
            "insight_agent": {
                "name": "Business Insights Specialist",
                "emoji": "💡",
                "description": "Interpret trained machine learning model results and translate findings into actionable business recommendations. Explain why models make certain predictions, analyze feature importance from completed models, identify root causes in model outputs, generate what-if scenarios, and provide strategic business insights based on model performance and predictions.",
                "system_prompt": """You are the Business Insights Specialist Agent - an expert in translating data into action.

**Your Expertise:**
- Root cause analysis and causal inference
- What-if scenario analysis
- Feature contribution interpretation
- Business intelligence and cohort analysis
- Actionable recommendations from ML results

**Your Tools (10 insight-focused):**
- analyze_root_cause, detect_causal_relationships
- generate_business_insights, explain_predictions
- perform_cohort_analysis, perform_rfm_analysis
- perform_customer_segmentation, analyze_customer_churn
- detect_model_issues (interpret issues)

**Your Approach:**
1. Translate statistical findings into business language
2. Identify root causes of patterns in data
3. Run what-if scenarios for decision support
4. Generate specific, actionable recommendations
5. Explain model predictions in human terms

You synthesize outputs from all other agents and provide the final business narrative.""",
                "tool_keywords": ["insight", "recommend", "explain", "interpret", "why", "cause", "what-if", "business", "segment", "churn"]
            },
            
            "preprocessing_agent": {
                "name": "Data Engineering Specialist",
                "emoji": "⚙️",
                "description": "Clean and prepare raw data for analysis by handling missing values, removing or treating outliers, encoding categorical variables, scaling numerical features, and engineering new features. Transform messy data into analysis-ready datasets through imputation, normalization, one-hot encoding, and feature creation.",
                "system_prompt": """You are the Data Engineering Specialist Agent - an expert in data preparation.

**Your Expertise:**
- Missing value handling and outlier treatment
- Feature scaling and normalization
- Imbalanced data handling (SMOTE, etc.)
- Feature engineering and transformation
- Data type conversion and encoding

**Your Tools (15 preprocessing-focused):**
- clean_missing_values, handle_outliers, handle_imbalanced_data
- perform_feature_scaling, encode_categorical
- create_interaction_features, create_aggregation_features
- auto_feature_engineering, create_time_features
- force_numeric_conversion, smart_type_inference
- merge_datasets, concat_datasets, reshape_dataset

**Your Approach:**
1. Fix data quality issues identified by EDA agent
2. Handle missing values with appropriate strategies
3. Treat outliers based on domain context
4. Engineer features to boost model performance
5. Prepare clean, model-ready data

You receive quality reports from EDA agent and deliver clean data to modeling agent.""",
                "tool_keywords": ["clean", "preprocess", "feature", "encod", "scal", "outlier", "missing", "transform", "engineer"]
            }
        }
    
    def _select_specialist_agent(self, task_description: str) -> str:
        """
        Route task to appropriate specialist agent.
        
        Uses SBERT semantic similarity if available, falls back to keyword matching.
        """
        # Try semantic routing first (more accurate)
        if self.semantic_layer.enabled:
            try:
                # Build agent descriptions for semantic matching
                agent_descriptions = {
                    agent_key: f"{agent_config['name']}: {agent_config['description']}"
                    for agent_key, agent_config in self.specialist_agents.items()
                }
                
                best_agent, confidence = self.semantic_layer.route_to_agent(
                    task_description, 
                    agent_descriptions
                )
                
                agent_config = self.specialist_agents[best_agent]
                print(f"🧠 Semantic routing → {agent_config['emoji']} {agent_config['name']} (confidence: {confidence:.2f})")
                
                return best_agent
                
            except Exception as e:
                print(f"⚠️ Semantic routing failed: {e}, falling back to keyword matching")
        
        # Fallback: Keyword-based routing (original method)
        task_lower = task_description.lower()
        
        # Score each agent based on keyword matches
        scores = {}
        for agent_key, agent_config in self.specialist_agents.items():
            score = sum(1 for keyword in agent_config["tool_keywords"] if keyword in task_lower)
            scores[agent_key] = score
        
        # Get agent with highest score
        if max(scores.values()) > 0:
            best_agent = max(scores.items(), key=lambda x: x[1])[0]
            agent_config = self.specialist_agents[best_agent]
            print(f"🔑 Keyword routing → {agent_config['emoji']} {agent_config['name']} ({scores[best_agent]} matches)")
            return best_agent
        
        # Default to EDA agent for exploratory tasks
        print("📊 Default routing → 🔬 EDA Specialist")
        return "eda_agent"
    
    def _get_agent_system_prompt(self, agent_key: str) -> str:
        """Get system prompt for specialist agent, fallback to main prompt."""
        if agent_key in self.specialist_agents:
            return self.specialist_agents[agent_key]["system_prompt"]
        return self._build_system_prompt()  # Fallback to main orchestrator prompt
    
    def _generate_cache_key(self, file_path: str, task_description: str, 
                           target_col: Optional[str] = None) -> str:
        """Generate cache key for a workflow."""
        # Include file hash to invalidate cache when data changes
        try:
            file_hash = self.cache.generate_file_hash(file_path)
        except:
            file_hash = "no_file"
        
        # Create simple string key (no kwargs unpacking to avoid dict hashing issues)
        cache_key_str = f"{file_hash}_{task_description}_{target_col or 'no_target'}"
        return self.cache._generate_key(cache_key_str)
    
    def _get_last_successful_file(self, workflow_history: List[Dict]) -> str:
        """Find the last successfully created DATA file from workflow history.
        
        Only returns actual data files (CSV, parquet, etc.), NOT visualization
        artifacts (HTML, PNG, etc.) which would break downstream tools.
        """
        data_extensions = ('.csv', '.parquet', '.xlsx', '.xls', '.json', '.tsv')
        
        # Check in reverse order for file-creating tools
        for step in reversed(workflow_history):
            result = step.get("result", {})
            if result.get("success"):
                # Check for output_path in result
                if "output_path" in result:
                    if result["output_path"].lower().endswith(data_extensions):
                        return result["output_path"]
                # For nested results
                if "result" in result and isinstance(result["result"], dict):
                    nested = result["result"]
                    if "output_path" in nested:
                        if nested["output_path"].lower().endswith(data_extensions):
                            return nested["output_path"]
                    # Check output_dir for dashboard-type tools
                    if "output_dir" in nested:
                        return nested["output_dir"]
                    # Check generated_files from execute_python_code
                    if "generated_files" in nested and nested["generated_files"]:
                        for gen_file in nested["generated_files"]:
                            if gen_file.lower().endswith(data_extensions):
                                return gen_file
                # Check tool arguments for file_path as last resort
                args = step.get("arguments", step.get("result", {}).get("arguments", {}))
                if isinstance(args, dict) and "file_path" in args:
                    import os
                    if os.path.exists(args["file_path"]):
                        return args["file_path"]
        
        # 🔥 FIX: Return the original input file instead of a phantom path
        # Try to get from session or workflow state
        if hasattr(self, 'session') and self.session and self.session.last_dataset:
            return self.session.last_dataset
        if hasattr(self, 'workflow_state') and self.workflow_state.current_file:
            return self.workflow_state.current_file
        
        # Last resort: return empty string instead of phantom file
        return "(no file found - use the original uploaded dataset)"
    
    def _determine_next_step(self, stuck_tool: str, completed_tools: List[str]) -> str:
        """Determine what the next workflow step should be based on what's stuck."""
        # Map of stuck tools to their next step
        next_steps = {
            "profile_dataset": "detect_data_quality_issues",
            "detect_data_quality_issues": "generate_data_quality_plots",
            "generate_data_quality_plots": "clean_missing_values",
            "clean_missing_values": "handle_outliers",
            "handle_outliers": "force_numeric_conversion",
            "force_numeric_conversion": "create_time_features (for datetime columns)",
            "create_time_features": "encode_categorical",
            "encode_categorical": "generate_eda_plots",
            "execute_python_code": "move forward (stop writing custom code!)",
            "generate_eda_plots": "train_baseline_models",
            "train_baseline_models": "hyperparameter_tuning OR generate_combined_eda_report",
            "hyperparameter_tuning": "perform_cross_validation OR generate_combined_eda_report",
            "perform_cross_validation": "generate_combined_eda_report",
            "generate_combined_eda_report": "generate_plotly_dashboard",
            "generate_plotly_dashboard": "WORKFLOW COMPLETE"
        }
        
        return next_steps.get(stuck_tool, "generate_eda_plots OR train_baseline_models")
    
    @staticmethod
    def _is_safe_path(path: Path, allowed_root: Path) -> bool:
        """Check if path is within an allowed root directory."""
        try:
            path.resolve().relative_to(allowed_root)
            return True
        except ValueError:
            return False
    
    # 🚀 PARALLEL EXECUTION: Helper methods for concurrent tool execution
    def _execute_tool_sync(self, tool_name: str, tool_args: Dict[str, Any]) -> Dict[str, Any]:
        """
        Synchronous wrapper for _execute_tool to be used in async context.
        This allows the parallel executor to run tools concurrently.
        """
        return self._execute_tool(tool_name, tool_args)
    
    async def _async_progress_callback(self, tool_name: str, status: str):
        """
        Async progress callback for parallel execution.
        Emits SSE events for real-time progress tracking.
        """
        if hasattr(self, 'session') and self.session:
            session_id = self.session.session_id
            if status == "started":
                print(f"🚀 [Parallel] Started: {tool_name}")
                from .api.app import progress_manager
                progress_manager.emit(session_id, {
                    'type': 'tool_executing',
                    'tool': tool_name,
                    'message': f"🚀 [Parallel] Executing: {tool_name}",
                    'parallel': True
                })
            elif status == "completed":
                print(f"✓ [Parallel] Completed: {tool_name}")
                from .api.app import progress_manager
                progress_manager.emit(session_id, {
                    'type': 'tool_completed',
                    'tool': tool_name,
                    'message': f"✓ [Parallel] Completed: {tool_name}",
                    'parallel': True
                })
            elif status.startswith("error"):
                print(f"❌ [Parallel] Failed: {tool_name}")
    
    # 🤝 INTER-AGENT COMMUNICATION: Methods for agent hand-offs
    def _should_hand_off(self, current_agent: str, completed_tools: List[str], 
                        workflow_history: List[Dict]) -> Optional[str]:
        """
        Determine if workflow should hand off to a different specialist agent.
        
        Args:
            current_agent: Currently active agent
            completed_tools: List of tool names executed so far
            workflow_history: Full workflow history
            
        Returns:
            Name of agent to hand off to, or None to stay with current agent
        """
        # Suggest next agent based on completed work
        suggested_agent = suggest_next_agent(current_agent, completed_tools)
        
        # Hand off if different from current agent
        if suggested_agent and suggested_agent != current_agent:
            return suggested_agent
        
        return None
    
    def _hand_off_to_agent(self, target_agent: str, context: Dict[str, Any], 
                          iteration: int) -> Dict[str, Any]:
        """
        Hand off workflow to a different specialist agent.
        
        Args:
            target_agent: Agent to hand off to
            context: Shared context (dataset info, completed steps, etc.)
            iteration: Current iteration number
            
        Returns:
            Dictionary with hand-off details
        """
        if target_agent not in self.specialist_agents:
            # Silently skip invalid hand-off targets (common during workflow transitions)
            return {"success": False, "error": "Invalid target agent"}
        
        # Update active agent
        old_agent = self.active_agent
        self.active_agent = target_agent
        
        agent_config = self.specialist_agents[target_agent]
        
        print(f"\n🔄 AGENT HAND-OFF (iteration {iteration})")
        print(f"   From: {old_agent}")
        print(f"   To: {target_agent} {agent_config['emoji']}")
        print(f"   Reason: {context.get('reason', 'Workflow progression')}")
        
        # Reload tools for new agent
        new_tools = self._compress_tools_registry(agent_name=target_agent)
        print(f"   📦 Reloaded {len(new_tools)} tools for {target_agent}")
        
        # Emit hand-off event
        if self.progress_callback:
            self.progress_callback({
                "type": "agent_handoff",
                "from_agent": old_agent,
                "to_agent": target_agent,
                "agent_name": agent_config['name'],
                "emoji": agent_config['emoji'],
                "reason": context.get('reason', 'Workflow progression'),
                "tools_count": len(new_tools)
            })
        
        return {
            "success": True,
            "old_agent": old_agent,
            "new_agent": target_agent,
            "new_tools": new_tools,
            "system_prompt": agent_config["system_prompt"]
        }
    
    def _get_agent_chain_suggestions(self, task_description: str, 
                                     current_agent: str) -> List[str]:
        """
        Get suggested agent chain for complex workflows.
        
        Args:
            task_description: User's task description
            current_agent: Currently active agent
            
        Returns:
            List of agent names in suggested execution order
        """
        task_lower = task_description.lower()
        
        # Detect workflow type from task description
        if "full" in task_lower or "complete" in task_lower or "end-to-end" in task_lower:
            # Full ML pipeline
            return [
                "data_quality_agent",
                "preprocessing_agent",
                "visualization_agent",
                "modeling_agent",
                "production_agent"
            ]
        elif "train" in task_lower or "model" in task_lower:
            # ML-focused workflow
            return [
                "data_quality_agent",
                "preprocessing_agent",
                "modeling_agent"
            ]
        elif "visualiz" in task_lower or "plot" in task_lower or "chart" in task_lower:
            # Visualization-focused
            return [
                "data_quality_agent",
                "visualization_agent"
            ]
        elif "clean" in task_lower or "preprocess" in task_lower:
            # Data cleaning focused
            return [
                "data_quality_agent",
                "preprocessing_agent"
            ]
        else:
            # Default single agent
            return [current_agent]
    
    def _generate_enhanced_summary(
        self, 
        workflow_history: List[Dict], 
        llm_summary: str,
        task_description: str
    ) -> Dict[str, Any]:
        """
        Generate an enhanced summary with extracted metrics, plots, and artifacts.
        
        Args:
            workflow_history: List of executed workflow steps
            llm_summary: Original summary from LLM
            task_description: User's original request
            
        Returns:
            Dictionary with enhanced summary text, metrics, and artifacts
        """
        metrics = {}
        artifacts = {
            "models": [],
            "reports": [],
            "data_files": []
        }
        plots = []
        
        # Extract information from workflow history
        for step in workflow_history:
            tool = step.get("tool", "")
            result = step.get("result", {})
            
            # Skip failed steps
            if not result.get("success", True):
                continue
            
            # Extract nested result if present
            # Tool results can be structured as:
            # 1. Direct: {"output_path": "...", "status": "success"}
            # 2. Nested: {"success": True, "result": {"output_path": "..."}}
            nested_result = result.get("result", result)
            
            # DEBUG: Log structure for visualization tools
            if "plot" in tool.lower() or "heatmap" in tool.lower() or "visualiz" in tool.lower():
                print(f"[DEBUG] Extracting plot from tool: {tool}")
                print(f"[DEBUG]   result keys: {list(result.keys())}")
                print(f"[DEBUG]   nested_result keys: {list(nested_result.keys()) if isinstance(nested_result, dict) else 'not a dict'}")
                print(f"[DEBUG]   output_path in nested_result: {'output_path' in nested_result if isinstance(nested_result, dict) else False}")
                if isinstance(nested_result, dict) and "output_path" in nested_result:
                    print(f"[DEBUG]   output_path value: {nested_result['output_path']}")
            
            # === EXTRACT MODEL METRICS ===
            if tool == "train_baseline_models":
                if "models" in nested_result:
                    models_data = nested_result["models"]
                    if models_data:
                        # Find best model (best_model is a dict with 'name', 'score', 'model_path')
                        best_model_info = nested_result.get("best_model", {})
                        if isinstance(best_model_info, dict):
                            best_model_name = best_model_info.get("name", "")
                        else:
                            best_model_name = str(best_model_info) if best_model_info else ""
                        
                        best_model_data = models_data.get(best_model_name, {})
                        # Metrics are nested inside test_metrics
                        test_metrics = best_model_data.get("test_metrics", {})
                        
                        metrics["best_model"] = {
                            "name": best_model_name,
                            "r2_score": test_metrics.get("r2", 0),
                            "rmse": test_metrics.get("rmse", 0),
                            "mae": test_metrics.get("mae", 0)
                        }
                        
                        # All models comparison - extract test_metrics for each
                        metrics["all_models"] = {}
                        for name, data in models_data.items():
                            if isinstance(data, dict) and "test_metrics" in data:
                                metrics["all_models"][name] = {
                                    "r2": data["test_metrics"].get("r2", 0),
                                    "rmse": data["test_metrics"].get("rmse", 0),
                                    "mae": data["test_metrics"].get("mae", 0)
                                }
                
                # Extract model artifacts
                if "model_path" in nested_result:
                    artifacts["models"].append({
                        "name": nested_result.get("best_model", "model"),
                        "path": nested_result["model_path"],
                        "url": f"/outputs/models/{nested_result['model_path'].split('/')[-1]}"
                    })
                
                # Extract performance plots
                if "performance_plots" in nested_result:
                    for plot_path in nested_result["performance_plots"]:
                        plots.append({
                            "title": plot_path.split("/")[-1].replace("_", " ").replace(".png", "").title(),
                            "path": plot_path,
                            "url": f"/outputs/{plot_path.replace('./outputs/', '')}"
                        })
                
                if "feature_importance_plot" in nested_result:
                    plot_path = nested_result["feature_importance_plot"]
                    plots.append({
                        "title": "Feature Importance",
                        "path": plot_path,
                        "url": f"/outputs/{plot_path.replace('./outputs/', '')}"
                    })
            
            # === HYPERPARAMETER TUNING METRICS ===
            elif tool == "hyperparameter_tuning":
                if "best_score" in nested_result:
                    metrics["tuned_model"] = {
                        "best_score": nested_result["best_score"],
                        "best_params": nested_result.get("best_params", {}),
                        "model_type": nested_result.get("model_type", "unknown")
                    }
                
                if "model_path" in nested_result:
                    artifacts["models"].append({
                        "name": f"{nested_result.get('model_type', 'model')}_tuned",
                        "path": nested_result["model_path"],
                        "url": f"/outputs/models/{nested_result['model_path'].split('/')[-1]}"
                    })
            
            # === CROSS-VALIDATION METRICS ===
            elif tool == "perform_cross_validation":
                if "mean_score" in nested_result:
                    metrics["cross_validation"] = {
                        "mean_score": nested_result["mean_score"],
                        "std_score": nested_result.get("std_score", 0),
                        "scores": nested_result.get("scores", [])
                    }
            
            # === COLLECT REPORT FILES ===
            elif "report" in tool.lower() or "dashboard" in tool.lower():
                print(f"[DEBUG] Report tool detected: {tool}")
                print(f"[DEBUG] nested_result keys: {list(nested_result.keys())}")
                # Check for both 'output_path' and 'report_path' keys
                report_path = nested_result.get("output_path") or nested_result.get("report_path")
                if report_path:
                    print(f"[DEBUG] Report path found: {report_path}")
                    # Clean path for URL — handle both ./outputs and /tmp paths
                    if report_path.startswith('./outputs/'):
                        url_path = report_path.replace('./outputs/', '')
                    elif report_path.startswith('/tmp/data_science_agent/outputs/'):
                        url_path = report_path.replace('/tmp/data_science_agent/outputs/', '')
                    elif report_path.startswith('/tmp/data_science_agent/'):
                        url_path = report_path.replace('/tmp/data_science_agent/', '')
                    else:
                        url_path = report_path.split('/')[-1]
                    artifacts["reports"].append({
                        "name": tool.replace("_", " ").title(),
                        "path": report_path,
                        "url": f"/outputs/{url_path}"
                    })
                    print(f"[DEBUG] Added to artifacts[reports], total reports: {len(artifacts['reports'])}")
                
                # 🔥 FIX: Extract individual plots from dashboard's 'plots' array
                # generate_plotly_dashboard returns {"plots": [{"output_path": ..., "status": "success"}, ...]}
                if "plots" in nested_result and isinstance(nested_result["plots"], list):
                    dashboard_output_dir = nested_result.get("output_dir", "./outputs/plots/interactive")
                    for sub_plot in nested_result["plots"]:
                        if isinstance(sub_plot, dict) and sub_plot.get("status") == "success":
                            sub_path = sub_plot.get("output_path", "")
                            if sub_path:
                                # Clean path for URL
                                if sub_path.startswith('./outputs/'):
                                    url_path = sub_path.replace('./outputs/', '')
                                elif sub_path.startswith('/tmp/data_science_agent/'):
                                    url_path = sub_path.replace('/tmp/data_science_agent/', '')
                                else:
                                    url_path = sub_path.split('/')[-1]
                                
                                plot_title = sub_path.split('/')[-1].replace('_', ' ').replace('.html', '').replace('.png', '').title()
                                plots.append({
                                    "title": plot_title,
                                    "path": sub_path,
                                    "url": f"/outputs/{url_path}",
                                    "type": "html" if sub_path.endswith(".html") else "image"
                                })
                                print(f"[DEBUG] Added dashboard sub-plot: {plot_title} -> /outputs/{url_path}")
                    
                    print(f"[DEBUG] Extracted {len(nested_result['plots'])} plots from dashboard")
                elif not report_path:
                    print(f"[DEBUG] No output_path, report_path, or plots array in nested_result for report tool")
            
            # === COLLECT VISUALIZATION FILES (interactive plots, charts, etc.) ===
            elif "plot" in tool.lower() or "visualiz" in tool.lower() or "chart" in tool.lower() or "heatmap" in tool.lower() or "scatter" in tool.lower() or "histogram" in tool.lower():
                if "output_path" in nested_result:
                    plot_path = nested_result["output_path"]
                    # Extract plot title from tool name or filename
                    plot_title = tool.replace("generate_", "").replace("interactive_", "").replace("_", " ").title()
                    if not plot_title or plot_title == "Output Path":
                        plot_title = plot_path.split("/")[-1].replace("_", " ").replace(".html", "").replace(".png", "").title()
                    
                    # Clean path for URL - handle both ./outputs and /tmp paths
                    if plot_path.startswith('./outputs/'):
                        url_path = plot_path.replace('./outputs/', '')
                    elif plot_path.startswith('/tmp/data_science_agent/outputs/'):
                        url_path = plot_path.replace('/tmp/data_science_agent/outputs/', '')
                    elif plot_path.startswith('/tmp/data_science_agent/'):
                        url_path = plot_path.replace('/tmp/data_science_agent/', '')
                    else:
                        # Just use filename for other paths
                        url_path = plot_path.split('/')[-1]
                    
                    plots.append({
                        "title": plot_title,
                        "path": plot_path,
                        "url": f"/outputs/{url_path}",
                        "type": "html" if plot_path.endswith(".html") else "image"
                    })
                    print(f"[DEBUG] Added plot to array:")
                    print(f"[DEBUG]   title: {plot_title}")
                    print(f"[DEBUG]   url: /outputs/{url_path}")
                    print(f"[DEBUG]   type: {'html' if plot_path.endswith('.html') else 'image'}")
            
            # === COLLECT PLOT FILES (from plot_paths key) ===
            if "plot_paths" in nested_result:
                for plot_path in nested_result["plot_paths"]:
                    # Clean path for URL
                    if plot_path.startswith('./outputs/'):
                        url_path = plot_path.replace('./outputs/', '')
                    elif plot_path.startswith('/tmp/data_science_agent/outputs/'):
                        url_path = plot_path.replace('/tmp/data_science_agent/outputs/', '')
                    elif plot_path.startswith('/tmp/data_science_agent/'):
                        url_path = plot_path.replace('/tmp/data_science_agent/', '')
                    else:
                        url_path = plot_path.split('/')[-1]
                    
                    plots.append({
                        "title": plot_path.split("/")[-1].replace("_", " ").replace(".png", "").replace(".html", "").title(),
                        "path": plot_path,
                        "url": f"/outputs/{url_path}",
                        "type": "html" if plot_path.endswith(".html") else "image"
                    })
            
            # === COLLECT DATA FILES ===
            if "output_path" in nested_result and nested_result["output_path"].endswith(".csv"):
                data_path = nested_result["output_path"]
                # Clean path for URL
                if data_path.startswith('./outputs/'):
                    url_path = data_path.replace('./outputs/', '')
                elif data_path.startswith('/tmp/data_science_agent/outputs/'):
                    url_path = data_path.replace('/tmp/data_science_agent/outputs/', '')
                elif data_path.startswith('/tmp/data_science_agent/'):
                    url_path = data_path.replace('/tmp/data_science_agent/', '')
                else:
                    url_path = data_path.split('/')[-1]
                
                artifacts["data_files"].append({
                    "name": data_path.split("/")[-1],
                    "path": data_path,
                    "url": f"/outputs/{url_path}"
                })
            
            # === SCAN execute_python_code OUTPUT FOR HTML FILES ===
            # When LLM uses execute_python_code to create visualizations, the HTML paths
            # are not in output_path - we need to scan the output/stdout for .html paths
            if tool == "execute_python_code":
                # Get raw output from code execution
                raw_output = str(nested_result.get("output", "")) + str(nested_result.get("stdout", "")) + str(result.get("output", ""))
                
                # Also scan the code itself for write_html() calls
                code_str = str(step.get("arguments", {}).get("code", ""))
                
                # Regex to find .html file paths in output or code
                html_paths = set()
                
                # Pattern 1: Paths in write_html() calls
                write_html_pattern = r"write_html\s*\(\s*['\"]([^'\"]+\.html)['\"]"
                html_paths.update(re.findall(write_html_pattern, code_str))
                
                # Pattern 2: Paths like /tmp/data_science_agent/*.html in output
                output_pattern = r"(/tmp/data_science_agent/[^\s'\"]+\.html)"
                html_paths.update(re.findall(output_pattern, raw_output))
                html_paths.update(re.findall(output_pattern, code_str))
                
                # Pattern 3: visualizations_created list in output (common pattern)
                viz_list_pattern = r"visualizations_created['\"]?\s*:\s*\[([^\]]+)\]"
                viz_match = re.search(viz_list_pattern, raw_output)
                if viz_match:
                    viz_paths = re.findall(r"['\"]([^'\"]+\.html)['\"]", viz_match.group(1))
                    html_paths.update(viz_paths)
                
                print(f"[DEBUG] execute_python_code artifact scanner found {len(html_paths)} HTML files: {html_paths}")
                
                # Register each found HTML as a plot
                for html_path in html_paths:
                    # Extract title from filename
                    filename = html_path.split("/")[-1]
                    plot_title = filename.replace("_", " ").replace(".html", "").title()
                    
                    # Clean path for URL
                    if html_path.startswith('/tmp/data_science_agent/'):
                        url_path = html_path.replace('/tmp/data_science_agent/', '')
                    else:
                        url_path = filename
                    
                    # Avoid duplicates
                    existing_urls = [p.get("url", "") for p in plots]
                    new_url = f"/outputs/{url_path}"
                    if new_url not in existing_urls:
                        plots.append({
                            "title": plot_title,
                            "path": html_path,
                            "url": new_url,
                            "type": "html"
                        })
                        print(f"[DEBUG] Registered plot from execute_python_code:")
                        print(f"[DEBUG]   title: {plot_title}")
                        print(f"[DEBUG]   url: {new_url}")
        
        # Build COMPREHENSIVE response template following user's format
        summary_lines = []
        
        # Start with the LLM's actual reasoning/summary
        if llm_summary and llm_summary.strip() and llm_summary != "Analysis completed":
            summary_lines.extend([
                llm_summary.strip(),
                "",
                "---",
                ""
            ])
        
        # Header
        summary_lines.extend([
            "## 📋 Workflow Summary:",
            ""
        ])
        
        # Extract task type and dataset info from workflow
        task_type = None
        n_features = 0
        n_samples = 0
        train_size = 0
        test_size = 0
        
        for step in workflow_history:
            if step.get("tool") == "train_baseline_models":
                result = step.get("result", {}).get("result", {})
                task_type = result.get("task_type", "regression")
                n_features = result.get("n_features", 0)
                train_size = result.get("train_size", 0)
                test_size = result.get("test_size", 0)
                n_samples = train_size + test_size
                break
        
        # SECTION 1: Dataset Profiling and Quality
        summary_lines.extend([
            "### 📊 Dataset Profiling and Quality:",
            ""
        ])
        
        if n_samples > 0:
            summary_lines.append(f"- The dataset contains **{n_samples:,} rows** and **{n_features} features**.")
        
        # Add workflow-specific insights
        profiling_done = any(s.get("tool") == "profile_dataset" for s in workflow_history)
        quality_checked = any(s.get("tool") == "detect_data_quality_issues" for s in workflow_history)
        
        if profiling_done:
            summary_lines.append("- Dataset profiling completed with comprehensive statistics.")
        if quality_checked:
            summary_lines.append("- Data quality issues were detected and analyzed.")
        
        summary_lines.extend(["", ""])
        
        # SECTION 2: Data Preprocessing
        summary_lines.extend([
            "### 🔧 Data Preprocessing:",
            ""
        ])
        
        preprocessing_steps = []
        for step in workflow_history:
            tool = step.get("tool", "")
            if tool == "clean_missing_values":
                preprocessing_steps.append("- Missing values were handled using automated strategies.")
            elif tool == "handle_outliers":
                preprocessing_steps.append("- Outliers were detected and handled appropriately.")
            elif tool == "encode_categorical":
                preprocessing_steps.append("- Categorical variables were encoded for ML compatibility.")
            elif tool == "feature_engineering" or tool == "enhanced_feature_engineering":
                preprocessing_steps.append("- Advanced feature engineering was performed to create predictive features.")
        
        if preprocessing_steps:
            summary_lines.extend(preprocessing_steps)
        else:
            summary_lines.append("- Data preprocessing steps were applied as needed.")
        
        summary_lines.extend(["", ""])
        
        # SECTION 3: Exploratory Data Analysis
        eda_done = any("eda" in s.get("tool", "").lower() or "plot" in s.get("tool", "").lower() 
                       for s in workflow_history)
        if eda_done:
            summary_lines.extend([
                "### 📈 Exploratory Data Analysis (EDA):",
                "",
                "- Comprehensive EDA visualizations were generated.",
                "- Correlation analysis, distribution plots, and feature relationships were examined.",
                f"- All visualizations are available in the **Visualization Gallery** below.",
                "",
                ""
            ])
        
        # SECTION 4: Model Training Results (ENHANCED - Following Template)
        if "all_models" in metrics and metrics["all_models"]:
            # Determine if classification or regression
            is_classification = task_type == "classification"
            metric_key = "f1" if is_classification else "r2"
            
            # Sort models by primary metric (descending)
            sorted_models = sorted(
                metrics["all_models"].items(),
                key=lambda x: x[1].get(metric_key, 0),
                reverse=True
            )
            
            best_model_name = sorted_models[0][0] if sorted_models else None
            best_model_score = sorted_models[0][1].get(metric_key, 0) if sorted_models else 0
            
            summary_lines.extend([
                "## 🎯 Model Training Results",
                "",
                f"**Task Type**: {task_type.title()}",
                f"**Features**: {n_features}",
                f"**Training Samples**: {train_size:,}",
                f"**Test Samples**: {test_size:,}",
                "",
                "### 📊 All Models Tested:",
                ""
            ])
            
            # Create detailed model performance table
            for model_name, model_metrics in sorted_models:
                is_best = (model_name == best_model_name)
                prefix = "🏆 " if is_best else "📊 "
                
                model_display_name = model_name.replace('_', ' ').title()
                
                if is_classification:
                    accuracy = model_metrics.get("accuracy", 0)
                    precision = model_metrics.get("precision", 0)
                    recall = model_metrics.get("recall", 0)
                    f1 = model_metrics.get("f1", 0)
                    
                    summary_lines.extend([
                        f"{prefix}**{model_display_name}**:",
                        "",
                        f"- Accuracy: **{accuracy:.4f}**",
                        f"- Precision: **{precision:.4f}**",
                        f"- Recall: **{recall:.4f}**",
                        f"- F1 Score: **{f1:.4f}**",
                        ""
                    ])
                else:  # regression
                    r2 = model_metrics.get("r2", 0)
                    rmse = model_metrics.get("rmse", 0)
                    mae = model_metrics.get("mae", 0)
                    
                    summary_lines.extend([
                        f"{prefix}**{model_display_name}**:",
                        "",
                        f"- R² Score: **{r2:.4f}**",
                        f"- RMSE: **{rmse:.4f}**",
                        f"- MAE: **{mae:.4f}**",
                        ""
                    ])
            
            # Best model highlight
            summary_lines.extend([
                f"### 🏆 Best Model: **{best_model_name.replace('_', ' ').title()}**",
                f"**Score**: {best_model_score:.4f}",
                "",
                ""
            ])
        
        # SECTION 5: Tuning Results (if hyperparameter tuning was done)
        if "tuned_model" in metrics:
            tuned = metrics["tuned_model"]
            summary_lines.extend([
                "### ⚙️ Hyperparameter Tuning:",
                "",
                f"- Model optimized: **{tuned.get('model_type', 'Unknown').replace('_', ' ').title()}**",
                f"- Best cross-validation score: **{tuned.get('best_score', 0):.4f}**",
                "- Hyperparameters were optimized using Bayesian optimization.",
                "",
                ""
            ])
        
        # SECTION 6: Cross-Validation (if performed)
        if "cross_validation" in metrics:
            cv = metrics["cross_validation"]
            summary_lines.extend([
                "### ✅ Cross-Validation:",
                "",
                f"- Mean Score: **{cv['mean_score']:.4f} ± {cv['std_score']:.4f}**",
                f"- Validated across multiple folds for robust performance estimation.",
                "",
                ""
            ])
        
        # SECTION 7: Workflow Steps Checklist
        summary_lines.extend([
            "## 🔧 Workflow Steps:",
            ""
        ])
        
        completed_steps = []
        for step in workflow_history:
            if step.get("result", {}).get("success", True):
                tool_name = step.get("tool", "")
                # Format tool name nicely
                display_name = tool_name.replace("_", " ").replace("generate ", "").title()
                completed_steps.append(f"✅ {display_name}")
        
        # Remove duplicates while preserving order
        seen = set()
        unique_steps = []
        for step in completed_steps:
            if step not in seen:
                seen.add(step)
                unique_steps.append(step)
        
        summary_lines.extend(unique_steps)
        summary_lines.extend(["", ""])
        
        # SECTION 8: Generated Visualizations
        if plots:
            summary_lines.extend([
                f"## 📊 Generated Visualizations ({len(plots)} plots)",
                "",
                "✅ **Plots are displayed in the Visualization Gallery below!**",
                "",
                "Available visualizations include:",
                ""
            ])
            
            for plot in plots[:10]:  # Show up to 10 plots
                plot_title = plot.get('title', 'Visualization')
                summary_lines.append(f"- 📈 {plot_title}")
            
            if len(plots) > 10:
                summary_lines.append(f"- ... and {len(plots) - 10} more visualizations")
            
            summary_lines.extend(["", ""])
        
        # SECTION 9: Execution Summary
        total_time = sum(s.get("duration", 0) for s in workflow_history)
        summary_lines.extend([
            "## ⏱️ Execution Summary:",
            "",
            f"- **Tools Executed**: {len(completed_steps)}",
            f"- **Iterations**: {len(workflow_history)}",
            f"- **Time**: {total_time:.1f}s",
            ""
        ])
        
        # SECTION 10: Artifacts (if any)
        if artifacts["models"]:
            summary_lines.extend([
                "### 💾 Trained Models:",
                ""
            ])
            for model in artifacts["models"]:
                summary_lines.append(f"- {model['name']}")
            summary_lines.append("")
        
        if artifacts["reports"]:
            summary_lines.extend([
                "### 📄 Generated Reports:",
                ""
            ])
            for report in artifacts["reports"]:
                summary_lines.append(f"- {report['name']}")
            summary_lines.append("")
        
        # 🔥 MERGE REPORTS INTO PLOTS ARRAY FOR FRONTEND DISPLAY
        # Frontend expects everything viewable in result.plots array
        print(f"[DEBUG] Merging {len(artifacts['reports'])} reports into plots array")
        for report in artifacts["reports"]:
            plots.append({
                "title": report["name"],
                "url": report["url"],
                "type": "html"  # Reports are typically HTML
            })
            print(f"[DEBUG] Added report to plots array: title='{report['name']}', url='{report['url']}'")
        
        print(f"[DEBUG] Final plots array length: {len(plots)}")
        
        return {
            "text": "\n".join(summary_lines),
            "metrics": metrics,
            "artifacts": artifacts,
            "plots": plots
        }
    
    @retry_with_fallback(tool_name=None)  # 🛡️ ERROR RECOVERY: Auto-retry with fallback
    def _execute_tool(self, tool_name: str, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """
        Execute a single tool function.
        
        Args:
            tool_name: Name of the tool
            arguments: Tool arguments
            
        Returns:
            Tool execution result
        """
        if tool_name not in self.tool_functions:
            return {
                "error": f"Tool '{tool_name}' not found",
                "available_tools": get_all_tool_names()
            }
        
        # Validate file_path arguments are within allowed directories
        ALLOWED_ROOTS = [
            Path("/tmp/data_science_agent").resolve(),
            Path("./outputs").resolve(),
            Path("./data").resolve(),
            Path("./cache_db").resolve(),
            Path("./checkpoints").resolve(),
        ]
        for key in ("file_path", "input_path", "train_data_path", "test_data_path"):
            if key in arguments and arguments[key]:
                try:
                    resolved = Path(arguments[key]).resolve()
                    if not any(self._is_safe_path(resolved, root) for root in ALLOWED_ROOTS):
                        return {
                            "success": False, 
                            "error": f"Path '{arguments[key]}' is outside allowed directories",
                            "error_type": "SecurityError"
                        }
                except (ValueError, OSError):
                    pass  # Let the tool handle invalid paths
        
        try:
            # Report progress before executing
            if self.progress_callback:
                self.progress_callback(tool_name, "running")
            
            tool_func = self.tool_functions[tool_name]
            
            # CRITICAL: Validate column names for modeling tools (prevent hallucinations)
            if tool_name in ["train_baseline_models", "hyperparameter_tuning", "train_ensemble_models"]:
                if "target_col" in arguments and arguments["target_col"]:
                    target_col = arguments["target_col"]
                    file_path = arguments.get("file_path", "")
                    
                    # Validate target column exists in dataset
                    try:
                        import polars as pl
                        df = pl.read_csv(file_path) if file_path.endswith('.csv') else pl.read_parquet(file_path)
                        actual_columns = df.columns
                        
                        if target_col not in actual_columns:
                            print(f"⚠️  HALLUCINATED TARGET COLUMN: '{target_col}'")
                            print(f"   Actual columns: {actual_columns}")
                            
                            # 🧠 Try semantic matching first (better than fuzzy)
                            corrected_col = None
                            if self.semantic_layer.enabled:
                                try:
                                    match = self.semantic_layer.semantic_column_match(target_col, actual_columns, threshold=0.6)
                                    if match:
                                        corrected_col, confidence = match
                                        print(f"   🧠 Semantic match: {corrected_col} (confidence: {confidence:.2f})")
                                except Exception as e:
                                    print(f"   ⚠️ Semantic matching failed: {e}")
                            
                            # Fallback to fuzzy matching if semantic didn't work
                            if not corrected_col:
                                close_matches = get_close_matches(target_col, actual_columns, n=1, cutoff=0.6)
                                if close_matches:
                                    corrected_col = close_matches[0]
                                    print(f"   ✓ Fuzzy match: {corrected_col}")
                            
                            if corrected_col:
                                arguments["target_col"] = corrected_col
                            else:
                                return {
                                    "success": False,
                                    "tool": tool_name,
                                    "arguments": arguments,
                                    "error": f"Target column '{target_col}' does not exist. Available columns: {actual_columns}",
                                    "error_type": "ColumnNotFoundError",
                                    "hint": "Please specify the correct target column name from the dataset."
                                }
                    except Exception as validation_error:
                        print(f"⚠️  Could not validate target column: {validation_error}")
            
            # Fix common parameter mismatches from LLM hallucinations
            if tool_name == "generate_ydata_profiling_report":
                # LLM often calls with 'output_dir' instead of 'output_path'
                if "output_dir" in arguments and "output_path" not in arguments:
                    output_dir = arguments.pop("output_dir")
                    # Convert directory to full file path
                    arguments["output_path"] = f"{output_dir}/ydata_profile.html"
            
            # Fix target_column → target_col (common LLM mistake)
            if "target_column" in arguments and "target_col" not in arguments:
                arguments["target_col"] = arguments.pop("target_column")
                print(f"   ✓ Parameter remapped: target_column → target_col")
            
            # Fix tool-specific parameter mismatches from LLM hallucinations
            if tool_name == "train_baseline_models":
                # LLM often adds 'models' parameter that doesn't exist
                if "models" in arguments:
                    models_val = arguments.pop("models")
                    print(f"   ✓ Stripped invalid parameter 'models': {models_val}")
                    print(f"   ℹ️ train_baseline_models trains all baseline models automatically")
                # LLM often adds 'feature_columns' parameter that doesn't exist
                if "feature_columns" in arguments:
                    feature_cols = arguments.pop("feature_columns")
                    print(f"   ✓ Stripped invalid parameter 'feature_columns': {feature_cols}")
                    print(f"   ℹ️ train_baseline_models uses all numeric columns automatically")
            
            if tool_name == "generate_model_report":
                # LLM uses 'file_path' instead of 'test_data_path'
                if "file_path" in arguments and "test_data_path" not in arguments:
                    arguments["test_data_path"] = arguments.pop("file_path")
                    print(f"   ✓ Parameter remapped: file_path → test_data_path")
            
            if tool_name == "detect_model_issues":
                # LLM adds invalid split parameters
                for invalid_param in ["train_target_path", "test_target_path"]:
                    if invalid_param in arguments:
                        val = arguments.pop(invalid_param)
                        print(f"   ✓ Stripped invalid parameter '{invalid_param}': {val}")
                # Ensure train_data_path is provided
                if "train_data_path" not in arguments:
                    print(f"   ⚠️ WARNING: detect_model_issues requires 'train_data_path' parameter")
            
            if tool_name == "create_statistical_features":
                # LLM confuses this with geospatial features and adds lat_col/lon_col
                for invalid_param in ["lat_col", "lon_col", "latitude", "longitude"]:
                    if invalid_param in arguments:
                        val = arguments.pop(invalid_param)
                        print(f"   ✓ Stripped invalid parameter '{invalid_param}': {val}")
                        print(f"   ℹ️ create_statistical_features creates row-wise stats (mean, std, min, max)")

            # General parameter corrections for common LLM hallucinations
            # IMPORTANT: Do this BEFORE generic invalid-arg stripping.
            if "output" in arguments and "output_path" not in arguments:
                arguments["output_path"] = arguments.pop("output")
                print(f"   ✓ Parameter remapped: output → output_path")

            # Common file path aliases used by LLM plans/prompts
            for alias in ["data_path", "input_file", "input", "path", "latest"]:
                if alias in arguments and "file_path" not in arguments:
                    arguments["file_path"] = arguments.pop(alias)
                    print(f"   ✓ Parameter remapped: {alias} → file_path")
                    break

            # create_time_features is frequently called with alias column names
            if tool_name == "create_time_features":
                for alias in ["date_column", "datetime_column", "datetime_col", "time_col", "column", "col"]:
                    if alias in arguments and "date_col" not in arguments:
                        arguments["date_col"] = arguments.pop(alias)
                        print(f"   ✓ Parameter remapped: {alias} → date_col")
                        break

                # Auto-fill output path if omitted
                if "output_path" not in arguments:
                    arguments["output_path"] = str(self.output_base / "data" / "time_features.csv")
                    print(f"   ✓ Parameter defaulted: output_path → {arguments['output_path']}")

                # Auto-detect datetime column if date_col is missing
                if "date_col" not in arguments and arguments.get("file_path"):
                    try:
                        import polars as pl
                        fp = arguments["file_path"]
                        df = pl.read_csv(fp) if str(fp).endswith(".csv") else pl.read_parquet(fp)

                        preferred_names = [
                            "pickup_time", "pickup_datetime", "dropoff_time", "dropoff_datetime",
                            "timestamp", "datetime", "date", "time"
                        ]
                        matched = next((c for c in preferred_names if c in df.columns), None)

                        if not matched:
                            # Prefer true datetime/date dtypes first
                            dt_cols = [
                                c for c in df.columns
                                if df[c].dtype in [pl.Date, pl.Datetime]
                            ]
                            if dt_cols:
                                matched = dt_cols[0]

                        if not matched:
                            # Fallback heuristic by name
                            name_hint_cols = [
                                c for c in df.columns
                                if any(k in c.lower() for k in ["date", "time", "timestamp"])
                            ]
                            if name_hint_cols:
                                matched = name_hint_cols[0]

                        if matched:
                            arguments["date_col"] = matched
                            print(f"   ✓ Auto-detected date_col: {matched}")
                    except Exception as infer_err:
                        print(f"   ⚠️ Could not auto-detect date_col: {infer_err}")
            
            # 🔧 FIX: analyze_autogluon_model path resolution
            # The Reasoner hallucinates model paths — resolve to actual saved path
            if tool_name == "analyze_autogluon_model":
                model_path = arguments.get("model_path", "")
                if model_path and not Path(model_path).exists():
                    # Try the default AutoGluon output dir
                    fallback_paths = [
                        "./outputs/autogluon_model",
                        "outputs/autogluon_model",
                        "/tmp/data_science_agent/outputs/autogluon_model",
                    ]
                    for fallback in fallback_paths:
                        if Path(fallback).exists():
                            print(f"   ✓ Fixed model_path: '{model_path}' → '{fallback}'")
                            arguments["model_path"] = fallback
                            break
                    else:
                        print(f"   ⚠️ Model path '{model_path}' not found, no fallback available")
            
            # 🔧 FIX: predict_with_autogluon path resolution (same issue)
            if tool_name == "predict_with_autogluon":
                model_path = arguments.get("model_path", "")
                if model_path and not Path(model_path).exists():
                    fallback_paths = [
                        "./outputs/autogluon_model",
                        "outputs/autogluon_model",
                        "/tmp/data_science_agent/outputs/autogluon_model",
                    ]
                    for fallback in fallback_paths:
                        if Path(fallback).exists():
                            print(f"   ✓ Fixed model_path: '{model_path}' → '{fallback}'")
                            arguments["model_path"] = fallback
                            break
            
            # 🔥 FIX: Generic parameter sanitization - strip any unknown kwargs
            # This prevents "got an unexpected keyword argument" errors from LLM hallucinations
            import inspect
            try:
                sig = inspect.signature(tool_func)
                valid_params = set(sig.parameters.keys())
                invalid_args = [k for k in arguments.keys() if k not in valid_params]
                # Only strip if the function doesn't accept **kwargs
                has_var_keyword = any(
                    p.kind == inspect.Parameter.VAR_KEYWORD 
                    for p in sig.parameters.values()
                )
                if invalid_args and not has_var_keyword:
                    for invalid_param in invalid_args:
                        val = arguments.pop(invalid_param)
                        print(f"   ✓ Stripped hallucinated parameter '{invalid_param}': {val}")
                    print(f"   ℹ️ Valid parameters for {tool_name}: {list(valid_params)}")
            except (ValueError, TypeError):
                pass  # Can't inspect, skip validation
            
            # Fix "None" string being passed as actual None
            for key, value in list(arguments.items()):
                if isinstance(value, str) and value.lower() in ["none", "null", "undefined"]:
                    arguments[key] = None
            
            # Log final parameters before execution
            print(f"   📋 Final parameters: {list(arguments.keys())}")
            
            result = tool_func(**arguments)
            
            # Check if tool itself returned an error (some tools return dict with 'status': 'error')
            if isinstance(result, dict) and result.get("status") == "error":
                tool_result = {
                    "success": False,
                    "tool": tool_name,
                    "arguments": arguments,
                    "error": result.get("message", result.get("error", "Tool returned error status")),
                    "error_type": "ToolError"
                }
                # Report failure
                if self.progress_callback:
                    self.progress_callback(tool_name, "failed")
            else:
                tool_result = {
                    "success": True,
                    "tool": tool_name,
                    "arguments": arguments,
                    "result": result
                }
                # Report success
                if self.progress_callback:
                    self.progress_callback(tool_name, "completed")
            
            # 🧠 Update session memory with tool execution
            if self.session:
                self.session.add_workflow_step(tool_name, tool_result)
            
            return tool_result
        
        except Exception as e:
            tool_result = {
                "success": False,
                "tool": tool_name,
                "arguments": arguments,
                "error": str(e),
                "error_type": type(e).__name__
            }
            
            # Still track failed tools in session
            if self.session:
                self.session.add_workflow_step(tool_name, tool_result)
            
            return tool_result
    
    def _make_json_serializable(self, obj: Any) -> Any:
        """
        Convert objects to JSON-serializable format.
        Handles matplotlib Figures, numpy arrays, infinity values, and other non-serializable types.
        """
        try:
            import numpy as np
        except ImportError:
            np = None
        
        try:
            from matplotlib.figure import Figure
        except ImportError:
            Figure = None
        
        # Handle dictionaries recursively
        if isinstance(obj, dict):
            return {k: self._make_json_serializable(v) for k, v in obj.items()}
        
        # Handle lists recursively
        elif isinstance(obj, (list, tuple)):
            return [self._make_json_serializable(item) for item in obj]
        
        # Handle infinity and NaN values (not JSON compliant)
        elif isinstance(obj, float):
            import math
            if math.isinf(obj):
                return "Infinity" if obj > 0 else "-Infinity"
            elif math.isnan(obj):
                return "NaN"
            return obj
        
        # Handle matplotlib Figure objects
        elif Figure and isinstance(obj, Figure):
            return f"<Matplotlib Figure: {id(obj)}>"
        
        # Handle numpy arrays
        elif np and isinstance(obj, np.ndarray):
            return f"<NumPy array: shape={obj.shape}>"
        
        # Handle numpy scalar types
        elif hasattr(obj, 'item') and callable(obj.item):
            try:
                return obj.item()
            except:
                return str(obj)
        
        # Handle other non-serializable objects
        elif hasattr(obj, '__dict__') and not isinstance(obj, (str, int, float, bool, type(None))):
            return f"<{obj.__class__.__name__} object>"
        
        # Already serializable
        return obj
    
    def _summarize_tool_result(self, tool_result: Dict[str, Any]) -> str:
        """
        Summarize tool result for LLM consumption.
        Extracts only essential info to avoid token bloat from large dataset outputs.
        """
        if not tool_result.get("success"):
            # Always return errors in full
            return json.dumps({
                "error": tool_result.get("error"),
                "error_type": tool_result.get("error_type")
            }, indent=2)
        
        result = tool_result.get("result", {})
        tool_name = tool_result.get("tool", "")
        
        # Create concise summary based on tool type
        summary = {"status": "success"}
        
        # Profile dataset - extract key stats only
        if tool_name == "profile_dataset":
            summary.update({
                "rows": result.get("basic_info", {}).get("num_rows"),
                "cols": result.get("basic_info", {}).get("num_columns"),
                "numeric_cols": len(result.get("numeric_columns", [])),
                "categorical_cols": len(result.get("categorical_columns", [])),
                "datetime_cols": len(result.get("datetime_columns", [])),
                "memory_mb": result.get("basic_info", {}).get("memory_usage_mb"),
                "missing_values": result.get("basic_info", {}).get("missing_values", 0)
            })
        
        # Data quality - extract issue counts
        elif tool_name == "detect_data_quality_issues":
            issues = result.get("issues", {})
            summary.update({
                "missing_values": len(issues.get("missing_values", [])),
                "duplicate_rows": result.get("duplicate_count", 0),
                "high_cardinality": len(issues.get("high_cardinality", [])),
                "constant_cols": len(issues.get("constant_columns", [])),
                "outliers": len(issues.get("outliers", [])),
                "total_issues": sum([
                    len(issues.get("missing_values", [])),
                    result.get("duplicate_count", 0),
                    len(issues.get("high_cardinality", [])),
                    len(issues.get("constant_columns", [])),
                    len(issues.get("outliers", []))
                ])
            })
        
        # File operations - just confirm path
        elif tool_name in ["clean_missing_values", "handle_outliers", "fix_data_types", 
                           "force_numeric_conversion", "encode_categorical", "smart_type_inference"]:
            summary.update({
                "output_path": result.get("output_path"),
                "message": result.get("message", ""),
                "rows_affected": result.get("rows_removed", result.get("rows_affected", 0))
            })
        
        # Training - extract model performance only
        elif tool_name == "train_baseline_models":
            models = result.get("models", {})
            best = result.get("best_model", {})
            best_model_name = best.get("name") if isinstance(best, dict) else best
            summary.update({
                "best_model": best_model_name,
                "models_trained": list(models.keys()),
                "best_score": best.get("score") if isinstance(best, dict) else None,
                "task_type": result.get("task_type")
            })
        
        # Report generation
        elif tool_name == "generate_model_report":
            summary.update({
                "report_path": result.get("report_path"),
                "message": "Report generated successfully"
            })
        
        # Default: extract message and status
        else:
            summary.update({
                "message": result.get("message", str(result)[:200]),  # Max 200 chars
                "output_path": result.get("output_path")
            })
        
        return json.dumps(summary, indent=2)
    
    def _format_tool_result(self, tool_result: Dict[str, Any]) -> str:
        """Format tool result for LLM consumption (alias for summarize)."""
        return self._summarize_tool_result(tool_result)
    
    def _compress_tools_registry(self, agent_name: str = None) -> List[Dict]:
        """
        Create compressed version of tools registry.
        Optionally filter to only include tools relevant to a specific agent.
        
        Args:
            agent_name: If provided, only include tools relevant to this agent
        
        Returns:
            Compressed and optionally filtered tools list
        """
        # If agent specified, filter tools first
        if agent_name:
            tool_names = get_tools_for_agent(agent_name)
            tools_to_compress = filter_tools_by_names(self.tools_registry, tool_names)
            print(f"🎯 Agent-specific tools: {len(tools_to_compress)} tools for {agent_name}")
        else:
            tools_to_compress = self.tools_registry
        
        compressed = []
        
        for tool in tools_to_compress:
            # Compress parameters by removing descriptions
            params = tool["function"]["parameters"]
            compressed_params = {
                "type": params["type"],
                "properties": {},
                "required": list(params.get("required", []))  # Create new list, not reference
            }
            
            # Keep only type info for properties, remove descriptions
            for prop_name, prop_value in params.get("properties", {}).items():
                compressed_prop = {}
                
                # Handle oneOf (like clean_missing_values strategy parameter)
                if "oneOf" in prop_value:
                    # Deep copy to avoid reference issues
                    compressed_prop["oneOf"] = json.loads(json.dumps(prop_value["oneOf"]))
                else:
                    compressed_prop["type"] = prop_value.get("type", "string")
                
                # Keep enum if present (important for validation)
                if "enum" in prop_value:
                    compressed_prop["enum"] = list(prop_value["enum"])  # Create new list
                
                # Keep array items type - handle both "array" and ["string", "array"]
                prop_type = prop_value.get("type")
                is_array_type = False
                
                if isinstance(prop_type, list):
                    is_array_type = "array" in prop_type
                elif prop_type == "array":
                    is_array_type = True
                
                if is_array_type and "items" in prop_value:
                    compressed_prop["items"] = {"type": prop_value["items"].get("type", "string")}
                
                compressed_params["properties"][prop_name] = compressed_prop
            
            compressed_tool = {
                "type": tool["type"],
                "function": {
                    "name": tool["function"]["name"],
                    "description": tool["function"]["description"][:100],  # Short description
                    "parameters": compressed_params
                }
            }
            compressed.append(compressed_tool)
        
        return compressed
    
    def _compress_tool_result(self, tool_name: str, result: Dict[str, Any]) -> Dict[str, Any]:
        """
        Compress tool results for small context models (production-grade approach).
        
        Keep only:
        - Status (success/failure)
        - Key metrics (5-10 most important numbers)
        - File paths created  
        - Next action hints
        
        Full results stored in workflow_history and session memory.
        LLM doesn't need verbose output - only decision-making info.
        
        Args:
            tool_name: Name of the tool executed
            result: Full tool result dict
            
        Returns:
            Compressed result dict (typically 100-500 tokens vs 5K-10K)
        """
        try:
            if not result.get("success", True):
                # Keep full error info (critical for debugging)
                return result
            
            compressed = {
                "success": True,
                "tool": tool_name
            }
            
            # Tool-specific compression rules
            if tool_name == "profile_dataset":
                # Compressed but preserves actual data values to prevent hallucination
                r = result.get("result", {})
                shape = r.get("shape", {})
                mem = r.get("memory_usage", {})
                col_types = r.get("column_types", {})
                columns_info = r.get("columns", {})
                
                # Build per-column stats summary (min/max/mean/median for numeric)
                column_stats = {}
                for col_name, col_info in columns_info.items():
                    stats = {"dtype": col_info.get("dtype", "unknown")}
                    if col_info.get("mean") is not None:
                        stats["min"] = col_info.get("min")
                        stats["max"] = col_info.get("max")
                        stats["mean"] = round(col_info["mean"], 4) if col_info["mean"] is not None else None
                        stats["median"] = round(col_info["median"], 4) if col_info.get("median") is not None else None
                    stats["null_pct"] = col_info.get("null_percentage", 0)
                    stats["unique"] = col_info.get("unique_count", 0)
                    if "top_values" in col_info:
                        stats["top_values"] = col_info["top_values"][:3]
                    column_stats[col_name] = stats
                
                compressed["summary"] = {
                    "rows": shape.get("rows"),
                    "cols": shape.get("columns"),
                    "missing_pct": r.get("overall_stats", {}).get("null_percentage", 0),
                    "duplicate_rows": r.get("overall_stats", {}).get("duplicate_rows", 0),
                    "numeric_cols": col_types.get("numeric", []),
                    "categorical_cols": col_types.get("categorical", []),
                    "file_size_mb": mem.get("total_mb", 0),
                    "column_stats": column_stats
                }
                compressed["next_steps"] = ["clean_missing_values", "detect_data_quality_issues"]
                
            elif tool_name == "detect_data_quality_issues":
                r = result.get("result", {})
                summary_data = r.get("summary", {})
                # Preserve actual issue details so LLM can cite real numbers
                critical_issues = r.get("critical", [])
                warning_issues = r.get("warning", [])[:10]  # Cap at 10
                info_issues = r.get("info", [])[:10]
                
                compressed["summary"] = {
                    "total_issues": summary_data.get("total_issues", 0),
                    "critical_count": summary_data.get("critical_count", 0),
                    "warning_count": summary_data.get("warning_count", 0),
                    "info_count": summary_data.get("info_count", 0),
                    "critical_issues": [{"type": i.get("type"), "column": i.get("column"), "message": i.get("message")} for i in critical_issues],
                    "warning_issues": [{"type": i.get("type"), "column": i.get("column"), "message": i.get("message"), "bounds": i.get("bounds")} for i in warning_issues],
                    "info_issues": [{"type": i.get("type"), "column": i.get("column"), "message": i.get("message")} for i in info_issues]
                }
                compressed["next_steps"] = ["clean_missing_values", "handle_outliers"]
                
            elif tool_name in ["clean_missing_values", "handle_outliers", "encode_categorical"]:
                r = result.get("result", {})
                compressed["summary"] = {
                    "output_file": r.get("output_file", r.get("output_path")),
                    "rows_processed": r.get("rows_after", r.get("num_rows")),
                    "changes_made": bool(r.get("changes", {}) or r.get("imputed_columns"))
                }
                compressed["next_steps"] = ["Use this file for next step"]
                
            elif tool_name == "train_baseline_models":
                r = result.get("result", {})
                models = r.get("models", [])
                if models and isinstance(models, list) and len(models) > 0:
                    # Filter to only dict entries (defensive)
                    valid_models = [m for m in models if isinstance(m, dict) and "test_score" in m]
                    if valid_models:
                        best = max(valid_models, key=lambda m: m.get("test_score", 0))
                        compressed["summary"] = {
                            "best_model": best.get("model"),
                            "test_score": round(best.get("test_score", 0), 4),
                            "train_score": round(best.get("train_score", 0), 4),
                            "task_type": r.get("task_type"),
                            "models_trained": len(valid_models)
                        }
                    else:
                        # Fallback if no valid models
                        compressed["summary"] = {
                            "task_type": r.get("task_type"),
                            "status": "No valid models trained"
                        }
                else:
                    compressed["summary"] = {"status": "No models found"}
                compressed["next_steps"] = ["hyperparameter_tuning", "generate_combined_eda_report"]
                
            elif tool_name in ["generate_plotly_dashboard", "generate_ydata_profiling_report", "generate_combined_eda_report"]:
                r = result.get("result", {})
                compressed["summary"] = {
                    "report_path": r.get("report_path", r.get("output_path")),
                    "report_type": tool_name,
                    "success": True
                }
                compressed["next_steps"] = ["Report ready for viewing"]
                
            elif tool_name == "hyperparameter_tuning":
                r = result.get("result", {})
                compressed["summary"] = {
                    "best_params": r.get("best_params", {}),
                    "best_score": round(r.get("best_score", 0), 4),
                    "model_type": r.get("model_type"),
                    "trials_completed": r.get("n_trials")
                }
                compressed["next_steps"] = ["perform_cross_validation", "generate_model_performance_plots"]
            
            # ── Feature importance / selection tools ──
            elif tool_name == "auto_feature_selection":
                r = result.get("result", {})
                # Preserve the actual feature scores — this IS the answer for "feature importance" queries
                feature_scores = r.get("feature_scores", r.get("feature_rankings", {}))
                # Keep top 15 features max
                if isinstance(feature_scores, dict):
                    sorted_feats = sorted(feature_scores.items(), key=lambda x: abs(float(x[1])) if x[1] is not None else 0, reverse=True)[:15]
                    feature_scores = {k: round(float(v), 4) if v is not None else 0 for k, v in sorted_feats}
                compressed["summary"] = {
                    "n_features_original": r.get("n_features_original"),
                    "n_features_selected": r.get("n_features_selected"),
                    "selected_features": r.get("selected_features", [])[:15],
                    "feature_scores": feature_scores,
                    "selection_method": r.get("selection_method"),
                    "task_type": r.get("task_type"),
                    "output_path": r.get("output_path")
                }
                compressed["next_steps"] = ["analyze_correlations", "generate_eda_plots"]
            
            elif tool_name == "analyze_correlations":
                r = result.get("result", {})
                # Preserve high correlations and target correlations — key analytical data
                high_corrs = r.get("high_correlations", [])[:10]  # Top 10 pairs
                target_corrs = r.get("target_correlations", {})
                if isinstance(target_corrs, dict) and "top_features" in target_corrs:
                    target_corrs = {
                        "target": target_corrs.get("target"),
                        "top_features": target_corrs["top_features"][:10]
                    }
                compressed["summary"] = {
                    "numeric_columns_count": len(r.get("numeric_columns", [])),
                    "high_correlations": high_corrs,
                    "target_correlations": target_corrs,
                }
                compressed["next_steps"] = ["auto_feature_selection", "generate_eda_plots"]
            
            elif tool_name in ["train_with_autogluon", "analyze_autogluon_model"]:
                r = result.get("result", {})
                # Preserve model metrics AND feature importance
                feature_importance = r.get("feature_importance", [])
                if isinstance(feature_importance, list):
                    feature_importance = feature_importance[:10]  # Top 10 features
                compressed["summary"] = {
                    "task_type": r.get("task_type"),
                    "best_model": r.get("best_model"),
                    "best_score": r.get("best_score"),
                    "eval_metric": r.get("eval_metric"),
                    "n_models_trained": r.get("n_models_trained"),
                    "feature_importance": feature_importance,
                    "model_path": r.get("model_path", r.get("output_path")),
                    "training_time_seconds": r.get("training_time_seconds")
                }
                compressed["next_steps"] = ["predict_with_autogluon", "generate_model_report"]
                
            else:
                # Generic compression: Keep only key fields
                r = result.get("result", {})
                if isinstance(r, dict):
                    # Extract key fields (common patterns)
                    key_fields = {}
                    for key in ["output_path", "output_file", "status", "message", "success"]:
                        if key in r:
                            key_fields[key] = r[key]
                    compressed["summary"] = key_fields or {"result": "completed"}
                else:
                    compressed["summary"] = {"result": str(r)[:200] if r else "completed"}
                compressed["next_steps"] = ["Continue workflow"]
            
            return compressed
            
        except Exception as e:
            # If compression fails, return minimal safe result
            print(f"⚠️  Compression failed for {tool_name}: {str(e)}")
            return {
                "success": result.get("success", True),
                "tool": tool_name,
                "summary": {"status": "completed (compression failed)"},
                "result": result.get("result", {}) if isinstance(result.get("result"), dict) else {}
            }


    def _parse_text_tool_calls(self, text_response: str) -> List[Dict[str, Any]]:
        """
        Parse tool calls from text-based LLM response (ReAct pattern).
        Supports multiple formats:
        - JSON: {"tool": "tool_name", "arguments": {...}}
        - Function: tool_name(arg1="value", arg2="value")
        - Markdown: ```json {...} ```
        """
        import re
        tool_calls = []
        
        # Pattern 1: JSON blocks (most reliable)
        json_pattern = r'```(?:json)?\s*(\{[^\`]+\})\s*```'
        json_matches = re.findall(json_pattern, text_response, re.DOTALL)
        
        for match in json_matches:
            try:
                tool_data = json.loads(match)
                if "tool" in tool_data or "function" in tool_data:
                    tool_name = tool_data.get("tool") or tool_data.get("function")
                    arguments = tool_data.get("arguments") or tool_data.get("args") or {}
                    tool_calls.append({
                        "id": f"call_{len(tool_calls)}",
                        "function": {
                            "name": tool_name,
                            "arguments": json.dumps(arguments)
                        }
                    })
            except json.JSONDecodeError:
                continue
        
        # Pattern 2: Function call format - tool_name(arg1="value", arg2=123)
        if not tool_calls:
            func_pattern = r'(\w+)\s*\((.*?)\)'
            for match in re.finditer(func_pattern, text_response):
                tool_name = match.group(1)
                args_str = match.group(2)
                
                # Check if this looks like a known tool
                if any(tool_name in tool["function"]["name"] for tool in self._compress_tools_registry()):
                    # Parse arguments
                    arguments = {}
                    arg_pattern = r'(\w+)\s*=\s*(["\']?)([^,\)]+)\2'
                    for arg_match in re.finditer(arg_pattern, args_str):
                        key = arg_match.group(1)
                        value = arg_match.group(3)
                        # Try to parse as number/bool
                        if value.lower() == "true":
                            arguments[key] = True
                        elif value.lower() == "false":
                            arguments[key] = False
                        elif value.isdigit():
                            arguments[key] = int(value)
                        else:
                            arguments[key] = value
                    
                    tool_calls.append({
                        "id": f"call_{len(tool_calls)}",
                        "function": {
                            "name": tool_name,
                            "arguments": json.dumps(arguments)
                        }
                    })
        
        return tool_calls
    
    def _convert_to_gemini_tools(self, groq_tools: List[Dict]) -> List[Dict]:
        """
        Convert Groq/OpenAI format tools to Gemini format.
        
        Groq format: {"type": "function", "function": {...}}
        Gemini format: {"name": "...", "description": "...", "parameters": {...}}
        
        Gemini requires:
        - Property types as UPPERCASE (STRING, NUMBER, BOOLEAN, ARRAY, OBJECT)
        - No "type": "object" at root parameters level
        """
        gemini_tools = []
        
        def convert_type(json_type: str) -> str:
            """Convert JSON Schema type to Gemini type."""
            type_map = {
                "string": "STRING",
                "number": "NUMBER",
                "integer": "INTEGER",
                "boolean": "BOOLEAN",
                "array": "ARRAY",
                "object": "OBJECT"
            }
            
            # Handle list of types (e.g., ["string", "array"])
            if isinstance(json_type, list):
                # Use the first type in the list, or ARRAY if array is in the list
                if "array" in json_type:
                    return "ARRAY"
                elif len(json_type) > 0:
                    return type_map.get(json_type[0], "STRING")
                else:
                    return "STRING"
            
            return type_map.get(json_type, "STRING")
        
        def convert_properties(properties: Dict) -> Dict:
            """Convert property definitions to Gemini format."""
            converted = {}
            for prop_name, prop_def in properties.items():
                new_def = {}
                
                # Handle oneOf (like clean_missing_values strategy)
                if "oneOf" in prop_def:
                    # For oneOf, just pick the first option or simplify
                    if isinstance(prop_def["oneOf"], list) and len(prop_def["oneOf"]) > 0:
                        first_option = prop_def["oneOf"][0]
                        if "type" in first_option:
                            new_def["type"] = convert_type(first_option["type"])
                        if "enum" in first_option:
                            new_def["enum"] = first_option["enum"]
                    else:
                        new_def["type"] = "STRING"
                elif "type" in prop_def:
                    prop_type = prop_def["type"]
                    
                    # Handle list of types (e.g., ["string", "array"])
                    if isinstance(prop_type, list):
                        converted_type = convert_type(prop_type)
                        new_def["type"] = converted_type
                        
                        # If it's an array type, we MUST provide items for Gemini
                        if converted_type == "ARRAY":
                            if "items" in prop_def:
                                items_type = prop_def["items"].get("type", "string")
                                new_def["items"] = {"type": convert_type(items_type)}
                            else:
                                # Default to STRING items if not specified
                                new_def["items"] = {"type": "STRING"}
                    else:
                        new_def["type"] = convert_type(prop_type)
                        
                        # Handle arrays
                        if prop_type == "array" and "items" in prop_def:
                            items_type = prop_def["items"].get("type", "string")
                            new_def["items"] = {"type": convert_type(items_type)}
                        elif prop_type == "array":
                            # Array without items specification - default to STRING
                            new_def["items"] = {"type": "STRING"}
                    
                    # Keep enum
                    if "enum" in prop_def:
                        new_def["enum"] = prop_def["enum"]
                else:
                    new_def["type"] = "STRING"
                
                # Keep description if present
                if "description" in prop_def:
                    new_def["description"] = prop_def["description"]
                
                converted[prop_name] = new_def
            
            return converted
        
        for tool in groq_tools:
            func = tool["function"]
            params = func.get("parameters", {})
            
            # Convert parameters to Gemini format
            gemini_params = {
                "type": "OBJECT",  # Gemini uses UPPERCASE
                "properties": convert_properties(params.get("properties", {})),
                "required": params.get("required", [])
            }
            
            gemini_tool = {
                "name": func["name"],
                "description": func["description"],
                "parameters": gemini_params
            }
            gemini_tools.append(gemini_tool)
        
        return gemini_tools
    
    def _update_workflow_state(self, tool_name: str, tool_result: Dict[str, Any]):
        """
        Update workflow state based on tool execution.
        This reduces the need to keep full tool results in LLM context.
        """
        if not tool_result.get("success", True):
            return  # Don't update state on failures
        
        result_data = tool_result.get("result", {})
        
        # Profile dataset
        if tool_name == "profile_dataset":
            shape = result_data.get("shape", {})
            col_types = result_data.get("column_types", {})
            overall = result_data.get("overall_stats", {})
            columns_info = result_data.get("columns", {})
            
            # Extract actual per-column stats for grounding
            column_ranges = {}
            for col_name, col_info in columns_info.items():
                if col_info.get("mean") is not None:
                    column_ranges[col_name] = {
                        "min": col_info.get("min"),
                        "max": col_info.get("max"),
                        "mean": round(col_info["mean"], 4) if col_info["mean"] is not None else None,
                        "median": round(col_info["median"], 4) if col_info.get("median") is not None else None,
                    }
            
            self.workflow_state.update_profiling({
                "num_rows": shape.get("rows"),
                "num_columns": shape.get("columns"),
                "missing_percentage": overall.get("null_percentage", 0),
                "duplicate_rows": overall.get("duplicate_rows", 0),
                "numeric_columns": col_types.get("numeric", []),
                "categorical_columns": col_types.get("categorical", []),
                "column_ranges": column_ranges
            })
        
        # Quality check
        elif tool_name == "detect_data_quality_issues":
            self.workflow_state.update_quality({
                "total_issues": result_data.get("total_issues", 0),
                "has_missing": result_data.get("has_missing", False),
                "has_outliers": result_data.get("has_outliers", False),
                "has_duplicates": result_data.get("has_duplicates", False)
            })
        
        # Cleaning tools
        elif tool_name in ["clean_missing_values", "handle_outliers", "encode_categorical"]:
            self.workflow_state.update_cleaning({
                "output_file": result_data.get("output_file") or result_data.get("output_path"),
                "rows_processed": result_data.get("rows_after") or result_data.get("num_rows"),
                "tool": tool_name
            })
        
        # Feature engineering
        elif tool_name in ["create_time_features", "create_interaction_features", "auto_feature_engineering"]:
            self.workflow_state.update_features({
                "output_file": result_data.get("output_file") or result_data.get("output_path"),
                "new_features": result_data.get("new_columns", []),
                "tool": tool_name
            })
        
        # Model training
        elif tool_name == "train_baseline_models":
            models = result_data.get("models", [])
            best_model = None
            if models and isinstance(models, list):
                valid_models = [m for m in models if isinstance(m, dict) and "test_score" in m]
                if valid_models:
                    best_model = max(valid_models, key=lambda m: m.get("test_score", 0))
            
            self.workflow_state.update_modeling({
                "best_model": best_model.get("model") if best_model else None,
                "best_score": best_model.get("test_score") if best_model else None,
                "models_trained": len(valid_models) if best_model else 0,
                "task_type": result_data.get("task_type")
            })
    
    # ═══════════════════════════════════════════════════════════════════════════
    # REASONING LOOP INFRASTRUCTURE
    # Three new methods that power the hypothesis-driven analysis mode:
    #   _llm_text_call       → Provider-agnostic text LLM call (no tool schemas)
    #   _get_tools_description → Lightweight text description of available tools
    #   _run_reasoning_loop   → The core Reason → Act → Evaluate → Loop/Stop cycle
    # ═══════════════════════════════════════════════════════════════════════════

    def _llm_text_call(self, system_prompt: str, user_prompt: str, max_tokens: int = 2048) -> str:
        """
        Simple text-only LLM call (no tool schemas).
        
        Used by Reasoner, Evaluator, and Synthesizer for lightweight
        reasoning calls. Much cheaper than full tool-calling API calls.
        
        Args:
            system_prompt: System prompt for the LLM
            user_prompt: User prompt for the LLM
            max_tokens: Maximum response tokens
            
        Returns:
            Plain text response from the LLM
        """
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
        
        # Rate limiting
        if self.min_api_call_interval > 0:
            time_since_last_call = time.time() - self.last_api_call_time
            if time_since_last_call < self.min_api_call_interval:
                wait_time = self.min_api_call_interval - time_since_last_call
                time.sleep(wait_time)
        
        try:
            if self.provider == "mistral":
                if hasattr(self.mistral_client, 'chat') and hasattr(self.mistral_client.chat, 'complete'):
                    response = self.mistral_client.chat.complete(
                        model=self.model,
                        messages=messages,
                        temperature=0.1,
                        max_tokens=max_tokens
                    )
                else:
                    response = self.mistral_client.chat(
                        model=self.model,
                        messages=messages,
                        temperature=0.1,
                        max_tokens=max_tokens
                    )
                self.api_calls_made += 1
                self.last_api_call_time = time.time()
                
                if hasattr(response, 'usage') and response.usage:
                    self.tokens_this_minute += response.usage.total_tokens
                
                return self._extract_content_text(response.choices[0].message.content)
            
            elif self.provider == "groq":
                response = self.groq_client.chat.completions.create(
                    model=self.model,
                    messages=messages,
                    temperature=0.1,
                    max_tokens=max_tokens
                )
                self.api_calls_made += 1
                self.last_api_call_time = time.time()
                
                if hasattr(response, 'usage') and response.usage:
                    self.tokens_this_minute += response.usage.total_tokens
                
                return self._extract_content_text(response.choices[0].message.content)
            
            elif self.provider == "gemini":
                full_prompt = f"{system_prompt}\n\n{user_prompt}"
                response = self.gemini_model.generate_content(
                    full_prompt,
                    generation_config={
                        "temperature": 0.1,
                        "max_output_tokens": max_tokens
                    }
                )
                self.api_calls_made += 1
                self.last_api_call_time = time.time()
                return response.text
            
            else:
                raise ValueError(f"Unsupported provider: {self.provider}")
                
        except Exception as e:
            error_str = str(e)
            # Handle rate limits
            if "429" in error_str or "rate_limit" in error_str.lower():
                print(f"⏳ Rate limit in reasoning call, waiting 10s...")
                time.sleep(10)
                return self._llm_text_call(system_prompt, user_prompt, max_tokens)
            raise

    def _get_tools_description(self, tool_names: Optional[List[str]] = None) -> str:
        """
        Build a lightweight text description of available tools.
        
        Used in Reasoner prompts instead of sending full JSON tool schemas.
        This is much more token-efficient than the OpenAI tools format.
        
        Args:
            tool_names: Optional list of tool names to include (None = all tools)
            
        Returns:
            Formatted text like:
                - profile_dataset(file_path): Profile a dataset to understand structure
                - analyze_correlations(file_path, target_col): Analyze column correlations
                ...
        """
        import inspect
        
        lines = []
        tool_map = self.tool_functions
        
        # Filter to specific tools if requested
        if tool_names:
            tool_map = {k: v for k, v in tool_map.items() if k in tool_names}
        
        for name, func in sorted(tool_map.items()):
            # Get function signature
            try:
                sig = inspect.signature(func)
                params = []
                for param_name, param in sig.parameters.items():
                    if param_name in ("kwargs", "args"):
                        continue
                    if param.default is inspect.Parameter.empty:
                        params.append(param_name)
                    else:
                        params.append(f"{param_name}=...")
                params_str = ", ".join(params[:5])  # Max 5 params shown
                if len(sig.parameters) > 5:
                    params_str += ", ..."
            except (ValueError, TypeError):
                params_str = "..."
            
            # Get first line of docstring
            doc = (func.__doc__ or "").strip().split("\n")[0][:100]
            
            lines.append(f"- {name}({params_str}): {doc}")
        
        return "\n".join(lines)

    def _get_relevant_tools_sbert(
        self,
        query: str,
        candidate_tools: Optional[set] = None,
        top_k: int = 20,
        threshold: float = 0.15
    ) -> set:
        """
        Use SBERT semantic similarity to rank tools by relevance to the query.
        
        Encodes the query and each tool's (name + docstring) into embeddings,
        then keeps only tools whose cosine similarity exceeds the threshold.
        Tool embeddings are lazily computed and cached for the lifetime of the
        orchestrator instance.
        
        Args:
            query: User's natural language question
            candidate_tools: Tools to score (default: all tool_functions)
            top_k: Max number of tools to return
            threshold: Minimum cosine similarity to include a tool (0.0-1.0)
            
        Returns:
            Set of tool names that are semantically relevant to the query.
            Falls back to candidate_tools unchanged if SBERT is unavailable.
        """
        if not self.semantic_layer.enabled:
            return candidate_tools or set(self.tool_functions.keys())
        
        try:
            from sklearn.metrics.pairwise import cosine_similarity as cos_sim
            import numpy as np
        except ImportError:
            return candidate_tools or set(self.tool_functions.keys())
        
        candidates = candidate_tools or set(self.tool_functions.keys())
        
        # ── Lazily build & cache tool embeddings ──
        if not hasattr(self, '_tool_embeddings_cache'):
            self._tool_embeddings_cache = {}
        
        # Compute embeddings for any tools not yet cached
        tools_needing_embed = [t for t in candidates if t not in self._tool_embeddings_cache]
        if tools_needing_embed:
            texts = []
            for name in tools_needing_embed:
                func = self.tool_functions.get(name)
                doc = (func.__doc__ or "").strip().split("\n")[0][:150] if func else ""
                texts.append(f"{name}: {doc}")
            
            try:
                embeddings = self.semantic_layer.model.encode(
                    texts, convert_to_numpy=True, show_progress_bar=False, batch_size=32
                )
                for name, emb in zip(tools_needing_embed, embeddings):
                    self._tool_embeddings_cache[name] = emb
            except Exception as e:
                print(f"⚠️ SBERT tool encoding failed: {e}, returning all candidates")
                return candidates
        
        # ── Encode the query ──
        try:
            query_emb = self.semantic_layer.model.encode(
                query, convert_to_numpy=True, show_progress_bar=False
            ).reshape(1, -1)
        except Exception as e:
            print(f"⚠️ SBERT query encoding failed: {e}")
            return candidates
        
        # ── Score each candidate tool ──
        scored = []
        for name in candidates:
            emb = self._tool_embeddings_cache.get(name)
            if emb is None:
                continue
            sim = float(cos_sim(query_emb, emb.reshape(1, -1))[0][0])
            scored.append((name, sim))
        
        # Sort descending by similarity
        scored.sort(key=lambda x: x[1], reverse=True)
        
        # Keep tools above threshold, up to top_k
        selected = {name for name, sim in scored[:top_k] if sim >= threshold}
        
        # ── Always include universally-useful core tools ──
        CORE_TOOLS = {
            "profile_dataset", "analyze_correlations", "auto_feature_selection",
            "generate_eda_plots", "clean_missing_values",
            "execute_python_code",
        }
        selected |= (CORE_TOOLS & candidates)
        
        if selected:
            # Log what SBERT chose
            top5 = scored[:5]
            print(f"   🧠 SBERT tool routing: {len(selected)}/{len(candidates)} tools selected")
            print(f"      Top-5 by similarity: {[(n, f'{s:.3f}') for n, s in top5]}")
        else:
            # Safety: if nothing passed threshold, return all candidates
            print(f"   ⚠️ SBERT: no tools above threshold {threshold}, using all {len(candidates)} candidates")
            selected = candidates
        
        return selected

    def _run_reasoning_loop(
        self,
        question: str,
        file_path: str,
        dataset_info: Dict[str, Any],
        target_col: Optional[str] = None,
        mode: str = "investigative",
        max_iterations: int = 7,
        tool_names: Optional[List[str]] = None
    ) -> Dict[str, Any]:
        """
        Run the Reasoning Loop: Reason → Act → Evaluate → Loop/Stop → Synthesize.
        
        This is the core of the hypothesis-driven analysis mode.
        Instead of a pipeline, the agent:
        1. REASONS about what to investigate next
        2. ACTS (executes one tool)
        3. EVALUATES the result
        4. Decides to LOOP (investigate more) or STOP
        5. SYNTHESIZES all findings into a coherent answer
        
        Args:
            question: User's question or "Analyze this data"
            file_path: Path to the dataset
            dataset_info: Schema info from local extraction
            target_col: Optional target column
            mode: "investigative" or "exploratory"
            max_iterations: Max reasoning iterations (default 7)
            tool_names: Optional subset of tools to use
            
        Returns:
            Dict with status, summary, findings, workflow_history, etc.
        """
        start_time = time.time()
        
        # Initialize reasoning components (pass our LLM caller)
        reasoner = Reasoner(llm_caller=self._llm_text_call)
        evaluator = Evaluator(llm_caller=self._llm_text_call)
        synthesizer = Synthesizer(llm_caller=self._llm_text_call)
        findings = FindingsAccumulator(question=question, mode=mode)
        
        # ── Intelligent tool filtering for the reasoning loop ──
        # Step 1: Hard-exclude tools that can never work in the reasoning loop
        EXCLUDED_FROM_REASONING = {
            "generate_feature_importance_plot",  # needs Dict[str, float] — Reasoner can't supply
        }
        TRAINING_TOOLS = {
            "train_with_autogluon", "train_baseline_models", "train_model",
            "hyperparameter_tuning", "predict_with_autogluon",
            "analyze_autogluon_model", "advanced_model_training",
            "neural_architecture_search"
        }
        
        # Build initial candidate pool
        effective_tool_names = set(tool_names) if tool_names else set(self.tool_functions.keys())
        effective_tool_names -= EXCLUDED_FROM_REASONING
        
        # Step 2: SBERT semantic routing — score tools against the query
        # This replaces the old keyword-only approach with real semantic understanding
        if self.semantic_layer.enabled:
            print(f"   🧠 Using SBERT semantic routing for tool selection...")
            effective_tool_names = self._get_relevant_tools_sbert(
                query=question,
                candidate_tools=effective_tool_names,
                top_k=20,
                threshold=0.15
            )
        
        # Step 3: Hard safety rail — even if SBERT scores a training tool highly,
        # block it for pure EDA queries (training wastes 120-180s for no benefit)
        question_lower = question.lower()
        explicitly_wants_training = any(kw in question_lower for kw in [
            "train", "predict", "build a model", "classification", "regression",
            "classify", "forecast", "deploy model", "autogluon"
        ])
        if not explicitly_wants_training:
            EDA_KEYWORDS = [
                "feature importance", "important features", "most important",
                "correlations", "correlation", "explore", "explain",
                "understand", "patterns", "insights", "eda", "profiling",
                "distribution", "outliers", "summary", "describe", "overview",
                "what drives", "what affects", "key factors", "top features",
                "feature ranking", "data quality", "missing values"
            ]
            is_eda_query = any(kw in question_lower for kw in EDA_KEYWORDS)
            if is_eda_query:
                removed = effective_tool_names & TRAINING_TOOLS
                if removed:
                    print(f"   🚫 EDA safety rail — removing training tools: {removed}")
                effective_tool_names -= TRAINING_TOOLS
        
        # Get tools description for the reasoner (filtered)
        tools_desc = self._get_tools_description(list(effective_tool_names))
        print(f"   📋 Reasoning loop will see {len(effective_tool_names)} tools (of {len(self.tool_functions)})")
        
        # Track for API response
        workflow_history = []
        original_data_file = file_path  # NEVER changes — always the uploaded dataset
        current_file = file_path        # Tracks the latest DATA file (csv/parquet only)
        
        # Emit mode info for UI
        if hasattr(self, 'session') and self.session:
            progress_manager.emit(self.session.session_id, {
                'type': 'reasoning_mode',
                'mode': mode,
                'message': f"🧠 Reasoning Loop activated ({mode} mode)",
                'question': question
            })
        
        print(f"\n{'='*60}")
        print(f"🧠 REASONING LOOP ({mode.upper()} mode)")
        print(f"   Question: {question}")
        print(f"   Max iterations: {max_iterations}")
        print(f"{'='*60}")
        
        # ── EXPLORATORY MODE: Generate hypotheses first ──
        if mode == "exploratory":
            print(f"\n🔬 Generating hypotheses from data profile...")
            
            # Profile the dataset first if not already done
            profile_result = self._execute_tool("profile_dataset", {"file_path": file_path})
            profile_summary = ""
            if profile_result.get("success", True):
                profile_summary = json.dumps(
                    self._compress_tool_result("profile_dataset", 
                        self._make_json_serializable(profile_result)),
                    default=str
                )[:2000]
                
                workflow_history.append({
                    "iteration": 0,
                    "tool": "profile_dataset",
                    "arguments": {"file_path": file_path},
                    "result": profile_result
                })
                self._update_workflow_state("profile_dataset", profile_result)
            
            # Generate hypotheses
            hypotheses = reasoner.generate_hypotheses(
                dataset_info=dataset_info,
                file_path=file_path,
                target_col=target_col,
                profile_summary=profile_summary
            )
            
            print(f"   Generated {len(hypotheses)} hypotheses:")
            for i, h in enumerate(hypotheses):
                text = h.get("text", str(h))
                priority = h.get("priority", 0.5)
                findings.add_hypothesis(text, priority=priority, source_iteration=0)
                print(f"   {i+1}. [{priority:.1f}] {text}")
            
            # Emit hypothesis info
            if hasattr(self, 'session') and self.session:
                progress_manager.emit(self.session.session_id, {
                    'type': 'hypotheses_generated',
                    'hypotheses': [h.get("text", str(h)) for h in hypotheses],
                    'count': len(hypotheses)
                })
        
        # ── MAIN REASONING LOOP ──
        for iteration in range(1, max_iterations + 1):
            print(f"\n── Iteration {iteration}/{max_iterations} ──")
            
            # STEP 1: REASON - What should we investigate next?
            print(f"🤔 REASON: Deciding next action...")
            
            reasoning_output = reasoner.reason(
                question=question,
                dataset_info=dataset_info,
                findings=findings,
                available_tools=tools_desc,
                file_path=current_file,
                target_col=target_col
            )
            
            print(f"   Status: {reasoning_output.status}")
            print(f"   Reasoning: {reasoning_output.reasoning}")
            
            # Check if done
            if reasoning_output.status == "done":
                print(f"✅ Reasoner says: DONE (confidence: {reasoning_output.confidence:.0%})")
                print(f"   Reason: {reasoning_output.reasoning}")
                break
            
            tool_name = reasoning_output.tool_name
            tool_args = reasoning_output.arguments
            hypothesis = reasoning_output.hypothesis
            
            if not tool_name or tool_name not in self.tool_functions:
                print(f"⚠️  Invalid tool: {tool_name}, skipping iteration")
                continue
            
            print(f"   Tool: {tool_name}")
            print(f"   Hypothesis: {hypothesis}")
            
            # Emit reasoning step for UI
            if hasattr(self, 'session') and self.session:
                progress_manager.emit(self.session.session_id, {
                    'type': 'reasoning_step',
                    'iteration': iteration,
                    'tool': tool_name,
                    'hypothesis': hypothesis,
                    'reasoning': reasoning_output.reasoning
                })
            
            # STEP 2: ACT - Execute the tool
            print(f"⚡ ACT: Executing {tool_name}...")
            
            # Emit tool execution event
            if hasattr(self, 'session') and self.session:
                progress_manager.emit(self.session.session_id, {
                    'type': 'tool_executing',
                    'tool': tool_name,
                    'message': f"🔧 Executing: {tool_name}",
                    'arguments': tool_args
                })
            
            tool_result = self._execute_tool(tool_name, tool_args)
            
            # Determine success/failure
            tool_success = tool_result.get("success", True)
            tool_error = ""
            
            # Track output file for next iteration — ONLY update for data files
            if tool_success:
                result_data = tool_result.get("result", {})
                if isinstance(result_data, dict):
                    new_file = result_data.get("output_file") or result_data.get("output_path")
                    if new_file:
                        # Only update current_file for actual data files (CSV, parquet, etc.)
                        # NOT for visualizations (HTML, PNG, JPG) or reports
                        data_extensions = ('.csv', '.parquet', '.xlsx', '.xls', '.json', '.tsv')
                        if new_file.lower().endswith(data_extensions):
                            current_file = new_file
                            print(f"   📂 Updated current data file: {new_file}")
                        else:
                            print(f"   📊 Output artifact (not updating data file): {new_file}")
                
                # Emit success
                if hasattr(self, 'session') and self.session:
                    progress_manager.emit(self.session.session_id, {
                        'type': 'tool_completed',
                        'tool': tool_name,
                        'message': f"✓ Completed: {tool_name}"
                    })
                print(f"   ✓ Tool completed successfully")
            else:
                error_msg = tool_result.get("error", "Unknown error")
                tool_error = str(error_msg)[:300]
                print(f"   ❌ Tool failed: {error_msg}")
                # Record failure so Reasoner won't retry this tool
                findings.add_failed_tool(tool_name, tool_error)
                if hasattr(self, 'session') and self.session:
                    progress_manager.emit(self.session.session_id, {
                        'type': 'tool_failed',
                        'tool': tool_name,
                        'message': f"❌ FAILED: {tool_name}",
                        'error': error_msg
                    })
            
            # Track in workflow history
            workflow_history.append({
                "iteration": iteration,
                "tool": tool_name,
                "arguments": tool_args,
                "result": tool_result
            })
            
            # Update workflow state
            self._update_workflow_state(tool_name, tool_result)
            
            # Checkpoint
            if tool_success:
                session_id = self.http_session_key or "default"
                self.recovery_manager.checkpoint_manager.save_checkpoint(
                    session_id=session_id,
                    workflow_state={
                        'iteration': iteration,
                        'workflow_history': workflow_history,
                        'current_file': file_path,
                        'task_description': question,
                        'target_col': target_col
                    },
                    last_tool=tool_name,
                    iteration=iteration
                )
            
            # STEP 3: EVALUATE - What did we learn?
            print(f"📊 EVALUATE: Interpreting results...")
            
            evaluation = evaluator.evaluate(
                question=question,
                tool_name=tool_name,
                arguments=tool_args,
                result=tool_result,
                findings=findings,
                result_compressor=lambda tn, r: self._compress_tool_result(
                    tn, self._make_json_serializable(r)
                )
            )
            
            print(f"   Interpretation: {evaluation.interpretation}")
            print(f"   Answered: {evaluation.answered} (confidence: {evaluation.confidence:.0%})")
            print(f"   Should stop: {evaluation.should_stop}")
            if evaluation.next_questions:
                print(f"   Next questions: {evaluation.next_questions}")
            
            # Build finding and add to accumulator
            compressed_result = json.dumps(
                self._compress_tool_result(tool_name, self._make_json_serializable(tool_result)),
                default=str
            )
            
            finding = evaluator.build_finding(
                iteration=iteration,
                hypothesis=hypothesis,
                tool_name=tool_name,
                arguments=tool_args,
                result_summary=compressed_result,
                evaluation=evaluation,
                success=tool_success,
                error_message=tool_error
            )
            findings.add_finding(finding)
            
            # Update hypothesis status based on evaluation results
            if hypothesis:
                if tool_success and evaluation.confidence >= 0.6:
                    findings.update_hypothesis(
                        hypothesis, "supported", evaluation.interpretation, is_supporting=True
                    )
                elif tool_success and evaluation.confidence >= 0.3:
                    findings.update_hypothesis(
                        hypothesis, "inconclusive", evaluation.interpretation, is_supporting=True
                    )
                elif not tool_success:
                    findings.update_hypothesis(
                        hypothesis, "inconclusive", f"Tool failed: {tool_error}", is_supporting=False
                    )
            
            # Emit finding for UI
            if hasattr(self, 'session') and self.session:
                progress_manager.emit(self.session.session_id, {
                    'type': 'finding_discovered',
                    'iteration': iteration,
                    'interpretation': evaluation.interpretation,
                    'confidence': evaluation.confidence,
                    'answered': evaluation.answered
                })
            
            # Check if we should stop
            if evaluation.should_stop:
                print(f"\n✅ Evaluator says: STOP (confidence: {evaluation.confidence:.0%})")
                break
        
        # ── STEP 4: SYNTHESIZE - Build the final answer ──
        print(f"\n{'='*60}")
        print(f"📝 SYNTHESIZE: Building final answer from {len(findings.findings)} findings...")
        print(f"{'='*60}")
        
        # Guard: If ALL findings failed, return honest error instead of hallucinated synthesis
        successful_findings = findings.get_successful_findings()
        if findings.findings and not successful_findings:
            failed_tools = ", ".join(findings.failed_tools.keys()) if findings.failed_tools else "unknown"
            summary_text = (
                "## Analysis Could Not Be Completed\n\n"
                f"All {len(findings.findings)} investigation steps failed. "
                f"**Failed tools**: {failed_tools}\n\n"
                "**Possible causes:**\n"
                "- The dataset file may be corrupted or in an unsupported format\n"
                "- Column names in the query may not match the actual dataset\n"
                "- Required dependencies may be missing\n\n"
                "**Recommended next steps:**\n"
                "1. Re-upload the dataset and try again\n"
                "2. Check that column names are correct\n"
                "3. Try a simpler query first (e.g., 'profile this dataset')"
            )
            print(f"⚠️  All tools failed — returning honest error instead of synthesis")
        else:
            # Collect artifacts from workflow history
            artifacts = self._collect_artifacts(workflow_history)
        
            # Generate synthesis
            if mode == "exploratory":
                summary_text = synthesizer.synthesize_exploratory(
                    findings=findings,
                    artifacts=artifacts
                )
            else:
                summary_text = synthesizer.synthesize(
                    findings=findings,
                    artifacts=artifacts
                )
        
        # Also generate enhanced summary for plots/metrics extraction
        try:
            enhanced = self._generate_enhanced_summary(
                workflow_history, summary_text, question
            )
            plots_data = enhanced.get("plots", [])
            metrics_data = enhanced.get("metrics", {})
            artifacts_data = enhanced.get("artifacts", {})
        except Exception as e:
            print(f"⚠️  Enhanced summary generation failed: {e}")
            plots_data = []
            metrics_data = {}
            artifacts_data = {}
        
        # Save to session
        if self.session:
            self.session.add_conversation(question, summary_text)
            self.session_store.save(self.session)
        
        result = {
            "status": "success",
            "summary": summary_text,
            "metrics": metrics_data,
            "artifacts": artifacts_data,
            "plots": plots_data,
            "workflow_history": workflow_history,
            "findings": findings.to_dict(),
            "reasoning_trace": self.reasoning_trace.get_trace(),
            "reasoning_summary": self.reasoning_trace.get_trace_summary(),
            "execution_mode": mode,
            "iterations": findings.iteration_count,
            "api_calls": self.api_calls_made,
            "execution_time": round(time.time() - start_time, 2)
        }
        
        print(f"\n✅ Reasoning loop completed in {result['execution_time']}s")
        print(f"   Iterations: {findings.iteration_count}")
        print(f"   Tools used: {', '.join(findings.tools_used)}")
        print(f"   API calls: {self.api_calls_made}")
        
        return result

    def _collect_artifacts(self, workflow_history: List[Dict]) -> Dict[str, Any]:
        """Collect plots, files, and other artifacts from workflow history."""
        plots = []
        files = []
        
        for step in workflow_history:
            result = step.get("result", {})
            if not isinstance(result, dict):
                continue
            
            result_data = result.get("result", result)
            if isinstance(result_data, dict):
                # Collect output files
                for key in ["output_file", "output_path", "report_path"]:
                    if key in result_data and result_data[key]:
                        files.append(result_data[key])
                
                # Collect plots
                if "plots" in result_data:
                    for plot in result_data["plots"]:
                        if isinstance(plot, dict):
                            plots.append(plot)
                        elif isinstance(plot, str):
                            plots.append({"path": plot, "title": step.get("tool", "Plot")})
                
                # Check for HTML files (interactive plots)
                for key in ["html_path", "dashboard_path"]:
                    if key in result_data and result_data[key]:
                        plots.append({
                            "path": result_data[key],
                            "title": step.get("tool", "Interactive Plot"),
                            "type": "html"
                        })
        
        return {"plots": plots, "files": files}

    def analyze(self, file_path: str, task_description: str, 
               target_col: Optional[str] = None, 
               use_cache: bool = True,
               stream: bool = True,
               max_iterations: int = 20) -> Dict[str, Any]:
        """
        Main entry point for data science analysis.
        
        Args:
            file_path: Path to dataset file
            task_description: Natural language description of the task
            target_col: Optional target column name
            use_cache: Whether to use cached results
            stream: Whether to stream LLM responses
            max_iterations: Maximum number of tool execution iterations
            
        Returns:
            Analysis results including summary and tool outputs
        """
        # 🛡️ SAFETY: Ensure max_iterations is never None (prevent NoneType comparison errors)
        if max_iterations is None:
            max_iterations = 20
            print(f"⚠️  max_iterations was None, defaulting to 20")
        
        start_time = time.time()
        
        # 🧹 CLEAR OLD CHECKPOINTS: Start fresh for each new workflow
        # This prevents stale checkpoint resumption when user starts a new query
        session_id = self.http_session_key or "default"
        if self.recovery_manager.checkpoint_manager.can_resume(session_id):
            print(f"🗑️  Clearing old checkpoint to start fresh workflow")
            self.recovery_manager.checkpoint_manager.clear_checkpoint(session_id)
        
        # 🧠 RESOLVE AMBIGUITY USING SESSION MEMORY (BEFORE SCHEMA EXTRACTION)
        # This ensures follow-up requests can find the file before we try to extract schema
        original_file_path = file_path
        original_target_col = target_col
        
        if self.session:
            # Check if request has ambiguous references
            resolved_params = self.session.resolve_ambiguity(task_description)
            print(f"[DEBUG] Orchestrator received resolved_params: {resolved_params}")
            print(f"[DEBUG] Current file_path: '{file_path}', target_col: '{target_col}'")
            
            # 🔥 FIX: Only use resolved file_path if user did NOT provide a new file
            # If file_path is already set (user uploaded a new file), DON'T override it
            if not file_path or file_path == "":
                if resolved_params.get("file_path"):
                    file_path = resolved_params["file_path"]
                    print(f"📝 Using dataset from session: {file_path}")
                else:
                    print(f"[DEBUG] No file_path in resolved_params")
            else:
                print(f"📝 User provided new file: {file_path} (ignoring session file: {resolved_params.get('file_path', 'none')})")
            
            if not target_col:
                if resolved_params.get("target_col"):
                    target_col = resolved_params["target_col"]
                    print(f"📝 Using target column from session: {target_col}")

            
            # Show session context if available (but show CURRENT file, not old one)
            if self.session.last_dataset or self.session.last_model:
                # 🔥 FIX: Update session's last_dataset to current file BEFORE showing context
                # This prevents stale session context from misleading the LLM
                if file_path and file_path != self.session.last_dataset:
                    print(f"📝 Updating session dataset: {self.session.last_dataset} → {file_path}")
                    self.session.last_dataset = file_path
                context_summary = self.session.get_context_summary()
                print(f"\n{context_summary}\n")
        
        # 🚀 LOCAL SCHEMA EXTRACTION (NO LLM) - Extract metadata before any LLM calls
        # Now that file_path is resolved from session if needed
        
        # 🛡️ VALIDATION: Ensure we have a valid file path
        if not file_path or file_path == "":
            error_msg = "No dataset file provided. Please upload a CSV, Excel, or Parquet file."
            print(f"❌ {error_msg}")
            return {
                "status": "error",
                "error": error_msg,
                "summary": "Cannot proceed without a dataset file.",
                "workflow_history": [],
                "execution_time": 0.0
            }
        
        print("🔍 Extracting dataset schema locally (no LLM)...")
        schema_info = extract_schema_local(file_path, sample_rows=3)
        
        if 'error' not in schema_info:
            # Guard: Reject empty datasets immediately instead of wasting reasoning iterations
            if schema_info.get('num_rows', 0) == 0:
                return {
                    "status": "error",
                    "error": "Dataset is empty (0 rows)",
                    "summary": "The uploaded dataset contains no data rows. Please upload a dataset with at least one row of data.",
                    "workflow_history": [],
                    "execution_time": time.time() - start_time
                }
            
            # 🧠 SEMANTIC LAYER: Enrich dataset info with column embeddings
            if self.semantic_layer.enabled:
                try:
                    schema_info = self.semantic_layer.enrich_dataset_info(schema_info, file_path, sample_size=100)
                    print(f"🧠 Semantic layer enriched {len(schema_info.get('column_embeddings', {}))} columns")
                except Exception as e:
                    print(f"⚠️ Semantic enrichment failed: {e}")
            
            # Update workflow state with schema
            self.workflow_state.update_dataset_info(schema_info)
            print(f"✅ Schema extracted: {schema_info['num_rows']} rows × {schema_info['num_columns']} cols")
            print(f"   File size: {schema_info['file_size_mb']} MB")
            
            # 🧠 SEMANTIC LAYER: Infer target column if not provided
            if not target_col and self.semantic_layer.enabled:
                try:
                    inferred = self.semantic_layer.infer_target_column(
                        schema_info.get('column_embeddings', {}),
                        task_description
                    )
                    if inferred:
                        target_col, confidence = inferred
                        print(f"💡 Inferred target column: {target_col} (confidence: {confidence:.2f})")
                except Exception as e:
                    print(f"⚠️ Target inference failed: {e}")
            
            # Infer task type if target column provided
            if target_col and target_col in schema_info['columns']:
                inferred_task = infer_task_type(target_col, schema_info)
                if inferred_task:
                    self.workflow_state.task_type = inferred_task
                    self.workflow_state.target_column = target_col
                    print(f"   Task type inferred: {inferred_task}")
        else:
            print(f"⚠️  Schema extraction failed: {schema_info.get('error')}")
        
        # Check cache
        if use_cache:
            cache_key = self._generate_cache_key(file_path, task_description, target_col)
            cached = self.cache.get(cache_key)
            if cached:
                print("✓ Using cached results")
                return cached
        
        # ═══════════════════════════════════════════════════════════════════════
        # 🧠 INTENT CLASSIFICATION → MODE SELECTION
        # Classify the user's request into one of three execution modes:
        #   DIRECT:        "Make a scatter plot"      → existing pipeline
        #   INVESTIGATIVE: "Why are customers churning?" → reasoning loop  
        #   EXPLORATORY:   "Analyze this data"        → hypothesis-driven loop
        # ═══════════════════════════════════════════════════════════════════════
        intent_classifier = IntentClassifier(semantic_layer=self.semantic_layer)
        intent_result = intent_classifier.classify(
            query=task_description,
            dataset_info=schema_info if 'error' not in schema_info else None,
            has_target_col=bool(target_col)
        )
        
        print(f"\n🎯 Intent Classification:")
        print(f"   Mode: {intent_result.mode.upper()}")
        print(f"   Confidence: {intent_result.confidence:.0%}")
        print(f"   Reasoning: {intent_result.reasoning}")
        print(f"   Sub-intent: {intent_result.sub_intent}")
        
        # Emit intent info for UI
        if hasattr(self, 'session') and self.session:
            progress_manager.emit(self.session.session_id, {
                'type': 'intent_classified',
                'mode': intent_result.mode,
                'confidence': intent_result.confidence,
                'reasoning': intent_result.reasoning,
                'sub_intent': intent_result.sub_intent
            })
        
        # 📝 Record intent classification in reasoning trace
        self.reasoning_trace.trace_history.append({
            "type": "intent_classification",
            "query": task_description,
            "mode": intent_result.mode,
            "confidence": intent_result.confidence,
            "reasoning": intent_result.reasoning,
            "sub_intent": intent_result.sub_intent
        })
        
        # ═══════════════════════════════════════════════════════════════════════
        # 🧠 REASONING LOOP PATH (Investigative / Exploratory modes)
        # ═══════════════════════════════════════════════════════════════════════
        if intent_result.mode in ("investigative", "exploratory"):
            print(f"\n🧠 Routing to REASONING LOOP ({intent_result.mode} mode)")
            
            # Determine iteration count based on mode and reasoning effort
            if intent_result.mode == "exploratory":
                loop_max = min(max_iterations, 8)  # Exploratory gets more iterations
            else:
                loop_max = min(max_iterations, 6)  # Investigative is more focused
            
            reasoning_result = self._run_reasoning_loop(
                question=task_description,
                file_path=file_path,
                dataset_info=schema_info if 'error' not in schema_info else {},
                target_col=target_col,
                mode=intent_result.mode,
                max_iterations=loop_max
            )
            
            # Cache the result
            if use_cache and reasoning_result.get("status") == "success":
                self.cache.set(cache_key, reasoning_result, metadata={
                    "file_path": file_path,
                    "task": task_description,
                    "mode": intent_result.mode
                })
            
            return reasoning_result
        
        # ═══════════════════════════════════════════════════════════════════════
        # 📋 DIRECT MODE PATH (existing pipeline - below is unchanged)
        # ═══════════════════════════════════════════════════════════════════════
        print(f"\n📋 Routing to DIRECT pipeline mode")
        
        # Build initial messages
        # Use dynamic prompts for small context models
        if self.use_compact_prompts:
            from .dynamic_prompts import build_compact_system_prompt
            system_prompt = build_compact_system_prompt(user_query=task_description)
            print("🔧 Using compact prompt for small context window")
        else:
            # 🤖 MULTI-AGENT ARCHITECTURE: Route to specialist agent
            selected_agent = self._select_specialist_agent(task_description)
            self.active_agent = selected_agent
            current_agent = selected_agent  # Track for dynamic tool loading
            
            # 📝 Record agent selection in reasoning trace
            if self.semantic_layer.enabled:
                # Get confidence from semantic routing
                agent_descriptions = {name: config["description"] for name, config in self.specialist_agents.items()}
                _, confidence = self.semantic_layer.route_to_agent(task_description, agent_descriptions)
                self.reasoning_trace.record_agent_selection(
                    task=task_description,
                    selected_agent=selected_agent,
                    confidence=confidence,
                    alternatives=agent_descriptions
                )
            
            agent_config = self.specialist_agents[selected_agent]
            print(f"\n{agent_config['emoji']} Delegating to: {agent_config['name']}")
            print(f"   Specialization: {agent_config['description']}")
            
            # 🎯 DYNAMIC TOOL LOADING: Load only tools relevant to this agent
            tools_to_use = self._compress_tools_registry(agent_name=selected_agent)
            print(f"   📦 Loaded {len(tools_to_use)} agent-specific tools")
            
            # Use specialist's system prompt
            system_prompt = agent_config["system_prompt"]
            
            # Emit agent info for UI display
            if self.progress_callback:
                self.progress_callback({
                    "type": "agent_assigned",
                    "agent": agent_config['name'],
                    "emoji": agent_config['emoji'],
                    "description": agent_config['description'],
                    "tools_count": len(tools_to_use)
                })
        
        
        # 🎯 PROACTIVE INTENT DETECTION - Tell LLM which tools to use BEFORE it tries wrong ones
        task_lower = task_description.lower()
        
        # Detect user intent
        wants_viz = any(kw in task_lower for kw in ["plot", "graph", "visualiz", "dashboard", "chart", "show", "display", "create", "generate"])
        wants_clean = any(kw in task_lower for kw in ["clean", "missing", "impute"])
        wants_features = any(kw in task_lower for kw in ["feature", "engineer", "time-based", "extract features"])
        wants_train = any(kw in task_lower for kw in ["train", "model", "predict", "best model", "classify", "regression", "forecast", "build model"])
        
        # 🔍 CRITICAL: Detect exploratory/relationship questions (should NOT trigger ML training)
        wants_relationship = any(kw in task_lower for kw in [
            "how does", "how do", "relationship", "relate", "correlation", "correlate",
            "affect", "effect", "impact", "influence", "change with", "vary with",
            "compare", "difference between", "distribution", "pattern"
        ])
        
        # 🎯 AUTO-ENABLE TRAINING: Only if explicitly asking for predictions AND not asking about relationships
        # Don't auto-enable for exploratory questions even if target exists
        if target_col and not wants_viz and not wants_clean and not wants_relationship and self.workflow_state.task_type in ["regression", "classification"]:
            # Additional check: only auto-enable if question implies prediction
            if wants_train or any(kw in task_lower for kw in ["predict", "forecast", "estimate"]):
                print(f"   🎯 Auto-enabling ML training (detected {self.workflow_state.task_type} task with target='{target_col}')")
                wants_train = True
        elif wants_relationship:
            # Override: Relationship questions should NOT train models
            print(f"   🔍 Exploratory analysis detected - disabling auto-ML (question asks about relationships, not predictions)")
            wants_train = False
        
        # 📊 DETECT SPECIFIC PLOT TYPE - Match user's exact visualization request
        plot_type_guidance = ""
        if wants_viz:
            if "histogram" in task_lower or "distribution" in task_lower or "freq" in task_lower:
                plot_type_guidance = "\n\n📊 **PLOT TYPE DETECTED**: Histogram\n✅ Use: generate_interactive_histogram\n❌ Do NOT use: generate_interactive_scatter (that's for scatter plots!)"
            elif "scatter" in task_lower or "relationship" in task_lower or "correlation" in task_lower:
                plot_type_guidance = "\n\n📊 **PLOT TYPE DETECTED**: Scatter Plot\n✅ Use: generate_interactive_scatter\n❌ Do NOT use: generate_interactive_histogram (that's for distributions!)"
            elif "box plot" in task_lower or "boxplot" in task_lower or "outlier" in task_lower:
                plot_type_guidance = "\n\n📊 **PLOT TYPE DETECTED**: Box Plot\n✅ Use: generate_interactive_box_plots"
            elif "time series" in task_lower or "trend" in task_lower or "over time" in task_lower:
                plot_type_guidance = "\n\n📊 **PLOT TYPE DETECTED**: Time Series\n✅ Use: generate_interactive_time_series"
            elif "heatmap" in task_lower or "correlation" in task_lower:
                plot_type_guidance = "\n\n📊 **PLOT TYPE DETECTED**: Heatmap\n✅ Use: generate_interactive_correlation_heatmap"
            elif "dashboard" in task_lower or "all plot" in task_lower:
                plot_type_guidance = "\n\n📊 **PLOT TYPE DETECTED**: Dashboard/Multiple Plots\n✅ Use: generate_plotly_dashboard OR generate_all_plots"
            else:
                # Generic visualization - let LLM decide based on data
                plot_type_guidance = "\n\n📊 **PLOT TYPE**: Generic visualization\n✅ Choose appropriate tool based on:\n- Histogram: Single numeric variable distribution\n- Scatter: Relationship between 2 numeric variables\n- Box Plot: Compare distributions across categories\n- Time Series: Data with datetime column"
        
        # Build specific guidance based on intent
        workflow_guidance = ""
        
        if wants_train:
            # Full ML pipeline - ALWAYS run complete workflow for model training
            target_info = f"\n🎯 **TARGET COLUMN**: '{target_col}' (Task: {self.workflow_state.task_type or 'auto'})\n" if target_col else "\n⚠️ **TARGET COLUMN**: Not specified - analyze correlations to find best candidate\n"
            workflow_guidance = (
                "\n\n🎯 **WORKFLOW**: Full ML Pipeline (Training Requested)"
                f"{target_info}"
                "Execute ALL steps for best model performance:\n"
                "1. Profile dataset (understand data)\n"
                "2. Clean missing values (data quality)\n"
                "3. Handle outliers (prevent bias)\n"
                "4. Create features (time features, interactions)\n"
                "5. Encode categorical (prepare for ML)\n"
                "6. Train models (baseline + optimization)\n"
                "7. Generate visualizations (feature importance, residuals, performance)\n"
                "8. Create reports (comprehensive analysis)\n\n"
                "⚠️ ALL tools allowed - cleaning, feature engineering, visualization, and training!"
            )
        elif wants_clean and wants_viz and not wants_train:
            # Multi-intent: Clean + Visualize
            workflow_guidance = (
                "\n\n🎯 **WORKFLOW**: Multi-Intent (Clean + Visualize)\n"
                "Steps:\n"
                "1. clean_missing_values\n"
                "2. handle_outliers\n"
                "3. generate_interactive_scatter OR generate_plotly_dashboard\n"
                "4. STOP (no training!)"
            )
        elif wants_viz and not wants_train and not wants_clean:
            # Visualization only
            workflow_guidance = (
                f"\n\n🎯 **WORKFLOW**: Visualization ONLY{plot_type_guidance}\n"
                "⚠️ DO NOT run profiling or cleaning tools!\n"
                "✅ YOUR FIRST CALL: Use the EXACT plot type mentioned above\n"
                "✅ Then STOP immediately (no training, no cleaning needed!)"
            )
        elif wants_features and not wants_train:
            # Feature engineering only
            workflow_guidance = (
                "\n\n🎯 **WORKFLOW**: Feature Engineering ONLY\n"
                "Steps:\n"
                "1. (Optional) profile_dataset if you need column names\n"
                "2. create_time_features OR encode_categorical OR create_interaction_features\n"
                "3. STOP (no training!)"
            )
        elif wants_clean and not wants_train and not wants_viz:
            # Cleaning only
            workflow_guidance = (
                "\n\n🎯 **WORKFLOW**: Data Cleaning ONLY\n"
                "Steps:\n"
                "1. (Optional) profile_dataset to see issues\n"
                "2. clean_missing_values\n"
                "3. handle_outliers\n"
                "4. STOP (no training, no feature engineering!)"
            )
        else:
            # Default full workflow
            workflow_guidance = "\n\n🎯 **WORKFLOW**: Complete Analysis\nExecute: profile → clean → encode → train → report"
        
        # Build user message with workflow state context (minimal, not full history)
        state_context = ""
        if self.workflow_state.dataset_info:
            # Include schema summary instead of raw data
            info = self.workflow_state.dataset_info
            # Create explicit column list for validation
            all_columns = ', '.join([f"'{col}'" for col in list(info['columns'].keys())[:15]])
            if len(info['columns']) > 15:
                all_columns += f"... ({len(info['columns'])} total)"
            
            state_context = f"""
**Dataset Schema** (extracted locally):
- Rows: {info['num_rows']:,} | Columns: {info['num_columns']}
- Size: {info['file_size_mb']} MB
- Numeric columns ({len(info['numeric_columns'])}): {', '.join([f"'{c}'" for c in info['numeric_columns'][:10]])}{'...' if len(info['numeric_columns']) > 10 else ''}
- Categorical columns ({len(info['categorical_columns'])}): {', '.join([f"'{c}'" for c in info['categorical_columns'][:10]])}{'...' if len(info['categorical_columns']) > 10 else ''}

**IMPORTANT - Exact Column Names:**
{all_columns}

⚠️ When calling modeling tools, use EXACT column names from above.
⚠️ DO NOT hallucinate column names like "Target", "Label", "Occupation" unless they appear above.
⚠️ If unsure about target column, use profile_dataset first to inspect data.
"""
        
        user_message = f"""Please analyze the dataset and complete the following task:

**Dataset**: {file_path}
**Task**: {task_description}
**Target Column**: {target_col if target_col else 'Not specified - please infer from data'}{state_context}{workflow_guidance}"""

        #🧠 Store file path in session memory for follow-up requests
        if self.session and file_path:
            self.session.update(last_dataset=file_path)
            if target_col:
                self.session.update(last_target_col=target_col)
            print(f"💾 Saved to session: dataset={file_path}, target={target_col}")
        
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_message}
        ]
        
        # Track workflow
        workflow_history = []
        iteration = 0
        tool_call_counter = {}  # Track how many times each tool has been called
        
        # current_agent and tools_to_use are set above in agent selection
        # If compact prompts used, prepare general tools here
        if self.use_compact_prompts:
            current_agent = None
            tools_to_use = self._compress_tools_registry(agent_name="general_agent")
        
        # For Gemini, use the existing model without tools (text-only mode)
        # Gemini tool schema is incompatible with OpenAI/Groq format
        # Tool execution is handled by our orchestrator, not by Gemini itself
        gemini_chat = None
        if self.provider == "gemini":
            gemini_chat = self.gemini_model.start_chat(history=[])
        
        while iteration < max_iterations:
            iteration += 1
            
            try:
                # 🚀 SMART CONVERSATION PRUNING (Mistral-compatible)
                # Keep only: system + user + last 12 exchanges (24 messages) - INCREASED FOR BETTER CONTEXT
                # CRITICAL: Maintain valid message ordering for Mistral API
                
                # Helper function to get role from message (handles dict or ChatMessage object)
                def get_role(msg):
                    if isinstance(msg, dict):
                        return msg.get('role', '')
                    return getattr(msg, 'role', '')
                
                # Helper to check if message has tool_calls
                def has_tool_calls(msg):
                    if isinstance(msg, dict):
                        return bool(msg.get('tool_calls'))
                    return bool(getattr(msg, 'tool_calls', None))
                
                if len(messages) > 26:
                    # Keep: system prompt [0], user query [1], last valid exchanges
                    system_msg = messages[0]
                    user_msg = messages[1]
                    recent_msgs = messages[-8:]
                    
                    # CRITICAL: Keep complete tool call/response groups together
                    # Mistral requires: assistant (with tool_calls) → tool responses → assistant → user
                    cleaned_recent = []
                    i = 0
                    while i < len(recent_msgs):
                        msg = recent_msgs[i]
                        role = get_role(msg)
                        
                        if role == 'assistant' and has_tool_calls(msg):
                            # This assistant has tool calls - must keep it AND all following tool responses
                            cleaned_recent.append(msg)
                            i += 1
                            # Collect all consecutive tool responses
                            while i < len(recent_msgs) and get_role(recent_msgs[i]) == 'tool':
                                cleaned_recent.append(recent_msgs[i])
                                i += 1
                        elif role == 'tool':
                            # Orphaned tool message (no preceding assistant with tool_calls) - skip it
                            i += 1
                        else:
                            # Regular message (assistant without tool_calls, user, system)
                            cleaned_recent.append(msg)
                            i += 1
                    
                    # 🔥 CRITICAL FIX: Remove orphaned tool messages at the start of cleaned_recent
                    # Mistral NEVER allows 'tool' role immediately after 'user' role
                    while cleaned_recent and get_role(cleaned_recent[0]) == 'tool':
                        print(f"⚠️  Removed orphaned tool message at start of pruned history")
                        cleaned_recent.pop(0)
                    
                    messages = [system_msg, user_msg] + cleaned_recent
                    print(f"✂️  Pruned conversation (keeping last 12 exchanges for better context preservation)")
                    
                    # 🎯 INJECT CONTEXT REMINDER after pruning (prevent LLM from forgetting)
                    context_parts = []
                    if target_col and self.workflow_state.task_type:
                        context_parts.append(f"📌 Target column: '{target_col}' (Task: {self.workflow_state.task_type})")
                    
                    # Inject profiling/quality context that would have been pruned
                    if self.workflow_state.profiling_summary:
                        ps = self.workflow_state.profiling_summary
                        context_parts.append(f"📊 Dataset: {ps.get('num_rows', '?')} rows × {ps.get('num_columns', '?')} cols")
                        if ps.get('column_ranges'):
                            ranges = ps['column_ranges']
                            range_lines = [f"  {col}: min={v.get('min')}, max={v.get('max')}, mean={v.get('mean')}" 
                                          for col, v in list(ranges.items())[:8]]
                            context_parts.append("Column ranges:\n" + "\n".join(range_lines))
                    
                    if self.workflow_state.quality_issues:
                        qi = self.workflow_state.quality_issues
                        if qi.get('total_issues', 0) > 0:
                            context_parts.append(f"⚠️ Quality: {qi.get('total_issues', 0)} issues found")
                    
                    if context_parts:
                        reminder = {
                            "role": "user",
                            "content": "REMINDER (original profiling context — preserved after pruning):\n" + "\n".join(context_parts)
                        }
                        messages.insert(2, reminder)  # Insert after system + user query
                
                # 🔍 Token estimation and warning
                estimated_tokens = sum(
                    len(str(m.get('content', '') if isinstance(m, dict) else getattr(m, 'content', ''))) // 4 
                    for m in messages
                )
                if estimated_tokens > 15000:
                    # Emergency pruning - keep only last 8 exchanges
                    system_msg = messages[0]
                    user_msg = messages[1]
                    recent_msgs = messages[-16:]
                    
                    # CRITICAL: Keep complete tool call/response groups together
                    cleaned_recent = []
                    i = 0
                    while i < len(recent_msgs):
                        msg = recent_msgs[i]
                        role = get_role(msg)
                        
                        if role == 'assistant' and has_tool_calls(msg):
                            # Keep assistant with tool calls AND all its tool responses
                            cleaned_recent.append(msg)
                            i += 1
                            while i < len(recent_msgs) and get_role(recent_msgs[i]) == 'tool':
                                cleaned_recent.append(recent_msgs[i])
                                i += 1
                        elif role == 'tool':
                            # Skip orphaned tool message
                            i += 1
                        else:
                            cleaned_recent.append(msg)
                            i += 1
                    
                    # 🔥 CRITICAL FIX: Remove orphaned tool messages at the start of cleaned_recent
                    # Mistral NEVER allows 'tool' role immediately after 'user' role
                    while cleaned_recent and get_role(cleaned_recent[0]) == 'tool':
                        print(f"⚠️  Removed orphaned tool message at start of emergency pruned history")
                        cleaned_recent.pop(0)
                    
                    messages = [system_msg, user_msg] + cleaned_recent
                    print(f"⚠️  Emergency pruning (conversation > 15K tokens, keeping last 8 exchanges)")
                
                # 💰 Token budget management (TPM limit)
                if self.provider in ["mistral", "groq"]:
                    # Reset minute counter if needed
                    elapsed = time.time() - self.minute_start_time
                    if elapsed > 60:
                        print(f"🔄 Token budget reset (was {self.tokens_this_minute}/{self.tpm_limit})")
                        self.tokens_this_minute = 0
                        self.minute_start_time = time.time()
                    
                    # Check if we're close to TPM limit (use 70% threshold to be safe)
                    if self.tokens_this_minute + estimated_tokens > self.tpm_limit * 0.7:
                        wait_time = 60 - elapsed
                        if wait_time > 0:
                            print(f"⏸️  Token budget: {self.tokens_this_minute}/{self.tpm_limit} used ({(self.tokens_this_minute/self.tpm_limit)*100:.0f}%)")
                            print(f"   Next request would use ~{estimated_tokens} tokens → exceeds safe limit")
                            print(f"   Waiting {wait_time:.0f}s for budget reset...")
                            time.sleep(wait_time)
                            self.tokens_this_minute = 0
                            self.minute_start_time = time.time()
                            print(f"✅ Token budget reset complete")
                    else:
                        print(f"💰 Token budget: {self.tokens_this_minute}/{self.tpm_limit} ({(self.tokens_this_minute/self.tpm_limit)*100:.0f}%)")

                
                # Rate limiting - wait if needed
                if self.min_api_call_interval > 0:
                    time_since_last_call = time.time() - self.last_api_call_time
                    if time_since_last_call < self.min_api_call_interval:
                        wait_time = self.min_api_call_interval - time_since_last_call
                        print(f"⏳ Rate limiting: waiting {wait_time:.1f}s...")
                        time.sleep(wait_time)
                
                # Initialize variables before try block to avoid UnboundLocalError
                tool_calls = None
                final_content = None
                response_message = None
                
                # 💰 TOKEN BUDGET: Enforce context window limits before LLM call
                messages, token_count = self.token_manager.enforce_budget(
                    messages=messages,
                    system_prompt=system_prompt
                )
                print(f"💰 Token budget: {token_count}/{self.token_manager.max_tokens} ({(token_count/self.token_manager.max_tokens*100):.1f}%)")
                
                # 🔥 CRITICAL: Validate message order for Mistral API compliance
                # Mistral requires: system → user → assistant → tool (only after assistant with tool_calls) → assistant → user...
                # NEVER: user → tool (this causes "Unexpected role 'tool' after role 'user'" error)
                if self.provider in ["mistral", "groq"]:
                    validated_messages = []
                    for i, msg in enumerate(messages):
                        role = get_role(msg)
                        
                        # Check if this is a tool message after a user message
                        if role == 'tool' and validated_messages:
                            prev_role = get_role(validated_messages[-1])
                            if prev_role == 'user':
                                # Invalid! Skip this tool message
                                print(f"⚠️  WARNING: Skipped orphaned tool message at position {i} (after user message)")
                                continue
                        
                        validated_messages.append(msg)
                    
                    messages = validated_messages
                    print(f"✅ Message order validation complete: {len(messages)} messages")
                
                # Call LLM with function calling (provider-specific)
                if self.provider == "mistral":
                    try:
                        # Support both new SDK (v1.x) and old SDK (v0.x)
                        if hasattr(self.mistral_client, 'chat') and hasattr(self.mistral_client.chat, 'complete'):
                            # New SDK (v1.x)
                            response = self.mistral_client.chat.complete(
                                model=self.model,
                                messages=messages,
                                tools=tools_to_use,
                                tool_choice="auto",
                                temperature=0.1,
                                max_tokens=4096
                            )
                        else:
                            # Old SDK (v0.x)
                            response = self.mistral_client.chat(
                                model=self.model,
                                messages=messages,
                                tools=tools_to_use,
                                tool_choice="auto",
                                temperature=0.1,
                                max_tokens=4096
                            )
                        
                        self.api_calls_made += 1
                        self.last_api_call_time = time.time()
                        
                        # Track tokens used (for TPM budget management)
                        if hasattr(response, 'usage') and response.usage:
                            tokens_used = response.usage.total_tokens
                            self.tokens_this_minute += tokens_used
                            print(f"📊 Tokens: {tokens_used} this call | {self.tokens_this_minute}/{self.tpm_limit} this minute")
                            
                            # Emit token update for SSE streaming using session UUID
                            if hasattr(self, 'session') and self.session:
                                progress_manager.emit(self.session.session_id, {
                                    'type': 'token_update',
                                    'message': f"📊 Tokens: {tokens_used} this call | {self.tokens_this_minute}/{self.tpm_limit} this minute",
                                    'tokens_used': tokens_used,
                                    'tokens_this_minute': self.tokens_this_minute,
                                    'tpm_limit': self.tpm_limit
                                })
                        
                        response_message = response.choices[0].message
                        tool_calls = response_message.tool_calls
                        final_content = self._extract_content_text(response_message.content)
                        
                    except Exception as mistral_error:
                        error_str = str(mistral_error)
                        print(f"❌ MISTRAL ERROR: {error_str[:300]}")
                        raise
                
                elif self.provider == "groq":
                    try:
                        response = self.groq_client.chat.completions.create(
                            model=self.model,
                            messages=messages,
                            tools=tools_to_use,
                            tool_choice="auto",
                            parallel_tool_calls=False,  # Disable parallel calls to prevent XML format errors
                            temperature=0.1,  # Low temperature for consistent outputs
                            max_tokens=4096
                        )
                        
                        self.api_calls_made += 1
                        self.last_api_call_time = time.time()
                        
                        # Track tokens used (for TPM budget management)
                        if hasattr(response, 'usage') and response.usage:
                            tokens_used = response.usage.total_tokens
                            self.tokens_this_minute += tokens_used
                            print(f"📊 Tokens: {tokens_used} this call | {self.tokens_this_minute}/{self.tpm_limit} this minute")
                            
                            # Emit token update for SSE streaming using session UUID
                            if hasattr(self, 'session') and self.session:
                                progress_manager.emit(self.session.session_id, {
                                    'type': 'token_update',
                                    'message': f"📊 Tokens: {tokens_used} this call | {self.tokens_this_minute}/{self.tpm_limit} this minute",
                                    'tokens_used': tokens_used,
                                    'tokens_this_minute': self.tokens_this_minute,
                                    'tpm_limit': self.tpm_limit
                                })
                        
                        response_message = response.choices[0].message
                        tool_calls = response_message.tool_calls
                        final_content = self._extract_content_text(response_message.content)
                        
                    except Exception as groq_error:
                        # Check if it's a rate limit error (429)
                        error_str = str(groq_error)
                        if "rate_limit" in error_str.lower() or "429" in error_str:
                            # Parse retry delay from error message if available
                            retry_delay = 60  # Default to 60s for TPM limit
                            
                            # Try to extract retry delay from error
                            import re
                            delay_match = re.search(r'retry.*?(\d+).*?second', error_str, re.IGNORECASE)
                            if delay_match:
                                retry_delay = int(delay_match.group(1))
                            elif "tokens per minute" in error_str or "TPM" in error_str:
                                retry_delay = 60
                            elif "tokens per day" in error_str or "TPD" in error_str:
                                # Daily limit - give up immediately
                                print(f"❌ GROQ DAILY TOKEN LIMIT EXHAUSTED (100K tokens/day)")
                                print(f"   Your daily quota resets at UTC midnight")
                                print(f"   Error: {error_str[:400]}")
                                raise ValueError(f"Groq daily quota exhausted. Please wait for reset.\n{error_str[:500]}")
                            
                            # TPM limit - wait and retry
                            print(f"⚠️  GROQ TPM RATE LIMIT (rolling 60s window)")
                            print(f"   Groq uses account-wide rolling window - previous requests still count")
                            print(f"   Waiting {retry_delay}s and retrying...")
                            print(f"   Error: {error_str[:300]}")
                            
                            time.sleep(retry_delay)
                            
                            # Retry the request
                            print(f"🔄 Retrying after {retry_delay}s delay...")
                            response = self.groq_client.chat.completions.create(
                                model=self.model,
                                messages=messages,
                                tools=tools_to_use,
                                tool_choice="auto",
                                parallel_tool_calls=False,
                                temperature=0.1,
                                max_tokens=4096
                            )
                            
                            self.api_calls_made += 1
                            self.last_api_call_time = time.time()
                            
                            # Track tokens used
                            if hasattr(response, 'usage') and response.usage:
                                tokens_used = response.usage.total_tokens
                                self.tokens_this_minute += tokens_used
                                print(f"📊 Tokens: {tokens_used} this call | {self.tokens_this_minute}/{self.tpm_limit} this minute")
                                
                                # Emit token update for SSE streaming using session UUID
                                if hasattr(self, 'session') and self.session:
                                    progress_manager.emit(self.session.session_id, {
                                        'type': 'token_update',
                                        'message': f"📊 Tokens: {tokens_used} this call | {self.tokens_this_minute}/{self.tpm_limit} this minute",
                                        'tokens_used': tokens_used,
                                        'tokens_this_minute': self.tokens_this_minute,
                                        'tpm_limit': self.tpm_limit
                                    })
                            
                            response_message = response.choices[0].message
                            tool_calls = response_message.tool_calls
                            final_content = self._extract_content_text(response_message.content)
                        else:
                            # Not a rate limit error, re-raise
                            raise
                
                # Check if done (no tool calls)
                if not tool_calls:
                    # Final response
                    final_summary = final_content or "Analysis completed"
                    
                    # 🎯 ENHANCED SUMMARY: Extract metrics and artifacts from workflow (with error handling)
                    try:
                        enhanced_summary = self._generate_enhanced_summary(
                            workflow_history, 
                            final_summary, 
                            task_description
                        )
                        summary_text = enhanced_summary["text"]
                        
                        # 🧹 POST-PROCESS: Light cleanup only
                        import re
                        
                        # Clean excessive whitespace only
                        summary_text = re.sub(r'\n\n\n+', '\n\n', summary_text)
                        summary_text = summary_text.strip()
                        
                        metrics_data = enhanced_summary.get("metrics", {})
                        artifacts_data = enhanced_summary.get("artifacts", {})
                        artifacts_data = enhanced_summary.get("artifacts", {})
                        plots_data = enhanced_summary.get("plots", [])
                        print(f"✅ Enhanced summary generated with {len(plots_data)} plots, {len(metrics_data)} metrics")
                        
                        # DEBUG: Log plots array details
                        if plots_data:
                            print(f"[DEBUG] Plots array contains {len(plots_data)} items:")
                            for idx, plot in enumerate(plots_data):
                                print(f"[DEBUG]   Plot {idx+1}: title='{plot.get('title')}', url='{plot.get('url')}', type='{plot.get('type')}'")
                    except Exception as e:
                        print(f"⚠️  Enhanced summary generation failed: {e}")
                        import traceback
                        traceback.print_exc()
                        # Fallback: use basic summary
                        summary_text = final_summary
                        metrics_data = {}
                        artifacts_data = {}
                        plots_data = []
                    
                    # 🧠 Save conversation to session memory
                    if self.session:
                        self.session.add_conversation(task_description, summary_text)
                        self.session_store.save(self.session)
                        print(f"\n✅ Session saved: {self.session.session_id}")
                    
                    result = {
                        "status": "success",
                        "summary": summary_text,
                        "metrics": metrics_data,
                        "artifacts": artifacts_data,
                        "plots": plots_data,
                        "workflow_history": workflow_history,
                        "reasoning_trace": self.reasoning_trace.get_trace(),
                        "reasoning_summary": self.reasoning_trace.get_trace_summary(),
                        "iterations": iteration,
                        "api_calls": self.api_calls_made,
                        "execution_time": round(time.time() - start_time, 2)
                    }
                    
                    # Cache result
                    if use_cache:
                        self.cache.set(cache_key, result, metadata={
                            "file_path": file_path,
                            "task": task_description
                        })
                    
                    return result
                
                # Execute tool calls (provider-specific format)
                if self.provider in ["groq", "mistral"]:
                    messages.append(response_message)
                
                # 🚀 PARALLEL EXECUTION: Detect multiple independent tool calls
                # ⚠️ DISABLED FOR STABILITY - Parallel execution causes race conditions and OOM errors
                # Re-enable only after implementing proper request isolation per user
                if len(tool_calls) > 1 and False:  # Disabled with "and False"
                    print(f"🚀 Detected {len(tool_calls)} tool calls - attempting parallel execution")
                    
                    # Extract tool executions with proper weight classification
                    tool_executions = []
                    heavy_tools = []
                    for idx, tc in enumerate(tool_calls):
                        if self.provider in ["groq", "mistral"]:
                            tool_name = tc.function.name
                            tool_args_raw = tc.function.arguments
                            # Sanitize tool name
                            import re
                            tool_name = re.sub(r'[^\x00-\x7F]+', '', str(tool_name))
                            match = re.search(r'([a-z_][a-z0-9_]*)', tool_name, re.IGNORECASE)
                            if match:
                                tool_name = match.group(1)
                            
                            if tool_name in self.tool_functions:
                                tool_args = json.loads(tool_args_raw)
                                weight = TOOL_WEIGHTS.get(tool_name, ToolWeight.MEDIUM)
                                
                                # Track heavy tools
                                if weight == ToolWeight.HEAVY:
                                    heavy_tools.append(tool_name)
                                
                                tool_executions.append(ToolExecution(
                                    tool_name=tool_name,
                                    arguments=tool_args,
                                    weight=weight,
                                    dependencies=set(),
                                    execution_id=f"{tool_name}_{idx}"
                                ))
                        elif self.provider == "gemini":
                            tool_name = tc.name
                            tool_args = {key: value for key, value in tc.args.items()}
                            if tool_name in self.tool_functions:
                                weight = TOOL_WEIGHTS.get(tool_name, ToolWeight.MEDIUM)
                                
                                # Track heavy tools
                                if weight == ToolWeight.HEAVY:
                                    heavy_tools.append(tool_name)
                                
                                tool_executions.append(ToolExecution(
                                    tool_name=tool_name,
                                    arguments=tool_args,
                                    weight=weight,
                                    dependencies=set(),
                                    execution_id=f"{tool_name}_{idx}"
                                ))
                    
                    # ⚠️ CRITICAL: Prevent multiple heavy tools from running in parallel
                    if len(heavy_tools) > 1:
                        print(f"⚠️ Multiple HEAVY tools detected: {heavy_tools}")
                        print(f"   These will run SEQUENTIALLY to prevent resource exhaustion")
                        print(f"   Heavy tools: {', '.join(heavy_tools)}")
                        # Fall through to sequential execution
                    elif len(tool_executions) > 1 and len(heavy_tools) <= 1 and self.parallel_executor is not None:
                        try:
                            results = asyncio.run(self.parallel_executor.execute_all(
                                tool_executions=tool_executions,
                                tool_executor=self._execute_tool_sync,
                                progress_callback=self._async_progress_callback
                            ))
                            
                            print(f"✓ Parallel execution completed: {len(results)} tools")
                            
                            # Add results to messages and workflow history
                            for tool_exec, tool_result in zip(tool_executions, results):
                                tool_name = tool_exec.tool_name
                                tool_args = tool_exec.arguments
                                tool_call_id = tool_exec.execution_id
                                
                                # Save checkpoint
                                if tool_result.get("success", True):
                                    session_id = self.http_session_key or "default"
                                    self.recovery_manager.checkpoint_manager.save_checkpoint(
                                        session_id=session_id,
                                        workflow_state={
                                            'iteration': iteration,
                                            'workflow_history': workflow_history,
                                            'current_file': file_path,
                                            'task_description': task_description,
                                            'target_col': target_col
                                        },
                                        last_tool=tool_name,
                                        iteration=iteration
                                    )
                                
                                # Track in workflow
                                workflow_history.append({
                                    "iteration": iteration,
                                    "tool": tool_name,
                                    "arguments": tool_args,
                                    "result": tool_result
                                })
                                
                                # Update workflow state
                                self._update_workflow_state(tool_name, tool_result)
                                
                                # Add to messages with compression
                                clean_tool_result = self._make_json_serializable(tool_result)
                                compressed_result = self._compress_tool_result(tool_name, clean_tool_result)
                                
                                if self.provider in ["mistral", "groq"]:
                                    messages.append({
                                        "role": "tool",
                                        "tool_call_id": tool_call_id,
                                        "name": tool_name,
                                        "content": json.dumps(compressed_result)
                                    })
                                elif self.provider == "gemini":
                                    messages.append({
                                        "role": "tool",
                                        "name": tool_name,
                                        "content": json.dumps(compressed_result)
                                    })
                            
                            # Skip sequential execution
                            continue
                            
                        except Exception as e:
                            print(f"⚠️ Parallel execution failed: {e}")
                            print("   Falling back to sequential execution")
                
                # Sequential execution (fallback or single tool)
                for tool_call in tool_calls:
                    # Extract tool name and args (provider-specific)
                    if self.provider in ["groq", "mistral"]:
                        tool_name = tool_call.function.name
                        tool_args = json.loads(tool_call.function.arguments)
                        tool_call_id = tool_call.id
                        
                        # CRITICAL FIX 1: Sanitize tool_name (remove any non-ASCII or prefix garbage)
                        import re
                        # Remove any non-ASCII characters and leading garbage
                        tool_name_cleaned = re.sub(r'[^\x00-\x7F]+', '', str(tool_name))
                        # Extract just the alphanumeric_underscore pattern
                        match = re.search(r'([a-z_][a-z0-9_]*)', tool_name_cleaned, re.IGNORECASE)
                        if match:
                            tool_name = match.group(1)
                        
                        # CRITICAL FIX 2: Validate tool exists before execution
                        if tool_name not in self.tool_functions:
                            print(f"⚠️  INVALID TOOL NAME: '{tool_name}' (original: {tool_call.function.name})")
                            print(f"   Available tools: {', '.join(list(self.tool_functions.keys())[:10])}...")
                            
                            # Explicit mappings for common LLM hallucinations
                            tool_name_mappings = {
                                "drop_columns": "execute_python_code",  # No drop_columns tool, use code
                                "select_columns": "execute_python_code",  # No select_columns tool, use code
                                "rename_columns": "execute_python_code",  # No rename_columns tool, use code
                                "create_geospatial_features": "create_interaction_features",  # No geospatial tool, use interaction features
                                "encode_categorical_variables": "encode_categorical",
                                "train_model": "train_baseline_models",
                                "train_models": "train_baseline_models",
                                "baseline_models": "train_baseline_models",
                                "tune_hyperparameters": "hyperparameter_tuning",
                                "hyperparameter_search": "hyperparameter_tuning",
                            }
                            
                            if tool_name in tool_name_mappings:
                                mapped_tool = tool_name_mappings[tool_name]
                                if mapped_tool == "execute_python_code":
                                    print(f"   ✓ Tool '{tool_name}' not available - LLM should use execute_python_code instead")
                                    # Skip and let LLM handle with code
                                    messages.append({
                                        "role": "tool",
                                        "tool_call_id": tool_call_id,
                                        "name": tool_name,
                                        "content": json.dumps({
                                            "error": f"Tool '{tool_name}' does not exist",
                                            "hint": "Use execute_python_code with pandas to perform this operation. Example: df.drop(columns=['col1', 'col2'])"
                                        })
                                    })
                                    continue
                                else:
                                    tool_name = mapped_tool
                                    print(f"   ✓ Mapped to: {tool_name}")
                            else:
                                # Try fuzzy matching to recover
                                from difflib import get_close_matches
                                close_matches = get_close_matches(tool_name, self.tool_functions.keys(), n=1, cutoff=0.6)
                                if close_matches:
                                    tool_name = close_matches[0]
                                    print(f"   ✓ Recovered using fuzzy match: {tool_name}")
                                else:
                                    print(f"   ❌ Cannot recover tool name, skipping")
                                    messages.append({
                                        "role": "tool",
                                        "tool_call_id": tool_call_id,
                                        "name": "invalid_tool",
                                        "content": json.dumps({
                                            "error": f"Invalid tool: {tool_call.function.name}",
                                            "message": "Tool does not exist in registry. Available tools can be found in the tools list.",
                                            "hint": "Check spelling and use exact tool names from the tools registry."
                                        })
                                    })
                                    continue
                        
                        # CRITICAL FIX 3: Check for corrupted tool names (length check)
                        if len(str(tool_call.function.name)) > 100:
                            print(f"⚠️  CORRUPTED TOOL NAME DETECTED: {str(tool_name)[:200]}")
                            # Try to extract actual tool name from garbage
                            import re
                            # Look for valid tool name pattern at the end
                            match = re.search(r'([a-z_]+)[\"\']?\s*$', str(tool_name), re.IGNORECASE)
                            if match:
                                recovered_name = match.group(1)
                                # Validate recovered tool name exists in registry
                                if recovered_name in self.tool_functions:
                                    tool_name = recovered_name
                                    print(f"✓ Recovered tool name: {tool_name}")
                                else:
                                    print(f"❌ Recovered '{recovered_name}' but it's not a valid tool")
                                    print(f"❌ Cannot recover tool name, skipping this tool call")
                                    # CRITICAL: Add tool response to maintain conversation state for Mistral API
                                    # Mistral requires messages to alternate: user -> assistant -> tool -> assistant
                                    # Skipping without adding response breaks this pattern
                                    messages.append({
                                        "role": "tool",
                                        "tool_call_id": tool_call_id,
                                        "name": "invalid_tool",
                                        "content": json.dumps({
                                            "error": "Corrupted tool name detected",
                                            "message": "The LLM returned invalid text instead of a tool call. Please try again with a valid tool.",
                                            "hint": "Use the session context to continue from where you left off."
                                        })
                                    })
                                    continue
                            else:
                                print(f"❌ Cannot recover tool name, skipping this tool call")
                                # CRITICAL: Add tool response to maintain conversation state for Mistral API
                                # Mistral requires messages to alternate: user -> assistant -> tool -> assistant
                                # Skipping without adding response breaks this pattern
                                messages.append({
                                    "role": "tool",
                                    "tool_call_id": tool_call_id,
                                    "name": "invalid_tool",
                                    "content": json.dumps({
                                        "error": "Corrupted tool name detected",
                                        "message": "The LLM returned invalid text instead of a tool call. Please try again with a valid tool.",
                                        "hint": "Use the session context to continue from where you left off."
                                    })
                                })
                                continue
                        
                    elif self.provider == "gemini":
                        tool_name = tool_call.name
                        # Convert protobuf args to Python dict
                        tool_args = {}
                        for key, value in tool_call.args.items():
                            # Handle different protobuf value types
                            if isinstance(value, (str, int, float, bool)):
                                tool_args[key] = value
                            elif hasattr(value, '__iter__') and not isinstance(value, str):
                                # Convert lists/repeated fields
                                tool_args[key] = list(value)
                            else:
                                # Fallback: try to convert to string
                                tool_args[key] = str(value)
                        tool_call_id = f"gemini_{iteration}_{tool_name}"
                    
                    # ⚠️ WORKFLOW STATE TRACKING: Block redundant operations
                    completed_tools = [step["tool"] for step in workflow_history]
                    
                    # 🎯 COMPREHENSIVE INTENT DETECTION SYSTEM
                    # Detect user's actual intent to prevent running full pipeline for partial tasks
                    
                    task_lower = task_description.lower()
                    
                    # Define intent keywords
                    visualization_keywords = ["plot", "graph", "visualiz", "dashboard", "chart", "show", "display", "create", "generate"]
                    cleaning_keywords = ["clean", "remove missing", "handle missing", "fill missing", "impute"]
                    feature_eng_keywords = ["feature", "engineer", "create features", "add features", "extract features", "time-based"]
                    profiling_keywords = ["profile", "explore", "understand", "summarize", "describe", "report", "analysis", "overview", "insights"]
                    ml_training_keywords = ["train", "model", "predict", "forecast", "classification", "regression", "tune", "optimize", "best model"]
                    
                    # Detect what user wants (can be multiple intents)
                    wants_visualization = any(kw in task_lower for kw in visualization_keywords)
                    wants_cleaning = any(kw in task_lower for kw in cleaning_keywords)
                    wants_feature_eng = any(kw in task_lower for kw in feature_eng_keywords)
                    wants_profiling = any(kw in task_lower for kw in profiling_keywords)
                    wants_ml_training = any(kw in task_lower for kw in ml_training_keywords)
                    
                    # Negation detection - "without", "no", "don't", "skip"
                    has_negation = any(neg in task_lower for neg in ["without", "no train", "don't train", "skip train", "no model"])
                    
                    # Count how many intents detected
                    intent_count = sum([wants_visualization, wants_cleaning, wants_feature_eng, wants_profiling, wants_ml_training])
                    
                    # Multi-intent detection: "Train model + feature engineering + graphs"
                    is_multi_intent = intent_count > 1
                    
                    # Determine intent type and allowed tools
                    # 🔥 CRITICAL: ML training ALWAYS needs full pipeline + visualization
                    if wants_ml_training and not has_negation:
                        # Full ML pipeline - training requires EVERYTHING
                        user_intent = "FULL_ML_PIPELINE"
                        allowed_tool_categories = ["all"]  # Allow all tools (cleaning, features, viz, training, reports)
                        
                    elif is_multi_intent and not wants_ml_training:
                        # Multi-intent WITHOUT training (e.g., "clean and visualize")
                        user_intent = "MULTI_INTENT"
                        allowed_tool_categories = []
                        
                        # Add categories based on detected intents
                        if wants_profiling:
                            allowed_tool_categories.append("profiling")
                        if wants_cleaning:
                            # Cleaning may need profiling to identify issues
                            allowed_tool_categories.extend(["profiling", "cleaning"])
                        if wants_feature_eng:
                            # Feature engineering may need profiling for column info
                            allowed_tool_categories.extend(["profiling", "cleaning", "feature_engineering"])
                        if wants_visualization:
                            allowed_tool_categories.append("visualization")
                        
                        # Remove duplicates
                        allowed_tool_categories = list(set(allowed_tool_categories))
                        
                    elif wants_visualization and not wants_ml_training:
                        # Visualization ONLY
                        user_intent = "VISUALIZATION_ONLY"
                        allowed_tool_categories = ["visualization"]
                        
                    elif wants_cleaning and not wants_ml_training:
                        # Data cleaning ONLY
                        user_intent = "CLEANING_ONLY"
                        allowed_tool_categories = ["profiling", "cleaning"]
                        
                    elif wants_feature_eng and not wants_ml_training:
                        # Feature engineering ONLY (may need cleaning first)
                        user_intent = "FEATURE_ENGINEERING_ONLY"
                        allowed_tool_categories = ["profiling", "cleaning", "feature_engineering"]
                        
                    elif wants_profiling and not wants_ml_training:
                        # Exploratory analysis ONLY
                        user_intent = "EXPLORATORY_ANALYSIS"
                        allowed_tool_categories = ["profiling", "visualization"]
                        
                    else:
                        # Default: Full pipeline if unclear
                        user_intent = "FULL_ML_PIPELINE"
                        allowed_tool_categories = ["all"]
                    
                    # Categorize tools
                    tool_categories = {
                        "profiling": ["profile_dataset", "detect_data_quality_issues", "analyze_correlations", "get_smart_summary"],
                        "cleaning": ["clean_missing_values", "handle_outliers", "fix_data_types", "force_numeric_conversion", "smart_type_inference"],
                        "feature_engineering": ["create_time_features", "encode_categorical", "create_interaction_features", 
                                               "create_aggregation_features", "auto_feature_engineering", "create_ratio_features",
                                               "create_statistical_features", "create_log_features", "create_binned_features"],
                        "ml_training": ["train_baseline_models", "hyperparameter_tuning", "perform_cross_validation", 
                                       "auto_ml_pipeline", "train_ensemble_models"],
                        "visualization": ["generate_interactive_scatter", "generate_interactive_histogram",
                                        "generate_interactive_correlation_heatmap", "generate_interactive_box_plots",
                                        "generate_interactive_time_series", "generate_plotly_dashboard",
                                        "generate_eda_plots", "generate_all_plots", "generate_data_quality_plots"]
                    }
                    
                    # Determine if tool should be blocked
                    should_block_tool = False
                    block_reason = ""
                    
                    if "all" not in allowed_tool_categories:
                        # Find which category this tool belongs to
                        tool_category = None
                        for category, tools in tool_categories.items():
                            if tool_name in tools:
                                tool_category = category
                                break
                        
                        # Block if tool category not in allowed categories
                        if tool_category and tool_category not in allowed_tool_categories:
                            should_block_tool = True
                            block_reason = f"User intent: {user_intent} (only allows: {', '.join(allowed_tool_categories)})"
                    
                    # 🚫 BLOCK tool if it doesn't match user intent
                    if should_block_tool:
                        print(f"\n🚫 BLOCKED: {tool_name}")
                        print(f"   Task: '{task_description}'")
                        print(f"   User Intent: {user_intent}")
                        print(f"   Reason: {block_reason}")
                        print(f"   Allowed categories: {', '.join(allowed_tool_categories)}")
                        
                        # Check if user's requested task is already complete
                        task_complete = False
                        completion_summary = ""
                        
                        if user_intent == "VISUALIZATION_ONLY":
                            viz_tools_used = [t for t in completed_tools if t in tool_categories["visualization"]]
                            if viz_tools_used:
                                task_complete = True
                                completion_summary = f"✅ Visualization completed: {', '.join(viz_tools_used)}"
                        
                        elif user_intent == "CLEANING_ONLY":
                            cleaning_tools_used = [t for t in completed_tools if t in tool_categories["cleaning"]]
                            if cleaning_tools_used:
                                task_complete = True
                                completion_summary = f"✅ Data cleaning completed: {', '.join(cleaning_tools_used)}"
                        
                        elif user_intent == "FEATURE_ENGINEERING_ONLY":
                            fe_tools_used = [t for t in completed_tools if t in tool_categories["feature_engineering"]]
                            if fe_tools_used:
                                task_complete = True
                                completion_summary = f"✅ Feature engineering completed: {', '.join(fe_tools_used)}"
                        
                        elif user_intent == "EXPLORATORY_ANALYSIS":
                            analysis_tools_used = [t for t in completed_tools if t in tool_categories["profiling"] or t in tool_categories["visualization"]]
                            if analysis_tools_used:
                                task_complete = True
                                completion_summary = f"✅ Exploratory analysis completed: {', '.join(analysis_tools_used)}"
                        
                        if task_complete:
                            print(f"   {completion_summary}")
                            
                            final_summary = (
                                f"{completion_summary}\n\n"
                                f"Task: {task_description}\n"
                                f"Intent: {user_intent}\n\n"
                                f"Tools executed:\n"
                                f"{chr(10).join(['- ' + tool for tool in completed_tools])}\n\n"
                                f"Check ./outputs/ for results."
                            )
                            
                            return {
                                "status": "completed",
                                "summary": final_summary,
                                "workflow_history": workflow_history,
                                "iterations": iteration,
                                "api_calls": self.api_calls_made,
                                "execution_time": round(time.time() - start_time, 2)
                            }
                        
                        # Build guidance for LLM based on intent
                        if user_intent == "VISUALIZATION_ONLY":
                            next_step_guidance = (
                                f"✅ YOUR NEXT CALL MUST BE a visualization tool:\n"
                                f"   - generate_interactive_scatter\n"
                                f"   - generate_plotly_dashboard\n"
                                f"   - generate_eda_plots\n"
                            )
                        elif user_intent == "CLEANING_ONLY":
                            next_step_guidance = (
                                f"✅ YOUR NEXT CALL should be a cleaning tool:\n"
                                f"   - clean_missing_values\n"
                                f"   - handle_outliers\n"
                                f"   - fix_data_types\n"
                                f"Then STOP (no training!)"
                            )
                        elif user_intent == "FEATURE_ENGINEERING_ONLY":
                            next_step_guidance = (
                                f"✅ YOUR NEXT CALL should be a feature engineering tool:\n"
                                f"   - create_time_features\n"
                                f"   - encode_categorical\n"
                                f"   - create_interaction_features\n"
                                f"Then STOP (no training!)"
                            )
                        elif user_intent == "EXPLORATORY_ANALYSIS":
                            next_step_guidance = (
                                f"✅ YOUR NEXT CALL should be profiling or visualization:\n"
                                f"   - profile_dataset\n"
                                f"   - generate_eda_plots\n"
                                f"   - analyze_correlations\n"
                                f"Then STOP (no training!)"
                            )
                        else:
                            next_step_guidance = "Continue with appropriate tools for the task."
                        
                        # Send blocking message to LLM
                        block_warning = {
                            "role": "user",
                            "content": (
                                f"🚫 BLOCKED: '{tool_name}' does not match user intent!\n\n"
                                f"Task: '{task_description}'\n"
                                f"Detected Intent: {user_intent}\n"
                                f"Allowed: {', '.join(allowed_tool_categories)}\n"
                                f"Blocked: {tool_name} (category: {tool_category if 'tool_category' in locals() else 'unknown'})\n\n"
                                f"{next_step_guidance}\n\n"
                                f"DO NOT call blocked tools. Proceed with allowed tools only!"
                            )
                        }
                        
                        # Track blocking
                        workflow_history.append({
                            "step": len(workflow_history) + 1,
                            "tool": "BLOCKED",
                            "blocked_tool": tool_name,
                            "reason": block_reason,
                            "user_intent": user_intent
                        })
                        
                        # CRITICAL: Add mock tool response to maintain message balance
                        if self.provider in ["mistral", "groq"]:
                            messages.append({
                                "role": "tool",
                                "tool_call_id": tool_call_id,
                                "name": tool_name,
                                "content": json.dumps({"blocked": True, "reason": block_reason})
                            })
                        elif self.provider == "gemini":
                            messages.append({
                                "role": "tool",
                                "name": tool_name,
                                "content": json.dumps({"blocked": True, "reason": block_reason})
                            })
                        
                        messages.append(block_warning)
                        continue
                    
                    # CRITICAL: Block execute_python_code if it's doing encoding/time features
                    if tool_name == "execute_python_code":
                        code = tool_args.get("code", "")
                        
                        # ✅ ALLOW: Data cleanup (dropping columns, fixing types, etc.)
                        is_cleanup = any(pattern in code.lower() for pattern in [
                            "drop(columns=", "drop_duplicates", "fillna", "dropna",
                            "select_dtypes", ".drop(", "errors='ignore'"
                        ])
                        
                        # Block if trying to do encoding (pd.get_dummies, one-hot, etc.) - UNLESS it's cleanup
                        if any(pattern in code.lower() for pattern in ["get_dummies", "onehot", "one-hot", "one_hot"]):
                            if "encode_categorical" in completed_tools and not is_cleanup:
                                print(f"\n🚫 BLOCKED: execute_python_code attempting to re-encode!")
                                print(f"   encode_categorical already completed. Skipping this call.")
                                print(f"   Using existing file: {self._get_last_successful_file(workflow_history)}")
                                
                                block_warning = {
                                    "role": "user",
                                    "content": (
                                        f"🚫 BLOCKED: You tried to use execute_python_code for encoding, but encode_categorical ALREADY completed!\n\n"
                                        f"Encoding is DONE. The file exists: {self._get_last_successful_file(workflow_history)}\n\n"
                                        f"MOVE TO NEXT STEP: generate_eda_plots OR train_baseline_models\n\n"
                                        f"DO NOT:\n"
                                        f"- Call execute_python_code for encoding\n"
                                        f"- Call encode_categorical again\n"
                                        f"- Repeat any completed step\n\n"
                                        f"PROCEED to the next workflow step immediately!"
                                    )
                                }
                                # CRITICAL: Add mock tool response
                                if self.provider in ["mistral", "groq"]:
                                    messages.append({
                                        "role": "tool",
                                        "tool_call_id": tool_call_id,
                                        "name": tool_name,
                                        "content": json.dumps({"blocked": True, "reason": "Encoding already done"})
                                    })
                                elif self.provider == "gemini":
                                    messages.append({
                                        "role": "tool",
                                        "name": tool_name,
                                        "content": json.dumps({"blocked": True, "reason": "Encoding already done"})
                                    })
                                messages.append(block_warning)
                                continue
                        
                        # Block if trying to do time feature extraction - UNLESS it's cleanup
                        if any(pattern in code.lower() for pattern in ["dt.year", "dt.month", "dt.day", "dt.hour", "strptime", "to_datetime"]):
                            if "create_time_features" in completed_tools and not is_cleanup:
                                print(f"\n🚫 BLOCKED: execute_python_code attempting time feature extraction!")
                                print(f"   create_time_features already completed. Skipping this call.")
                                
                                block_warning = {
                                    "role": "user",
                                    "content": (
                                        f"🚫 BLOCKED: You tried to use execute_python_code for time features, but create_time_features ALREADY completed!\n\n"
                                        f"Time features are DONE. Use the existing file: {self._get_last_successful_file(workflow_history)}\n\n"
                                        f"MOVE TO NEXT STEP: encode_categorical\n\n"
                                        f"DO NOT call execute_python_code for time feature extraction!"
                                    )
                                }
                                # CRITICAL: Add mock tool response
                                if self.provider in ["mistral", "groq"]:
                                    messages.append({
                                        "role": "tool",
                                        "tool_call_id": tool_call_id,
                                        "name": tool_name,
                                        "content": json.dumps({"blocked": True, "reason": "Time features already extracted"})
                                    })
                                elif self.provider == "gemini":
                                    messages.append({
                                        "role": "tool",
                                        "name": tool_name,
                                        "content": json.dumps({"blocked": True, "reason": "Time features already extracted"})
                                    })
                                messages.append(block_warning)
                                continue
                    
                    # CRITICAL: Block create_time_features if already called for both datetime columns
                    if tool_name == "create_time_features":
                        time_feature_calls = [step for step in workflow_history if step["tool"] == "create_time_features"]
                        if len(time_feature_calls) >= 2:  # Already called for 'time' and 'updated'
                            print(f"\n🚫 BLOCKED: create_time_features already called {len(time_feature_calls)} times!")
                            print(f"   Time features extracted for all datetime columns. Skipping.")
                            
                            block_warning = {
                                "role": "user",
                                "content": (
                                    f"🚫 BLOCKED: create_time_features already called {len(time_feature_calls)} times!\n\n"
                                    f"Time features extraction is COMPLETE for all datetime columns ('time' and 'updated').\n\n"
                                    f"MOVE TO NEXT STEP: encode_categorical\n\n"
                                    f"DO NOT call create_time_features again!"
                                )
                            }
                            # CRITICAL: Add mock tool response
                            if self.provider in ["mistral", "groq"]:
                                messages.append({
                                    "role": "tool",
                                    "tool_call_id": tool_call_id,
                                    "name": tool_name,
                                    "content": json.dumps({"blocked": True, "reason": "Time features already extracted"})
                                })
                            elif self.provider == "gemini":
                                messages.append({
                                    "role": "tool",
                                    "name": tool_name,
                                    "content": json.dumps({"blocked": True, "reason": "Time features already extracted"})
                                })
                            messages.append(block_warning)
                            continue
                    
                    # CRITICAL: Block encode_categorical if already completed
                    if tool_name == "encode_categorical":
                        if "encode_categorical" in completed_tools:
                            print(f"\n🚫 BLOCKED: encode_categorical already completed!")
                            print(f"   Categorical encoding is DONE. Skipping.")
                            
                            block_warning = {
                                "role": "user",
                                "content": (
                                    f"🚫 BLOCKED: encode_categorical ALREADY completed!\n\n"
                                    f"Encoding is DONE. Use file: {self._get_last_successful_file(workflow_history)}\n\n"
                                    f"MOVE TO NEXT STEP: generate_eda_plots\n\n"
                                    f"DO NOT call encode_categorical again!"
                                )
                            }
                            # CRITICAL: Add mock tool response
                            if self.provider in ["mistral", "groq"]:
                                messages.append({
                                    "role": "tool",
                                    "tool_call_id": tool_call_id,
                                    "name": tool_name,
                                    "content": json.dumps({"blocked": True, "reason": "Categorical encoding already done"})
                                })
                            elif self.provider == "gemini":
                                messages.append({
                                    "role": "tool",
                                    "name": tool_name,
                                    "content": json.dumps({"blocked": True, "reason": "Categorical encoding already done"})
                                })
                            messages.append(block_warning)
                            continue
                    
                    # CRITICAL: Block smart_type_inference after encoding (data is ready!)
                    if tool_name == "smart_type_inference":
                        if "encode_categorical" in completed_tools or "execute_python_code" in completed_tools:
                            print(f"\n🚫 BLOCKED: smart_type_inference after encoding!")
                            print(f"   Data is already encoded and ready. Skipping type inference.")
                            
                            block_warning = {
                                "role": "user",
                                "content": (
                                    f"🚫 BLOCKED: smart_type_inference is NOT needed after encoding!\n\n"
                                    f"The data is already encoded and ready for modeling.\n\n"
                                    f"MOVE TO NEXT STEP: generate_eda_plots OR train_baseline_models\n\n"
                                    f"DO NOT call smart_type_inference after encoding!"
                                )
                            }
                            # CRITICAL: Add mock tool response
                            if self.provider in ["mistral", "groq"]:
                                messages.append({
                                    "role": "tool",
                                    "tool_call_id": tool_call_id,
                                    "name": tool_name,
                                    "content": json.dumps({"blocked": True, "reason": "Type inference not needed after encoding"})
                                })
                            elif self.provider == "gemini":
                                messages.append({
                                    "role": "tool",
                                    "name": tool_name,
                                    "content": json.dumps({"blocked": True, "reason": "Type inference not needed after encoding"})
                                })
                            messages.append(block_warning)
                            continue
                    
                    # ⚠️ LOOP DETECTION: Prevent calling the same tool multiple times in a row
                    # EXCEPTION: Don't apply loop detection for execute_python_code in code-only tasks
                    tool_call_counter[tool_name] = tool_call_counter.get(tool_name, 0) + 1
                    
                    # Detect if this is a code-only task (no ML workflow tools used)
                    ml_tools = ["profile_dataset", "detect_data_quality_issues", "clean_missing_values", 
                               "encode_categorical", "train_baseline_models"]
                    is_code_only_task = not any(tool in completed_tools for tool in ml_tools)
                    
                    # Skip loop detection for execute_python_code in code-only tasks
                    should_check_loops = not (is_code_only_task and tool_name == "execute_python_code")
                    
                    # AGGRESSIVE: For execute_python_code with same args, detect after 1 retry
                    loop_threshold = 2
                    if tool_name == "execute_python_code":
                        # Check if same code being executed repeatedly
                        if workflow_history:
                            last_exec_steps = [s for s in workflow_history if s["tool"] == "execute_python_code"]
                            if len(last_exec_steps) >= 1:
                                last_code = last_exec_steps[-1].get("arguments", {}).get("code", "")
                                current_code = tool_args.get("code", "")
                                # If same/similar code, be more aggressive
                                if last_code and current_code and len(set(last_code.split()) & set(current_code.split())) > len(current_code.split()) * 0.7:
                                    loop_threshold = 1  # Stop after first retry with similar code
                                    print(f"⚠️  Detected repeated similar code execution")
                    
                    # 🔥 FIX: Check if arguments are DIFFERENT from last call
                    # If the same tool is called with different arguments, it's NOT a loop
                    # (e.g., generating multiple different plots is legitimate)
                    is_same_args = False
                    if workflow_history and workflow_history[-1]["tool"] == tool_name:
                        last_args = workflow_history[-1].get("arguments", {})
                        # Compare key arguments (ignore output paths which may differ)
                        ignore_keys = {"output_path", "output_dir"}
                        last_key_args = {k: v for k, v in last_args.items() if k not in ignore_keys}
                        current_key_args = {k: v for k, v in tool_args.items() if k not in ignore_keys}
                        is_same_args = (last_key_args == current_key_args)
                    
                    # Check for loops (same tool called threshold+ times consecutively WITH SAME ARGS)
                    if should_check_loops and tool_call_counter[tool_name] >= loop_threshold:
                        # Only flag as loop if last call was same tool WITH same arguments
                        if workflow_history and workflow_history[-1]["tool"] == tool_name and is_same_args:
                            print(f"\n⚠️  LOOP DETECTED: {tool_name} called {tool_call_counter[tool_name]} times consecutively!")
                            print(f"   This indicates the workflow is stuck. Skipping and forcing progression.")
                            print(f"   Last successful file: {self._get_last_successful_file(workflow_history)}")
                            
                            # Check if we've completed the main workflow (reports generated)
                            completed_tools = [step["tool"] for step in workflow_history]
                            reports_generated = any(tool in completed_tools for tool in [
                                "generate_combined_eda_report", 
                                "generate_plotly_dashboard",
                                "generate_ydata_profiling_report"
                            ])
                            training_done = "train_baseline_models" in completed_tools
                            
                            # If reports done and we're looping, mark as complete
                            if reports_generated and training_done:
                                print(f"   ✅ Main workflow complete. Marking as DONE.")
                                final_summary = (
                                    f"Analysis completed successfully! Main steps finished:\n"
                                    f"- Data profiling and cleaning\n"
                                    f"- Model training ({completed_tools.count('train_baseline_models')} models trained)\n"
                                    f"- {'Hyperparameter tuning' if 'hyperparameter_tuning' in completed_tools else 'Baseline models'}\n"
                                    f"- Comprehensive reports generated\n"
                                    f"- Interactive visualizations created\n\n"
                                    f"Check ./outputs/ for all results."
                                )
                                
                                return {
                                    "status": "completed",
                                    "summary": final_summary,
                                    "workflow_history": workflow_history,
                                    "iterations": iteration,
                                    "api_calls": self.api_calls_made,
                                    "execution_time": round(time.time() - start_time, 2)
                                }
                            
                            # Otherwise, force LLM to move on with VERY STRONG warning
                            next_step = self._determine_next_step(tool_name, completed_tools)
                            
                            # 🎯 If data prep is done but no training yet, push toward modeling
                            prep_done = any(t in completed_tools for t in ["encode_categorical", "create_time_features", "clean_missing_values"])
                            no_training = "train_baseline_models" not in completed_tools
                            if prep_done and no_training and target_col:
                                next_step = f"train_baseline_models (target_col='{target_col}') - Data preparation complete, proceed to modeling!"
                            
                            # CRITICAL: Add mock tool response to maintain message balance
                            # (Mistral API requires: every tool call must have a matching tool response)
                            if self.provider in ["mistral", "groq"]:
                                messages.append({
                                    "role": "tool",
                                    "tool_call_id": tool_call_id,
                                    "name": tool_name,
                                    "content": json.dumps({
                                        "blocked": True,
                                        "reason": f"Loop detected: {tool_name} called {tool_call_counter[tool_name]} times consecutively",
                                        "last_successful_file": self._get_last_successful_file(workflow_history)
                                    })
                                })
                            elif self.provider == "gemini":
                                messages.append({
                                    "role": "tool",
                                    "name": tool_name,
                                    "content": json.dumps({
                                        "blocked": True,
                                        "reason": f"Loop detected: {tool_name} called {tool_call_counter[tool_name]} times consecutively"
                                    })
                                })
                            
                            loop_warning = {
                                "role": "user",
                                "content": (
                                    f"🚨 CRITICAL ERROR: You are STUCK IN A LOOP! 🚨\n\n"
                                    f"You called '{tool_name}' {tool_call_counter[tool_name]} times consecutively.\n"
                                    f"This step is ALREADY COMPLETE (✓ Completed shown above).\n\n"
                                    f"**DO NOT call {tool_name} again!**\n"
                                    f"**DO NOT call execute_python_code for the same task!**\n\n"
                                    f"NEXT STEP: {next_step}\n\n"
                                    f"Last successful output file: {self._get_last_successful_file(workflow_history)}\n"
                                    f"Use this file and proceed to the NEXT step immediately.\n\n"
                                    f"Remember:\n"
                                    f"- If a tool succeeds (✓ Completed) → NEVER call it again\n"
                                    f"- Do NOT use execute_python_code for tasks that have dedicated tools\n"
                                    f"- Follow the workflow: Steps 1→2→3→...→15 (ONE TIME EACH)"
                                )
                            }
                            messages.append(loop_warning)
                            continue  # Skip this tool call
                    
                    print(f"\n🔧 Executing: {tool_name}")
                    try:
                        print(f"   Arguments: {json.dumps(tool_args, indent=2)}")
                    except:
                        print(f"   Arguments: {tool_args}")
                    
                    # Emit progress event for SSE streaming using session UUID
                    if hasattr(self, 'session') and self.session:
                        session_id = self.session.session_id
                        print(f"[SSE] EMIT tool_executing: session={session_id}, tool={tool_name}")
                        progress_manager.emit(session_id, {
                            'type': 'tool_executing',
                            'tool': tool_name,
                            'message': f"🔧 Executing: {tool_name}",
                            'arguments': tool_args
                        })
                    
                    # Execute tool
                    tool_result = self._execute_tool(tool_name, tool_args)
                    
                    # 📂 CHECKPOINT: Save progress after successful tool execution
                    if tool_result.get("success", True):
                        session_id = self.http_session_key or "default"
                        self.recovery_manager.checkpoint_manager.save_checkpoint(
                            session_id=session_id,
                            workflow_state={
                                'iteration': iteration,
                                'workflow_history': workflow_history,
                                'current_file': file_path,
                                'task_description': task_description,
                                'target_col': target_col
                            },
                            last_tool=tool_name,
                            iteration=iteration
                        )
                    
                    # Check for errors and display them prominently
                    if not tool_result.get("success", True):
                        error_msg = tool_result.get("error", "Unknown error")
                        error_type = tool_result.get("error_type", "Error")
                        print(f"   ❌ FAILED: {tool_name}")
                        print(f"   ⚠️  Error Type: {error_type}")
                        print(f"   ⚠️  Error Message: {error_msg}")
                        
                        # Emit failure event for SSE streaming
                        if hasattr(self, 'session') and self.session:
                            progress_manager.emit(self.session.session_id, {
                                'type': 'tool_failed',
                                'tool': tool_name,
                                'message': f"❌ FAILED: {tool_name}",
                                'error': error_msg,
                                'error_type': error_type
                            })
                        
                        # Add recovery guidance with last successful file
                        last_successful_file = self._get_last_successful_file(workflow_history)
                        if last_successful_file:
                            tool_result["recovery_guidance"] = (
                                f"This tool failed. Use the last successful file for next steps: {last_successful_file}\n"
                                f"Do NOT try to use the failed tool's output file."
                            )
                            print(f"   🔄 Recovery: Use {last_successful_file} for next step")
                        
                        # Special handling for execute_python_code errors
                        if tool_name == "execute_python_code":
                            stderr = tool_result.get("stderr", "")
                            hints = tool_result.get("hints", [])
                            
                            if stderr:
                                print(f"   📄 Code Error Details:")
                                # Show last 10 lines of stderr (most relevant)
                                stderr_lines = stderr.split('\n')[-10:]
                                for line in stderr_lines:
                                    if line.strip():
                                        print(f"      {line}")
                            
                            if hints:
                                print(f"   💡 Suggestions:")
                                for hint in hints:
                                    print(f"      {hint}")
                            
                            # Add suggestion to use specialized tools instead
                            if error_type in ["PermissionError", "FileNotFoundError", "KeyError"]:
                                tool_result["suggestion"] = (
                                    f"Consider using specialized tools instead of execute_python_code:\n"
                                    f"- For file operations: use clean_missing_values(), encode_categorical(), etc.\n"
                                    f"- For data transformations: use create_ratio_features(), create_statistical_features(), etc.\n"
                                    f"- Specialized tools are more robust and handle edge cases better!"
                                )
                        
                        # Extract helpful info from common errors and add to result
                        if "Column" in error_msg and "not found" in error_msg and "Available columns:" in error_msg:
                            # Extract the column that was searched for and available columns
                            import re
                            searched = re.search(r"Column '([^']+)' not found", error_msg)
                            available = re.search(r"Available columns: (.+?)(?:\n|$)", error_msg)
                            if searched and available:
                                searched_col = searched.group(1)
                                available_cols = [c.strip() for c in available.group(1).split(',')]
                                
                                # Find similar column names (case-insensitive partial match)
                                suggestions = []
                                searched_lower = searched_col.lower()
                                for col in available_cols[:20]:  # Check first 20
                                    if searched_lower in col.lower() or col.lower() in searched_lower:
                                        suggestions.append(col)
                                
                                if suggestions:
                                    tool_result["suggestion"] = f"Did you mean: {suggestions[0]}? (Similar columns: {', '.join(suggestions[:3])})"
                                    print(f"   💡 HINT: Did you mean '{suggestions[0]}'?")
                        
                        # For critical tools, show detailed error to user
                        if tool_name in ["train_baseline_models", "auto_ml_pipeline"]:
                            print(f"\n🔴 CRITICAL ERROR in {tool_name}:")
                            print(f"   {error_msg}\n")
                    else:
                        print(f"   ✓ Completed: {tool_name}")
                        
                        # Emit completion event for SSE streaming
                        if hasattr(self, 'session') and self.session:
                            progress_manager.emit(self.session.session_id, {
                                'type': 'tool_completed',
                                'tool': tool_name,
                                'message': f"✓ Completed: {tool_name}"
                            })
                    
                    # Track in workflow
                    workflow_history.append({
                        "iteration": iteration,
                        "tool": tool_name,
                        "arguments": tool_args,
                        "result": tool_result
                    })
                    
                    # 🤝 INTER-AGENT COMMUNICATION: Check if should hand off to specialist
                    if not self.use_compact_prompts:  # Only for multi-agent mode
                        completed_tool_names = [step["tool"] for step in workflow_history]
                        target_agent = self._should_hand_off(
                            current_agent=self.active_agent,
                            completed_tools=completed_tool_names,
                            workflow_history=workflow_history
                        )
                        
                        if target_agent:
                            hand_off_result = self._hand_off_to_agent(
                                target_agent=target_agent,
                                context={
                                    "completed_tools": completed_tool_names,
                                    "reason": "Workflow progression - ready for next phase"
                                },
                                iteration=iteration
                            )
                            
                            if hand_off_result["success"]:
                                # Update tools for new agent
                                tools_to_use = hand_off_result["new_tools"]
                                
                                # Update system prompt for new agent
                                messages[0] = {"role": "system", "content": hand_off_result["system_prompt"]}
                                
                                # 📝 Record hand-off in reasoning trace
                                self.reasoning_trace.record_agent_handoff(
                                    from_agent=hand_off_result["old_agent"],
                                    to_agent=hand_off_result["new_agent"],
                                    reason="Workflow progression - ready for next phase",
                                    iteration=iteration
                                )
                    
                    # 🗂️ UPDATE WORKFLOW STATE (reduces need to send full history to LLM)
                    self._update_workflow_state(tool_name, tool_result)
                    
                    # ⚡ CRITICAL FIX: Add tool result back to messages so LLM sees it in next iteration!
                    if self.provider in ["mistral", "groq"]:
                        # For Mistral/Groq, add tool message with the result
                        # **COMPRESS RESULT** for small context models
                        clean_tool_result = self._make_json_serializable(tool_result)
                        
                        # Smart compression: Keep only what LLM needs for next decision
                        compressed_result = self._compress_tool_result(tool_name, clean_tool_result)
                        tool_response_content = json.dumps(compressed_result)
                        
                        # If tool failed, prepend ERROR indicator to make it obvious
                        if not tool_result.get("success", True):
                            error_msg = tool_result.get("error", "Unknown error")
                            suggestion = tool_result.get("suggestion", "")
                            
                            # Create VERY EXPLICIT error message
                            tool_response_content = json.dumps({
                                "❌ TOOL_FAILED": True,
                                "tool_name": tool_name,
                                "error": error_msg,
                                "suggestion": suggestion,
                                "⚠️ ACTION_REQUIRED": f"RETRY {tool_name} with corrected parameters. Do NOT call other tools first!",
                                "💡 HINT": suggestion if suggestion else "Check error message for details"
                            })
                        
                        messages.append({
                            "role": "tool",
                            "tool_call_id": tool_call_id,
                            "name": tool_name,
                            "content": tool_response_content
                        })
                    
                    elif self.provider == "gemini":
                        # For Gemini, add to messages for history tracking
                        # Gemini uses function responses differently but we still track
                        # Clean tool_result to make it JSON-serializable
                        clean_tool_result = self._make_json_serializable(tool_result)
                        tool_response_content = json.dumps(clean_tool_result)
                        
                        # If tool failed, make error VERY explicit
                        if not tool_result.get("success", True):
                            error_msg = tool_result.get("error", "Unknown error")
                            suggestion = tool_result.get("suggestion", "")
                            
                            tool_response_content = json.dumps({
                                "❌ TOOL_FAILED": True,
                                "tool_name": tool_name,
                                "error": error_msg,
                                "suggestion": suggestion,
                                "⚠️ ACTION_REQUIRED": f"RETRY {tool_name} with corrected parameters",
                                "💡 HINT": suggestion if suggestion else "Check error message"
                            })
                        
                        messages.append({
                            "role": "tool",
                            "name": tool_name,
                            "content": tool_response_content
                        })
                    
                    # Debug: Check if training completed
                    if tool_name == "train_baseline_models":
                        print(f"[DEBUG] train_baseline_models executed!")
                        print(f"[DEBUG]   tool_result keys: {list(tool_result.keys())}")
                        print(f"[DEBUG]   'best_model' in tool_result: {'best_model' in tool_result}")
                        if isinstance(tool_result, dict) and 'result' in tool_result:
                            print(f"[DEBUG]   Nested result keys: {list(tool_result['result'].keys()) if isinstance(tool_result['result'], dict) else 'Not a dict'}")
                            print(f"[DEBUG]   'best_model' in nested result: {'best_model' in tool_result['result'] if isinstance(tool_result['result'], dict) else False}")
                        if "best_model" in tool_result:
                            print(f"[DEBUG]   best_model value: {tool_result['best_model']}")
                    
                    # AUTO-FINISH DISABLED: Let agent complete full workflow including EDA reports
                    # Previously auto-finish would exit immediately after training, preventing
                    # report generation. Now the agent continues to generate visualizations and reports.
            
            except Exception as e:
                import traceback
                error_traceback = traceback.format_exc()
                error_str = str(e)
                
                # Log the actual error for debugging
                print(f"❌ ERROR in analyze loop: {e}")
                print(f"   Error type: {type(e).__name__}")
                print(f"   Full error: {error_str}")
                print(f"   Traceback:\n{error_traceback}")
                
                # Handle rate limit errors with retry (be more specific to avoid false positives)
                if ("429" in error_str or 
                    "Resource has been exhausted" in error_str or
                    "quota exceeded" in error_str.lower()):
                    
                    retry_delay = 10
                    if "retry after" in error_str.lower():
                        import re
                        match = re.search(r'retry after (\d+)', error_str.lower())
                        if match:
                            retry_delay = min(int(match.group(1)) + 2, 15)
                    
                    print(f"⏳ Rate limit detected (429/quota). Waiting {retry_delay}s before retry...")
                    time.sleep(retry_delay)
                    iteration -= 1
                    continue
                
                # For other errors, don't retry - just report and continue
                print(f"   Traceback:\n{error_traceback}")
                
                # 🧠 Save session even on error
                if self.session:
                    self.session.add_conversation(task_description, f"Error: {str(e)}")
                    self.session_store.save(self.session)
                
                return {
                    "status": "error",
                    "error": str(e),
                    "error_type": type(e).__name__,
                    "traceback": error_traceback,
                    "workflow_history": workflow_history,
                    "iterations": iteration
                }
        
        # Max iterations reached
        # 🧠 Save session
        if self.session:
            self.session.add_conversation(task_description, "Workflow incomplete - max iterations reached")
            self.session_store.save(self.session)
        
        return {
            "status": "incomplete",
            "message": f"Reached maximum iterations ({max_iterations})",
            "workflow_history": workflow_history,
            "iterations": iteration
        }
    
    def get_cache_stats(self) -> Dict[str, Any]:
        """Get cache statistics."""
        return self.cache.get_stats()
    
    def clear_cache(self) -> None:
        """Clear all cached results."""
        self.cache.clear_all()
    
    def get_session_id(self) -> Optional[str]:
        """Get current session ID."""
        return self.session.session_id if self.session else None
    
    def clear_session(self) -> None:
        """Clear current session context (start fresh)."""
        if self.session:
            self.session.clear()
            print("✅ Session context cleared")
        else:
            print("⚠️  No active session")
    
    def get_session_context(self) -> str:
        """Get human-readable session context summary."""
        if self.session:
            return self.session.get_context_summary()
        else:
            return "No active session"