Spaces:

CliDyn
/

Eurus

Runtime error

File size: 30,177 Bytes

ab07cb1

#!/usr/bin/env python3
"""
QA Runner — Automated End-to-End Agent Testing
===============================================
Runs test queries through the Eurus agent, captures ALL intermediate steps
(tool calls, tool outputs, reasoning, plots) and saves structured results
to data/qa_results/q{NN}_{slug}/.

Usage:
    PYTHONPATH=src OPENAI_API_KEY=... python3 scripts/qa_runner.py
    
Or run a single query:
    PYTHONPATH=src OPENAI_API_KEY=... python3 scripts/qa_runner.py --query 2
"""

import os
import sys
import json
import shutil
import base64
import time
import argparse
from pathlib import Path
from datetime import datetime
from typing import Optional

# Ensure eurus package is importable
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT / "src"))
sys.path.insert(0, str(PROJECT_ROOT))

# Load .env (API keys)
from dotenv import load_dotenv
load_dotenv(PROJECT_ROOT / ".env")

from langchain_openai import ChatOpenAI
from langchain.agents import create_agent
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage

from eurus.config import AGENT_SYSTEM_PROMPT, CONFIG, get_plots_dir
from eurus.tools import get_all_tools

# ============================================================================
# QA TEST QUERIES — 36 research-grade demo queries
#
#  §1  Synoptic Meteorology & Case Studies         (Q01–Q05)
#  §2  Climate Variability & Teleconnections        (Q06–Q10)
#  §3  Trends & Climate Change Signals              (Q11–Q15)
#  §4  Extreme Events & Risk                        (Q16–Q20)
#  §5  Maritime & Shipping                          (Q21–Q24)
#  §6  Energy Assessment                            (Q25–Q28)
#  §7  Diurnal & Sub-Daily Processes                (Q29–Q30)
#  §8  Multi-Variable & Diagnostics                 (Q31–Q33)
#  §9  Quick Lookups                                (Q34–Q36)
# ============================================================================

QA_QUERIES = [
    # ═══════════════════════════════════════════════════════════════
    #  §1 — Synoptic Meteorology & Case Studies
    # ═══════════════════════════════════════════════════════════════
    {
        "id": 1,
        "slug": "europe_heatwave_anomaly",
        "query": "Show me a spatial map of 2m temperature anomalies across Europe "
                 "during the June 2023 heatwave compared to June 2022.",
        "type": "anomaly_map",
        "variables": ["t2"],
        "region": "Europe",
    },
    {
        "id": 2,
        "slug": "storm_isha_mslp_wind",
        "query": "Plot MSLP isobars and 10m wind vectors over the North Atlantic "
                 "for 2024-01-22 — I want to see Storm Isha's structure.",
        "type": "contour_quiver",
        "variables": ["mslp", "u10", "v10"],
        "region": "North Atlantic",
    },
    {
        "id": 3,
        "slug": "atmospheric_river_jan2023",
        "query": "Download total column water vapour for the US West Coast, Jan 2023, "
                 "and show the atmospheric river event around Jan 9th.",
        "type": "ar_detection",
        "variables": ["tcwv"],
        "region": "US West Coast",
    },
    {
        "id": 4,
        "slug": "sahara_heat_july2024",
        "query": "Plot the daily mean 2m temperature time series averaged over "
                 "the Sahara (20-30°N, 0 to 15°E) for July 2024 and compare "
                 "it to July 2023 on the same chart.",
        "type": "time_series",
        "variables": ["t2"],
        "region": "Sahara",
    },
    {
        "id": 5,
        "slug": "great_plains_wind_may2024",
        "query": "Plot a map of mean 10m wind speed over the US Great Plains "
                 "(30-45°N, -105 to -90°W) for May 2024 and highlight areas exceeding 5 m/s.",
        "type": "threshold_map",
        "variables": ["u10", "v10"],
        "region": "US Great Plains",
    },

    # ═══════════════════════════════════════════════════════════════
    #  §2 — Climate Variability & Teleconnections
    # ═══════════════════════════════════════════════════════════════
    {
        "id": 6,
        "slug": "nino34_index",
        "query": "Calculate the Niño 3.4 index from ERA5 SST for 2015-2024 and "
                 "classify El Niño / La Niña episodes.",
        "type": "climate_index",
        "variables": ["sst"],
        "region": "Tropical Pacific",
    },
    {
        "id": 7,
        "slug": "elnino_vs_lanina_tropical_belt",
        "query": "Compare SST anomalies across the entire tropical belt "
                 "(30°S-30°N, global) for December 2023 (peak El Niño) vs December 2022 "
                 "(La Niña). Show the full basin-wide pattern across the Pacific, "
                 "Atlantic, and Indian oceans in a single anomaly difference map.",
        "type": "anomaly_comparison",
        "variables": ["sst"],
        "region": "Tropical Belt (global)",
    },
    {
        "id": 8,
        "slug": "nao_index",
        "query": "Compute the NAO index from MSLP (Azores minus Iceland) for 2000-2024 "
                 "and plot it with a 3-month rolling mean.",
        "type": "climate_index",
        "variables": ["mslp"],
        "region": "North Atlantic",
    },
    {
        "id": 9,
        "slug": "australia_enso_rainfall",
        "query": "Compare precipitation over Eastern Australia (25-45°S, 145-155°E) "
                 "between the La Niña year 2022 and El Niño year 2023. "
                 "Show a two-panel map of annual total precipitation for each year "
                 "and a difference map (2023 minus 2022).",
        "type": "multi_year_anomaly",
        "variables": ["tp"],
        "region": "Australia",
    },
    {
        "id": 10,
        "slug": "med_eof_sst",
        "query": "Perform an EOF analysis on Mediterranean SST anomalies "
                 "(30-46°N, -6 to 36°E) for 2019-2024 and show the first 3 modes "
                 "with variance explained. Interpret the dominant patterns.",
        "type": "eof_analysis",
        "variables": ["sst"],
        "region": "Mediterranean",
    },

    # ═══════════════════════════════════════════════════════════════
    #  §3 — Trends & Climate Change Signals
    # ═══════════════════════════════════════════════════════════════
    {
        "id": 11,
        "slug": "arctic_polar_amplification",
        "query": "Compare January mean 2m temperature across the entire Arctic "
                 "(north of 70°N) for 2024 vs 2000. Show both maps side by side, "
                 "compute the area-weighted temperature difference, and quantify "
                 "polar amplification.",
        "type": "decadal_comparison",
        "variables": ["t2"],
        "region": "Arctic (>70°N)",
    },
    {
        "id": 12,
        "slug": "med_marine_heatwave_2023",
        "query": "Map the summer (JJA) 2023 mean SST anomaly across the entire "
                 "Mediterranean basin (30-46°N, -6 to 36°E) compared to the 2018-2022 "
                 "summer mean. Identify marine heatwave hotspots where SST exceeded "
                 "+2°C above normal.",
        "type": "marine_heatwave",
        "variables": ["sst"],
        "region": "Mediterranean",
    },
    {
        "id": 13,
        "slug": "paris_decadal_comparison",
        "query": "Compare the average summer (JJA) temperature in Paris between the "
                 "decades 2000-2009 and 2014-2023 — show a difference map and time series.",
        "type": "multi_panel_comparison",
        "variables": ["t2"],
        "region": "Paris",
    },
    {
        "id": 14,
        "slug": "alps_snow_trend",
        "query": "Has the snow depth over the Alps decreased over the last 30 years? "
                 "Show me the December-February trend.",
        "type": "trend_analysis",
        "variables": ["sd"],
        "region": "Alps",
    },
    {
        "id": 15,
        "slug": "uk_precip_anomaly_winter2024",
        "query": "Map the total precipitation anomaly over the British Isles "
                 "(49-60°N, 11°W-2°E) for January 2024 compared to the 2019-2023 "
                 "January mean. Highlight regions receiving more than 150% of normal "
                 "rainfall. Save the map as a PNG file.",
        "type": "anomaly_map",
        "variables": ["tp"],
        "region": "British Isles",
    },

    # ═══════════════════════════════════════════════════════════════
    #  §4 — Extreme Events & Risk
    # ═══════════════════════════════════════════════════════════════
    {
        "id": 16,
        "slug": "delhi_heatwave_detection",
        "query": "Detect heatwave events in Delhi from 2010-2024 using the 90th "
                 "percentile threshold with a 3-day duration criterion — how has the "
                 "frequency changed?",
        "type": "heatwave_detection",
        "variables": ["t2"],
        "region": "Delhi",
    },
    {
        "id": 17,
        "slug": "horn_africa_drought",
        "query": "Calculate a 3-month SPI proxy for the Horn of Africa "
                 "(Ethiopia/Somalia) for 2020-2024 — when were the worst drought periods?",
        "type": "drought_analysis",
        "variables": ["tp"],
        "region": "Horn of Africa",
    },
    {
        "id": 18,
        "slug": "baghdad_hot_days",
        "query": "How many days per year exceeded 35°C in Baghdad from 1980 to 2024? "
                 "Plot as a bar chart with a trend line.",
        "type": "exceedance_frequency",
        "variables": ["t2"],
        "region": "Baghdad",
    },
    {
        "id": 19,
        "slug": "sea_p95_precip",
        "query": "Show me the 95th percentile daily precipitation map for Southeast Asia "
                 "for 2010-2023.",
        "type": "extreme_percentile",
        "variables": ["tp"],
        "region": "Southeast Asia",
    },
    {
        "id": 20,
        "slug": "scandinavia_blocking_2018",
        "query": "Analyse the blocking event over Scandinavia in July 2018 — show MSLP "
                 "anomalies persisting for 5+ days.",
        "type": "blocking_detection",
        "variables": ["mslp"],
        "region": "Scandinavia",
    },

    # ═══════════════════════════════════════════════════════════════
    #  §5 — Maritime & Shipping
    # ═══════════════════════════════════════════════════════════════
    {
        "id": 21,
        "slug": "rotterdam_shanghai_route",
        "query": "Calculate the maritime route from Rotterdam to Shanghai and analyse "
                 "wind risk along the route for December.",
        "type": "maritime_route_risk",
        "variables": ["u10", "v10"],
        "region": "Europe-Asia",
    },
    {
        "id": 22,
        "slug": "indian_ocean_sst_dipole",
        "query": "Map the SST anomaly across the Indian Ocean (30°S-25°N, 30-120°E) "
                 "for October 2023 relative to the 2019-2022 October mean. "
                 "Show the Indian Ocean Dipole pattern. Save the map as PNG.",
        "type": "anomaly_map",
        "variables": ["sst"],
        "region": "Indian Ocean",
    },
    {
        "id": 23,
        "slug": "japan_typhoon_season_wind",
        "query": "Map the mean and maximum 10m wind speed over the seas around Japan "
                 "(20-45°N, 120-150°E) during typhoon season (August-October) 2023. "
                 "Show two-panel spatial maps highlighting areas where mean wind "
                 "exceeded 8 m/s. Save as PNG.",
        "type": "multi_panel_map",
        "variables": ["u10", "v10"],
        "region": "Japan",
    },
    {
        "id": 24,
        "slug": "south_atlantic_sst_gradient",
        "query": "Map the mean SST field across the South Atlantic (40°S-5°N, 50°W-15°E) "
                 "for March 2024. Overlay SST isotherms and highlight the "
                 "Brazil-Malvinas confluence zone. Save as PNG.",
        "type": "sst_map",
        "variables": ["sst"],
        "region": "South Atlantic",
    },

    # ═══════════════════════════════════════════════════════════════
    #  §6 — Energy Assessment
    # ═══════════════════════════════════════════════════════════════
    {
        "id": 25,
        "slug": "north_sea_wind_power",
        "query": "Map the mean 100m wind power density across the North Sea for "
                 "2020-2024 — where are the best offshore wind sites?",
        "type": "wind_energy",
        "variables": ["u100", "v100"],
        "region": "North Sea",
    },
    {
        "id": 26,
        "slug": "german_bight_weibull",
        "query": "Fit a Weibull distribution to 100m wind speed at 54°N, 7°E "
                 "(German Bight) for 2023 and estimate the capacity factor for a "
                 "3-25 m/s turbine range. Plot the histogram with Weibull fit overlay "
                 "and save as PNG.",
        "type": "weibull_analysis",
        "variables": ["u100", "v100"],
        "region": "German Bight",
    },
    {
        "id": 27,
        "slug": "solar_sahara_vs_germany",
        "query": "Compare incoming solar radiation (SSRD) between the Sahara and "
                 "northern Germany across 2023 — show monthly means.",
        "type": "comparison_timeseries",
        "variables": ["ssrd"],
        "region": "Sahara / Germany",
    },
    {
        "id": 28,
        "slug": "persian_gulf_sst_summer",
        "query": "Map the mean SST across the Persian Gulf and Arabian Sea "
                 "(12-32°N, 44-70°E) for August 2023. Highlight areas where SST "
                 "exceeded 32°C in a spatial map. Save as PNG.",
        "type": "threshold_map",
        "variables": ["sst"],
        "region": "Persian Gulf",
    },

    # ═══════════════════════════════════════════════════════════════
    #  §7 — Diurnal & Sub-Daily Processes
    # ═══════════════════════════════════════════════════════════════
    {
        "id": 29,
        "slug": "sahara_diurnal_t2_blh",
        "query": "Show the diurnal cycle of 2m temperature and boundary layer height "
                 "in the Sahara for July 2024 — dual-axis plot.",
        "type": "diurnal_cycle",
        "variables": ["t2", "blh"],
        "region": "Sahara",
    },
    {
        "id": 30,
        "slug": "amazon_convective_peak",
        "query": "When does convective precipitation peak over the Amazon basin during "
                 "DJF? Hourly climatology please.",
        "type": "diurnal_cycle",
        "variables": ["cp"],
        "region": "Amazon",
    },

    # ═══════════════════════════════════════════════════════════════
    #  §8 — Multi-Variable & Diagnostics
    # ═══════════════════════════════════════════════════════════════
    {
        "id": 31,
        "slug": "europe_rh_august",
        "query": "Compute relative humidity from 2m temperature and dewpoint for "
                 "central Europe, August 2023, and map the spatial mean.",
        "type": "derived_variable",
        "variables": ["t2", "d2"],
        "region": "Central Europe",
    },
    {
        "id": 32,
        "slug": "hovmoller_equator_skt",
        "query": "Create a Hovmöller diagram of 850 hPa equivalent — use skin "
                 "temperature as proxy — along the equator for 2023 to visualise the MJO.",
        "type": "hovmoller",
        "variables": ["skt"],
        "region": "Equatorial",
    },
    {
        "id": 33,
        "slug": "hurricane_otis_dashboard",
        "query": "Plot a summary dashboard for Hurricane Otis (Oct 2023, Acapulco): "
                 "SST map, wind speed time series, and TCWV distribution in one figure.",
        "type": "dashboard",
        "variables": ["sst", "u10", "v10", "tcwv"],
        "region": "East Pacific / Mexico",
    },

    # ═══════════════════════════════════════════════════════════════
    #  §9 — Quick Lookups
    # ═══════════════════════════════════════════════════════════════
    {
        "id": 34,
        "slug": "california_sst_jan",
        "query": "What was the average SST off the coast of California in January 2024? "
                 "Also plot a spatial map of the SST field for that month and save as PNG.",
        "type": "point_retrieval",
        "variables": ["sst"],
        "region": "California",
    },
    {
        "id": 35,
        "slug": "berlin_monthly_temp",
        "query": "Plot the 2023 monthly mean temperature for Berlin as a seasonal curve.",
        "type": "time_series",
        "variables": ["t2"],
        "region": "Berlin",
    },
    {
        "id": 36,
        "slug": "biscay_wind_stats",
        "query": "Download 10m wind speed for the Bay of Biscay, last 3 years, and "
                 "give me basic statistics. Also plot a wind speed histogram or time "
                 "series and save as PNG.",
        "type": "stats_retrieval",
        "variables": ["u10", "v10"],
        "region": "Bay of Biscay",
    },
]


# ============================================================================
# AGENT SETUP  (mirrors main.py exactly)
# ============================================================================

def build_agent():
    """Build a LangChain agent with full tool suite."""
    llm = ChatOpenAI(
        model=CONFIG.model_name,
        temperature=CONFIG.temperature,
    )
    
    tools = get_all_tools(enable_routing=False, enable_guide=True)
    
    agent = create_agent(
        model=llm,
        tools=tools,
        system_prompt=AGENT_SYSTEM_PROMPT,
        debug=False,
    )
    
    return agent


# ============================================================================
# STEP CAPTURE
# ============================================================================

def extract_steps(messages) -> list:
    """
    Extract ALL intermediate steps from agent message history.
    Returns list of step dicts with type, content, tool_name, etc.
    """
    steps = []
    
    for msg in messages:
        if isinstance(msg, HumanMessage):
            steps.append({
                "step": len(steps) + 1,
                "type": "user_query",
                "content": msg.content[:2000],
            })
        elif isinstance(msg, AIMessage):
            # AI thinking / tool calls
            if msg.tool_calls:
                for tc in msg.tool_calls:
                    # Capture tool call request
                    args = tc.get("args", {})
                    # Truncate large args
                    args_str = json.dumps(args, indent=2, default=str)
                    if len(args_str) > 5000:
                        args_str = args_str[:5000] + "\n... [TRUNCATED]"
                    
                    steps.append({
                        "step": len(steps) + 1,
                        "type": "tool_call",
                        "tool_name": tc.get("name", "unknown"),
                        "tool_id": tc.get("id", ""),
                        "arguments": json.loads(args_str) if len(args_str) <= 5000 else args_str,
                        "reasoning": msg.content[:1000] if msg.content else "",
                    })
            elif msg.content:
                # Final response or intermediate reasoning
                steps.append({
                    "step": len(steps) + 1,
                    "type": "ai_response",
                    "content": msg.content[:5000],
                })
        elif isinstance(msg, ToolMessage):
            # Tool output
            content = msg.content if isinstance(msg.content, str) else str(msg.content)
            if len(content) > 3000:
                content = content[:3000] + "\n... [TRUNCATED]"
            
            steps.append({
                "step": len(steps) + 1,
                "type": "tool_output",
                "tool_name": msg.name if hasattr(msg, 'name') else "unknown",
                "tool_call_id": msg.tool_call_id if hasattr(msg, 'tool_call_id') else "",
                "content": content,
            })
    
    return steps


# ============================================================================
# QA RUNNER
# ============================================================================

def run_single_query(agent, query_def: dict, output_dir: Path) -> dict:
    """
    Run a single QA query and capture everything.
    
    Returns: metadata dict
    """
    qid = query_def["id"]
    slug = query_def["slug"]
    query = query_def["query"]
    
    folder = output_dir / f"q{qid:02d}_{slug}"
    folder.mkdir(parents=True, exist_ok=True)
    
    print(f"\n{'='*70}")
    print(f"  Q{qid:02d}: {query[:70]}...")
    print(f"{'='*70}")
    
    start_time = time.time()
    
    try:
        # Snapshot existing plots BEFORE running so we only copy NEW ones
        plots_dir = get_plots_dir()
        existing_plots = set()
        if plots_dir.exists():
            existing_plots = {f.name for f in plots_dir.glob("*.png")}
        
        # Invoke agent
        config = {"recursion_limit": 35}
        messages = [HumanMessage(content=query)]
        
        result = agent.invoke({"messages": messages}, config=config)
        
        elapsed = time.time() - start_time
        result_messages = result["messages"]
        
        # Extract intermediate steps
        steps = extract_steps(result_messages)
        
        # Get final response
        final_response = ""
        for msg in reversed(result_messages):
            if isinstance(msg, AIMessage) and msg.content and not msg.tool_calls:
                final_response = msg.content
                break
        
        # Save steps.json
        steps_path = folder / "steps.json"
        with open(steps_path, "w") as f:
            json.dump(steps, f, indent=2, default=str, ensure_ascii=False)
        
        # Save final response
        response_path = folder / "response.md"
        with open(response_path, "w") as f:
            f.write(f"# Q{qid:02d}: {slug}\n\n")
            f.write(f"**Query:** {query}\n\n")
            f.write(f"**Elapsed:** {elapsed:.1f}s\n\n")
            f.write("---\n\n")
            f.write(final_response)
        
        # Copy only NEW plots (diff against pre-query snapshot)
        plot_files = []
        if plots_dir.exists():
            for f_path in sorted(plots_dir.glob("*.png")):
                if f_path.name not in existing_plots:
                    dest = folder / f_path.name
                    shutil.copy2(f_path, dest)
                    plot_files.append(f_path.name)
                    print(f"   📊 Plot saved: {f_path.name}")
        
        # Count tool calls
        tool_calls = [s for s in steps if s["type"] == "tool_call"]
        tools_used = list(set(s["tool_name"] for s in tool_calls))
        
        # Build metadata
        metadata = {
            "query_id": qid,
            "slug": slug,
            "query": query,
            "type": query_def.get("type", "unknown"),
            "variables": query_def.get("variables", []),
            "region": query_def.get("region", ""),
            "timestamp": datetime.now().isoformat(),
            "elapsed_seconds": round(elapsed, 1),
            "status": "success",
            "tools_used": tools_used,
            "num_tool_calls": len(tool_calls),
            "num_steps": len(steps),
            "plot_files": plot_files,
            "notes": "",
        }
        
        # Save metadata.json
        meta_path = folder / "metadata.json"
        with open(meta_path, "w") as f:
            json.dump(metadata, f, indent=2, ensure_ascii=False)
        
        print(f"   ✅ SUCCESS in {elapsed:.1f}s | Tools: {', '.join(tools_used)} | Steps: {len(steps)}")
        
        return metadata
    
    except Exception as e:
        elapsed = time.time() - start_time
        print(f"   ❌ FAILED in {elapsed:.1f}s: {e}")
        
        metadata = {
            "query_id": qid,
            "slug": slug,
            "query": query,
            "type": query_def.get("type", "unknown"),
            "variables": query_def.get("variables", []),
            "region": query_def.get("region", ""),
            "timestamp": datetime.now().isoformat(),
            "elapsed_seconds": round(elapsed, 1),
            "status": "error",
            "error": str(e),
            "tools_used": [],
            "num_tool_calls": 0,
            "num_steps": 0,
            "plot_files": [],
            "notes": f"Error: {e}",
        }
        
        meta_path = folder / "metadata.json"
        with open(meta_path, "w") as f:
            json.dump(metadata, f, indent=2, ensure_ascii=False)
        
        return metadata


def main():
    parser = argparse.ArgumentParser(description="Eurus QA Runner")
    parser.add_argument("--query", type=int, help="Run a single query by ID (1-36)")
    parser.add_argument("--start", type=int, default=1, help="Start from query ID")
    parser.add_argument("--end", type=int, default=36, help="End at query ID (inclusive)")
    parser.add_argument("--output", type=str, default=None, help="Output directory (default: data/qa_results)")
    parser.add_argument("--skip-existing", action="store_true", help="Skip if folder already has metadata.json")
    args = parser.parse_args()
    
    # Check API key
    if not os.environ.get("OPENAI_API_KEY"):
        print("❌ OPENAI_API_KEY not set!")
        sys.exit(1)
    
    if args.output:
        output_dir = Path(args.output)
    else:
        output_dir = PROJECT_ROOT / "data" / "qa_results"
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"""
╔══════════════════════════════════════════════════════╗
║          Eurus QA Runner v1.0                       ║
║          {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}                        ║
╚══════════════════════════════════════════════════════╝
Output: {output_dir}
""")
    
    # Build agent once
    print("🏗️  Building agent...")
    agent = build_agent()
    print("✅ Agent ready\n")
    
    # Select queries
    if args.query:
        queries = [q for q in QA_QUERIES if q["id"] == args.query]
    else:
        queries = [q for q in QA_QUERIES if args.start <= q["id"] <= args.end]
    
    results = []
    for q in queries:
        folder = output_dir / f"q{q['id']:02d}_{q['slug']}"
        if args.skip_existing and (folder / "metadata.json").exists():
            print(f"⏭️  Skipping Q{q['id']:02d} (already exists)")
            continue
        
        result = run_single_query(agent, q, output_dir)
        results.append(result)
    
    # Print summary
    print(f"\n{'='*70}")
    print("QA SUMMARY")
    print(f"{'='*70}")
    
    success = sum(1 for r in results if r["status"] == "success")
    failed = sum(1 for r in results if r["status"] == "error")
    total_time = sum(r["elapsed_seconds"] for r in results)
    
    for r in results:
        status = "✅" if r["status"] == "success" else "❌"
        print(f"  {status} Q{r['query_id']:02d} ({r['slug']:20s}) | "
              f"{r['elapsed_seconds']:5.1f}s | Tools: {', '.join(r['tools_used'])}")
    
    print(f"\nTotal: {success} passed, {failed} failed, {total_time:.1f}s total")
    
    # Save summary
    summary_path = output_dir / "qa_summary.json"
    with open(summary_path, "w") as f:
        json.dump({
            "timestamp": datetime.now().isoformat(),
            "total_queries": len(results),
            "passed": success,
            "failed": failed,
            "total_time_seconds": round(total_time, 1),
            "results": results,
        }, f, indent=2, ensure_ascii=False)
    
    print(f"\nSummary saved to: {summary_path}")


if __name__ == "__main__":
    main()