Eurus / scripts /qa_runner.py
dmpantiu's picture
Upload folder using huggingface_hub
ab07cb1 verified
#!/usr/bin/env python3
"""
QA Runner β€” Automated End-to-End Agent Testing
===============================================
Runs test queries through the Eurus agent, captures ALL intermediate steps
(tool calls, tool outputs, reasoning, plots) and saves structured results
to data/qa_results/q{NN}_{slug}/.
Usage:
PYTHONPATH=src OPENAI_API_KEY=... python3 scripts/qa_runner.py
Or run a single query:
PYTHONPATH=src OPENAI_API_KEY=... python3 scripts/qa_runner.py --query 2
"""
import os
import sys
import json
import shutil
import base64
import time
import argparse
from pathlib import Path
from datetime import datetime
from typing import Optional
# Ensure eurus package is importable
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT / "src"))
sys.path.insert(0, str(PROJECT_ROOT))
# Load .env (API keys)
from dotenv import load_dotenv
load_dotenv(PROJECT_ROOT / ".env")
from langchain_openai import ChatOpenAI
from langchain.agents import create_agent
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
from eurus.config import AGENT_SYSTEM_PROMPT, CONFIG, get_plots_dir
from eurus.tools import get_all_tools
# ============================================================================
# QA TEST QUERIES β€” 36 research-grade demo queries
#
# Β§1 Synoptic Meteorology & Case Studies (Q01–Q05)
# Β§2 Climate Variability & Teleconnections (Q06–Q10)
# Β§3 Trends & Climate Change Signals (Q11–Q15)
# Β§4 Extreme Events & Risk (Q16–Q20)
# Β§5 Maritime & Shipping (Q21–Q24)
# Β§6 Energy Assessment (Q25–Q28)
# Β§7 Diurnal & Sub-Daily Processes (Q29–Q30)
# Β§8 Multi-Variable & Diagnostics (Q31–Q33)
# Β§9 Quick Lookups (Q34–Q36)
# ============================================================================
QA_QUERIES = [
# ═══════════════════════════════════════════════════════════════
# Β§1 β€” Synoptic Meteorology & Case Studies
# ═══════════════════════════════════════════════════════════════
{
"id": 1,
"slug": "europe_heatwave_anomaly",
"query": "Show me a spatial map of 2m temperature anomalies across Europe "
"during the June 2023 heatwave compared to June 2022.",
"type": "anomaly_map",
"variables": ["t2"],
"region": "Europe",
},
{
"id": 2,
"slug": "storm_isha_mslp_wind",
"query": "Plot MSLP isobars and 10m wind vectors over the North Atlantic "
"for 2024-01-22 β€” I want to see Storm Isha's structure.",
"type": "contour_quiver",
"variables": ["mslp", "u10", "v10"],
"region": "North Atlantic",
},
{
"id": 3,
"slug": "atmospheric_river_jan2023",
"query": "Download total column water vapour for the US West Coast, Jan 2023, "
"and show the atmospheric river event around Jan 9th.",
"type": "ar_detection",
"variables": ["tcwv"],
"region": "US West Coast",
},
{
"id": 4,
"slug": "sahara_heat_july2024",
"query": "Plot the daily mean 2m temperature time series averaged over "
"the Sahara (20-30Β°N, 0 to 15Β°E) for July 2024 and compare "
"it to July 2023 on the same chart.",
"type": "time_series",
"variables": ["t2"],
"region": "Sahara",
},
{
"id": 5,
"slug": "great_plains_wind_may2024",
"query": "Plot a map of mean 10m wind speed over the US Great Plains "
"(30-45Β°N, -105 to -90Β°W) for May 2024 and highlight areas exceeding 5 m/s.",
"type": "threshold_map",
"variables": ["u10", "v10"],
"region": "US Great Plains",
},
# ═══════════════════════════════════════════════════════════════
# Β§2 β€” Climate Variability & Teleconnections
# ═══════════════════════════════════════════════════════════════
{
"id": 6,
"slug": "nino34_index",
"query": "Calculate the NiΓ±o 3.4 index from ERA5 SST for 2015-2024 and "
"classify El NiΓ±o / La NiΓ±a episodes.",
"type": "climate_index",
"variables": ["sst"],
"region": "Tropical Pacific",
},
{
"id": 7,
"slug": "elnino_vs_lanina_tropical_belt",
"query": "Compare SST anomalies across the entire tropical belt "
"(30Β°S-30Β°N, global) for December 2023 (peak El NiΓ±o) vs December 2022 "
"(La NiΓ±a). Show the full basin-wide pattern across the Pacific, "
"Atlantic, and Indian oceans in a single anomaly difference map.",
"type": "anomaly_comparison",
"variables": ["sst"],
"region": "Tropical Belt (global)",
},
{
"id": 8,
"slug": "nao_index",
"query": "Compute the NAO index from MSLP (Azores minus Iceland) for 2000-2024 "
"and plot it with a 3-month rolling mean.",
"type": "climate_index",
"variables": ["mslp"],
"region": "North Atlantic",
},
{
"id": 9,
"slug": "australia_enso_rainfall",
"query": "Compare precipitation over Eastern Australia (25-45Β°S, 145-155Β°E) "
"between the La NiΓ±a year 2022 and El NiΓ±o year 2023. "
"Show a two-panel map of annual total precipitation for each year "
"and a difference map (2023 minus 2022).",
"type": "multi_year_anomaly",
"variables": ["tp"],
"region": "Australia",
},
{
"id": 10,
"slug": "med_eof_sst",
"query": "Perform an EOF analysis on Mediterranean SST anomalies "
"(30-46Β°N, -6 to 36Β°E) for 2019-2024 and show the first 3 modes "
"with variance explained. Interpret the dominant patterns.",
"type": "eof_analysis",
"variables": ["sst"],
"region": "Mediterranean",
},
# ═══════════════════════════════════════════════════════════════
# Β§3 β€” Trends & Climate Change Signals
# ═══════════════════════════════════════════════════════════════
{
"id": 11,
"slug": "arctic_polar_amplification",
"query": "Compare January mean 2m temperature across the entire Arctic "
"(north of 70Β°N) for 2024 vs 2000. Show both maps side by side, "
"compute the area-weighted temperature difference, and quantify "
"polar amplification.",
"type": "decadal_comparison",
"variables": ["t2"],
"region": "Arctic (>70Β°N)",
},
{
"id": 12,
"slug": "med_marine_heatwave_2023",
"query": "Map the summer (JJA) 2023 mean SST anomaly across the entire "
"Mediterranean basin (30-46Β°N, -6 to 36Β°E) compared to the 2018-2022 "
"summer mean. Identify marine heatwave hotspots where SST exceeded "
"+2Β°C above normal.",
"type": "marine_heatwave",
"variables": ["sst"],
"region": "Mediterranean",
},
{
"id": 13,
"slug": "paris_decadal_comparison",
"query": "Compare the average summer (JJA) temperature in Paris between the "
"decades 2000-2009 and 2014-2023 β€” show a difference map and time series.",
"type": "multi_panel_comparison",
"variables": ["t2"],
"region": "Paris",
},
{
"id": 14,
"slug": "alps_snow_trend",
"query": "Has the snow depth over the Alps decreased over the last 30 years? "
"Show me the December-February trend.",
"type": "trend_analysis",
"variables": ["sd"],
"region": "Alps",
},
{
"id": 15,
"slug": "uk_precip_anomaly_winter2024",
"query": "Map the total precipitation anomaly over the British Isles "
"(49-60Β°N, 11Β°W-2Β°E) for January 2024 compared to the 2019-2023 "
"January mean. Highlight regions receiving more than 150% of normal "
"rainfall. Save the map as a PNG file.",
"type": "anomaly_map",
"variables": ["tp"],
"region": "British Isles",
},
# ═══════════════════════════════════════════════════════════════
# Β§4 β€” Extreme Events & Risk
# ═══════════════════════════════════════════════════════════════
{
"id": 16,
"slug": "delhi_heatwave_detection",
"query": "Detect heatwave events in Delhi from 2010-2024 using the 90th "
"percentile threshold with a 3-day duration criterion β€” how has the "
"frequency changed?",
"type": "heatwave_detection",
"variables": ["t2"],
"region": "Delhi",
},
{
"id": 17,
"slug": "horn_africa_drought",
"query": "Calculate a 3-month SPI proxy for the Horn of Africa "
"(Ethiopia/Somalia) for 2020-2024 β€” when were the worst drought periods?",
"type": "drought_analysis",
"variables": ["tp"],
"region": "Horn of Africa",
},
{
"id": 18,
"slug": "baghdad_hot_days",
"query": "How many days per year exceeded 35Β°C in Baghdad from 1980 to 2024? "
"Plot as a bar chart with a trend line.",
"type": "exceedance_frequency",
"variables": ["t2"],
"region": "Baghdad",
},
{
"id": 19,
"slug": "sea_p95_precip",
"query": "Show me the 95th percentile daily precipitation map for Southeast Asia "
"for 2010-2023.",
"type": "extreme_percentile",
"variables": ["tp"],
"region": "Southeast Asia",
},
{
"id": 20,
"slug": "scandinavia_blocking_2018",
"query": "Analyse the blocking event over Scandinavia in July 2018 β€” show MSLP "
"anomalies persisting for 5+ days.",
"type": "blocking_detection",
"variables": ["mslp"],
"region": "Scandinavia",
},
# ═══════════════════════════════════════════════════════════════
# Β§5 β€” Maritime & Shipping
# ═══════════════════════════════════════════════════════════════
{
"id": 21,
"slug": "rotterdam_shanghai_route",
"query": "Calculate the maritime route from Rotterdam to Shanghai and analyse "
"wind risk along the route for December.",
"type": "maritime_route_risk",
"variables": ["u10", "v10"],
"region": "Europe-Asia",
},
{
"id": 22,
"slug": "indian_ocean_sst_dipole",
"query": "Map the SST anomaly across the Indian Ocean (30Β°S-25Β°N, 30-120Β°E) "
"for October 2023 relative to the 2019-2022 October mean. "
"Show the Indian Ocean Dipole pattern. Save the map as PNG.",
"type": "anomaly_map",
"variables": ["sst"],
"region": "Indian Ocean",
},
{
"id": 23,
"slug": "japan_typhoon_season_wind",
"query": "Map the mean and maximum 10m wind speed over the seas around Japan "
"(20-45Β°N, 120-150Β°E) during typhoon season (August-October) 2023. "
"Show two-panel spatial maps highlighting areas where mean wind "
"exceeded 8 m/s. Save as PNG.",
"type": "multi_panel_map",
"variables": ["u10", "v10"],
"region": "Japan",
},
{
"id": 24,
"slug": "south_atlantic_sst_gradient",
"query": "Map the mean SST field across the South Atlantic (40Β°S-5Β°N, 50Β°W-15Β°E) "
"for March 2024. Overlay SST isotherms and highlight the "
"Brazil-Malvinas confluence zone. Save as PNG.",
"type": "sst_map",
"variables": ["sst"],
"region": "South Atlantic",
},
# ═══════════════════════════════════════════════════════════════
# Β§6 β€” Energy Assessment
# ═══════════════════════════════════════════════════════════════
{
"id": 25,
"slug": "north_sea_wind_power",
"query": "Map the mean 100m wind power density across the North Sea for "
"2020-2024 β€” where are the best offshore wind sites?",
"type": "wind_energy",
"variables": ["u100", "v100"],
"region": "North Sea",
},
{
"id": 26,
"slug": "german_bight_weibull",
"query": "Fit a Weibull distribution to 100m wind speed at 54Β°N, 7Β°E "
"(German Bight) for 2023 and estimate the capacity factor for a "
"3-25 m/s turbine range. Plot the histogram with Weibull fit overlay "
"and save as PNG.",
"type": "weibull_analysis",
"variables": ["u100", "v100"],
"region": "German Bight",
},
{
"id": 27,
"slug": "solar_sahara_vs_germany",
"query": "Compare incoming solar radiation (SSRD) between the Sahara and "
"northern Germany across 2023 β€” show monthly means.",
"type": "comparison_timeseries",
"variables": ["ssrd"],
"region": "Sahara / Germany",
},
{
"id": 28,
"slug": "persian_gulf_sst_summer",
"query": "Map the mean SST across the Persian Gulf and Arabian Sea "
"(12-32Β°N, 44-70Β°E) for August 2023. Highlight areas where SST "
"exceeded 32Β°C in a spatial map. Save as PNG.",
"type": "threshold_map",
"variables": ["sst"],
"region": "Persian Gulf",
},
# ═══════════════════════════════════════════════════════════════
# Β§7 β€” Diurnal & Sub-Daily Processes
# ═══════════════════════════════════════════════════════════════
{
"id": 29,
"slug": "sahara_diurnal_t2_blh",
"query": "Show the diurnal cycle of 2m temperature and boundary layer height "
"in the Sahara for July 2024 β€” dual-axis plot.",
"type": "diurnal_cycle",
"variables": ["t2", "blh"],
"region": "Sahara",
},
{
"id": 30,
"slug": "amazon_convective_peak",
"query": "When does convective precipitation peak over the Amazon basin during "
"DJF? Hourly climatology please.",
"type": "diurnal_cycle",
"variables": ["cp"],
"region": "Amazon",
},
# ═══════════════════════════════════════════════════════════════
# Β§8 β€” Multi-Variable & Diagnostics
# ═══════════════════════════════════════════════════════════════
{
"id": 31,
"slug": "europe_rh_august",
"query": "Compute relative humidity from 2m temperature and dewpoint for "
"central Europe, August 2023, and map the spatial mean.",
"type": "derived_variable",
"variables": ["t2", "d2"],
"region": "Central Europe",
},
{
"id": 32,
"slug": "hovmoller_equator_skt",
"query": "Create a HovmΓΆller diagram of 850 hPa equivalent β€” use skin "
"temperature as proxy β€” along the equator for 2023 to visualise the MJO.",
"type": "hovmoller",
"variables": ["skt"],
"region": "Equatorial",
},
{
"id": 33,
"slug": "hurricane_otis_dashboard",
"query": "Plot a summary dashboard for Hurricane Otis (Oct 2023, Acapulco): "
"SST map, wind speed time series, and TCWV distribution in one figure.",
"type": "dashboard",
"variables": ["sst", "u10", "v10", "tcwv"],
"region": "East Pacific / Mexico",
},
# ═══════════════════════════════════════════════════════════════
# Β§9 β€” Quick Lookups
# ═══════════════════════════════════════════════════════════════
{
"id": 34,
"slug": "california_sst_jan",
"query": "What was the average SST off the coast of California in January 2024? "
"Also plot a spatial map of the SST field for that month and save as PNG.",
"type": "point_retrieval",
"variables": ["sst"],
"region": "California",
},
{
"id": 35,
"slug": "berlin_monthly_temp",
"query": "Plot the 2023 monthly mean temperature for Berlin as a seasonal curve.",
"type": "time_series",
"variables": ["t2"],
"region": "Berlin",
},
{
"id": 36,
"slug": "biscay_wind_stats",
"query": "Download 10m wind speed for the Bay of Biscay, last 3 years, and "
"give me basic statistics. Also plot a wind speed histogram or time "
"series and save as PNG.",
"type": "stats_retrieval",
"variables": ["u10", "v10"],
"region": "Bay of Biscay",
},
]
# ============================================================================
# AGENT SETUP (mirrors main.py exactly)
# ============================================================================
def build_agent():
"""Build a LangChain agent with full tool suite."""
llm = ChatOpenAI(
model=CONFIG.model_name,
temperature=CONFIG.temperature,
)
tools = get_all_tools(enable_routing=False, enable_guide=True)
agent = create_agent(
model=llm,
tools=tools,
system_prompt=AGENT_SYSTEM_PROMPT,
debug=False,
)
return agent
# ============================================================================
# STEP CAPTURE
# ============================================================================
def extract_steps(messages) -> list:
"""
Extract ALL intermediate steps from agent message history.
Returns list of step dicts with type, content, tool_name, etc.
"""
steps = []
for msg in messages:
if isinstance(msg, HumanMessage):
steps.append({
"step": len(steps) + 1,
"type": "user_query",
"content": msg.content[:2000],
})
elif isinstance(msg, AIMessage):
# AI thinking / tool calls
if msg.tool_calls:
for tc in msg.tool_calls:
# Capture tool call request
args = tc.get("args", {})
# Truncate large args
args_str = json.dumps(args, indent=2, default=str)
if len(args_str) > 5000:
args_str = args_str[:5000] + "\n... [TRUNCATED]"
steps.append({
"step": len(steps) + 1,
"type": "tool_call",
"tool_name": tc.get("name", "unknown"),
"tool_id": tc.get("id", ""),
"arguments": json.loads(args_str) if len(args_str) <= 5000 else args_str,
"reasoning": msg.content[:1000] if msg.content else "",
})
elif msg.content:
# Final response or intermediate reasoning
steps.append({
"step": len(steps) + 1,
"type": "ai_response",
"content": msg.content[:5000],
})
elif isinstance(msg, ToolMessage):
# Tool output
content = msg.content if isinstance(msg.content, str) else str(msg.content)
if len(content) > 3000:
content = content[:3000] + "\n... [TRUNCATED]"
steps.append({
"step": len(steps) + 1,
"type": "tool_output",
"tool_name": msg.name if hasattr(msg, 'name') else "unknown",
"tool_call_id": msg.tool_call_id if hasattr(msg, 'tool_call_id') else "",
"content": content,
})
return steps
# ============================================================================
# QA RUNNER
# ============================================================================
def run_single_query(agent, query_def: dict, output_dir: Path) -> dict:
"""
Run a single QA query and capture everything.
Returns: metadata dict
"""
qid = query_def["id"]
slug = query_def["slug"]
query = query_def["query"]
folder = output_dir / f"q{qid:02d}_{slug}"
folder.mkdir(parents=True, exist_ok=True)
print(f"\n{'='*70}")
print(f" Q{qid:02d}: {query[:70]}...")
print(f"{'='*70}")
start_time = time.time()
try:
# Snapshot existing plots BEFORE running so we only copy NEW ones
plots_dir = get_plots_dir()
existing_plots = set()
if plots_dir.exists():
existing_plots = {f.name for f in plots_dir.glob("*.png")}
# Invoke agent
config = {"recursion_limit": 35}
messages = [HumanMessage(content=query)]
result = agent.invoke({"messages": messages}, config=config)
elapsed = time.time() - start_time
result_messages = result["messages"]
# Extract intermediate steps
steps = extract_steps(result_messages)
# Get final response
final_response = ""
for msg in reversed(result_messages):
if isinstance(msg, AIMessage) and msg.content and not msg.tool_calls:
final_response = msg.content
break
# Save steps.json
steps_path = folder / "steps.json"
with open(steps_path, "w") as f:
json.dump(steps, f, indent=2, default=str, ensure_ascii=False)
# Save final response
response_path = folder / "response.md"
with open(response_path, "w") as f:
f.write(f"# Q{qid:02d}: {slug}\n\n")
f.write(f"**Query:** {query}\n\n")
f.write(f"**Elapsed:** {elapsed:.1f}s\n\n")
f.write("---\n\n")
f.write(final_response)
# Copy only NEW plots (diff against pre-query snapshot)
plot_files = []
if plots_dir.exists():
for f_path in sorted(plots_dir.glob("*.png")):
if f_path.name not in existing_plots:
dest = folder / f_path.name
shutil.copy2(f_path, dest)
plot_files.append(f_path.name)
print(f" πŸ“Š Plot saved: {f_path.name}")
# Count tool calls
tool_calls = [s for s in steps if s["type"] == "tool_call"]
tools_used = list(set(s["tool_name"] for s in tool_calls))
# Build metadata
metadata = {
"query_id": qid,
"slug": slug,
"query": query,
"type": query_def.get("type", "unknown"),
"variables": query_def.get("variables", []),
"region": query_def.get("region", ""),
"timestamp": datetime.now().isoformat(),
"elapsed_seconds": round(elapsed, 1),
"status": "success",
"tools_used": tools_used,
"num_tool_calls": len(tool_calls),
"num_steps": len(steps),
"plot_files": plot_files,
"notes": "",
}
# Save metadata.json
meta_path = folder / "metadata.json"
with open(meta_path, "w") as f:
json.dump(metadata, f, indent=2, ensure_ascii=False)
print(f" βœ… SUCCESS in {elapsed:.1f}s | Tools: {', '.join(tools_used)} | Steps: {len(steps)}")
return metadata
except Exception as e:
elapsed = time.time() - start_time
print(f" ❌ FAILED in {elapsed:.1f}s: {e}")
metadata = {
"query_id": qid,
"slug": slug,
"query": query,
"type": query_def.get("type", "unknown"),
"variables": query_def.get("variables", []),
"region": query_def.get("region", ""),
"timestamp": datetime.now().isoformat(),
"elapsed_seconds": round(elapsed, 1),
"status": "error",
"error": str(e),
"tools_used": [],
"num_tool_calls": 0,
"num_steps": 0,
"plot_files": [],
"notes": f"Error: {e}",
}
meta_path = folder / "metadata.json"
with open(meta_path, "w") as f:
json.dump(metadata, f, indent=2, ensure_ascii=False)
return metadata
def main():
parser = argparse.ArgumentParser(description="Eurus QA Runner")
parser.add_argument("--query", type=int, help="Run a single query by ID (1-36)")
parser.add_argument("--start", type=int, default=1, help="Start from query ID")
parser.add_argument("--end", type=int, default=36, help="End at query ID (inclusive)")
parser.add_argument("--output", type=str, default=None, help="Output directory (default: data/qa_results)")
parser.add_argument("--skip-existing", action="store_true", help="Skip if folder already has metadata.json")
args = parser.parse_args()
# Check API key
if not os.environ.get("OPENAI_API_KEY"):
print("❌ OPENAI_API_KEY not set!")
sys.exit(1)
if args.output:
output_dir = Path(args.output)
else:
output_dir = PROJECT_ROOT / "data" / "qa_results"
output_dir.mkdir(parents=True, exist_ok=True)
print(f"""
╔══════════════════════════════════════════════════════╗
β•‘ Eurus QA Runner v1.0 β•‘
β•‘ {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} β•‘
β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
Output: {output_dir}
""")
# Build agent once
print("πŸ—οΈ Building agent...")
agent = build_agent()
print("βœ… Agent ready\n")
# Select queries
if args.query:
queries = [q for q in QA_QUERIES if q["id"] == args.query]
else:
queries = [q for q in QA_QUERIES if args.start <= q["id"] <= args.end]
results = []
for q in queries:
folder = output_dir / f"q{q['id']:02d}_{q['slug']}"
if args.skip_existing and (folder / "metadata.json").exists():
print(f"⏭️ Skipping Q{q['id']:02d} (already exists)")
continue
result = run_single_query(agent, q, output_dir)
results.append(result)
# Print summary
print(f"\n{'='*70}")
print("QA SUMMARY")
print(f"{'='*70}")
success = sum(1 for r in results if r["status"] == "success")
failed = sum(1 for r in results if r["status"] == "error")
total_time = sum(r["elapsed_seconds"] for r in results)
for r in results:
status = "βœ…" if r["status"] == "success" else "❌"
print(f" {status} Q{r['query_id']:02d} ({r['slug']:20s}) | "
f"{r['elapsed_seconds']:5.1f}s | Tools: {', '.join(r['tools_used'])}")
print(f"\nTotal: {success} passed, {failed} failed, {total_time:.1f}s total")
# Save summary
summary_path = output_dir / "qa_summary.json"
with open(summary_path, "w") as f:
json.dump({
"timestamp": datetime.now().isoformat(),
"total_queries": len(results),
"passed": success,
"failed": failed,
"total_time_seconds": round(total_time, 1),
"results": results,
}, f, indent=2, ensure_ascii=False)
print(f"\nSummary saved to: {summary_path}")
if __name__ == "__main__":
main()