File size: 30,177 Bytes
ab07cb1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
#!/usr/bin/env python3
"""
QA Runner β€” Automated End-to-End Agent Testing
===============================================
Runs test queries through the Eurus agent, captures ALL intermediate steps
(tool calls, tool outputs, reasoning, plots) and saves structured results
to data/qa_results/q{NN}_{slug}/.

Usage:
    PYTHONPATH=src OPENAI_API_KEY=... python3 scripts/qa_runner.py
    
Or run a single query:
    PYTHONPATH=src OPENAI_API_KEY=... python3 scripts/qa_runner.py --query 2
"""

import os
import sys
import json
import shutil
import base64
import time
import argparse
from pathlib import Path
from datetime import datetime
from typing import Optional

# Ensure eurus package is importable
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT / "src"))
sys.path.insert(0, str(PROJECT_ROOT))

# Load .env (API keys)
from dotenv import load_dotenv
load_dotenv(PROJECT_ROOT / ".env")

from langchain_openai import ChatOpenAI
from langchain.agents import create_agent
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage

from eurus.config import AGENT_SYSTEM_PROMPT, CONFIG, get_plots_dir
from eurus.tools import get_all_tools

# ============================================================================
# QA TEST QUERIES β€” 36 research-grade demo queries
#
#  Β§1  Synoptic Meteorology & Case Studies         (Q01–Q05)
#  Β§2  Climate Variability & Teleconnections        (Q06–Q10)
#  Β§3  Trends & Climate Change Signals              (Q11–Q15)
#  Β§4  Extreme Events & Risk                        (Q16–Q20)
#  Β§5  Maritime & Shipping                          (Q21–Q24)
#  Β§6  Energy Assessment                            (Q25–Q28)
#  Β§7  Diurnal & Sub-Daily Processes                (Q29–Q30)
#  Β§8  Multi-Variable & Diagnostics                 (Q31–Q33)
#  Β§9  Quick Lookups                                (Q34–Q36)
# ============================================================================

QA_QUERIES = [
    # ═══════════════════════════════════════════════════════════════
    #  Β§1 β€” Synoptic Meteorology & Case Studies
    # ═══════════════════════════════════════════════════════════════
    {
        "id": 1,
        "slug": "europe_heatwave_anomaly",
        "query": "Show me a spatial map of 2m temperature anomalies across Europe "
                 "during the June 2023 heatwave compared to June 2022.",
        "type": "anomaly_map",
        "variables": ["t2"],
        "region": "Europe",
    },
    {
        "id": 2,
        "slug": "storm_isha_mslp_wind",
        "query": "Plot MSLP isobars and 10m wind vectors over the North Atlantic "
                 "for 2024-01-22 β€” I want to see Storm Isha's structure.",
        "type": "contour_quiver",
        "variables": ["mslp", "u10", "v10"],
        "region": "North Atlantic",
    },
    {
        "id": 3,
        "slug": "atmospheric_river_jan2023",
        "query": "Download total column water vapour for the US West Coast, Jan 2023, "
                 "and show the atmospheric river event around Jan 9th.",
        "type": "ar_detection",
        "variables": ["tcwv"],
        "region": "US West Coast",
    },
    {
        "id": 4,
        "slug": "sahara_heat_july2024",
        "query": "Plot the daily mean 2m temperature time series averaged over "
                 "the Sahara (20-30Β°N, 0 to 15Β°E) for July 2024 and compare "
                 "it to July 2023 on the same chart.",
        "type": "time_series",
        "variables": ["t2"],
        "region": "Sahara",
    },
    {
        "id": 5,
        "slug": "great_plains_wind_may2024",
        "query": "Plot a map of mean 10m wind speed over the US Great Plains "
                 "(30-45Β°N, -105 to -90Β°W) for May 2024 and highlight areas exceeding 5 m/s.",
        "type": "threshold_map",
        "variables": ["u10", "v10"],
        "region": "US Great Plains",
    },

    # ═══════════════════════════════════════════════════════════════
    #  Β§2 β€” Climate Variability & Teleconnections
    # ═══════════════════════════════════════════════════════════════
    {
        "id": 6,
        "slug": "nino34_index",
        "query": "Calculate the NiΓ±o 3.4 index from ERA5 SST for 2015-2024 and "
                 "classify El NiΓ±o / La NiΓ±a episodes.",
        "type": "climate_index",
        "variables": ["sst"],
        "region": "Tropical Pacific",
    },
    {
        "id": 7,
        "slug": "elnino_vs_lanina_tropical_belt",
        "query": "Compare SST anomalies across the entire tropical belt "
                 "(30Β°S-30Β°N, global) for December 2023 (peak El NiΓ±o) vs December 2022 "
                 "(La NiΓ±a). Show the full basin-wide pattern across the Pacific, "
                 "Atlantic, and Indian oceans in a single anomaly difference map.",
        "type": "anomaly_comparison",
        "variables": ["sst"],
        "region": "Tropical Belt (global)",
    },
    {
        "id": 8,
        "slug": "nao_index",
        "query": "Compute the NAO index from MSLP (Azores minus Iceland) for 2000-2024 "
                 "and plot it with a 3-month rolling mean.",
        "type": "climate_index",
        "variables": ["mslp"],
        "region": "North Atlantic",
    },
    {
        "id": 9,
        "slug": "australia_enso_rainfall",
        "query": "Compare precipitation over Eastern Australia (25-45Β°S, 145-155Β°E) "
                 "between the La NiΓ±a year 2022 and El NiΓ±o year 2023. "
                 "Show a two-panel map of annual total precipitation for each year "
                 "and a difference map (2023 minus 2022).",
        "type": "multi_year_anomaly",
        "variables": ["tp"],
        "region": "Australia",
    },
    {
        "id": 10,
        "slug": "med_eof_sst",
        "query": "Perform an EOF analysis on Mediterranean SST anomalies "
                 "(30-46Β°N, -6 to 36Β°E) for 2019-2024 and show the first 3 modes "
                 "with variance explained. Interpret the dominant patterns.",
        "type": "eof_analysis",
        "variables": ["sst"],
        "region": "Mediterranean",
    },

    # ═══════════════════════════════════════════════════════════════
    #  Β§3 β€” Trends & Climate Change Signals
    # ═══════════════════════════════════════════════════════════════
    {
        "id": 11,
        "slug": "arctic_polar_amplification",
        "query": "Compare January mean 2m temperature across the entire Arctic "
                 "(north of 70Β°N) for 2024 vs 2000. Show both maps side by side, "
                 "compute the area-weighted temperature difference, and quantify "
                 "polar amplification.",
        "type": "decadal_comparison",
        "variables": ["t2"],
        "region": "Arctic (>70Β°N)",
    },
    {
        "id": 12,
        "slug": "med_marine_heatwave_2023",
        "query": "Map the summer (JJA) 2023 mean SST anomaly across the entire "
                 "Mediterranean basin (30-46Β°N, -6 to 36Β°E) compared to the 2018-2022 "
                 "summer mean. Identify marine heatwave hotspots where SST exceeded "
                 "+2Β°C above normal.",
        "type": "marine_heatwave",
        "variables": ["sst"],
        "region": "Mediterranean",
    },
    {
        "id": 13,
        "slug": "paris_decadal_comparison",
        "query": "Compare the average summer (JJA) temperature in Paris between the "
                 "decades 2000-2009 and 2014-2023 β€” show a difference map and time series.",
        "type": "multi_panel_comparison",
        "variables": ["t2"],
        "region": "Paris",
    },
    {
        "id": 14,
        "slug": "alps_snow_trend",
        "query": "Has the snow depth over the Alps decreased over the last 30 years? "
                 "Show me the December-February trend.",
        "type": "trend_analysis",
        "variables": ["sd"],
        "region": "Alps",
    },
    {
        "id": 15,
        "slug": "uk_precip_anomaly_winter2024",
        "query": "Map the total precipitation anomaly over the British Isles "
                 "(49-60Β°N, 11Β°W-2Β°E) for January 2024 compared to the 2019-2023 "
                 "January mean. Highlight regions receiving more than 150% of normal "
                 "rainfall. Save the map as a PNG file.",
        "type": "anomaly_map",
        "variables": ["tp"],
        "region": "British Isles",
    },

    # ═══════════════════════════════════════════════════════════════
    #  Β§4 β€” Extreme Events & Risk
    # ═══════════════════════════════════════════════════════════════
    {
        "id": 16,
        "slug": "delhi_heatwave_detection",
        "query": "Detect heatwave events in Delhi from 2010-2024 using the 90th "
                 "percentile threshold with a 3-day duration criterion β€” how has the "
                 "frequency changed?",
        "type": "heatwave_detection",
        "variables": ["t2"],
        "region": "Delhi",
    },
    {
        "id": 17,
        "slug": "horn_africa_drought",
        "query": "Calculate a 3-month SPI proxy for the Horn of Africa "
                 "(Ethiopia/Somalia) for 2020-2024 β€” when were the worst drought periods?",
        "type": "drought_analysis",
        "variables": ["tp"],
        "region": "Horn of Africa",
    },
    {
        "id": 18,
        "slug": "baghdad_hot_days",
        "query": "How many days per year exceeded 35Β°C in Baghdad from 1980 to 2024? "
                 "Plot as a bar chart with a trend line.",
        "type": "exceedance_frequency",
        "variables": ["t2"],
        "region": "Baghdad",
    },
    {
        "id": 19,
        "slug": "sea_p95_precip",
        "query": "Show me the 95th percentile daily precipitation map for Southeast Asia "
                 "for 2010-2023.",
        "type": "extreme_percentile",
        "variables": ["tp"],
        "region": "Southeast Asia",
    },
    {
        "id": 20,
        "slug": "scandinavia_blocking_2018",
        "query": "Analyse the blocking event over Scandinavia in July 2018 β€” show MSLP "
                 "anomalies persisting for 5+ days.",
        "type": "blocking_detection",
        "variables": ["mslp"],
        "region": "Scandinavia",
    },

    # ═══════════════════════════════════════════════════════════════
    #  Β§5 β€” Maritime & Shipping
    # ═══════════════════════════════════════════════════════════════
    {
        "id": 21,
        "slug": "rotterdam_shanghai_route",
        "query": "Calculate the maritime route from Rotterdam to Shanghai and analyse "
                 "wind risk along the route for December.",
        "type": "maritime_route_risk",
        "variables": ["u10", "v10"],
        "region": "Europe-Asia",
    },
    {
        "id": 22,
        "slug": "indian_ocean_sst_dipole",
        "query": "Map the SST anomaly across the Indian Ocean (30Β°S-25Β°N, 30-120Β°E) "
                 "for October 2023 relative to the 2019-2022 October mean. "
                 "Show the Indian Ocean Dipole pattern. Save the map as PNG.",
        "type": "anomaly_map",
        "variables": ["sst"],
        "region": "Indian Ocean",
    },
    {
        "id": 23,
        "slug": "japan_typhoon_season_wind",
        "query": "Map the mean and maximum 10m wind speed over the seas around Japan "
                 "(20-45Β°N, 120-150Β°E) during typhoon season (August-October) 2023. "
                 "Show two-panel spatial maps highlighting areas where mean wind "
                 "exceeded 8 m/s. Save as PNG.",
        "type": "multi_panel_map",
        "variables": ["u10", "v10"],
        "region": "Japan",
    },
    {
        "id": 24,
        "slug": "south_atlantic_sst_gradient",
        "query": "Map the mean SST field across the South Atlantic (40Β°S-5Β°N, 50Β°W-15Β°E) "
                 "for March 2024. Overlay SST isotherms and highlight the "
                 "Brazil-Malvinas confluence zone. Save as PNG.",
        "type": "sst_map",
        "variables": ["sst"],
        "region": "South Atlantic",
    },

    # ═══════════════════════════════════════════════════════════════
    #  Β§6 β€” Energy Assessment
    # ═══════════════════════════════════════════════════════════════
    {
        "id": 25,
        "slug": "north_sea_wind_power",
        "query": "Map the mean 100m wind power density across the North Sea for "
                 "2020-2024 β€” where are the best offshore wind sites?",
        "type": "wind_energy",
        "variables": ["u100", "v100"],
        "region": "North Sea",
    },
    {
        "id": 26,
        "slug": "german_bight_weibull",
        "query": "Fit a Weibull distribution to 100m wind speed at 54Β°N, 7Β°E "
                 "(German Bight) for 2023 and estimate the capacity factor for a "
                 "3-25 m/s turbine range. Plot the histogram with Weibull fit overlay "
                 "and save as PNG.",
        "type": "weibull_analysis",
        "variables": ["u100", "v100"],
        "region": "German Bight",
    },
    {
        "id": 27,
        "slug": "solar_sahara_vs_germany",
        "query": "Compare incoming solar radiation (SSRD) between the Sahara and "
                 "northern Germany across 2023 β€” show monthly means.",
        "type": "comparison_timeseries",
        "variables": ["ssrd"],
        "region": "Sahara / Germany",
    },
    {
        "id": 28,
        "slug": "persian_gulf_sst_summer",
        "query": "Map the mean SST across the Persian Gulf and Arabian Sea "
                 "(12-32Β°N, 44-70Β°E) for August 2023. Highlight areas where SST "
                 "exceeded 32Β°C in a spatial map. Save as PNG.",
        "type": "threshold_map",
        "variables": ["sst"],
        "region": "Persian Gulf",
    },

    # ═══════════════════════════════════════════════════════════════
    #  Β§7 β€” Diurnal & Sub-Daily Processes
    # ═══════════════════════════════════════════════════════════════
    {
        "id": 29,
        "slug": "sahara_diurnal_t2_blh",
        "query": "Show the diurnal cycle of 2m temperature and boundary layer height "
                 "in the Sahara for July 2024 β€” dual-axis plot.",
        "type": "diurnal_cycle",
        "variables": ["t2", "blh"],
        "region": "Sahara",
    },
    {
        "id": 30,
        "slug": "amazon_convective_peak",
        "query": "When does convective precipitation peak over the Amazon basin during "
                 "DJF? Hourly climatology please.",
        "type": "diurnal_cycle",
        "variables": ["cp"],
        "region": "Amazon",
    },

    # ═══════════════════════════════════════════════════════════════
    #  Β§8 β€” Multi-Variable & Diagnostics
    # ═══════════════════════════════════════════════════════════════
    {
        "id": 31,
        "slug": "europe_rh_august",
        "query": "Compute relative humidity from 2m temperature and dewpoint for "
                 "central Europe, August 2023, and map the spatial mean.",
        "type": "derived_variable",
        "variables": ["t2", "d2"],
        "region": "Central Europe",
    },
    {
        "id": 32,
        "slug": "hovmoller_equator_skt",
        "query": "Create a HovmΓΆller diagram of 850 hPa equivalent β€” use skin "
                 "temperature as proxy β€” along the equator for 2023 to visualise the MJO.",
        "type": "hovmoller",
        "variables": ["skt"],
        "region": "Equatorial",
    },
    {
        "id": 33,
        "slug": "hurricane_otis_dashboard",
        "query": "Plot a summary dashboard for Hurricane Otis (Oct 2023, Acapulco): "
                 "SST map, wind speed time series, and TCWV distribution in one figure.",
        "type": "dashboard",
        "variables": ["sst", "u10", "v10", "tcwv"],
        "region": "East Pacific / Mexico",
    },

    # ═══════════════════════════════════════════════════════════════
    #  Β§9 β€” Quick Lookups
    # ═══════════════════════════════════════════════════════════════
    {
        "id": 34,
        "slug": "california_sst_jan",
        "query": "What was the average SST off the coast of California in January 2024? "
                 "Also plot a spatial map of the SST field for that month and save as PNG.",
        "type": "point_retrieval",
        "variables": ["sst"],
        "region": "California",
    },
    {
        "id": 35,
        "slug": "berlin_monthly_temp",
        "query": "Plot the 2023 monthly mean temperature for Berlin as a seasonal curve.",
        "type": "time_series",
        "variables": ["t2"],
        "region": "Berlin",
    },
    {
        "id": 36,
        "slug": "biscay_wind_stats",
        "query": "Download 10m wind speed for the Bay of Biscay, last 3 years, and "
                 "give me basic statistics. Also plot a wind speed histogram or time "
                 "series and save as PNG.",
        "type": "stats_retrieval",
        "variables": ["u10", "v10"],
        "region": "Bay of Biscay",
    },
]


# ============================================================================
# AGENT SETUP  (mirrors main.py exactly)
# ============================================================================

def build_agent():
    """Build a LangChain agent with full tool suite."""
    llm = ChatOpenAI(
        model=CONFIG.model_name,
        temperature=CONFIG.temperature,
    )
    
    tools = get_all_tools(enable_routing=False, enable_guide=True)
    
    agent = create_agent(
        model=llm,
        tools=tools,
        system_prompt=AGENT_SYSTEM_PROMPT,
        debug=False,
    )
    
    return agent


# ============================================================================
# STEP CAPTURE
# ============================================================================

def extract_steps(messages) -> list:
    """
    Extract ALL intermediate steps from agent message history.
    Returns list of step dicts with type, content, tool_name, etc.
    """
    steps = []
    
    for msg in messages:
        if isinstance(msg, HumanMessage):
            steps.append({
                "step": len(steps) + 1,
                "type": "user_query",
                "content": msg.content[:2000],
            })
        elif isinstance(msg, AIMessage):
            # AI thinking / tool calls
            if msg.tool_calls:
                for tc in msg.tool_calls:
                    # Capture tool call request
                    args = tc.get("args", {})
                    # Truncate large args
                    args_str = json.dumps(args, indent=2, default=str)
                    if len(args_str) > 5000:
                        args_str = args_str[:5000] + "\n... [TRUNCATED]"
                    
                    steps.append({
                        "step": len(steps) + 1,
                        "type": "tool_call",
                        "tool_name": tc.get("name", "unknown"),
                        "tool_id": tc.get("id", ""),
                        "arguments": json.loads(args_str) if len(args_str) <= 5000 else args_str,
                        "reasoning": msg.content[:1000] if msg.content else "",
                    })
            elif msg.content:
                # Final response or intermediate reasoning
                steps.append({
                    "step": len(steps) + 1,
                    "type": "ai_response",
                    "content": msg.content[:5000],
                })
        elif isinstance(msg, ToolMessage):
            # Tool output
            content = msg.content if isinstance(msg.content, str) else str(msg.content)
            if len(content) > 3000:
                content = content[:3000] + "\n... [TRUNCATED]"
            
            steps.append({
                "step": len(steps) + 1,
                "type": "tool_output",
                "tool_name": msg.name if hasattr(msg, 'name') else "unknown",
                "tool_call_id": msg.tool_call_id if hasattr(msg, 'tool_call_id') else "",
                "content": content,
            })
    
    return steps


# ============================================================================
# QA RUNNER
# ============================================================================

def run_single_query(agent, query_def: dict, output_dir: Path) -> dict:
    """
    Run a single QA query and capture everything.
    
    Returns: metadata dict
    """
    qid = query_def["id"]
    slug = query_def["slug"]
    query = query_def["query"]
    
    folder = output_dir / f"q{qid:02d}_{slug}"
    folder.mkdir(parents=True, exist_ok=True)
    
    print(f"\n{'='*70}")
    print(f"  Q{qid:02d}: {query[:70]}...")
    print(f"{'='*70}")
    
    start_time = time.time()
    
    try:
        # Snapshot existing plots BEFORE running so we only copy NEW ones
        plots_dir = get_plots_dir()
        existing_plots = set()
        if plots_dir.exists():
            existing_plots = {f.name for f in plots_dir.glob("*.png")}
        
        # Invoke agent
        config = {"recursion_limit": 35}
        messages = [HumanMessage(content=query)]
        
        result = agent.invoke({"messages": messages}, config=config)
        
        elapsed = time.time() - start_time
        result_messages = result["messages"]
        
        # Extract intermediate steps
        steps = extract_steps(result_messages)
        
        # Get final response
        final_response = ""
        for msg in reversed(result_messages):
            if isinstance(msg, AIMessage) and msg.content and not msg.tool_calls:
                final_response = msg.content
                break
        
        # Save steps.json
        steps_path = folder / "steps.json"
        with open(steps_path, "w") as f:
            json.dump(steps, f, indent=2, default=str, ensure_ascii=False)
        
        # Save final response
        response_path = folder / "response.md"
        with open(response_path, "w") as f:
            f.write(f"# Q{qid:02d}: {slug}\n\n")
            f.write(f"**Query:** {query}\n\n")
            f.write(f"**Elapsed:** {elapsed:.1f}s\n\n")
            f.write("---\n\n")
            f.write(final_response)
        
        # Copy only NEW plots (diff against pre-query snapshot)
        plot_files = []
        if plots_dir.exists():
            for f_path in sorted(plots_dir.glob("*.png")):
                if f_path.name not in existing_plots:
                    dest = folder / f_path.name
                    shutil.copy2(f_path, dest)
                    plot_files.append(f_path.name)
                    print(f"   πŸ“Š Plot saved: {f_path.name}")
        
        # Count tool calls
        tool_calls = [s for s in steps if s["type"] == "tool_call"]
        tools_used = list(set(s["tool_name"] for s in tool_calls))
        
        # Build metadata
        metadata = {
            "query_id": qid,
            "slug": slug,
            "query": query,
            "type": query_def.get("type", "unknown"),
            "variables": query_def.get("variables", []),
            "region": query_def.get("region", ""),
            "timestamp": datetime.now().isoformat(),
            "elapsed_seconds": round(elapsed, 1),
            "status": "success",
            "tools_used": tools_used,
            "num_tool_calls": len(tool_calls),
            "num_steps": len(steps),
            "plot_files": plot_files,
            "notes": "",
        }
        
        # Save metadata.json
        meta_path = folder / "metadata.json"
        with open(meta_path, "w") as f:
            json.dump(metadata, f, indent=2, ensure_ascii=False)
        
        print(f"   βœ… SUCCESS in {elapsed:.1f}s | Tools: {', '.join(tools_used)} | Steps: {len(steps)}")
        
        return metadata
    
    except Exception as e:
        elapsed = time.time() - start_time
        print(f"   ❌ FAILED in {elapsed:.1f}s: {e}")
        
        metadata = {
            "query_id": qid,
            "slug": slug,
            "query": query,
            "type": query_def.get("type", "unknown"),
            "variables": query_def.get("variables", []),
            "region": query_def.get("region", ""),
            "timestamp": datetime.now().isoformat(),
            "elapsed_seconds": round(elapsed, 1),
            "status": "error",
            "error": str(e),
            "tools_used": [],
            "num_tool_calls": 0,
            "num_steps": 0,
            "plot_files": [],
            "notes": f"Error: {e}",
        }
        
        meta_path = folder / "metadata.json"
        with open(meta_path, "w") as f:
            json.dump(metadata, f, indent=2, ensure_ascii=False)
        
        return metadata


def main():
    parser = argparse.ArgumentParser(description="Eurus QA Runner")
    parser.add_argument("--query", type=int, help="Run a single query by ID (1-36)")
    parser.add_argument("--start", type=int, default=1, help="Start from query ID")
    parser.add_argument("--end", type=int, default=36, help="End at query ID (inclusive)")
    parser.add_argument("--output", type=str, default=None, help="Output directory (default: data/qa_results)")
    parser.add_argument("--skip-existing", action="store_true", help="Skip if folder already has metadata.json")
    args = parser.parse_args()
    
    # Check API key
    if not os.environ.get("OPENAI_API_KEY"):
        print("❌ OPENAI_API_KEY not set!")
        sys.exit(1)
    
    if args.output:
        output_dir = Path(args.output)
    else:
        output_dir = PROJECT_ROOT / "data" / "qa_results"
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"""
╔══════════════════════════════════════════════════════╗
β•‘          Eurus QA Runner v1.0                       β•‘
β•‘          {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}                        β•‘
β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
Output: {output_dir}
""")
    
    # Build agent once
    print("πŸ—οΈ  Building agent...")
    agent = build_agent()
    print("βœ… Agent ready\n")
    
    # Select queries
    if args.query:
        queries = [q for q in QA_QUERIES if q["id"] == args.query]
    else:
        queries = [q for q in QA_QUERIES if args.start <= q["id"] <= args.end]
    
    results = []
    for q in queries:
        folder = output_dir / f"q{q['id']:02d}_{q['slug']}"
        if args.skip_existing and (folder / "metadata.json").exists():
            print(f"⏭️  Skipping Q{q['id']:02d} (already exists)")
            continue
        
        result = run_single_query(agent, q, output_dir)
        results.append(result)
    
    # Print summary
    print(f"\n{'='*70}")
    print("QA SUMMARY")
    print(f"{'='*70}")
    
    success = sum(1 for r in results if r["status"] == "success")
    failed = sum(1 for r in results if r["status"] == "error")
    total_time = sum(r["elapsed_seconds"] for r in results)
    
    for r in results:
        status = "βœ…" if r["status"] == "success" else "❌"
        print(f"  {status} Q{r['query_id']:02d} ({r['slug']:20s}) | "
              f"{r['elapsed_seconds']:5.1f}s | Tools: {', '.join(r['tools_used'])}")
    
    print(f"\nTotal: {success} passed, {failed} failed, {total_time:.1f}s total")
    
    # Save summary
    summary_path = output_dir / "qa_summary.json"
    with open(summary_path, "w") as f:
        json.dump({
            "timestamp": datetime.now().isoformat(),
            "total_queries": len(results),
            "passed": success,
            "failed": failed,
            "total_time_seconds": round(total_time, 1),
            "results": results,
        }, f, indent=2, ensure_ascii=False)
    
    print(f"\nSummary saved to: {summary_path}")


if __name__ == "__main__":
    main()