| |
| from __future__ import annotations |
|
|
| import argparse |
| import csv |
| import json |
| import subprocess |
| from pathlib import Path |
| from typing import Any |
|
|
| ROOT = Path(__file__).resolve().parents[1] |
| SCRIPT = ROOT / 'scripts' / 'score_tool_routing_confusion.py' |
| OUT_DIR = ROOT / 'docs' / 'tool_routing_eval' |
|
|
|
|
| def model_stem(model: str) -> str: |
| return model.replace('/', '_') |
|
|
|
|
| def run_one( |
| model: str, |
| agent: str, |
| agent_cards: Path, |
| prompts: Path, |
| expected: Path, |
| start: int, |
| end: int, |
| timeout: int, |
| out_dir: Path, |
| raw_results_dir: Path | None, |
| ) -> None: |
| cmd = [ |
| 'python', str(SCRIPT), |
| '--model', model, |
| '--agent', agent, |
| '--agent-cards', str(agent_cards), |
| '--prompts', str(prompts), |
| '--expected', str(expected), |
| '--start', str(start), |
| '--end', str(end), |
| '--timeout', str(timeout), |
| '--out-dir', str(out_dir), |
| ] |
| if raw_results_dir is not None: |
| cmd.extend(['--raw-results-dir', str(raw_results_dir)]) |
| print('\n[run]', ' '.join(cmd)) |
| subprocess.run(cmd, check=True) |
|
|
|
|
| def load_model_summary(model: str, out_dir: Path) -> dict[str, Any]: |
| p = out_dir / f"tool_routing_{model_stem(model)}.json" |
| data = json.loads(p.read_text(encoding='utf-8')) |
| s = data['summary'] |
| s['model'] = model |
| return s |
|
|
|
|
| def aggregate(summaries: list[dict[str, Any]]) -> dict[str, Any]: |
| n = len(summaries) |
| if n == 0: |
| return {'n_models': 0} |
|
|
| def avg(key: str) -> float: |
| vals = [float(s[key]) for s in summaries] |
| return round(sum(vals) / len(vals), 4) |
|
|
| return { |
| 'n_models': n, |
| 'avg_first_accuracy': avg('first_accuracy'), |
| 'avg_primary_accuracy': avg('primary_accuracy'), |
| 'avg_chain_accuracy': avg('chain_accuracy'), |
| 'avg_success_rate': avg('success_rate'), |
| 'avg_tool_calls': avg('avg_tool_calls'), |
| 'avg_score_total': avg('avg_score_total'), |
| } |
|
|
|
|
| def write_outputs(summaries: list[dict[str, Any]], agg: dict[str, Any], out_dir: Path) -> None: |
| out_dir.mkdir(parents=True, exist_ok=True) |
|
|
| json_path = out_dir / 'tool_routing_batch_summary.json' |
| csv_path = out_dir / 'tool_routing_batch_summary.csv' |
| md_path = out_dir / 'tool_routing_batch_summary.md' |
|
|
| payload = { |
| 'aggregate': agg, |
| 'models': summaries, |
| } |
| json_path.write_text(json.dumps(payload, indent=2), encoding='utf-8') |
|
|
| with csv_path.open('w', newline='', encoding='utf-8') as f: |
| w = csv.DictWriter( |
| f, |
| fieldnames=[ |
| 'model', 'n_cases', 'first_accuracy', 'primary_accuracy', 'chain_accuracy', |
| 'success_rate', 'avg_tool_calls', 'avg_score_total', |
| ], |
| ) |
| w.writeheader() |
| for s in summaries: |
| w.writerow({ |
| 'model': s['model'], |
| 'n_cases': s['n_cases'], |
| 'first_accuracy': s['first_accuracy'], |
| 'primary_accuracy': s['primary_accuracy'], |
| 'chain_accuracy': s['chain_accuracy'], |
| 'success_rate': s['success_rate'], |
| 'avg_tool_calls': s['avg_tool_calls'], |
| 'avg_score_total': s['avg_score_total'], |
| }) |
|
|
| lines = [ |
| '# Tool Routing Batch Summary', |
| '', |
| f"- Models: **{agg.get('n_models', 0)}**", |
| '', |
| '## Aggregate means', |
| '', |
| f"- Avg first-tool accuracy: **{agg.get('avg_first_accuracy')}**", |
| f"- Avg primary-tool accuracy: **{agg.get('avg_primary_accuracy')}**", |
| f"- Avg chain accuracy: **{agg.get('avg_chain_accuracy')}**", |
| f"- Avg success rate: **{agg.get('avg_success_rate')}**", |
| f"- Avg tool calls: **{agg.get('avg_tool_calls')}**", |
| f"- Avg score (/10): **{agg.get('avg_score_total')}**", |
| '', |
| '## Per-model', |
| '', |
| '| Model | Cases | First acc | Primary acc | Chain acc | Success | Avg calls | Avg score |', |
| '|---|---:|---:|---:|---:|---:|---:|---:|', |
| ] |
|
|
| for s in summaries: |
| lines.append( |
| f"| {s['model']} | {s['n_cases']} | {s['first_accuracy']} | {s['primary_accuracy']} | {s['chain_accuracy']} | {s['success_rate']} | {s['avg_tool_calls']} | {s['avg_score_total']} |" |
| ) |
|
|
| md_path.write_text('\n'.join(lines) + '\n', encoding='utf-8') |
|
|
| print('\nWrote:') |
| print(f'- {json_path}') |
| print(f'- {csv_path}') |
| print(f'- {md_path}') |
|
|
|
|
| def main() -> None: |
| ap = argparse.ArgumentParser(description='Batch runner for tool-routing/confusion benchmark') |
| ap.add_argument('--models', required=True, help='Comma-separated model IDs') |
| ap.add_argument('--agent', required=True, help='Router agent name') |
| ap.add_argument('--agent-cards', type=Path, required=True, help='Path containing router agent card and tools') |
| ap.add_argument('--prompts', type=Path, default=ROOT / 'scripts' / 'tool_routing_challenges.txt') |
| ap.add_argument('--expected', type=Path, default=ROOT / 'scripts' / 'tool_routing_expected.json') |
| ap.add_argument('--start', type=int, default=1) |
| ap.add_argument('--end', type=int, default=20) |
| ap.add_argument('--timeout', type=int, default=240) |
| ap.add_argument('--out-dir', type=Path, default=OUT_DIR) |
| ap.add_argument('--raw-results-dir', type=Path, default=None, help='Root directory for fast-agent --results JSON files') |
| args = ap.parse_args() |
|
|
| models = [m.strip() for m in args.models.split(',') if m.strip()] |
|
|
| for m in models: |
| run_one( |
| model=m, |
| agent=args.agent, |
| agent_cards=args.agent_cards, |
| prompts=args.prompts, |
| expected=args.expected, |
| start=args.start, |
| end=args.end, |
| timeout=args.timeout, |
| out_dir=args.out_dir, |
| raw_results_dir=args.raw_results_dir, |
| ) |
|
|
| summaries = [load_model_summary(m, args.out_dir) for m in models] |
| |
| summaries = sorted(summaries, key=lambda s: (-s['first_accuracy'], -s['primary_accuracy'], -s['avg_score_total'], s['avg_tool_calls'], s['model'])) |
| agg = aggregate(summaries) |
| write_outputs(summaries, agg, args.out_dir) |
|
|
|
|
| if __name__ == '__main__': |
| main() |
|
|