import json from pathlib import Path from agent3 import MyAgent def test_agent( metadata_path: str = "metadata.jsonl", max_tests: int = 5, ): """ Load up to max_tests questions from the GAIA metadata JSONL file, run them through MyAgent, and compare with the correct answer. """ try: agent = MyAgent() except Exception as e: print(f"Error initializing agent: {e}") return correct_count = 0 total_count = 0 metadata_file = Path(metadata_path) if not metadata_file.exists(): print(f"Metadata file not found: {metadata_path}") return with open(metadata_file, "r", encoding="utf-8") as f: for i, line in enumerate(f): if i >= max_tests: break try: meta = json.loads(line) except json.JSONDecodeError: print(f"Invalid JSON on line {i+1}") continue task_id = meta.get("task_id") or meta.get("id") or "" question = meta.get("Question") or meta.get("text") or "" correct_answer = meta.get("Final answer") or meta.get("final answer") or meta.get("Answer") or "" print(f"--- Test {i+1}/{max_tests}: Task ID {task_id} ---") print(f"Question: {question}") if not question: print("Skipping: no question found\n") continue try: file_arg = meta.get("file_name") if file_arg: try: answer = agent.run(question, file_paths=[file_arg]) except Exception as e: import traceback print(f"Error running agent with file: {e}") print(traceback.format_exc()) continue else: try: answer = agent.run(question) except Exception as e: import traceback print(f"Error running agent: {e}") print(traceback.format_exc()) continue print(f"Agent Answer: {answer}") print(f"Correct Answer: {correct_answer}") # Normalize for comparison def normalize(s): return str(s).strip().lower() if normalize(answer) == normalize(correct_answer): print("✅ MATCH\n") correct_count += 1 else: print("❌ NO MATCH\n") total_count += 1 except Exception as e: print(f"Error running agent on question '{question}': {e}\n") print(f"=== Final Results ===") print(f"Total Tests: {total_count}") print(f"Correct Answers: {correct_count}") if total_count > 0: print(f"Accuracy: {correct_count / total_count * 100:.2f}%") else: print("No valid tests run.") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Test MyAgent with GAIA metadata.") parser.add_argument( "--metadata", type=str, default="metadata.jsonl", help="Path to GAIA metadata JSONL" ) parser.add_argument( "--max", type=int, default=5, help="Maximum number of tests to run" ) args = parser.parse_args() test_agent(args.metadata, args.max)