Agents_Final_Assignment / test_agent.py
mabelwang21's picture
quick check accuracy
3cc0589
import json
from pathlib import Path
from agent3 import MyAgent
def test_agent(
metadata_path: str = "metadata.jsonl",
max_tests: int = 5,
):
"""
Load up to max_tests questions from the GAIA metadata JSONL file,
run them through MyAgent, and compare with the correct answer.
"""
try:
agent = MyAgent()
except Exception as e:
print(f"Error initializing agent: {e}")
return
correct_count = 0
total_count = 0
metadata_file = Path(metadata_path)
if not metadata_file.exists():
print(f"Metadata file not found: {metadata_path}")
return
with open(metadata_file, "r", encoding="utf-8") as f:
for i, line in enumerate(f):
if i >= max_tests:
break
try:
meta = json.loads(line)
except json.JSONDecodeError:
print(f"Invalid JSON on line {i+1}")
continue
task_id = meta.get("task_id") or meta.get("id") or ""
question = meta.get("Question") or meta.get("text") or ""
correct_answer = meta.get("Final answer") or meta.get("final answer") or meta.get("Answer") or ""
print(f"--- Test {i+1}/{max_tests}: Task ID {task_id} ---")
print(f"Question: {question}")
if not question:
print("Skipping: no question found\n")
continue
try:
file_arg = meta.get("file_name")
if file_arg:
try:
answer = agent.run(question, file_paths=[file_arg])
except Exception as e:
import traceback
print(f"Error running agent with file: {e}")
print(traceback.format_exc())
continue
else:
try:
answer = agent.run(question)
except Exception as e:
import traceback
print(f"Error running agent: {e}")
print(traceback.format_exc())
continue
print(f"Agent Answer: {answer}")
print(f"Correct Answer: {correct_answer}")
# Normalize for comparison
def normalize(s):
return str(s).strip().lower()
if normalize(answer) == normalize(correct_answer):
print("✅ MATCH\n")
correct_count += 1
else:
print("❌ NO MATCH\n")
total_count += 1
except Exception as e:
print(f"Error running agent on question '{question}': {e}\n")
print(f"=== Final Results ===")
print(f"Total Tests: {total_count}")
print(f"Correct Answers: {correct_count}")
if total_count > 0:
print(f"Accuracy: {correct_count / total_count * 100:.2f}%")
else:
print("No valid tests run.")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Test MyAgent with GAIA metadata.")
parser.add_argument(
"--metadata", type=str, default="metadata.jsonl",
help="Path to GAIA metadata JSONL"
)
parser.add_argument(
"--max", type=int, default=5,
help="Maximum number of tests to run"
)
args = parser.parse_args()
test_agent(args.metadata, args.max)