File size: 3,512 Bytes
fec32f4
 
3cc0589
fec32f4
 
 
 
 
 
 
3cc0589
 
fec32f4
3cc0589
 
 
 
 
 
 
 
fec32f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3cc0589
fec32f4
 
 
 
 
 
 
 
 
3cc0589
fec32f4
3cc0589
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fec32f4
3cc0589
 
 
fec32f4
 
 
3cc0589
 
 
 
 
 
 
 
fec32f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import json
from pathlib import Path
from agent3 import MyAgent


def test_agent(
    metadata_path: str = "metadata.jsonl",
    max_tests: int = 5,
):
    """
    Load up to max_tests questions from the GAIA metadata JSONL file,
    run them through MyAgent, and compare with the correct answer.
    """
    try:
        agent = MyAgent()
    except Exception as e:
        print(f"Error initializing agent: {e}")
        return
        
    correct_count = 0
    total_count = 0

    metadata_file = Path(metadata_path)
    if not metadata_file.exists():
        print(f"Metadata file not found: {metadata_path}")
        return

    with open(metadata_file, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= max_tests:
                break
            try:
                meta = json.loads(line)
            except json.JSONDecodeError:
                print(f"Invalid JSON on line {i+1}")
                continue

            task_id = meta.get("task_id") or meta.get("id") or ""
            question = meta.get("Question") or meta.get("text") or ""
            correct_answer = meta.get("Final answer") or meta.get("final answer") or meta.get("Answer") or ""

            print(f"--- Test {i+1}/{max_tests}: Task ID {task_id} ---")
            print(f"Question: {question}")

            if not question:
                print("Skipping: no question found\n")
                continue

            try:
                file_arg = meta.get("file_name")
                if file_arg:
                    try:
                        answer = agent.run(question, file_paths=[file_arg])
                    except Exception as e:
                        import traceback
                        print(f"Error running agent with file: {e}")
                        print(traceback.format_exc())
                        continue
                else:
                    try:
                        answer = agent.run(question)
                    except Exception as e:
                        import traceback
                        print(f"Error running agent: {e}")
                        print(traceback.format_exc())
                        continue
                print(f"Agent Answer: {answer}")
                print(f"Correct Answer: {correct_answer}")

                # Normalize for comparison
                def normalize(s):
                    return str(s).strip().lower()

                if normalize(answer) == normalize(correct_answer):
                    print("✅ MATCH\n")
                    correct_count += 1
                else:
                    print("❌ NO MATCH\n")
                total_count += 1

            except Exception as e:
                print(f"Error running agent on question '{question}': {e}\n")

    print(f"=== Final Results ===")
    print(f"Total Tests: {total_count}")
    print(f"Correct Answers: {correct_count}")
    if total_count > 0:
        print(f"Accuracy: {correct_count / total_count * 100:.2f}%")
    else:
        print("No valid tests run.")


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Test MyAgent with GAIA metadata.")
    parser.add_argument(
        "--metadata", type=str, default="metadata.jsonl",
        help="Path to GAIA metadata JSONL"
    )
    parser.add_argument(
        "--max", type=int, default=5,
        help="Maximum number of tests to run"
    )
    args = parser.parse_args()
    test_agent(args.metadata, args.max)