| { |
| "timestamp": 1774284729.3695014, |
| "results": { |
| "Phase_6_Only": { |
| "overall_accuracy": 0.5, |
| "accuracy_by_category": { |
| "factual_easy": 0.5, |
| "factual_medium": 1.0, |
| "conceptual_medium": 0.5, |
| "reasoning_medium": 1.0, |
| "tricky_medium": 0.0, |
| "nuanced_hard": 0.0, |
| "meta_loop_prone": 0.5 |
| }, |
| "accuracy_by_difficulty": { |
| "1": 0.5, |
| "2": 0.625, |
| "3": 0.25 |
| }, |
| "avg_latency_ms": 0.2077141080583845, |
| "total_tests": 14, |
| "correct_count": 7, |
| "category_stats": { |
| "factual_easy": { |
| "accuracy": 0.5, |
| "count": 2, |
| "avg_latency_ms": 0.809056854248047 |
| }, |
| "factual_medium": { |
| "accuracy": 1.0, |
| "count": 2, |
| "avg_latency_ms": 0.10834465026855469 |
| }, |
| "conceptual_medium": { |
| "accuracy": 0.5, |
| "count": 2, |
| "avg_latency_ms": 0.10786781311035157 |
| }, |
| "reasoning_medium": { |
| "accuracy": 1.0, |
| "count": 2, |
| "avg_latency_ms": 0.10739097595214844 |
| }, |
| "tricky_medium": { |
| "accuracy": 0.0, |
| "count": 2, |
| "avg_latency_ms": 0.10751018524169922 |
| }, |
| "nuanced_hard": { |
| "accuracy": 0.0, |
| "count": 2, |
| "avg_latency_ms": 0.10751018524169922 |
| }, |
| "meta_loop_prone": { |
| "accuracy": 0.5, |
| "count": 2, |
| "avg_latency_ms": 0.10631809234619141 |
| } |
| } |
| }, |
| "Phase_6_Plus_13": { |
| "overall_accuracy": 0.7857142857142857, |
| "accuracy_by_category": { |
| "factual_easy": 1.0, |
| "factual_medium": 1.0, |
| "conceptual_medium": 1.0, |
| "reasoning_medium": 1.0, |
| "tricky_medium": 1.0, |
| "nuanced_hard": 0.5, |
| "meta_loop_prone": 0.0 |
| }, |
| "accuracy_by_difficulty": { |
| "1": 1.0, |
| "2": 1.0, |
| "3": 0.25 |
| }, |
| "avg_latency_ms": 0.10701631818498884, |
| "total_tests": 14, |
| "correct_count": 11, |
| "category_stats": { |
| "factual_easy": { |
| "accuracy": 1.0, |
| "count": 2, |
| "avg_latency_ms": 0.10751018524169922 |
| }, |
| "factual_medium": { |
| "accuracy": 1.0, |
| "count": 2, |
| "avg_latency_ms": 0.10667572021484376 |
| }, |
| "conceptual_medium": { |
| "accuracy": 1.0, |
| "count": 2, |
| "avg_latency_ms": 0.10643730163574219 |
| }, |
| "reasoning_medium": { |
| "accuracy": 1.0, |
| "count": 2, |
| "avg_latency_ms": 0.10762939453125 |
| }, |
| "tricky_medium": { |
| "accuracy": 1.0, |
| "count": 2, |
| "avg_latency_ms": 0.10727176666259766 |
| }, |
| "nuanced_hard": { |
| "accuracy": 0.5, |
| "count": 2, |
| "avg_latency_ms": 0.1070333480834961 |
| }, |
| "meta_loop_prone": { |
| "accuracy": 0.0, |
| "count": 2, |
| "avg_latency_ms": 0.10655651092529297 |
| } |
| } |
| }, |
| "Phase_6_Plus_13_Plus_14": { |
| "overall_accuracy": 0.5714285714285714, |
| "accuracy_by_category": { |
| "factual_easy": 0.5, |
| "factual_medium": 1.0, |
| "conceptual_medium": 0.5, |
| "reasoning_medium": 0.5, |
| "tricky_medium": 1.0, |
| "nuanced_hard": 0.5, |
| "meta_loop_prone": 0.0 |
| }, |
| "accuracy_by_difficulty": { |
| "1": 0.5, |
| "2": 0.75, |
| "3": 0.25 |
| }, |
| "avg_latency_ms": 0.10691413879394532, |
| "total_tests": 14, |
| "correct_count": 8, |
| "category_stats": { |
| "factual_easy": { |
| "accuracy": 0.5, |
| "count": 2, |
| "avg_latency_ms": 0.10715255737304688 |
| }, |
| "factual_medium": { |
| "accuracy": 1.0, |
| "count": 2, |
| "avg_latency_ms": 0.10667572021484376 |
| }, |
| "conceptual_medium": { |
| "accuracy": 0.5, |
| "count": 2, |
| "avg_latency_ms": 0.10655651092529297 |
| }, |
| "reasoning_medium": { |
| "accuracy": 0.5, |
| "count": 2, |
| "avg_latency_ms": 0.10727176666259766 |
| }, |
| "tricky_medium": { |
| "accuracy": 1.0, |
| "count": 2, |
| "avg_latency_ms": 0.1070333480834961 |
| }, |
| "nuanced_hard": { |
| "accuracy": 0.5, |
| "count": 2, |
| "avg_latency_ms": 0.1070333480834961 |
| }, |
| "meta_loop_prone": { |
| "accuracy": 0.0, |
| "count": 2, |
| "avg_latency_ms": 0.10667572021484376 |
| } |
| } |
| } |
| }, |
| "summary": { |
| "phase6_accuracy": 0.5, |
| "phase6_13_accuracy": 0.7857142857142857, |
| "phase6_13_14_accuracy": 0.5714285714285714, |
| "improvement_13_pct": 57.14285714285714, |
| "improvement_14_pct": -27.272727272727277, |
| "total_improvement_pct": 138.0952380952381 |
| } |
| } |