{ "timestamp": 1774284729.3695014, "results": { "Phase_6_Only": { "overall_accuracy": 0.5, "accuracy_by_category": { "factual_easy": 0.5, "factual_medium": 1.0, "conceptual_medium": 0.5, "reasoning_medium": 1.0, "tricky_medium": 0.0, "nuanced_hard": 0.0, "meta_loop_prone": 0.5 }, "accuracy_by_difficulty": { "1": 0.5, "2": 0.625, "3": 0.25 }, "avg_latency_ms": 0.2077141080583845, "total_tests": 14, "correct_count": 7, "category_stats": { "factual_easy": { "accuracy": 0.5, "count": 2, "avg_latency_ms": 0.809056854248047 }, "factual_medium": { "accuracy": 1.0, "count": 2, "avg_latency_ms": 0.10834465026855469 }, "conceptual_medium": { "accuracy": 0.5, "count": 2, "avg_latency_ms": 0.10786781311035157 }, "reasoning_medium": { "accuracy": 1.0, "count": 2, "avg_latency_ms": 0.10739097595214844 }, "tricky_medium": { "accuracy": 0.0, "count": 2, "avg_latency_ms": 0.10751018524169922 }, "nuanced_hard": { "accuracy": 0.0, "count": 2, "avg_latency_ms": 0.10751018524169922 }, "meta_loop_prone": { "accuracy": 0.5, "count": 2, "avg_latency_ms": 0.10631809234619141 } } }, "Phase_6_Plus_13": { "overall_accuracy": 0.7857142857142857, "accuracy_by_category": { "factual_easy": 1.0, "factual_medium": 1.0, "conceptual_medium": 1.0, "reasoning_medium": 1.0, "tricky_medium": 1.0, "nuanced_hard": 0.5, "meta_loop_prone": 0.0 }, "accuracy_by_difficulty": { "1": 1.0, "2": 1.0, "3": 0.25 }, "avg_latency_ms": 0.10701631818498884, "total_tests": 14, "correct_count": 11, "category_stats": { "factual_easy": { "accuracy": 1.0, "count": 2, "avg_latency_ms": 0.10751018524169922 }, "factual_medium": { "accuracy": 1.0, "count": 2, "avg_latency_ms": 0.10667572021484376 }, "conceptual_medium": { "accuracy": 1.0, "count": 2, "avg_latency_ms": 0.10643730163574219 }, "reasoning_medium": { "accuracy": 1.0, "count": 2, "avg_latency_ms": 0.10762939453125 }, "tricky_medium": { "accuracy": 1.0, "count": 2, "avg_latency_ms": 0.10727176666259766 }, "nuanced_hard": { "accuracy": 0.5, "count": 2, "avg_latency_ms": 0.1070333480834961 }, "meta_loop_prone": { "accuracy": 0.0, "count": 2, "avg_latency_ms": 0.10655651092529297 } } }, "Phase_6_Plus_13_Plus_14": { "overall_accuracy": 0.5714285714285714, "accuracy_by_category": { "factual_easy": 0.5, "factual_medium": 1.0, "conceptual_medium": 0.5, "reasoning_medium": 0.5, "tricky_medium": 1.0, "nuanced_hard": 0.5, "meta_loop_prone": 0.0 }, "accuracy_by_difficulty": { "1": 0.5, "2": 0.75, "3": 0.25 }, "avg_latency_ms": 0.10691413879394532, "total_tests": 14, "correct_count": 8, "category_stats": { "factual_easy": { "accuracy": 0.5, "count": 2, "avg_latency_ms": 0.10715255737304688 }, "factual_medium": { "accuracy": 1.0, "count": 2, "avg_latency_ms": 0.10667572021484376 }, "conceptual_medium": { "accuracy": 0.5, "count": 2, "avg_latency_ms": 0.10655651092529297 }, "reasoning_medium": { "accuracy": 0.5, "count": 2, "avg_latency_ms": 0.10727176666259766 }, "tricky_medium": { "accuracy": 1.0, "count": 2, "avg_latency_ms": 0.1070333480834961 }, "nuanced_hard": { "accuracy": 0.5, "count": 2, "avg_latency_ms": 0.1070333480834961 }, "meta_loop_prone": { "accuracy": 0.0, "count": 2, "avg_latency_ms": 0.10667572021484376 } } } }, "summary": { "phase6_accuracy": 0.5, "phase6_13_accuracy": 0.7857142857142857, "phase6_13_14_accuracy": 0.5714285714285714, "improvement_13_pct": 57.14285714285714, "improvement_14_pct": -27.272727272727277, "total_improvement_pct": 138.0952380952381 } }