| |
| """ |
| Phase 6 Focus Evaluation Runner |
| |
| Compares 4 conditions: |
| 1. Baseline (Llama only, no routing) |
| 2. Phase 1-5 (debate, no semantic tension, no specialization) |
| 3. Phase 6 Full (all Phase 6 components: semantic tension, specialization, preflight) |
| 4. Phase 6 -PreFlight (Phase 6 minus preflight prediction) |
| |
| This isolates the value of individual Phase 6 components. |
| """ |
|
|
| import sys |
| import json |
| from pathlib import Path |
| from datetime import datetime |
| import time |
|
|
| |
| sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) |
|
|
| from reasoning_forge.forge_engine import ForgeEngine |
| from evaluation.test_suite_evaluation import EvaluationHarness, EvaluationAnalyzer, EVALUATION_TEST_SUITE |
|
|
|
|
| def main(): |
| print("\n" + "=" * 80) |
| print("PHASE 6 EVALUATION: Component Impact Analysis") |
| print("=" * 80 + "\n") |
|
|
| |
| print("[1/4] Initializing ForgeEngine with Phase 6 components...") |
| try: |
| forge = ForgeEngine() |
| print("β ForgeEngine loaded") |
| print(f" - Analysis agents: {len(forge.analysis_agents)}") |
| print(f" - Semantic tension engine: {forge.semantic_tension_engine is not None}") |
| print(f" - Specialization tracker: {forge.specialization is not None}") |
| print(f" - Pre-flight predictor: {forge.preflight_predictor is not None}") |
| except Exception as e: |
| print(f"β Failed to load ForgeEngine: {e}") |
| return 1 |
|
|
| |
| print("\n[2/4] Creating evaluation harness...") |
| try: |
| harness = EvaluationHarness(forge) |
| print("β Evaluation harness ready") |
| except Exception as e: |
| print(f"β Failed to create harness: {e}") |
| return 1 |
|
|
| |
| |
| focused_questions = [ |
| EVALUATION_TEST_SUITE[0], |
| EVALUATION_TEST_SUITE[2], |
| EVALUATION_TEST_SUITE[3], |
| EVALUATION_TEST_SUITE[5], |
| EVALUATION_TEST_SUITE[7], |
| EVALUATION_TEST_SUITE[9], |
| EVALUATION_TEST_SUITE[11], |
| EVALUATION_TEST_SUITE[12], |
| EVALUATION_TEST_SUITE[14], |
| ] |
|
|
| print(f" - Running {len(focused_questions)} focused questions") |
| print(" - Questions span: physics (easy, hard), ethics (medium, hard),") |
| print(" consciousness, creativity, systems, interdisciplinary\n") |
|
|
| |
| print("[3/4] Running evaluation (this may take 5-10 minutes)...\n") |
| start_time = time.time() |
|
|
| try: |
| results = harness.run_evaluation_suite(focused_questions) |
| elapsed = time.time() - start_time |
|
|
| print(f"\nβ Evaluation complete ({elapsed:.1f}s)") |
| print(f" - Phase 1-5 results: {len(results['phase_1_5'])} questions") |
| print(f" - Phase 6 Full results: {len(results['phase_6_full'])} questions") |
| print(f" - Phase 6 -PreFlight results: {len(results['phase_6_no_preflight'])} questions") |
| except Exception as e: |
| print(f"\nβ Evaluation failed: {e}") |
| import traceback |
| traceback.print_exc() |
| return 1 |
|
|
| |
| print("\n[4/4] Analyzing results...\n") |
| try: |
| analyzer = EvaluationAnalyzer(results) |
| report = analyzer.report() |
| print(report) |
|
|
| |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| output_file = f"/j/codette-training-lab/evaluation_results_{timestamp}.json" |
| harness.export_results(output_file) |
| print(f"β Detailed results exported: {output_file}") |
|
|
| except Exception as e: |
| print(f"β Analysis failed: {e}") |
| import traceback |
| traceback.print_exc() |
| return 1 |
|
|
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| exit_code = main() |
| sys.exit(exit_code) |
|
|