Codette-Reasoning / scripts /run_phase6_evaluation.py

Jonathan Harrison

Full Codette codebase sync — transparency release

74f2af5 about 21 hours ago

4.22 kB

	#!/usr/bin/env python3
	"""
	Phase 6 Focus Evaluation Runner

	Compares 4 conditions:
	1. Baseline (Llama only, no routing)
	2. Phase 1-5 (debate, no semantic tension, no specialization)
	3. Phase 6 Full (all Phase 6 components: semantic tension, specialization, preflight)
	4. Phase 6 -PreFlight (Phase 6 minus preflight prediction)

	This isolates the value of individual Phase 6 components.
	"""

	import sys
	import json
	from pathlib import Path
	from datetime import datetime
	import time

	# Add repo to path
	sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

	from reasoning_forge.forge_engine import ForgeEngine
	from evaluation.test_suite_evaluation import EvaluationHarness, EvaluationAnalyzer, EVALUATION_TEST_SUITE


	def main():
	print("\n" + "=" * 80)
	print("PHASE 6 EVALUATION: Component Impact Analysis")
	print("=" * 80 + "\n")

	# Load Forge Engine
	print("[1/4] Initializing ForgeEngine with Phase 6 components...")
	try:
	forge = ForgeEngine()
	print("✓ ForgeEngine loaded")
	print(f" - Analysis agents: {len(forge.analysis_agents)}")
	print(f" - Semantic tension engine: {forge.semantic_tension_engine is not None}")
	print(f" - Specialization tracker: {forge.specialization is not None}")
	print(f" - Pre-flight predictor: {forge.preflight_predictor is not None}")
	except Exception as e:
	print(f"✗ Failed to load ForgeEngine: {e}")
	return 1

	# Create evaluation harness
	print("\n[2/4] Creating evaluation harness...")
	try:
	harness = EvaluationHarness(forge)
	print("✓ Evaluation harness ready")
	except Exception as e:
	print(f"✗ Failed to create harness: {e}")
	return 1

	# Select subset of questions for focused evaluation (top question from each category + hard ones)
	# This reduces test time while still covering all domains
	focused_questions = [
	EVALUATION_TEST_SUITE[0], # Speed of light (easy physics)
	EVALUATION_TEST_SUITE[2], # Entropy & time (hard physics)
	EVALUATION_TEST_SUITE[3], # Lying to save life (ethics)
	EVALUATION_TEST_SUITE[5], # AI explanations (hard ethics)
	EVALUATION_TEST_SUITE[7], # Can machines be conscious? (hard consciousness)
	EVALUATION_TEST_SUITE[9], # What makes something creative (creativity)
	EVALUATION_TEST_SUITE[11], # Can AI be truly creative (hard creativity)
	EVALUATION_TEST_SUITE[12], # What is emergence (systems)
	EVALUATION_TEST_SUITE[14], # Free will (hard interdisciplinary)
	]

	print(f" - Running {len(focused_questions)} focused questions")
	print(" - Questions span: physics (easy, hard), ethics (medium, hard),")
	print(" consciousness, creativity, systems, interdisciplinary\n")

	# Run evaluation
	print("[3/4] Running evaluation (this may take 5-10 minutes)...\n")
	start_time = time.time()

	try:
	results = harness.run_evaluation_suite(focused_questions)
	elapsed = time.time() - start_time

	print(f"\n✓ Evaluation complete ({elapsed:.1f}s)")
	print(f" - Phase 1-5 results: {len(results['phase_1_5'])} questions")
	print(f" - Phase 6 Full results: {len(results['phase_6_full'])} questions")
	print(f" - Phase 6 -PreFlight results: {len(results['phase_6_no_preflight'])} questions")
	except Exception as e:
	print(f"\n✗ Evaluation failed: {e}")
	import traceback
	traceback.print_exc()
	return 1

	# Analyze results
	print("\n[4/4] Analyzing results...\n")
	try:
	analyzer = EvaluationAnalyzer(results)
	report = analyzer.report()
	print(report)

	# Export detailed results
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	output_file = f"/j/codette-training-lab/evaluation_results_{timestamp}.json"
	harness.export_results(output_file)
	print(f"✓ Detailed results exported: {output_file}")

	except Exception as e:
	print(f"✗ Analysis failed: {e}")
	import traceback
	traceback.print_exc()
	return 1

	return 0


	if __name__ == "__main__":
	exit_code = main()
	sys.exit(exit_code)