Codette-Reasoning / tests /validate_phase1.py

Jonathan Harrison

Full Codette codebase sync — transparency release

74f2af5 1 day ago

7.71 kB

	#!/usr/bin/env python3
	"""
	Phase 1 Validation Script
	Quick test to verify conflict detection is working.
	"""

	import sys
	import json
	from pathlib import Path

	# Add project to path
	project_root = Path(__file__).resolve().parent.parent
	sys.path.insert(0, str(project_root))

	def test_imports():
	"""Test that all Phase 1 modules can be imported."""
	print("\n" + "="*80)
	print("PHASE 1 VALIDATION: IMPORT TEST")
	print("="*80 + "\n")

	try:
	print("Importing TokenConfidenceEngine...")
	from reasoning_forge.token_confidence import TokenConfidenceEngine
	print(" ✓ TokenConfidenceEngine imported")

	print("Importing ConflictEngine...")
	from reasoning_forge.conflict_engine import ConflictEngine
	print(" ✓ ConflictEngine imported")

	print("Importing ForgeEngine...")
	from reasoning_forge.forge_engine import ForgeEngine
	print(" ✓ ForgeEngine imported")

	print("Importing ConflictTestRunner...")
	from evaluation.conflict_tests import ConflictTestRunner
	print(" ✓ ConflictTestRunner imported")

	return True

	except Exception as e:
	print(f"\n✗ IMPORT FAILED: {e}")
	import traceback
	traceback.print_exc()
	return False


	def test_token_confidence_engine():
	"""Test TokenConfidenceEngine basic functionality."""
	print("\n" + "="*80)
	print("PHASE 1 VALIDATION: TOKEN CONFIDENCE ENGINE")
	print("="*80 + "\n")

	try:
	from reasoning_forge.token_confidence import TokenConfidenceEngine

	engine = TokenConfidenceEngine()
	print("✓ TokenConfidenceEngine initialized")

	# Test semantic marker parsing
	test_response = (
	"I'm confident that this approach will work. However, it's possible that we'll "
	"encounter issues. The data clearly shows a trend towards improvement."
	)
	peer_responses = {
	"peer1": "This approach might be problematic in some cases.",
	"peer2": "I argue that this is fundamentally sound.",
	}

	scores = engine.score_tokens(test_response, "agent1", peer_responses)
	print(f"✓ Token confidence scoring completed")
	print(f" - Claims extracted: {len(scores.claims)}")
	print(f" - Token scores: {len(scores.token_scores)} tokens")
	print(f" - Mean confidence: {sum(scores.token_scores) / max(len(scores.token_scores), 1):.3f}")

	return True

	except Exception as e:
	print(f"\n✗ TOKEN CONFIDENCE TEST FAILED: {e}")
	import traceback
	traceback.print_exc()
	return False


	def test_conflict_engine():
	"""Test ConflictEngine basic functionality."""
	print("\n" + "="*80)
	print("PHASE 1 VALIDATION: CONFLICT ENGINE")
	print("="*80 + "\n")

	try:
	from reasoning_forge.token_confidence import TokenConfidenceEngine
	from reasoning_forge.conflict_engine import ConflictEngine

	token_conf = TokenConfidenceEngine()
	conflict_engine = ConflictEngine(token_confidence_engine=token_conf)
	print("✓ ConflictEngine initialized")

	# Test conflict detection with synthetic responses
	agent_analyses = {
	"agent_a": "The algorithm must be deterministic for maximum control. "
	"This ensures predictability and reliability in all cases.",
	"agent_b": "A probabilistic approach is superior because it captures the "
	"inherent uncertainty in real-world systems. Determinism is rigid.",
	}

	conflicts = conflict_engine.detect_conflicts(agent_analyses)
	print(f"✓ Conflict detection completed")
	print(f" - Conflicts detected: {len(conflicts)}")

	if conflicts:
	top_conflict = conflicts[0]
	print(f"\n Top conflict:")
	print(f" - Type: {top_conflict.conflict_type}")
	print(f" - Strength: {top_conflict.conflict_strength:.3f}")
	print(f" - Agent A claim: {top_conflict.claim_a[:60]}...")
	print(f" - Agent B claim: {top_conflict.claim_b[:60]}...")
	print(f" - Overlap: {top_conflict.semantic_overlap:.3f}")

	return True

	except Exception as e:
	print(f"\n✗ CONFLICT ENGINE TEST FAILED: {e}")
	import traceback
	traceback.print_exc()
	return False


	def test_forge_integration():
	"""Test that ForgeEngine initializes with conflict detection."""
	print("\n" + "="*80)
	print("PHASE 1 VALIDATION: FORGE ENGINE INTEGRATION")
	print("="*80 + "\n")

	try:
	from reasoning_forge.forge_engine import ForgeEngine

	print("Initializing ForgeEngine with conflict detection...")
	forge = ForgeEngine()
	print("✓ ForgeEngine initialized")

	# Check that conflict engines are attached
	if not hasattr(forge, 'token_confidence'):
	raise AttributeError("ForgeEngine missing token_confidence engine")
	print("✓ TokenConfidenceEngine attached to ForgeEngine")

	if not hasattr(forge, 'conflict_engine'):
	raise AttributeError("ForgeEngine missing conflict_engine")
	print("✓ ConflictEngine attached to ForgeEngine")

	# Test a simple debate (this will be slow without GPU)
	print("\nTesting forge_with_debate() on a simple concept...")
	print(" (This may take a moment without GPU acceleration)")

	result = forge.forge_with_debate("Should an algorithm prioritize speed or clarity?", debate_rounds=1)

	metadata = result.get("metadata", {})
	print("✓ forge_with_debate() completed successfully")

	# Check Phase 1 metrics
	round_0_conflicts = metadata.get("conflicts_round_0_count", 0)
	print(f" - Conflicts detected (R0): {round_0_conflicts}")

	if "debate_log" in metadata:
	print(f" - Debate rounds logged: {len(metadata['debate_log'])}")

	if "ensemble_coherence" in metadata:
	print(f" - Ensemble coherence: {metadata['ensemble_coherence']:.3f}")

	return True

	except Exception as e:
	print(f"\n✗ FORGE INTEGRATION TEST FAILED: {e}")
	import traceback
	traceback.print_exc()
	return False


	def main():
	"""Run all validation tests."""
	print("\n")
	print("=" * 80)
	print("CODETTE PHASE 1: CONFLICT DETECTION - VALIDATION SUITE")
	print("=" * 80)

	tests = [
	("Imports", test_imports),
	("Token Confidence Engine", test_token_confidence_engine),
	("Conflict Engine", test_conflict_engine),
	("Forge Integration", test_forge_integration),
	]

	results = {}
	for test_name, test_func in tests:
	try:
	results[test_name] = test_func()
	except KeyboardInterrupt:
	print("\n\n✗ Tests interrupted by user")
	return 1
	except Exception as e:
	print(f"\n✗ Unexpected error in {test_name}: {e}")
	results[test_name] = False

	# Summary
	print("\n" + "="*80)
	print("VALIDATION SUMMARY")
	print("="*80 + "\n")

	passed = sum(1 for v in results.values() if v)
	total = len(results)

	for test_name, result in results.items():
	status = "✓ PASS" if result else "✗ FAIL"
	print(f" {status}: {test_name}")

	print(f"\n Total: {passed}/{total} tests passed\n")

	if passed == total:
	print("✓ All Phase 1 validations passed! Ready for testing.")
	return 0
	else:
	print(f"✗ {total - passed} validation(s) failed. Check errors above.")
	return 1


	if __name__ == "__main__":
	sys.exit(main())