Codette-Reasoning / tests /validate_phase1.py
Jonathan Harrison
Full Codette codebase sync β€” transparency release
74f2af5
#!/usr/bin/env python3
"""
Phase 1 Validation Script
Quick test to verify conflict detection is working.
"""
import sys
import json
from pathlib import Path
# Add project to path
project_root = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(project_root))
def test_imports():
"""Test that all Phase 1 modules can be imported."""
print("\n" + "="*80)
print("PHASE 1 VALIDATION: IMPORT TEST")
print("="*80 + "\n")
try:
print("Importing TokenConfidenceEngine...")
from reasoning_forge.token_confidence import TokenConfidenceEngine
print(" βœ“ TokenConfidenceEngine imported")
print("Importing ConflictEngine...")
from reasoning_forge.conflict_engine import ConflictEngine
print(" βœ“ ConflictEngine imported")
print("Importing ForgeEngine...")
from reasoning_forge.forge_engine import ForgeEngine
print(" βœ“ ForgeEngine imported")
print("Importing ConflictTestRunner...")
from evaluation.conflict_tests import ConflictTestRunner
print(" βœ“ ConflictTestRunner imported")
return True
except Exception as e:
print(f"\nβœ— IMPORT FAILED: {e}")
import traceback
traceback.print_exc()
return False
def test_token_confidence_engine():
"""Test TokenConfidenceEngine basic functionality."""
print("\n" + "="*80)
print("PHASE 1 VALIDATION: TOKEN CONFIDENCE ENGINE")
print("="*80 + "\n")
try:
from reasoning_forge.token_confidence import TokenConfidenceEngine
engine = TokenConfidenceEngine()
print("βœ“ TokenConfidenceEngine initialized")
# Test semantic marker parsing
test_response = (
"I'm confident that this approach will work. However, it's possible that we'll "
"encounter issues. The data clearly shows a trend towards improvement."
)
peer_responses = {
"peer1": "This approach might be problematic in some cases.",
"peer2": "I argue that this is fundamentally sound.",
}
scores = engine.score_tokens(test_response, "agent1", peer_responses)
print(f"βœ“ Token confidence scoring completed")
print(f" - Claims extracted: {len(scores.claims)}")
print(f" - Token scores: {len(scores.token_scores)} tokens")
print(f" - Mean confidence: {sum(scores.token_scores) / max(len(scores.token_scores), 1):.3f}")
return True
except Exception as e:
print(f"\nβœ— TOKEN CONFIDENCE TEST FAILED: {e}")
import traceback
traceback.print_exc()
return False
def test_conflict_engine():
"""Test ConflictEngine basic functionality."""
print("\n" + "="*80)
print("PHASE 1 VALIDATION: CONFLICT ENGINE")
print("="*80 + "\n")
try:
from reasoning_forge.token_confidence import TokenConfidenceEngine
from reasoning_forge.conflict_engine import ConflictEngine
token_conf = TokenConfidenceEngine()
conflict_engine = ConflictEngine(token_confidence_engine=token_conf)
print("βœ“ ConflictEngine initialized")
# Test conflict detection with synthetic responses
agent_analyses = {
"agent_a": "The algorithm must be deterministic for maximum control. "
"This ensures predictability and reliability in all cases.",
"agent_b": "A probabilistic approach is superior because it captures the "
"inherent uncertainty in real-world systems. Determinism is rigid.",
}
conflicts = conflict_engine.detect_conflicts(agent_analyses)
print(f"βœ“ Conflict detection completed")
print(f" - Conflicts detected: {len(conflicts)}")
if conflicts:
top_conflict = conflicts[0]
print(f"\n Top conflict:")
print(f" - Type: {top_conflict.conflict_type}")
print(f" - Strength: {top_conflict.conflict_strength:.3f}")
print(f" - Agent A claim: {top_conflict.claim_a[:60]}...")
print(f" - Agent B claim: {top_conflict.claim_b[:60]}...")
print(f" - Overlap: {top_conflict.semantic_overlap:.3f}")
return True
except Exception as e:
print(f"\nβœ— CONFLICT ENGINE TEST FAILED: {e}")
import traceback
traceback.print_exc()
return False
def test_forge_integration():
"""Test that ForgeEngine initializes with conflict detection."""
print("\n" + "="*80)
print("PHASE 1 VALIDATION: FORGE ENGINE INTEGRATION")
print("="*80 + "\n")
try:
from reasoning_forge.forge_engine import ForgeEngine
print("Initializing ForgeEngine with conflict detection...")
forge = ForgeEngine()
print("βœ“ ForgeEngine initialized")
# Check that conflict engines are attached
if not hasattr(forge, 'token_confidence'):
raise AttributeError("ForgeEngine missing token_confidence engine")
print("βœ“ TokenConfidenceEngine attached to ForgeEngine")
if not hasattr(forge, 'conflict_engine'):
raise AttributeError("ForgeEngine missing conflict_engine")
print("βœ“ ConflictEngine attached to ForgeEngine")
# Test a simple debate (this will be slow without GPU)
print("\nTesting forge_with_debate() on a simple concept...")
print(" (This may take a moment without GPU acceleration)")
result = forge.forge_with_debate("Should an algorithm prioritize speed or clarity?", debate_rounds=1)
metadata = result.get("metadata", {})
print("βœ“ forge_with_debate() completed successfully")
# Check Phase 1 metrics
round_0_conflicts = metadata.get("conflicts_round_0_count", 0)
print(f" - Conflicts detected (R0): {round_0_conflicts}")
if "debate_log" in metadata:
print(f" - Debate rounds logged: {len(metadata['debate_log'])}")
if "ensemble_coherence" in metadata:
print(f" - Ensemble coherence: {metadata['ensemble_coherence']:.3f}")
return True
except Exception as e:
print(f"\nβœ— FORGE INTEGRATION TEST FAILED: {e}")
import traceback
traceback.print_exc()
return False
def main():
"""Run all validation tests."""
print("\n")
print("=" * 80)
print("CODETTE PHASE 1: CONFLICT DETECTION - VALIDATION SUITE")
print("=" * 80)
tests = [
("Imports", test_imports),
("Token Confidence Engine", test_token_confidence_engine),
("Conflict Engine", test_conflict_engine),
("Forge Integration", test_forge_integration),
]
results = {}
for test_name, test_func in tests:
try:
results[test_name] = test_func()
except KeyboardInterrupt:
print("\n\nβœ— Tests interrupted by user")
return 1
except Exception as e:
print(f"\nβœ— Unexpected error in {test_name}: {e}")
results[test_name] = False
# Summary
print("\n" + "="*80)
print("VALIDATION SUMMARY")
print("="*80 + "\n")
passed = sum(1 for v in results.values() if v)
total = len(results)
for test_name, result in results.items():
status = "βœ“ PASS" if result else "βœ— FAIL"
print(f" {status}: {test_name}")
print(f"\n Total: {passed}/{total} tests passed\n")
if passed == total:
print("βœ“ All Phase 1 validations passed! Ready for testing.")
return 0
else:
print(f"βœ— {total - passed} validation(s) failed. Check errors above.")
return 1
if __name__ == "__main__":
sys.exit(main())