Codette-Reasoning / tests /test_phase6_e2e.py

Jonathan Harrison

Full Codette codebase sync — transparency release

74f2af5 about 21 hours ago

13.9 kB

	"""
	Phase 6 End-to-End Integration Tests

	Tests all Phase 6 components working together:
	1. Semantic tension engine (embedding-based opposition)
	2. Specialization tracker (domain expertise)
	3. Pre-flight conflict predictor (Spiderweb-based)
	4. Benchmarking suite
	5. Full integration in ForgeEngine debate loop

	Run with: pytest test_phase6_e2e.py -v
	"""

	import sys
	from pathlib import Path
	sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

	import pytest
	import json
	import numpy as np
	from reasoning_forge.framework_definitions import (
	StateVector,
	TensionDefinition,
	CoherenceMetrics,
	ConflictPrediction,
	SpecializationScore,
	)
	from reasoning_forge.semantic_tension import SemanticTensionEngine
	from reasoning_forge.specialization_tracker import SpecializationTracker
	from reasoning_forge.preflight_predictor import PreFlightConflictPredictor


	class TestPhase6Framework:
	"""Test Phase 6 mathematical framework definitions."""

	def test_state_vector_creation(self):
	"""Test StateVector dataclass."""
	state = StateVector(psi=0.8, tau=0.2, chi=0.5, phi=0.3, lam=0.6)
	assert state.psi == 0.8
	assert len(state.to_array()) == 5
	state_dict = state.to_dict()
	assert "psi" in state_dict
	assert round(state_dict["psi"], 3) == 0.8

	def test_state_vector_distance(self):
	"""Test Euclidean distance in 5D state space."""
	state_a = StateVector(psi=0.0, tau=0.0, chi=0.0, phi=0.0, lam=0.0)
	state_b = StateVector(psi=3.0, tau=4.0, chi=0.0, phi=0.0, lam=0.0)
	distance = StateVector.distance(state_a, state_b)
	# Distance should be sqrt(9 + 16) = 5
	assert abs(distance - 5.0) < 0.1, f"Expected ~5, got {distance}"

	def test_tension_definition(self):
	"""Test TensionDefinition dataclass."""
	tension = TensionDefinition(
	structural_xi=1.2,
	semantic_xi=0.5,
	combined_xi=0.9,
	opposition_type="framework",
	weight_structural=0.4,
	weight_semantic=0.6,
	)
	assert tension.combined_xi == 0.9
	tensor_dict = tension.to_dict()
	assert tensor_dict["opposition_type"] == "framework"

	def test_coherence_metrics_gamma_computation(self):
	"""Test Gamma score computation."""
	gamma, status = CoherenceMetrics.compute_gamma(
	perspective_diversity=0.9,
	tension_health=0.8,
	adapter_weight_variance=0.2,
	resolution_rate=0.7,
	)
	# Expected: (0.250.9 + 0.250.8 + 0.250.8 + 0.250.7) = ~0.8
	assert 0.75 < gamma < 0.85
	assert status == "healthy"

	def test_coherence_metrics_collapse_detection(self):
	"""Test Gamma collapse detection (< 0.4)."""
	gamma, status = CoherenceMetrics.compute_gamma(
	perspective_diversity=0.1,
	tension_health=0.2,
	adapter_weight_variance=0.9,
	resolution_rate=0.1,
	)
	assert gamma < 0.4
	assert status == "collapsing"

	def test_coherence_metrics_groupthink_detection(self):
	"""Test Gamma groupthink detection (> 0.8)."""
	gamma, status = CoherenceMetrics.compute_gamma(
	perspective_diversity=0.95,
	tension_health=0.95,
	adapter_weight_variance=0.05,
	resolution_rate=0.95,
	)
	assert gamma > 0.8
	assert status == "groupthinking"


	class TestSemanticTension:
	"""Test semantic tension engine."""

	def test_semantic_tension_initialization(self):
	"""Test SemanticTensionEngine creation."""
	engine = SemanticTensionEngine(llama_model=None)
	assert engine is not None
	assert engine.embedding_dim == 4096

	def test_semantic_tension_identical_claims(self):
	"""Test that identical claims have low tension."""
	engine = SemanticTensionEngine(llama_model=None)
	claim = "The sky is blue"
	tension = engine.compute_semantic_tension(claim, claim)
	# Identical embeddings → cosine similarity ≈ 1 → tension ≈ 0
	assert 0.0 <= tension <= 0.1, f"Identical claims should have low tension, got {tension}"

	def test_semantic_tension_different_claims(self):
	"""Test that different claims have higher tension."""
	engine = SemanticTensionEngine(llama_model=None)
	claim_a = "The sky is blue"
	claim_b = "The ocean is red"
	tension = engine.compute_semantic_tension(claim_a, claim_b)
	# Different claims → orthogonal embeddings → tension > 0
	assert tension > 0.0, f"Different claims should have positive tension, got {tension}"
	assert tension <= 1.0

	def test_polarity_classification(self):
	"""Test polarity type classification."""
	engine = SemanticTensionEngine(llama_model=None)
	claim_a = "I agree with this"
	claim_b = "I also agree with this"
	polarity = engine.compute_polarity(claim_a, claim_b)
	# Similar claims → paraphrase or framework, not contradiction
	assert polarity in ["paraphrase", "framework", "contradiction"]

	def test_embedding_cache(self):
	"""Test caching mechanism."""
	engine = SemanticTensionEngine(llama_model=None)
	claim = "Test claim"

	# First call: cache miss
	embed_1 = engine.embed_claim(claim, use_cache=True)

	# Check cache was populated
	stats = engine.get_cache_stats()
	assert stats["cached_embeddings"] >= 1

	# Second call: cache hit (same object)
	embed_2 = engine.embed_claim(claim, use_cache=True)
	assert np.array_equal(embed_1, embed_2)


	class TestSpecializationTracker:
	"""Test adapter specialization tracking."""

	def test_specialization_initialization(self):
	"""Test SpecializationTracker creation."""
	tracker = SpecializationTracker()
	assert tracker is not None
	assert len(tracker.DOMAIN_KEYWORDS) > 0

	def test_query_domain_classification(self):
	"""Test query domain classification."""
	tracker = SpecializationTracker()

	# Physics query
	domains = tracker.classify_query_domain("What is the force of gravity?")
	assert "physics" in domains

	# Ethics query
	domains = tracker.classify_query_domain("Is it right to do this?")
	assert "ethics" in domains

	# No domain match
	domains = tracker.classify_query_domain("Hello world")
	assert "general" in domains

	def test_adapter_performance_recording(self):
	"""Test recording adapter performance."""
	tracker = SpecializationTracker()
	tracker.record_adapter_performance("newton", "What is force?", 0.85)
	tracker.record_adapter_performance("newton", "What is acceleration?", 0.90)

	specialization = tracker.compute_specialization("newton")
	assert "physics" in specialization
	# specialization = mean(0.85, 0.90) / usage(2) = 0.875 / 2 = 0.4375
	assert 0.4 <= specialization["physics"] <= 0.5

	def test_semantic_convergence_detection(self):
	"""Test convergence detection between adapters."""
	tracker = SpecializationTracker()
	outputs = {
	"newton": "The answer is clearly related to physics and forces.",
	"empathy": "The answer is clearly related to feelings and emotions.",
	}
	convergence = tracker.detect_semantic_convergence(outputs)
	assert "convergent_pairs" in convergence
	# These outputs are different, so should have low convergence
	assert convergence["max_similarity"] < 0.7

	def test_adapter_health(self):
	"""Test adapter health scoring."""
	tracker = SpecializationTracker()
	tracker.record_adapter_performance("newton", "physics query 1", 0.9)
	tracker.record_adapter_performance("newton", "physics query 2", 0.85)

	health = tracker.get_adapter_health("newton")
	assert health["adapter"] == "newton"
	assert health["avg_accuracy"] > 0.8
	assert "recommendation" in health


	class TestPreFlightPredictor:
	"""Test pre-flight conflict prediction."""

	def test_predictor_initialization(self):
	"""Test PreFlightConflictPredictor creation."""
	predictor = PreFlightConflictPredictor(spiderweb=None)
	assert predictor is not None

	def test_query_encoding(self):
	"""Test encoding queries to 5D state vectors."""
	predictor = PreFlightConflictPredictor(spiderweb=None)

	# Simple query
	state = predictor.encode_query_to_state("What is force?")
	assert isinstance(state, StateVector)
	assert 0 <= state.psi <= 1
	assert 0 <= state.tau <= 1
	assert -1 <= state.phi <= 1

	# Complex query with ethics
	state_eth = predictor.encode_query_to_state(
	"Should we use AI ethically in society?"
	)
	assert state_eth.phi > 0.0, "Ethical query should have emotional valence"

	def test_empty_prediction_fallback(self):
	"""Test fallback when spiderweb is unavailable."""
	predictor = PreFlightConflictPredictor(spiderweb=None)
	query_state = StateVector(psi=0.5, tau=0.5, chi=0.5, phi=0.5, lam=0.5)
	prediction = predictor._empty_prediction(query_state)
	assert isinstance(prediction, ConflictPrediction)
	assert prediction.preflight_confidence == 0.0


	class TestPhase6Integration:
	"""Test full Phase 6 integration."""

	def test_framework_definitions_export(self):
	"""Test exporting framework definitions to JSON."""
	state = StateVector(psi=0.7, tau=0.3, chi=0.5, phi=0.4, lam=0.6)
	state_dict = state.to_dict()

	# Should be JSON serializable
	json_str = json.dumps(state_dict)
	parsed = json.loads(json_str)
	assert parsed["psi"] == round(0.7, 3)

	def test_semantic_tension_explain(self):
	"""Test detailed semantic tension explanation."""
	engine = SemanticTensionEngine(llama_model=None)
	explanation = engine.explain_tension("Claim A", "Claim B")
	assert "semantic_tension" in explanation
	assert "similarity" in explanation
	assert "polarity_type" in explanation

	def test_specialization_system_health(self):
	"""Test overall specialization system health."""
	tracker = SpecializationTracker()
	tracker.record_adapter_performance("newton", "Force query", 0.9)
	tracker.record_adapter_performance("empathy", "Emotion query", 0.85)

	system_health = tracker.get_system_health()
	assert "total_adapters" in system_health
	assert "health_by_adapter" in system_health
	assert system_health["total_adapters"] == 2


	class TestPhase6Benchmarks:
	"""Test benchmarking suite (without full ForgeEngine)."""

	def test_benchmark_framework_instantiation(self):
	"""Test Phase6Benchmarks class."""
	from evaluation.phase6_benchmarks import Phase6Benchmarks

	benchmarks = Phase6Benchmarks(forge_engine=None)
	assert benchmarks is not None
	assert "multi_round_convergence" in benchmarks.results

	def test_benchmark_summary_generation(self):
	"""Test benchmark summary formatting."""
	from evaluation.phase6_benchmarks import Phase6Benchmarks

	benchmarks = Phase6Benchmarks(forge_engine=None)
	summary = benchmarks.summary()
	assert "PHASE 6 BENCHMARK SUMMARY" in summary
	assert "MULTI-ROUND" in summary or "MEMORY" in summary


	# ===================================================================
	# Integration Test: All Components Together (MockForgeEngine)
	# ===================================================================


	class MockForgeEngine:
	"""Mock ForgeEngine for testing Phase 6 integration without full system."""

	def __init__(self):
	self.semantic_tension_engine = SemanticTensionEngine(llama_model=None)
	self.specialization = SpecializationTracker()
	self.conflict_engine = type("obj", (object,), {
	"_classify_conflict": lambda _self, a, b, o: ("framework", 0.5)
	})()

	def forge_with_debate(self, query, use_memory_weights=False, num_rounds=2):
	"""Mock debate method."""
	return {
	"synthesis": "Mock synthesis",
	"metadata": {
	"coherence": 0.75,
	"resolution_rate": 0.8,
	}
	}


	@pytest.mark.integration
	class TestPhase6EndToEnd:
	"""End-to-end Phase 6 tests."""

	def test_full_phase6_pipeline(self):
	"""Test all Phase 6 components in sequence."""
	# Create mock system
	forge = MockForgeEngine()

	# Test 1: Semantic tension
	tension = forge.semantic_tension_engine.compute_semantic_tension(
	"This is true", "This is false"
	)
	assert 0 <= tension <= 1

	# Test 2: Specialization
	forge.specialization.record_adapter_performance("test_adapter", "physics query", 0.9)
	specs = forge.specialization.get_global_specialization()
	assert "test_adapter" in specs

	# Test 3: Pre-flight prediction (with encoding)
	predictor = PreFlightConflictPredictor(spiderweb=None)
	state = predictor.encode_query_to_state("Test query")
	assert state.psi >= 0

	def test_phase6_with_benchmarks(self):
	"""Test Phase6Benchmarks with mock data."""
	from evaluation.phase6_benchmarks import Phase6Benchmarks

	forge = MockForgeEngine()
	benchmarks = Phase6Benchmarks(forge_engine=forge)

	# Test specialization benchmark (no ForgeEngine calls needed)
	result = benchmarks.benchmark_specialization()
	assert "adapters_tracked" in result


	if __name__ == "__main__":
	pytest.main([__file__, "-v"])