File size: 13,882 Bytes
74f2af5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 | """
Phase 6 End-to-End Integration Tests
Tests all Phase 6 components working together:
1. Semantic tension engine (embedding-based opposition)
2. Specialization tracker (domain expertise)
3. Pre-flight conflict predictor (Spiderweb-based)
4. Benchmarking suite
5. Full integration in ForgeEngine debate loop
Run with: pytest test_phase6_e2e.py -v
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
import pytest
import json
import numpy as np
from reasoning_forge.framework_definitions import (
StateVector,
TensionDefinition,
CoherenceMetrics,
ConflictPrediction,
SpecializationScore,
)
from reasoning_forge.semantic_tension import SemanticTensionEngine
from reasoning_forge.specialization_tracker import SpecializationTracker
from reasoning_forge.preflight_predictor import PreFlightConflictPredictor
class TestPhase6Framework:
"""Test Phase 6 mathematical framework definitions."""
def test_state_vector_creation(self):
"""Test StateVector dataclass."""
state = StateVector(psi=0.8, tau=0.2, chi=0.5, phi=0.3, lam=0.6)
assert state.psi == 0.8
assert len(state.to_array()) == 5
state_dict = state.to_dict()
assert "psi" in state_dict
assert round(state_dict["psi"], 3) == 0.8
def test_state_vector_distance(self):
"""Test Euclidean distance in 5D state space."""
state_a = StateVector(psi=0.0, tau=0.0, chi=0.0, phi=0.0, lam=0.0)
state_b = StateVector(psi=3.0, tau=4.0, chi=0.0, phi=0.0, lam=0.0)
distance = StateVector.distance(state_a, state_b)
# Distance should be sqrt(9 + 16) = 5
assert abs(distance - 5.0) < 0.1, f"Expected ~5, got {distance}"
def test_tension_definition(self):
"""Test TensionDefinition dataclass."""
tension = TensionDefinition(
structural_xi=1.2,
semantic_xi=0.5,
combined_xi=0.9,
opposition_type="framework",
weight_structural=0.4,
weight_semantic=0.6,
)
assert tension.combined_xi == 0.9
tensor_dict = tension.to_dict()
assert tensor_dict["opposition_type"] == "framework"
def test_coherence_metrics_gamma_computation(self):
"""Test Gamma score computation."""
gamma, status = CoherenceMetrics.compute_gamma(
perspective_diversity=0.9,
tension_health=0.8,
adapter_weight_variance=0.2,
resolution_rate=0.7,
)
# Expected: (0.25*0.9 + 0.25*0.8 + 0.25*0.8 + 0.25*0.7) = ~0.8
assert 0.75 < gamma < 0.85
assert status == "healthy"
def test_coherence_metrics_collapse_detection(self):
"""Test Gamma collapse detection (< 0.4)."""
gamma, status = CoherenceMetrics.compute_gamma(
perspective_diversity=0.1,
tension_health=0.2,
adapter_weight_variance=0.9,
resolution_rate=0.1,
)
assert gamma < 0.4
assert status == "collapsing"
def test_coherence_metrics_groupthink_detection(self):
"""Test Gamma groupthink detection (> 0.8)."""
gamma, status = CoherenceMetrics.compute_gamma(
perspective_diversity=0.95,
tension_health=0.95,
adapter_weight_variance=0.05,
resolution_rate=0.95,
)
assert gamma > 0.8
assert status == "groupthinking"
class TestSemanticTension:
"""Test semantic tension engine."""
def test_semantic_tension_initialization(self):
"""Test SemanticTensionEngine creation."""
engine = SemanticTensionEngine(llama_model=None)
assert engine is not None
assert engine.embedding_dim == 4096
def test_semantic_tension_identical_claims(self):
"""Test that identical claims have low tension."""
engine = SemanticTensionEngine(llama_model=None)
claim = "The sky is blue"
tension = engine.compute_semantic_tension(claim, claim)
# Identical embeddings → cosine similarity ≈ 1 → tension ≈ 0
assert 0.0 <= tension <= 0.1, f"Identical claims should have low tension, got {tension}"
def test_semantic_tension_different_claims(self):
"""Test that different claims have higher tension."""
engine = SemanticTensionEngine(llama_model=None)
claim_a = "The sky is blue"
claim_b = "The ocean is red"
tension = engine.compute_semantic_tension(claim_a, claim_b)
# Different claims → orthogonal embeddings → tension > 0
assert tension > 0.0, f"Different claims should have positive tension, got {tension}"
assert tension <= 1.0
def test_polarity_classification(self):
"""Test polarity type classification."""
engine = SemanticTensionEngine(llama_model=None)
claim_a = "I agree with this"
claim_b = "I also agree with this"
polarity = engine.compute_polarity(claim_a, claim_b)
# Similar claims → paraphrase or framework, not contradiction
assert polarity in ["paraphrase", "framework", "contradiction"]
def test_embedding_cache(self):
"""Test caching mechanism."""
engine = SemanticTensionEngine(llama_model=None)
claim = "Test claim"
# First call: cache miss
embed_1 = engine.embed_claim(claim, use_cache=True)
# Check cache was populated
stats = engine.get_cache_stats()
assert stats["cached_embeddings"] >= 1
# Second call: cache hit (same object)
embed_2 = engine.embed_claim(claim, use_cache=True)
assert np.array_equal(embed_1, embed_2)
class TestSpecializationTracker:
"""Test adapter specialization tracking."""
def test_specialization_initialization(self):
"""Test SpecializationTracker creation."""
tracker = SpecializationTracker()
assert tracker is not None
assert len(tracker.DOMAIN_KEYWORDS) > 0
def test_query_domain_classification(self):
"""Test query domain classification."""
tracker = SpecializationTracker()
# Physics query
domains = tracker.classify_query_domain("What is the force of gravity?")
assert "physics" in domains
# Ethics query
domains = tracker.classify_query_domain("Is it right to do this?")
assert "ethics" in domains
# No domain match
domains = tracker.classify_query_domain("Hello world")
assert "general" in domains
def test_adapter_performance_recording(self):
"""Test recording adapter performance."""
tracker = SpecializationTracker()
tracker.record_adapter_performance("newton", "What is force?", 0.85)
tracker.record_adapter_performance("newton", "What is acceleration?", 0.90)
specialization = tracker.compute_specialization("newton")
assert "physics" in specialization
# specialization = mean(0.85, 0.90) / usage(2) = 0.875 / 2 = 0.4375
assert 0.4 <= specialization["physics"] <= 0.5
def test_semantic_convergence_detection(self):
"""Test convergence detection between adapters."""
tracker = SpecializationTracker()
outputs = {
"newton": "The answer is clearly related to physics and forces.",
"empathy": "The answer is clearly related to feelings and emotions.",
}
convergence = tracker.detect_semantic_convergence(outputs)
assert "convergent_pairs" in convergence
# These outputs are different, so should have low convergence
assert convergence["max_similarity"] < 0.7
def test_adapter_health(self):
"""Test adapter health scoring."""
tracker = SpecializationTracker()
tracker.record_adapter_performance("newton", "physics query 1", 0.9)
tracker.record_adapter_performance("newton", "physics query 2", 0.85)
health = tracker.get_adapter_health("newton")
assert health["adapter"] == "newton"
assert health["avg_accuracy"] > 0.8
assert "recommendation" in health
class TestPreFlightPredictor:
"""Test pre-flight conflict prediction."""
def test_predictor_initialization(self):
"""Test PreFlightConflictPredictor creation."""
predictor = PreFlightConflictPredictor(spiderweb=None)
assert predictor is not None
def test_query_encoding(self):
"""Test encoding queries to 5D state vectors."""
predictor = PreFlightConflictPredictor(spiderweb=None)
# Simple query
state = predictor.encode_query_to_state("What is force?")
assert isinstance(state, StateVector)
assert 0 <= state.psi <= 1
assert 0 <= state.tau <= 1
assert -1 <= state.phi <= 1
# Complex query with ethics
state_eth = predictor.encode_query_to_state(
"Should we use AI ethically in society?"
)
assert state_eth.phi > 0.0, "Ethical query should have emotional valence"
def test_empty_prediction_fallback(self):
"""Test fallback when spiderweb is unavailable."""
predictor = PreFlightConflictPredictor(spiderweb=None)
query_state = StateVector(psi=0.5, tau=0.5, chi=0.5, phi=0.5, lam=0.5)
prediction = predictor._empty_prediction(query_state)
assert isinstance(prediction, ConflictPrediction)
assert prediction.preflight_confidence == 0.0
class TestPhase6Integration:
"""Test full Phase 6 integration."""
def test_framework_definitions_export(self):
"""Test exporting framework definitions to JSON."""
state = StateVector(psi=0.7, tau=0.3, chi=0.5, phi=0.4, lam=0.6)
state_dict = state.to_dict()
# Should be JSON serializable
json_str = json.dumps(state_dict)
parsed = json.loads(json_str)
assert parsed["psi"] == round(0.7, 3)
def test_semantic_tension_explain(self):
"""Test detailed semantic tension explanation."""
engine = SemanticTensionEngine(llama_model=None)
explanation = engine.explain_tension("Claim A", "Claim B")
assert "semantic_tension" in explanation
assert "similarity" in explanation
assert "polarity_type" in explanation
def test_specialization_system_health(self):
"""Test overall specialization system health."""
tracker = SpecializationTracker()
tracker.record_adapter_performance("newton", "Force query", 0.9)
tracker.record_adapter_performance("empathy", "Emotion query", 0.85)
system_health = tracker.get_system_health()
assert "total_adapters" in system_health
assert "health_by_adapter" in system_health
assert system_health["total_adapters"] == 2
class TestPhase6Benchmarks:
"""Test benchmarking suite (without full ForgeEngine)."""
def test_benchmark_framework_instantiation(self):
"""Test Phase6Benchmarks class."""
from evaluation.phase6_benchmarks import Phase6Benchmarks
benchmarks = Phase6Benchmarks(forge_engine=None)
assert benchmarks is not None
assert "multi_round_convergence" in benchmarks.results
def test_benchmark_summary_generation(self):
"""Test benchmark summary formatting."""
from evaluation.phase6_benchmarks import Phase6Benchmarks
benchmarks = Phase6Benchmarks(forge_engine=None)
summary = benchmarks.summary()
assert "PHASE 6 BENCHMARK SUMMARY" in summary
assert "MULTI-ROUND" in summary or "MEMORY" in summary
# ===================================================================
# Integration Test: All Components Together (MockForgeEngine)
# ===================================================================
class MockForgeEngine:
"""Mock ForgeEngine for testing Phase 6 integration without full system."""
def __init__(self):
self.semantic_tension_engine = SemanticTensionEngine(llama_model=None)
self.specialization = SpecializationTracker()
self.conflict_engine = type("obj", (object,), {
"_classify_conflict": lambda _self, a, b, o: ("framework", 0.5)
})()
def forge_with_debate(self, query, use_memory_weights=False, num_rounds=2):
"""Mock debate method."""
return {
"synthesis": "Mock synthesis",
"metadata": {
"coherence": 0.75,
"resolution_rate": 0.8,
}
}
@pytest.mark.integration
class TestPhase6EndToEnd:
"""End-to-end Phase 6 tests."""
def test_full_phase6_pipeline(self):
"""Test all Phase 6 components in sequence."""
# Create mock system
forge = MockForgeEngine()
# Test 1: Semantic tension
tension = forge.semantic_tension_engine.compute_semantic_tension(
"This is true", "This is false"
)
assert 0 <= tension <= 1
# Test 2: Specialization
forge.specialization.record_adapter_performance("test_adapter", "physics query", 0.9)
specs = forge.specialization.get_global_specialization()
assert "test_adapter" in specs
# Test 3: Pre-flight prediction (with encoding)
predictor = PreFlightConflictPredictor(spiderweb=None)
state = predictor.encode_query_to_state("Test query")
assert state.psi >= 0
def test_phase6_with_benchmarks(self):
"""Test Phase6Benchmarks with mock data."""
from evaluation.phase6_benchmarks import Phase6Benchmarks
forge = MockForgeEngine()
benchmarks = Phase6Benchmarks(forge_engine=forge)
# Test specialization benchmark (no ForgeEngine calls needed)
result = benchmarks.benchmark_specialization()
assert "adapters_tracked" in result
if __name__ == "__main__":
pytest.main([__file__, "-v"])
|