File size: 13,882 Bytes
74f2af5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
"""
Phase 6 End-to-End Integration Tests

Tests all Phase 6 components working together:
1. Semantic tension engine (embedding-based opposition)
2. Specialization tracker (domain expertise)
3. Pre-flight conflict predictor (Spiderweb-based)
4. Benchmarking suite
5. Full integration in ForgeEngine debate loop

Run with: pytest test_phase6_e2e.py -v
"""

import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

import pytest
import json
import numpy as np
from reasoning_forge.framework_definitions import (
    StateVector,
    TensionDefinition,
    CoherenceMetrics,
    ConflictPrediction,
    SpecializationScore,
)
from reasoning_forge.semantic_tension import SemanticTensionEngine
from reasoning_forge.specialization_tracker import SpecializationTracker
from reasoning_forge.preflight_predictor import PreFlightConflictPredictor


class TestPhase6Framework:
    """Test Phase 6 mathematical framework definitions."""

    def test_state_vector_creation(self):
        """Test StateVector dataclass."""
        state = StateVector(psi=0.8, tau=0.2, chi=0.5, phi=0.3, lam=0.6)
        assert state.psi == 0.8
        assert len(state.to_array()) == 5
        state_dict = state.to_dict()
        assert "psi" in state_dict
        assert round(state_dict["psi"], 3) == 0.8

    def test_state_vector_distance(self):
        """Test Euclidean distance in 5D state space."""
        state_a = StateVector(psi=0.0, tau=0.0, chi=0.0, phi=0.0, lam=0.0)
        state_b = StateVector(psi=3.0, tau=4.0, chi=0.0, phi=0.0, lam=0.0)
        distance = StateVector.distance(state_a, state_b)
        # Distance should be sqrt(9 + 16) = 5
        assert abs(distance - 5.0) < 0.1, f"Expected ~5, got {distance}"

    def test_tension_definition(self):
        """Test TensionDefinition dataclass."""
        tension = TensionDefinition(
            structural_xi=1.2,
            semantic_xi=0.5,
            combined_xi=0.9,
            opposition_type="framework",
            weight_structural=0.4,
            weight_semantic=0.6,
        )
        assert tension.combined_xi == 0.9
        tensor_dict = tension.to_dict()
        assert tensor_dict["opposition_type"] == "framework"

    def test_coherence_metrics_gamma_computation(self):
        """Test Gamma score computation."""
        gamma, status = CoherenceMetrics.compute_gamma(
            perspective_diversity=0.9,
            tension_health=0.8,
            adapter_weight_variance=0.2,
            resolution_rate=0.7,
        )
        # Expected: (0.25*0.9 + 0.25*0.8 + 0.25*0.8 + 0.25*0.7) = ~0.8
        assert 0.75 < gamma < 0.85
        assert status == "healthy"

    def test_coherence_metrics_collapse_detection(self):
        """Test Gamma collapse detection (< 0.4)."""
        gamma, status = CoherenceMetrics.compute_gamma(
            perspective_diversity=0.1,
            tension_health=0.2,
            adapter_weight_variance=0.9,
            resolution_rate=0.1,
        )
        assert gamma < 0.4
        assert status == "collapsing"

    def test_coherence_metrics_groupthink_detection(self):
        """Test Gamma groupthink detection (> 0.8)."""
        gamma, status = CoherenceMetrics.compute_gamma(
            perspective_diversity=0.95,
            tension_health=0.95,
            adapter_weight_variance=0.05,
            resolution_rate=0.95,
        )
        assert gamma > 0.8
        assert status == "groupthinking"


class TestSemanticTension:
    """Test semantic tension engine."""

    def test_semantic_tension_initialization(self):
        """Test SemanticTensionEngine creation."""
        engine = SemanticTensionEngine(llama_model=None)
        assert engine is not None
        assert engine.embedding_dim == 4096

    def test_semantic_tension_identical_claims(self):
        """Test that identical claims have low tension."""
        engine = SemanticTensionEngine(llama_model=None)
        claim = "The sky is blue"
        tension = engine.compute_semantic_tension(claim, claim)
        # Identical embeddings → cosine similarity ≈ 1 → tension ≈ 0
        assert 0.0 <= tension <= 0.1, f"Identical claims should have low tension, got {tension}"

    def test_semantic_tension_different_claims(self):
        """Test that different claims have higher tension."""
        engine = SemanticTensionEngine(llama_model=None)
        claim_a = "The sky is blue"
        claim_b = "The ocean is red"
        tension = engine.compute_semantic_tension(claim_a, claim_b)
        # Different claims → orthogonal embeddings → tension > 0
        assert tension > 0.0, f"Different claims should have positive tension, got {tension}"
        assert tension <= 1.0

    def test_polarity_classification(self):
        """Test polarity type classification."""
        engine = SemanticTensionEngine(llama_model=None)
        claim_a = "I agree with this"
        claim_b = "I also agree with this"
        polarity = engine.compute_polarity(claim_a, claim_b)
        # Similar claims → paraphrase or framework, not contradiction
        assert polarity in ["paraphrase", "framework", "contradiction"]

    def test_embedding_cache(self):
        """Test caching mechanism."""
        engine = SemanticTensionEngine(llama_model=None)
        claim = "Test claim"

        # First call: cache miss
        embed_1 = engine.embed_claim(claim, use_cache=True)

        # Check cache was populated
        stats = engine.get_cache_stats()
        assert stats["cached_embeddings"] >= 1

        # Second call: cache hit (same object)
        embed_2 = engine.embed_claim(claim, use_cache=True)
        assert np.array_equal(embed_1, embed_2)


class TestSpecializationTracker:
    """Test adapter specialization tracking."""

    def test_specialization_initialization(self):
        """Test SpecializationTracker creation."""
        tracker = SpecializationTracker()
        assert tracker is not None
        assert len(tracker.DOMAIN_KEYWORDS) > 0

    def test_query_domain_classification(self):
        """Test query domain classification."""
        tracker = SpecializationTracker()

        # Physics query
        domains = tracker.classify_query_domain("What is the force of gravity?")
        assert "physics" in domains

        # Ethics query
        domains = tracker.classify_query_domain("Is it right to do this?")
        assert "ethics" in domains

        # No domain match
        domains = tracker.classify_query_domain("Hello world")
        assert "general" in domains

    def test_adapter_performance_recording(self):
        """Test recording adapter performance."""
        tracker = SpecializationTracker()
        tracker.record_adapter_performance("newton", "What is force?", 0.85)
        tracker.record_adapter_performance("newton", "What is acceleration?", 0.90)

        specialization = tracker.compute_specialization("newton")
        assert "physics" in specialization
        # specialization = mean(0.85, 0.90) / usage(2) = 0.875 / 2 = 0.4375
        assert 0.4 <= specialization["physics"] <= 0.5

    def test_semantic_convergence_detection(self):
        """Test convergence detection between adapters."""
        tracker = SpecializationTracker()
        outputs = {
            "newton": "The answer is clearly related to physics and forces.",
            "empathy": "The answer is clearly related to feelings and emotions.",
        }
        convergence = tracker.detect_semantic_convergence(outputs)
        assert "convergent_pairs" in convergence
        # These outputs are different, so should have low convergence
        assert convergence["max_similarity"] < 0.7

    def test_adapter_health(self):
        """Test adapter health scoring."""
        tracker = SpecializationTracker()
        tracker.record_adapter_performance("newton", "physics query 1", 0.9)
        tracker.record_adapter_performance("newton", "physics query 2", 0.85)

        health = tracker.get_adapter_health("newton")
        assert health["adapter"] == "newton"
        assert health["avg_accuracy"] > 0.8
        assert "recommendation" in health


class TestPreFlightPredictor:
    """Test pre-flight conflict prediction."""

    def test_predictor_initialization(self):
        """Test PreFlightConflictPredictor creation."""
        predictor = PreFlightConflictPredictor(spiderweb=None)
        assert predictor is not None

    def test_query_encoding(self):
        """Test encoding queries to 5D state vectors."""
        predictor = PreFlightConflictPredictor(spiderweb=None)

        # Simple query
        state = predictor.encode_query_to_state("What is force?")
        assert isinstance(state, StateVector)
        assert 0 <= state.psi <= 1
        assert 0 <= state.tau <= 1
        assert -1 <= state.phi <= 1

        # Complex query with ethics
        state_eth = predictor.encode_query_to_state(
            "Should we use AI ethically in society?"
        )
        assert state_eth.phi > 0.0, "Ethical query should have emotional valence"

    def test_empty_prediction_fallback(self):
        """Test fallback when spiderweb is unavailable."""
        predictor = PreFlightConflictPredictor(spiderweb=None)
        query_state = StateVector(psi=0.5, tau=0.5, chi=0.5, phi=0.5, lam=0.5)
        prediction = predictor._empty_prediction(query_state)
        assert isinstance(prediction, ConflictPrediction)
        assert prediction.preflight_confidence == 0.0


class TestPhase6Integration:
    """Test full Phase 6 integration."""

    def test_framework_definitions_export(self):
        """Test exporting framework definitions to JSON."""
        state = StateVector(psi=0.7, tau=0.3, chi=0.5, phi=0.4, lam=0.6)
        state_dict = state.to_dict()

        # Should be JSON serializable
        json_str = json.dumps(state_dict)
        parsed = json.loads(json_str)
        assert parsed["psi"] == round(0.7, 3)

    def test_semantic_tension_explain(self):
        """Test detailed semantic tension explanation."""
        engine = SemanticTensionEngine(llama_model=None)
        explanation = engine.explain_tension("Claim A", "Claim B")
        assert "semantic_tension" in explanation
        assert "similarity" in explanation
        assert "polarity_type" in explanation

    def test_specialization_system_health(self):
        """Test overall specialization system health."""
        tracker = SpecializationTracker()
        tracker.record_adapter_performance("newton", "Force query", 0.9)
        tracker.record_adapter_performance("empathy", "Emotion query", 0.85)

        system_health = tracker.get_system_health()
        assert "total_adapters" in system_health
        assert "health_by_adapter" in system_health
        assert system_health["total_adapters"] == 2


class TestPhase6Benchmarks:
    """Test benchmarking suite (without full ForgeEngine)."""

    def test_benchmark_framework_instantiation(self):
        """Test Phase6Benchmarks class."""
        from evaluation.phase6_benchmarks import Phase6Benchmarks

        benchmarks = Phase6Benchmarks(forge_engine=None)
        assert benchmarks is not None
        assert "multi_round_convergence" in benchmarks.results

    def test_benchmark_summary_generation(self):
        """Test benchmark summary formatting."""
        from evaluation.phase6_benchmarks import Phase6Benchmarks

        benchmarks = Phase6Benchmarks(forge_engine=None)
        summary = benchmarks.summary()
        assert "PHASE 6 BENCHMARK SUMMARY" in summary
        assert "MULTI-ROUND" in summary or "MEMORY" in summary


# ===================================================================
# Integration Test: All Components Together (MockForgeEngine)
# ===================================================================


class MockForgeEngine:
    """Mock ForgeEngine for testing Phase 6 integration without full system."""

    def __init__(self):
        self.semantic_tension_engine = SemanticTensionEngine(llama_model=None)
        self.specialization = SpecializationTracker()
        self.conflict_engine = type("obj", (object,), {
            "_classify_conflict": lambda _self, a, b, o: ("framework", 0.5)
        })()

    def forge_with_debate(self, query, use_memory_weights=False, num_rounds=2):
        """Mock debate method."""
        return {
            "synthesis": "Mock synthesis",
            "metadata": {
                "coherence": 0.75,
                "resolution_rate": 0.8,
            }
        }


@pytest.mark.integration
class TestPhase6EndToEnd:
    """End-to-end Phase 6 tests."""

    def test_full_phase6_pipeline(self):
        """Test all Phase 6 components in sequence."""
        # Create mock system
        forge = MockForgeEngine()

        # Test 1: Semantic tension
        tension = forge.semantic_tension_engine.compute_semantic_tension(
            "This is true", "This is false"
        )
        assert 0 <= tension <= 1

        # Test 2: Specialization
        forge.specialization.record_adapter_performance("test_adapter", "physics query", 0.9)
        specs = forge.specialization.get_global_specialization()
        assert "test_adapter" in specs

        # Test 3: Pre-flight prediction (with encoding)
        predictor = PreFlightConflictPredictor(spiderweb=None)
        state = predictor.encode_query_to_state("Test query")
        assert state.psi >= 0

    def test_phase6_with_benchmarks(self):
        """Test Phase6Benchmarks with mock data."""
        from evaluation.phase6_benchmarks import Phase6Benchmarks

        forge = MockForgeEngine()
        benchmarks = Phase6Benchmarks(forge_engine=forge)

        # Test specialization benchmark (no ForgeEngine calls needed)
        result = benchmarks.benchmark_specialization()
        assert "adapters_tracked" in result


if __name__ == "__main__":
    pytest.main([__file__, "-v"])