File size: 11,337 Bytes

74f2af5

#!/usr/bin/env python3
"""Phase 5 End-to-End Integration Tests

Tests the complete Phase 5 system:
1. ReinforcementConfig tunable coefficients
2. AdapterRouter with MemoryWeighting integration
3. CodetteOrchestrator routing with memory context
4. Gamma stabilization field health monitoring
5. RoutingMetrics observability

Run with: python test_phase5_e2e.py
"""

import sys
import os
from pathlib import Path

# Add parent directory to path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from reasoning_forge.memory_weighting import MemoryWeighting, ReinforcementConfig
from reasoning_forge.coherence_field import CoherenceFieldGamma, GammaHealthMetrics, InterventionType
from reasoning_forge.routing_metrics import RoutingMetrics, AdapterSelectionRecord
from inference.adapter_router import AdapterRouter, RouteResult


def test_reinforcement_config():
    """Test ReinforcementConfig tunable coefficients."""
    print("\n=== Test 1: Reinforcement Config ===")

    # Test default values
    config = ReinforcementConfig()
    assert config.boost_successful == 0.08, "Default boost should be 0.08"
    assert config.penalize_failed == 0.08, "Default penalize should be 0.08"
    assert config.reward_soft_consensus == 0.03, "Default soft_consensus should be 0.03"
    print("[OK] Default coefficients loaded")

    # Test from_dict()
    custom_dict = {
        "boost_successful": 0.12,
        "penalize_failed": 0.10,
        "reward_soft_consensus": 0.05,
    }
    custom = ReinforcementConfig.from_dict(custom_dict)
    assert custom.boost_successful == 0.12, "Custom boost not applied"
    assert custom.penalize_failed == 0.10, "Custom penalize not applied"
    print("[OK] Custom coefficients loaded from dict")

    # Test to_dict()
    exported = custom.to_dict()
    assert exported["boost_successful"] == 0.12, "Export failed"
    print("[OK] Coefficients exported to dict")

    # Test partial config (missing keys should use defaults)
    partial = ReinforcementConfig.from_dict({"boost_successful": 0.15})
    assert partial.boost_successful == 0.15, "Partial override failed"
    assert partial.penalize_failed == 0.08, "Default not used for missing key"
    print("[OK] Partial config with defaults works")

    return True


def test_adapter_router_with_memory():
    """Test AdapterRouter memory weighting integration."""
    print("\n=== Test 2: AdapterRouter with Memory ===")

    # Create router without memory
    router_no_mem = AdapterRouter(available_adapters=["newton", "davinci", "empathy"])
    assert router_no_mem.memory_weighting is None, "Router should not have memory"
    print("[OK] Router created without memory")

    # Route a simple query
    query = "Explain the physics of gravity"
    route = router_no_mem.route(query, strategy="keyword")
    assert route.primary == "newton", "Should select newton for physics query"
    assert route.confidence > 0.0, "Confidence should be set"
    print(f"[OK] Routed to {route.primary} with confidence {route.confidence:.2f}")

    # Test explain_routing without memory
    explanation = router_no_mem.explain_routing(route)
    assert "primary" in explanation, "Explanation missing primary"
    assert explanation["memory_aware"] is False, "Should show memory not available"
    print("[OK] Routing explanation works without memory")

    return True


def test_gamma_health_monitoring():
    """Test Gamma (Γ) stabilization field."""
    print("\n=== Test 3: Gamma Health Monitoring ===")

    gamma = CoherenceFieldGamma()

    # Simulate a healthy debate (diverse perspectives, good resolution)
    class MockConflict:
        def __init__(self):
            self.strength = 0.25  # Productive zone

    conflicts = [MockConflict(), MockConflict()]
    responses = {
        "newton": "Physics perspective",
        "davinci": "Creative perspective",
        "empathy": "Emotional perspective",
    }

    # Compute health
    health = gamma.compute_health(
        conflicts=conflicts,
        responses=responses,
        adapter_weights={"newton": 1.0, "davinci": 1.0, "empathy": 1.0},
    )

    assert 0.0 <= health.gamma <= 1.0, "Gamma should be in [0, 1]"
    assert len(gamma.health_history) == 1, "Should record health metric"
    print(f"[OK] Healthy state: Gamma = {health.gamma:.3f}")
    assert health.is_stable(), "Should be in stable zone"
    print("[OK] Status correctly identified as stable")

    # Simulate collapse (no diversity, low resolution)
    mono_responses = {"newton": "Only newton perspective"}
    weak_conflicts = []  # No progress

    health_collapse = gamma.compute_health(
        conflicts=weak_conflicts,
        responses=mono_responses,
        adapter_weights={"newton": 2.0},  # All weight on one
    )

    print(f"[OK] Collapsed state: Gamma = {health_collapse.gamma:.3f}")
    if health_collapse.gamma < 0.4:
        assert health_collapse.is_collapsing(), "Should detect collapse"
        print("[OK] Collapse correctly detected")

    # Test intervention detection
    intervention = gamma.get_intervention(health_collapse, ["davinci", "empathy"])
    if intervention:
        assert intervention.intervention_type == InterventionType.DIVERSITY_INJECTION, \
            "Should inject diversity on collapse"
        print(f"[OK] Intervention recommended: {intervention.intervention_type.value}")

    return True


def test_routing_metrics():
    """Test RoutingMetrics observability."""
    print("\n=== Test 4: Routing Metrics ===")

    metrics = RoutingMetrics()
    assert metrics.total_queries == 0, "Should start at 0"
    print("[OK] RoutingMetrics initialized")

    # Record some routing decisions
    record1 = RoutingMetrics.create_record(
        query="What is quantum mechanics?",
        primary_adapter="quantum",
        secondary_adapters=["physics"],
        strategy="keyword",
        confidence_before_boost=0.75,
        confidence_after_boost=0.85,
        memory_boost_applied=True,
    )
    metrics.record_route(record1)

    assert metrics.total_queries == 1, "Should count query"
    assert metrics.adapter_selection_counts["quantum"] == 1, "Should count selection"
    assert metrics.memory_boost_count == 1, "Should count boost"
    print("[OK] Route recorded and metrics updated")

    # Record more routes
    for i in range(4):
        record = RoutingMetrics.create_record(
            query="Another query",
            primary_adapter="newton",
            secondary_adapters=[],
            strategy="keyword",
            confidence_before_boost=0.6,
            confidence_after_boost=0.6,
            memory_boost_applied=False,
        )
        metrics.record_route(record)

    assert metrics.total_queries == 5, "Should have 5 queries"
    assert metrics.adapter_selection_counts["newton"] == 4, "Newton selected 4 times"
    print(f"[OK] Recorded 5 queries total")

    # Get summary
    summary = metrics.get_summary()
    assert summary["total_queries"] == 5, "Summary should show total queries"
    assert "quantum" in summary["adapter_stats"], "Should have quantum stats"
    assert "newton" in summary["adapter_stats"], "Should have newton stats"
    print(f"[OK] Summary generated with {len(summary['adapter_stats'])} adapters")

    # Check specific adapter stats
    newton_stats = metrics.get_adapter_stats("newton")
    assert newton_stats["total_selections"] == 4, "Newton should have 4 selections"
    assert newton_stats["memory_boost_hits"] == 0, "Newton had no boosts"
    print(f"[OK] Adapter stats: {newton_stats['total_selections']} selections")

    # Get recent routes
    recent = metrics.get_recent_routes(limit=3)
    assert len(recent) == 3, "Should return 3 recent routes"
    assert recent[0]["primary"] == "newton", "Most recent should be newton"
    print("[OK] Recent routes retrieved")

    return True


def test_phase5_integration():
    """Test complete Phase 5 integration (all components together)."""
    print("\n=== Test 5: Phase 5 Complete Integration ===")

    # Create router with memory (normally would load from disk)
    router = AdapterRouter(
        available_adapters=["newton", "davinci", "empathy", "philosophy"],
        memory_weighting=None,  # Phase 5 but no memory loaded
    )
    print("[OK] Router created with Phase 5 integration ready")

    # Create Gamma field
    gamma = CoherenceFieldGamma()
    print("[OK] Gamma stabilization field initialized")

    # Create metrics tracker
    routing_metrics = RoutingMetrics()
    print("[OK] Routing metrics tracker initialized")

    # Simulate a complete routing cycle
    query = "How should society balance freedom and security?"
    route = router.route(query, strategy="keyword", max_adapters=2)

    # Create metrics record
    record = RoutingMetrics.create_record(
        query=query,
        primary_adapter=route.primary,
        secondary_adapters=route.secondary,
        strategy=route.strategy,
        confidence_before_boost=0.7,
        confidence_after_boost=0.7,
        memory_boost_applied=False,
    )
    routing_metrics.record_route(record)

    # Simulate debate with conflict
    class MockConflict:
        def __init__(self, agent_a, agent_b):
            self.agent_a = agent_a
            self.agent_b = agent_b
            self.strength = 0.15

    conflicts = [MockConflict("newton", "philosophy")]
    responses = {
        "newton": "Mathematical security metrics",
        "philosophy": "Ethical freedom considerations",
        "davinci": "Innovative balance approaches",
    }

    # Check health
    health = gamma.compute_health(conflicts, responses)
    # Determine status based on is_* methods
    if health.is_collapsing():
        status = "collapsing"
    elif health.is_groupthinking():
        status = "groupthinking"
    else:
        status = "stable"
    print(f"[OK] Health computed: Gamma = {health.gamma:.3f} ({status})")

    # Get all metrics
    summary = routing_metrics.get_summary()
    gamma_data = gamma.export_metrics()

    assert summary["total_queries"] == 1, "Should have recorded 1 query"
    assert "health_history" in gamma_data, "Should export health history"
    print("[OK] All Phase 5 components working together")

    return True


def main():
    """Run all Phase 5 tests."""
    print("=" * 70)
    print("PHASE 5 END-TO-END INTEGRATION TESTS")
    print("=" * 70)

    tests = [
        ("Reinforcement Config", test_reinforcement_config),
        ("AdapterRouter Memory", test_adapter_router_with_memory),
        ("Gamma Health Monitoring", test_gamma_health_monitoring),
        ("Routing Metrics", test_routing_metrics),
        ("Phase 5 Integration", test_phase5_integration),
    ]

    passed = 0
    failed = 0

    for test_name, test_func in tests:
        try:
            if test_func():
                passed += 1
                print(f"\n[PASS] {test_name} PASSED")
            else:
                failed += 1
                print(f"\n[FAIL] {test_name} FAILED")
        except Exception as e:
            failed += 1
            print(f"\n[FAIL] {test_name} ERROR: {e}")
            import traceback
            traceback.print_exc()

    print("\n" + "=" * 70)
    print(f"RESULTS: {passed} passed, {failed} failed")
    print("=" * 70)

    return 0 if failed == 0 else 1


if __name__ == "__main__":
    sys.exit(main())