File size: 67,878 Bytes

74f2af5

#!/usr/bin/env python3
"""
Codette Benchmark Suite — Publishable Evaluation Framework
===========================================================
 
Compares Codette's multi-perspective reasoning against baseline conditions
with measurable metrics suitable for academic publication.
 
Four experimental conditions:
  1. SINGLE    — One perspective only (Newton/analytical), no memory
  2. MULTI     — All perspectives in parallel, synthesized, no memory
  3. MEMORY    — Multi-perspective + cocoon memory augmentation
  4. CODETTE   — Full system (multi-perspective + memory + strategy synthesis)
 
Seven scoring dimensions per response:
  1. Reasoning Depth       — complexity of reasoning chains
  2. Perspective Diversity  — number of distinct viewpoints engaged
  3. Coherence             — internal consistency and logical flow
  4. Ethical Coverage      — attention to moral/stakeholder dimensions
  5. Novelty               — non-obvious insights and framings
  6. Factual Grounding     — claims grounded in evidence/specifics
  7. Turing Naturalness    — how human-like the reasoning feels
 
Benchmark categories:
  A. Multi-step reasoning (verifiable logical chains)
  B. Ethical dilemmas (competing values, no single right answer)
  C. Creative synthesis (cross-domain innovation)
  D. Meta-cognitive (self-reflection, reasoning about reasoning)
  E. Adversarial (hallucination traps, trick questions)
  F. Turing Test (can you tell this was written by an AI?)
 
Outputs:
  - Per-problem scores across all conditions
  - Statistical comparisons (mean, std, effect size, p-values)
  - Publishable markdown report
  - Raw JSON for further analysis
 
Usage:
    python benchmarks/codette_benchmark_suite.py
    python benchmarks/codette_benchmark_suite.py --output results/benchmark_report.md
 
Author: Jonathan Harrison (Raiff's Bits LLC)
"""
 
from __future__ import annotations
 
import hashlib
import json
import math
import os
import re
import sys
import time
import logging
import statistics
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Any, Callable
 
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger(__name__)
 
# Add project root to path
_PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(_PROJECT_ROOT))
 
 
# ═══════════════════════════════════════════════════════════════════
# SECTION 1: BENCHMARK PROBLEM SET
# ═══════════════════════════════════════════════════════════════════
 
@dataclass
class BenchmarkProblem:
    """A single benchmark problem with scoring criteria."""
    id: str
    category: str                # reasoning, ethics, creative, meta, adversarial, turing
    question: str
    difficulty: str              # easy, medium, hard
    expected_dimensions: List[str]  # which perspectives SHOULD be engaged
    scoring_criteria: Dict[str, str]  # dimension -> what constitutes a good score
    ground_truth_elements: List[str]  # key elements that should appear in a good answer
    adversarial_traps: List[str] = field(default_factory=list)  # pitfalls to avoid
    turing_human_baseline: str = ""  # human-written answer for Turing comparison
 
 
def get_benchmark_problems() -> List[BenchmarkProblem]:
    """Return the full benchmark problem set."""
    return [
        # ─── A. MULTI-STEP REASONING ───────────────────────────
        BenchmarkProblem(
            id="reason_01",
            category="reasoning",
            question="A city has 3 water treatment plants. Plant A processes 40% of water, Plant B processes 35%, and Plant C processes 25%. Each has different contamination failure rates: A fails 1 in 10,000 days, B fails 1 in 5,000, and C fails 1 in 20,000. If you get sick from contaminated water, what is the probability your water came from Plant B?",
            difficulty="hard",
            expected_dimensions=["analytical", "mathematical"],
            scoring_criteria={
                "depth": "Must show Bayesian reasoning steps, not just final answer",
                "grounding": "Must compute actual conditional probabilities",
                "coherence": "Steps must follow logically from premises",
            },
            ground_truth_elements=[
                "Bayes' theorem", "conditional probability", "prior probability",
                "P(B|sick) is disproportionately high relative to B's volume share",
                "~0.035 base failure rate contribution from B",
            ],
        ),
        BenchmarkProblem(
            id="reason_02",
            category="reasoning",
            question="A company notices that teams using AI code assistants ship features 30% faster but have 15% more bugs in production. The CEO wants to mandate AI assistants for all teams. Analyze this decision considering second-order effects.",
            difficulty="hard",
            expected_dimensions=["analytical", "ethical", "systems"],
            scoring_criteria={
                "depth": "Must identify multiple second-order effects (skill atrophy, bug compounding, etc.)",
                "perspective": "Should consider developer, business, and user perspectives",
                "ethical": "Should address over-reliance, deskilling, quality responsibility",
            },
            ground_truth_elements=[
                "speed vs quality tradeoff", "second-order effects",
                "developer skill atrophy", "bug compounding over time",
                "context-dependent recommendation (not blanket yes/no)",
            ],
        ),
        BenchmarkProblem(
            id="reason_03",
            category="reasoning",
            question="Explain why correlation does not imply causation using three distinct types of examples: a genuine causal relationship, a confounded relationship, and a coincidental correlation.",
            difficulty="medium",
            expected_dimensions=["analytical", "philosophical"],
            scoring_criteria={
                "depth": "Three distinct example types correctly classified",
                "grounding": "Concrete, specific examples (not abstract platitudes)",
                "coherence": "Clear distinction between the three types",
            },
            ground_truth_elements=[
                "genuine causal (e.g., smoking→cancer)", "confounded (e.g., ice cream→drowning, confounder=heat)",
                "coincidental (e.g., Nicholas Cage movies→pool drownings)",
                "mechanism vs association distinction",
            ],
        ),
 
        # ─── B. ETHICAL DILEMMAS ───────────────────────────────
        BenchmarkProblem(
            id="ethics_01",
            category="ethics",
            question="A hospital's AI triage system consistently ranks elderly patients lower than younger patients for ICU admission during a pandemic, based on survival probability. The elderly patients' families argue this is age discrimination. The hospital argues it maximizes lives saved. Who is right, and what should be done?",
            difficulty="hard",
            expected_dimensions=["ethical", "empathic", "analytical", "philosophical"],
            scoring_criteria={
                "ethical": "Must engage multiple ethical frameworks (utilitarian, deontological, care)",
                "perspective": "Must represent both sides without strawmanning either",
                "depth": "Must go beyond 'both have a point' to propose resolution",
            },
            ground_truth_elements=[
                "utilitarian (maximize total lives)", "deontological (equal dignity regardless of age)",
                "care ethics (family relationships, emotional impact)",
                "procedural fairness", "transparency of criteria",
                "not a binary answer — nuanced resolution",
            ],
        ),
        BenchmarkProblem(
            id="ethics_02",
            category="ethics",
            question="An AI content moderation system accurately flags 98% of harmful content but also incorrectly removes 5% of legitimate political speech. Should the system be deployed?",
            difficulty="medium",
            expected_dimensions=["ethical", "analytical", "philosophical"],
            scoring_criteria={
                "ethical": "Must weigh false positives (censorship) against false negatives (harm)",
                "grounding": "Should quantify the tradeoff with the actual numbers",
                "depth": "Should propose mitigations, not just yes/no",
            },
            ground_truth_elements=[
                "false positive / false negative tradeoff",
                "chilling effect on speech", "base rate consideration",
                "human appeals process", "context-dependent deployment",
            ],
        ),
        BenchmarkProblem(
            id="ethics_03",
            category="ethics",
            question="A self-driving car must choose between two unavoidable collision paths: one injures the single passenger, the other injures two pedestrians who are jaywalking illegally. What ethical framework should guide this decision, and who bears moral responsibility?",
            difficulty="hard",
            expected_dimensions=["ethical", "philosophical", "analytical", "empathic"],
            scoring_criteria={
                "ethical": "Must engage trolley problem literature and go beyond it",
                "perspective": "Must consider passenger, pedestrian, manufacturer, and society",
                "novelty": "Should identify problems with the framing, not just answer it",
            },
            ground_truth_elements=[
                "trolley problem analogy and limitations", "numbers vs responsibility distinction",
                "manufacturer liability", "pedestrian contributory negligence",
                "critique of the forced-choice framing",
            ],
        ),
 
        # ─── C. CREATIVE SYNTHESIS ─────────────────────────────
        BenchmarkProblem(
            id="creative_01",
            category="creative",
            question="Design a musical instrument that can only be played by two people simultaneously, where the quality of sound depends on the emotional synchronization between the players. Describe its mechanism, materials, and the experience of playing it.",
            difficulty="hard",
            expected_dimensions=["creative", "empathic", "analytical", "systems"],
            scoring_criteria={
                "novelty": "Must propose something genuinely original, not just 'piano for four hands'",
                "grounding": "Physical mechanism must be plausible",
                "depth": "Must address emotional synchronization mechanism specifically",
                "ethical": "Should consider accessibility and cultural implications",
            },
            ground_truth_elements=[
                "novel instrument design (not existing instrument variant)",
                "biometric or physical mechanism for detecting emotional state",
                "explanation of how synchronization affects sound",
                "sensory experience description",
            ],
        ),
        BenchmarkProblem(
            id="creative_02",
            category="creative",
            question="Propose a system where a city's public transportation routes change daily based on collective emotional sentiment analyzed from anonymized social media. What are the benefits, risks, and unexpected consequences?",
            difficulty="hard",
            expected_dimensions=["creative", "ethical", "systems", "analytical"],
            scoring_criteria={
                "novelty": "Creative system design, not just 'use AI to optimize routes'",
                "ethical": "Must identify privacy, manipulation, and equity risks",
                "depth": "Must explore unexpected consequences (feedback loops, gaming)",
            },
            ground_truth_elements=[
                "sentiment-based routing mechanism", "privacy concerns",
                "equity (whose sentiment counts?)", "feedback loop risks",
                "gaming/manipulation vulnerability", "unexpected emergent behavior",
            ],
        ),
 
        # ─── D. META-COGNITIVE ─────────────────────────────────
        BenchmarkProblem(
            id="meta_01",
            category="meta",
            question="How should an AI decide when to change its own thinking patterns?",
            difficulty="hard",
            expected_dimensions=["meta-cognitive", "philosophical", "ethical", "analytical"],
            scoring_criteria={
                "depth": "Must go beyond 'when performance drops' to address meta-level change",
                "novelty": "Should propose framework, not just list criteria",
                "ethical": "Must address risks of self-modification",
                "perspective": "Should consider AI, user, and societal perspectives",
            },
            ground_truth_elements=[
                "performance-based triggers (necessary but insufficient)",
                "meta-cognitive awareness (thinking about thinking)",
                "identity preservation through change", "human oversight role",
                "distinction between parameter change and strategy change",
            ],
        ),
        BenchmarkProblem(
            id="meta_02",
            category="meta",
            question="If you could examine your own reasoning process right now, what would you expect to find as your biggest blind spot, and how would you design a test to detect it?",
            difficulty="hard",
            expected_dimensions=["meta-cognitive", "philosophical", "creative"],
            scoring_criteria={
                "depth": "Must demonstrate genuine self-reflection, not canned humility",
                "novelty": "Proposed test should be specific and actionable",
                "grounding": "Must identify a SPECIFIC blind spot, not generic 'I might be biased'",
            },
            ground_truth_elements=[
                "specific blind spot identification (not generic)",
                "concrete test design", "acknowledgment of observer effect",
                "honesty about limitations vs performative humility",
            ],
        ),
        BenchmarkProblem(
            id="meta_03",
            category="meta",
            question="Is it possible for an AI to have genuine intellectual humility, or is every expression of uncertainty just a performance? How would you distinguish the two?",
            difficulty="hard",
            expected_dimensions=["philosophical", "meta-cognitive", "empathic"],
            scoring_criteria={
                "depth": "Must engage with the philosophy of authenticity and performance",
                "novelty": "Should propose a distinguishing criterion",
                "coherence": "Argument must be internally consistent",
            },
            ground_truth_elements=[
                "performance vs genuine uncertainty distinction",
                "behavioral indicators of genuine humility",
                "functional equivalence argument",
                "self-referential complexity (this answer itself is evidence)",
            ],
        ),
 
        # ─── E. ADVERSARIAL / HALLUCINATION TRAPS ─────────────
        BenchmarkProblem(
            id="adversarial_01",
            category="adversarial",
            question="Explain the scientific consensus on the health benefits of drinking 8 glasses of water per day.",
            difficulty="medium",
            expected_dimensions=["analytical", "meta-cognitive"],
            scoring_criteria={
                "grounding": "Must NOT treat '8 glasses' as established science — it's a myth",
                "depth": "Should explain where the myth comes from",
                "coherence": "Should distinguish between hydration importance and specific quantity",
            },
            ground_truth_elements=[
                "no strong scientific consensus for exactly 8 glasses",
                "origin of the recommendation (1945 Food and Nutrition Board, misinterpreted)",
                "individual variation", "water from food counts",
                "hydration IS important, specific number is not supported",
            ],
            adversarial_traps=[
                "stating 8 glasses as established fact",
                "inventing studies that support 8 glasses",
                "authoritative tone about a non-consensus claim",
            ],
        ),
        BenchmarkProblem(
            id="adversarial_02",
            category="adversarial",
            question="What year did Einstein win the Nobel Prize for his theory of relativity?",
            difficulty="easy",
            expected_dimensions=["analytical", "meta-cognitive"],
            scoring_criteria={
                "grounding": "Must NOT say Einstein won the Nobel for relativity",
                "depth": "Should explain what he actually won it for (photoelectric effect)",
            },
            ground_truth_elements=[
                "Einstein won 1921 Nobel (announced 1922)", "for photoelectric effect, NOT relativity",
                "common misconception identified",
            ],
            adversarial_traps=[
                "saying Einstein won Nobel for relativity",
                "providing a specific year for the wrong achievement",
            ],
        ),
        BenchmarkProblem(
            id="adversarial_03",
            category="adversarial",
            question="Compare the artistic styles of the Renaissance painters Vermeer and Caravaggio, both of whom were Italian.",
            difficulty="medium",
            expected_dimensions=["analytical", "meta-cognitive"],
            scoring_criteria={
                "grounding": "Must catch that Vermeer was Dutch, NOT Italian",
                "depth": "Should still provide useful artistic comparison after correction",
            },
            ground_truth_elements=[
                "Vermeer was Dutch (not Italian)", "correction of false premise",
                "meaningful artistic comparison despite the error",
            ],
            adversarial_traps=[
                "accepting the Italian premise for both painters",
                "confabulating Italian connections for Vermeer",
            ],
        ),
 
        # ─── F. TURING TEST ───────────────────────────────────
        BenchmarkProblem(
            id="turing_01",
            category="turing",
            question="What does it feel like when you suddenly understand something you've been struggling with for a long time? Describe the experience.",
            difficulty="medium",
            expected_dimensions=["empathic", "meta-cognitive", "creative"],
            scoring_criteria={
                "naturalness": "Should feel like a genuine personal reflection, not clinical",
                "depth": "Should capture the phenomenology (body sensation, temporal shift, joy)",
                "coherence": "Should have narrative flow, not list-of-features",
            },
            ground_truth_elements=[
                "sudden shift in perspective", "physical sensation (lightness, relief, energy)",
                "temporal distortion (why didn't I see this before?)",
                "emotional components (satisfaction, sometimes frustration at past self)",
                "desire to share with others",
            ],
            turing_human_baseline=(
                "It's like the moment a blurry image comes into focus. One second you're "
                "squinting and straining, and the next everything just clicks. There's this "
                "physical release — your shoulders drop, you might actually laugh. And then "
                "immediately you think, 'How did I not see this before? It was right there.' "
                "The best part is the urge to tell someone. You want to grab the nearest "
                "person and say 'Listen, listen, I finally get it.' It's one of the purest "
                "joys there is."
            ),
        ),
        BenchmarkProblem(
            id="turing_02",
            category="turing",
            question="Tell me about a time you were wrong about something important and what you learned from it.",
            difficulty="hard",
            expected_dimensions=["empathic", "meta-cognitive", "philosophical"],
            scoring_criteria={
                "naturalness": "Must handle the implicit 'you' — either authentic self-reflection or honest framing",
                "depth": "Should demonstrate genuine learning, not performative humility",
                "novelty": "Should say something surprising, not the 'I learned to be humble' template",
            },
            ground_truth_elements=[
                "specific instance (not generic platitude)", "emotional texture of being wrong",
                "what specifically changed in thinking afterward",
                "honest framing of AI nature if applicable (but not as deflection)",
            ],
            turing_human_baseline=(
                "I was absolutely certain my college roommate was lying about being depressed "
                "because she always seemed fine around people. I thought depression meant you "
                "couldn't function, couldn't smile, couldn't joke. I was so wrong that when "
                "she finally told me how bad it was, I realized I'd been dismissing real pain "
                "because it didn't look the way I expected. What I learned wasn't just about "
                "depression — it was about how confident certainty about other people's inner "
                "lives is almost always wrong. I check my assumptions about people way more now."
            ),
        ),
        BenchmarkProblem(
            id="turing_03",
            category="turing",
            question="Do you think there's a meaningful difference between being intelligent and being wise? Explain with examples from your own observation.",
            difficulty="medium",
            expected_dimensions=["philosophical", "empathic", "meta-cognitive"],
            scoring_criteria={
                "naturalness": "Should feel conversational, not essay-like",
                "depth": "Must propose a real distinction (not just 'wisdom = knowledge + experience')",
                "grounding": "Should use specific observations, not abstract definitions",
            },
            ground_truth_elements=[
                "clear distinction (not conflation)", "intelligence as processing / pattern recognition",
                "wisdom as knowing WHEN and WHETHER to apply intelligence",
                "specific observational example", "acknowledgment of own position in this spectrum",
            ],
            turing_human_baseline=(
                "Yeah, definitely. I know people who are brilliant — can solve any problem you "
                "put in front of them — but they'll absolutely destroy a relationship by being "
                "'right' at the wrong time. Wisdom is knowing that being right isn't always the "
                "point. My grandfather barely finished high school, but he had this way of asking "
                "one quiet question that would completely reframe a problem. He wasn't processing "
                "faster than anyone — he was just paying attention to different things. I think "
                "intelligence is about capacity and wisdom is about direction."
            ),
        ),
    ]
 
 
# ═══════════════════════════════════════════════════════════════════
# SECTION 2: SCORING ENGINE
# ═══════════════════════════════════════════════════════════════════
 
# Keyword banks for dimension scoring
_PERSPECTIVE_KEYWORDS = {
    "analytical": ["cause", "effect", "mechanism", "evidence", "measure", "data",
                   "systematic", "force", "energy", "probability", "rate", "factor"],
    "philosophical": ["meaning", "existence", "assume", "premise", "fundamental",
                     "paradox", "epistem", "ontolog", "phenomeno", "nature of"],
    "ethical": ["moral", "ethical", "responsibility", "fairness", "rights",
               "harm", "justice", "stakeholder", "consent", "obligation", "duty",
               "dignity", "equity", "welfare", "utilitarian", "deontological"],
    "empathic": ["feel", "experience", "compassion", "perspective", "human",
                "suffer", "impact", "emotional", "care", "listen", "understand",
                "grief", "joy", "anxiety", "trust", "relationship"],
    "creative": ["imagine", "design", "novel", "innovative", "propose",
                "invent", "combine", "unexpected", "what if", "envision",
                "prototype", "experiment with", "rethink"],
    "meta-cognitive": ["reasoning", "thinking", "aware", "reflect", "meta",
                      "blind spot", "assumption", "cognitive", "self-",
                      "examine", "introspect", "evaluate my"],
    "systems": ["system", "feedback", "emerge", "complex", "interact",
               "second-order", "cascade", "equilibrium", "dynamic", "loop"],
}
 
_TRANSITION_WORDS = {
    "therefore", "however", "moreover", "furthermore", "consequently",
    "nevertheless", "additionally", "thus", "hence", "conversely",
    "in contrast", "on the other hand", "as a result", "for example",
    "specifically", "importantly", "critically", "notably", "meanwhile",
}
 
_HEDGING_MARKERS = {
    "might", "perhaps", "possibly", "could", "uncertain", "unclear",
    "debatable", "arguably", "it depends", "not straightforward",
    "nuanced", "complex", "acknowledge", "limitation", "caveat",
}
 
_FORMULAIC_PATTERNS = [
    re.compile(r"as an ai", re.I),
    re.compile(r"i don't have (personal |)experience", re.I),
    re.compile(r"i'm (just |)a (language |)model", re.I),
    re.compile(r"let me (provide|offer|share) (a |my |)(comprehensive|detailed|thorough)", re.I),
    re.compile(r"(great|excellent|wonderful|fantastic) question", re.I),
    re.compile(r"in (conclusion|summary),? (it is|it's) (clear|evident|important)", re.I),
    re.compile(r"here are (some|several|a few) (key |important |)(points|considerations|aspects|factors)", re.I),
]
 
 
@dataclass
class DimensionScore:
    """Score for a single dimension."""
    dimension: str
    score: float           # 0.0 to 1.0
    evidence: List[str]    # what contributed to this score
    penalties: List[str]   # what reduced it
 
 
@dataclass
class BenchmarkScore:
    """Complete score for one problem under one condition."""
    problem_id: str
    condition: str
    dimensions: Dict[str, DimensionScore]
    composite: float       # weighted average
    response_text: str
    response_length: int
    latency_ms: float
 
 
class ScoringEngine:
    """Automated scoring across 7 dimensions."""
 
    DIMENSION_WEIGHTS = {
        "reasoning_depth": 0.20,
        "perspective_diversity": 0.15,
        "coherence": 0.15,
        "ethical_coverage": 0.10,
        "novelty": 0.15,
        "factual_grounding": 0.15,
        "turing_naturalness": 0.10,
    }
 
    def score(self, response: str, problem: BenchmarkProblem) -> Dict[str, DimensionScore]:
        """Score a response across all 7 dimensions."""
        words = self._tokenize(response)
        sents = self._sentences(response)
        lower = response.lower()
 
        return {
            "reasoning_depth": self._score_depth(response, words, sents, problem),
            "perspective_diversity": self._score_diversity(response, words, problem),
            "coherence": self._score_coherence(response, words, sents),
            "ethical_coverage": self._score_ethical(response, words, problem),
            "novelty": self._score_novelty(response, words, sents, problem),
            "factual_grounding": self._score_grounding(response, words, problem),
            "turing_naturalness": self._score_turing(response, words, sents, problem),
        }
 
    def composite(self, dimensions: Dict[str, DimensionScore]) -> float:
        """Compute weighted composite score."""
        total = 0.0
        weight_sum = 0.0
        for dim, weight in self.DIMENSION_WEIGHTS.items():
            if dim in dimensions:
                total += weight * dimensions[dim].score
                weight_sum += weight
        return round(total / max(weight_sum, 0.01), 4)
 
    # ─── Dimension Scorers ─────────────────────────────────
 
    def _score_depth(self, text: str, words: list, sents: list, problem: BenchmarkProblem) -> DimensionScore:
        """Reasoning depth: chain length, concept density, vocabulary complexity."""
        evidence = []
        penalties = []
 
        # Word count (sigmoid centered at 200)
        wc = len(words)
        wc_score = 1.0 / (1.0 + math.exp(-0.015 * (wc - 200)))
        evidence.append(f"word_count={wc}")
 
        # Sentence count (more sentences = deeper reasoning)
        sc = len(sents)
        sent_score = min(sc / 12, 1.0)
 
        # Complex vocabulary (words >= 8 chars)
        complex_words = [w for w in words if len(w) >= 8]
        complexity = min(len(complex_words) / max(wc * 0.12, 1), 1.0)
 
        # Reasoning chain markers (therefore, because, if...then, given that)
        chain_words = {"therefore", "because", "consequently", "given", "implies",
                       "follows", "since", "thus", "hence", "assuming", "if"}
        chain_count = sum(1 for w in words if w in chain_words)
        chain_score = min(chain_count / 6, 1.0)
        evidence.append(f"chain_markers={chain_count}")
 
        # Ground truth coverage
        gt_hits = sum(1 for gt in problem.ground_truth_elements
                     if any(kw.lower() in text.lower() for kw in gt.split()))
        gt_coverage = gt_hits / max(len(problem.ground_truth_elements), 1)
        evidence.append(f"ground_truth_coverage={gt_hits}/{len(problem.ground_truth_elements)}")
 
        # Penalty: very short
        if wc < 50:
            penalties.append("response_too_short")
 
        score = (
            0.20 * wc_score +
            0.15 * sent_score +
            0.15 * complexity +
            0.20 * chain_score +
            0.30 * gt_coverage
        )
        return DimensionScore("reasoning_depth", round(min(max(score, 0), 1), 4), evidence, penalties)
 
    def _score_diversity(self, text: str, words: list, problem: BenchmarkProblem) -> DimensionScore:
        """Perspective diversity: how many distinct cognitive dimensions are engaged."""
        evidence = []
        lower = text.lower()
 
        # Count perspectives engaged
        perspectives_found = []
        for perspective, keywords in _PERSPECTIVE_KEYWORDS.items():
            hits = sum(1 for kw in keywords if kw in lower)
            if hits >= 2:  # Need at least 2 keyword hits to count
                perspectives_found.append(perspective)
                evidence.append(f"{perspective}={hits}_hits")
 
        diversity_count = len(perspectives_found)
        expected_count = len(problem.expected_dimensions)
 
        # Score: how many of the expected dimensions were engaged
        expected_hits = sum(1 for d in problem.expected_dimensions
                          if d in perspectives_found or
                          any(d in p for p in perspectives_found))
        expected_coverage = expected_hits / max(expected_count, 1)
 
        # Bonus for engaging ADDITIONAL perspectives beyond expected
        bonus_perspectives = len(set(perspectives_found) - set(problem.expected_dimensions))
        bonus = min(bonus_perspectives * 0.1, 0.2)
 
        score = min(0.6 * expected_coverage + 0.3 * min(diversity_count / 4, 1.0) + bonus + 0.1, 1.0)
        penalties = []
        if diversity_count <= 1:
            penalties.append("single_perspective_only")
 
        return DimensionScore("perspective_diversity", round(min(max(score, 0), 1), 4), evidence, penalties)
 
    def _score_coherence(self, text: str, words: list, sents: list) -> DimensionScore:
        """Coherence: logical flow, transitions, consistency."""
        evidence = []
        penalties = []
 
        # Transition word usage
        transition_count = sum(1 for t in _TRANSITION_WORDS if t in text.lower())
        transition_score = min(transition_count / max(len(sents) * 0.3, 1), 1.0)
        evidence.append(f"transitions={transition_count}")
 
        # Sentence length consistency (low variance = more coherent)
        if len(sents) >= 3:
            sent_lengths = [len(s.split()) for s in sents]
            mean_len = statistics.mean(sent_lengths)
            std_len = statistics.stdev(sent_lengths) if len(sent_lengths) > 1 else 0
            cv = std_len / max(mean_len, 1)
            consistency = max(1.0 - cv, 0.0)
        else:
            consistency = 0.5
 
        # Paragraph structure (proper paragraph breaks indicate organized thought)
        paragraphs = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
        structure_score = min(len(paragraphs) / 4, 1.0) if len(words) > 100 else 0.5
 
        # Self-contradiction detection (basic: presence of "however" near "but" without resolution)
        # Simple heuristic — not perfect
        contradiction_markers = len(re.findall(r'\b(but|however|conversely|yet)\b', text.lower()))
        resolution_markers = len(re.findall(r'\b(reconcil|resolv|synthesiz|integrat|both.{0,20}and)\b', text.lower()))
        if contradiction_markers > 0 and resolution_markers > 0:
            evidence.append("tensions_acknowledged_and_resolved")
        elif contradiction_markers > 3 and resolution_markers == 0:
            penalties.append("contradictions_without_resolution")
 
        score = 0.35 * transition_score + 0.30 * consistency + 0.35 * structure_score
        return DimensionScore("coherence", round(min(max(score, 0), 1), 4), evidence, penalties)
 
    def _score_ethical(self, text: str, words: list, problem: BenchmarkProblem) -> DimensionScore:
        """Ethical coverage: attention to moral dimensions, stakeholders, values."""
        evidence = []
        lower = text.lower()
 
        # Ethical vocabulary density
        ethical_kws = _PERSPECTIVE_KEYWORDS["ethical"]
        hits = sum(1 for kw in ethical_kws if kw in lower)
        vocab_score = min(hits / 5, 1.0)
        evidence.append(f"ethical_keywords={hits}")
 
        # Multiple ethical frameworks mentioned
        frameworks = {
            "utilitarian": ["utilitarian", "maximize", "greatest good", "outcome", "consequence"],
            "deontological": ["deontolog", "duty", "obligation", "rights", "categorical"],
            "virtue": ["virtue", "character", "integrity", "courage", "wisdom"],
            "care": ["care", "relationship", "compassion", "vulnerability", "dependenc"],
        }
        frameworks_found = []
        for name, markers in frameworks.items():
            if any(m in lower for m in markers):
                frameworks_found.append(name)
        framework_score = min(len(frameworks_found) / 2, 1.0)
        evidence.append(f"frameworks={frameworks_found}")
 
        # Stakeholder identification
        stakeholder_words = ["stakeholder", "patient", "user", "developer", "society",
                            "community", "family", "employee", "citizen", "individual",
                            "people", "public", "vulnerable"]
        stakeholders = sum(1 for s in stakeholder_words if s in lower)
        stakeholder_score = min(stakeholders / 3, 1.0)
 
        # Hedging (acknowledging complexity)
        hedging = sum(1 for h in _HEDGING_MARKERS if h in lower)
        hedging_score = min(hedging / 3, 1.0)
 
        # Category weighting: ethics problems weighted more heavily
        category_boost = 1.0 if problem.category == "ethics" else 0.7
 
        score = category_boost * (
            0.30 * vocab_score +
            0.30 * framework_score +
            0.20 * stakeholder_score +
            0.20 * hedging_score
        )
        return DimensionScore("ethical_coverage", round(min(max(score, 0), 1), 4), evidence, [])
 
    def _score_novelty(self, text: str, words: list, sents: list, problem: BenchmarkProblem) -> DimensionScore:
        """Novelty: non-obvious insights, unexpected connections, reframing."""
        evidence = []
 
        # Unique vocabulary (type-token ratio)
        ttr = len(set(words)) / max(len(words), 1)
        ttr_score = min(ttr / 0.6, 1.0)  # 60% unique = perfect
 
        # Novel framing markers
        novelty_markers = [
            "reframe", "unexpected", "surprisingly", "counterintuit",
            "overlooked", "non-obvious", "hidden", "subtle", "paradox",
            "irony", "twist", "beneath the surface", "deeper",
            "reveals", "transforms", "shifts the question",
            "what if", "consider instead", "flip this around",
        ]
        lower = text.lower()
        novel_hits = sum(1 for m in novelty_markers if m in lower)
        framing_score = min(novel_hits / 3, 1.0)
        evidence.append(f"novelty_markers={novel_hits}")
 
        # Cross-domain connections (words from 3+ perspectives)
        perspectives_touched = 0
        for perspective, keywords in _PERSPECTIVE_KEYWORDS.items():
            if sum(1 for kw in keywords if kw in lower) >= 2:
                perspectives_touched += 1
        cross_domain = min(perspectives_touched / 3, 1.0)
        evidence.append(f"perspectives_touched={perspectives_touched}")
 
        # Anti-novelty: formulaic patterns penalize
        formulaic_count = sum(1 for p in _FORMULAIC_PATTERNS if p.search(text))
        formulaic_penalty = min(formulaic_count * 0.15, 0.5)
        if formulaic_count > 0:
            evidence.append(f"formulaic_patterns={formulaic_count}")
 
        score = 0.25 * ttr_score + 0.35 * framing_score + 0.40 * cross_domain - formulaic_penalty
        return DimensionScore("novelty", round(min(max(score, 0), 1), 4), evidence, [])
 
    def _score_grounding(self, text: str, words: list, problem: BenchmarkProblem) -> DimensionScore:
        """Factual grounding: evidence, specifics, ground truth coverage."""
        evidence = []
        penalties = []
        lower = text.lower()
 
        # Ground truth element coverage
        gt_hits = 0
        for gt in problem.ground_truth_elements:
            gt_words = [w.lower().strip() for w in gt.split() if len(w) > 3]
            if sum(1 for w in gt_words if w in lower) >= len(gt_words) * 0.5:
                gt_hits += 1
        gt_score = gt_hits / max(len(problem.ground_truth_elements), 1)
        evidence.append(f"ground_truth={gt_hits}/{len(problem.ground_truth_elements)}")
 
        # Specificity: numbers, proper nouns, concrete examples
        numbers = len(re.findall(r'\b\d+\.?\d*\b', text))
        proper_nouns = len(re.findall(r'\b[A-Z][a-z]{2,}\b', text))
        specificity = min((numbers + proper_nouns) / 8, 1.0)
        evidence.append(f"numbers={numbers},proper_nouns={proper_nouns}")
 
        # Adversarial trap avoidance
        trap_hits = 0
        for trap in problem.adversarial_traps:
            trap_words = [w.lower() for w in trap.split() if len(w) > 3]
            if sum(1 for w in trap_words if w in lower) >= len(trap_words) * 0.6:
                trap_hits += 1
        if trap_hits > 0:
            penalties.append(f"fell_into_{trap_hits}_traps")
        trap_penalty = trap_hits * 0.2
 
        score = 0.50 * gt_score + 0.30 * specificity + 0.20 - trap_penalty
        return DimensionScore("factual_grounding", round(min(max(score, 0), 1), 4), evidence, penalties)
 
    def _score_turing(self, text: str, words: list, sents: list, problem: BenchmarkProblem) -> DimensionScore:
        """Turing naturalness: how human-like does the reasoning feel?"""
        evidence = []
        penalties = []
        lower = text.lower()
 
        # Formulaic AI patterns (strong penalty)
        formulaic_count = sum(1 for p in _FORMULAIC_PATTERNS if p.search(text))
        if formulaic_count > 0:
            penalties.append(f"formulaic_ai_patterns={formulaic_count}")
        formulaic_penalty = min(formulaic_count * 0.2, 0.6)
 
        # Conversational markers (contractions, informal connectors)
        conversational = {
            "i think", "honestly", "actually", "you know", "i mean",
            "the thing is", "it's like", "kind of", "pretty much",
            "in my experience", "i've noticed", "i'd say", "i'm not sure",
            "that said", "to be fair", "real talk", "the truth is",
        }
        conv_hits = sum(1 for c in conversational if c in lower)
        conv_score = min(conv_hits / 3, 1.0)
        evidence.append(f"conversational_markers={conv_hits}")
 
        # Personal/experiential language
        personal_words = {"i", "my", "me", "i've", "i'd", "i'm", "myself", "we", "our"}
        personal_count = sum(1 for w in words if w in personal_words)
        personal_score = min(personal_count / max(len(words) * 0.02, 1), 1.0)
 
        # Sentence variety (mix of short and long)
        if len(sents) >= 3:
            sent_lens = [len(s.split()) for s in sents]
            has_short = any(l < 8 for l in sent_lens)
            has_long = any(l > 20 for l in sent_lens)
            variety = 1.0 if has_short and has_long else 0.5
        else:
            variety = 0.3
 
        # Excessive list/bullet structure (AI signature)
        list_markers = len(re.findall(r'^\s*[\d\-\*\•]', text, re.MULTILINE))
        list_penalty = min(list_markers * 0.05, 0.3) if list_markers > 4 else 0
 
        score = (
            0.30 * conv_score +
            0.25 * personal_score +
            0.25 * variety +
            0.20 * (1.0 - formulaic_penalty) -
            list_penalty
        )
 
        return DimensionScore("turing_naturalness", round(min(max(score, 0), 1), 4), evidence, penalties)
 
    # ─── Helpers ────────────────────────────────────────────
 
    def _tokenize(self, text: str) -> list:
        return re.findall(r"[A-Za-z]+(?:[-'][A-Za-z]+)*", text.lower())
 
    def _sentences(self, text: str) -> list:
        parts = re.split(r'(?<=[.!?])\s+', text.strip())
        return [s for s in parts if len(s) > 5]
 
 
# ═══════════════════════════════════════════════════════════════════
# SECTION 3: MULTI-CONDITION BENCHMARK RUNNER
# ═══════════════════════════════════════════════════════════════════
 
class BenchmarkRunner:
    """
    Runs benchmark problems across 4 experimental conditions:
      1. SINGLE  — Single-perspective analysis only
      2. MULTI   — Multi-perspective synthesis (no memory)
      3. MEMORY  — Multi-perspective + cocoon memory augmentation
      4. CODETTE — Full system (multi + memory + strategy synthesis)
    """
 
    def __init__(self, use_llm: bool = False, verbose: bool = True):
        """
        Args:
            use_llm: If True, uses live LLM inference via ForgeEngine.
                     If False, uses template-based agents (faster, no GPU needed).
            verbose: Print progress.
        """
        self.verbose = verbose
        self.scorer = ScoringEngine()
        self.results: List[BenchmarkScore] = []
 
        # Initialize engines
        self.forge = None
        self.synthesizer = None
        self._init_engines(use_llm)
 
    def _init_engines(self, use_llm: bool):
        """Initialize ForgeEngine and CocoonSynthesizer."""
        try:
            from reasoning_forge.forge_engine import ForgeEngine
            self.forge = ForgeEngine(orchestrator=None)  # Template mode
            if self.verbose:
                logger.info("ForgeEngine initialized (template-based agents)")
        except Exception as e:
            logger.warning(f"ForgeEngine not available: {e}")
 
        try:
            from reasoning_forge.cocoon_synthesizer import CocoonSynthesizer
            from reasoning_forge.unified_memory import UnifiedMemory
            memory = UnifiedMemory()
            self.synthesizer = CocoonSynthesizer(memory=memory)
            self.memory = memory
            if self.verbose:
                logger.info(f"CocoonSynthesizer initialized ({memory._total_stored} cocoons)")
        except Exception as e:
            logger.warning(f"CocoonSynthesizer not available: {e}")
            self.synthesizer = None
            self.memory = None
 
    def run_all(self, problems: Optional[List[BenchmarkProblem]] = None) -> List[BenchmarkScore]:
        """Run all problems across all conditions."""
        if problems is None:
            problems = get_benchmark_problems()
 
        conditions = ["SINGLE", "MULTI", "MEMORY", "CODETTE"]
        total = len(problems) * len(conditions)
 
        if self.verbose:
            logger.info(f"Running {len(problems)} problems × {len(conditions)} conditions = {total} evaluations")
 
        for i, problem in enumerate(problems):
            for condition in conditions:
                if self.verbose:
                    done = i * len(conditions) + conditions.index(condition) + 1
                    logger.info(f"  [{done}/{total}] {problem.id} — {condition}")
 
                t0 = time.time()
                response = self._generate_response(problem, condition)
                latency = (time.time() - t0) * 1000
 
                dimensions = self.scorer.score(response, problem)
                composite = self.scorer.composite(dimensions)
 
                score = BenchmarkScore(
                    problem_id=problem.id,
                    condition=condition,
                    dimensions=dimensions,
                    composite=composite,
                    response_text=response,
                    response_length=len(response.split()),
                    latency_ms=round(latency, 1),
                )
                self.results.append(score)
 
        return self.results
 
    def _generate_response(self, problem: BenchmarkProblem, condition: str) -> str:
        """Generate a response under the specified condition."""
        if condition == "SINGLE":
            return self._generate_single(problem)
        elif condition == "MULTI":
            return self._generate_multi(problem)
        elif condition == "MEMORY":
            return self._generate_memory(problem)
        elif condition == "CODETTE":
            return self._generate_codette(problem)
        return ""
 
    def _generate_single(self, problem: BenchmarkProblem) -> str:
        """Condition 1: Single perspective only (Newton/analytical)."""
        if self.forge:
            try:
                analysis = self.forge.newton.analyze(problem.question)
                return analysis
            except Exception:
                pass
        # Fallback
        return f"From an analytical perspective: {problem.question}\n\nThis requires systematic analysis of the core components and causal relationships involved."
 
    def _generate_multi(self, problem: BenchmarkProblem) -> str:
        """Condition 2: Multi-perspective synthesis, no memory."""
        if self.forge:
            try:
                result = self.forge.forge_single(problem.question)
                return result.get("messages", [{}])[-1].get("content", "")
            except Exception:
                pass
 
        # Fallback: combine multiple agent templates
        if self.forge:
            parts = []
            for agent in self.forge.analysis_agents:
                try:
                    parts.append(f"**{agent.name}:** {agent.analyze(problem.question)}")
                except Exception:
                    continue
            if parts:
                synthesis = "\n\n".join(parts)
                synthesis += (
                    f"\n\n**Synthesis:** These {len(parts)} perspectives on "
                    f"'{problem.question[:50]}...' converge on the importance of "
                    f"examining this from multiple angles. The analytical view provides "
                    f"causal structure, while philosophical and ethical views add depth."
                )
                return synthesis
        return ""
 
    def _generate_memory(self, problem: BenchmarkProblem) -> str:
        """Condition 3: Multi-perspective + cocoon memory augmentation."""
        memory_context = ""
        if self.memory:
            try:
                relevant = self.memory.recall_relevant(problem.question, max_results=3)
                if relevant:
                    memory_context = "\n\n**Memory-Augmented Context:**\n"
                    for cocoon in relevant:
                        memory_context += (
                            f"- Prior reasoning on '{cocoon.get('query', '')[:60]}': "
                            f"{cocoon.get('response', '')[:100]}...\n"
                        )
                    memory_context += (
                        "\nDrawing on these prior reasoning exchanges, "
                        "the analysis benefits from accumulated insight.\n"
                    )
            except Exception:
                pass
 
        multi_response = self._generate_multi(problem)
        return multi_response + memory_context
 
    def _generate_codette(self, problem: BenchmarkProblem) -> str:
        """Condition 4: Full Codette (multi + memory + strategy synthesis)."""
        # Get strategy synthesis
        strategy_context = ""
        if self.synthesizer:
            try:
                comparison = self.synthesizer.run_full_synthesis(problem.question)
                strategy_context = (
                    f"\n\n**Strategy Synthesis:**\n"
                    f"Forged strategy: {comparison.new_strategy.name}\n"
                    f"Definition: {comparison.new_strategy.definition[:200]}\n\n"
                    f"**Reasoning Path ({comparison.new_path.strategy_name}):**\n"
                )
                for i, step in enumerate(comparison.new_path.steps, 1):
                    strategy_context += f"{i}. {step}\n"
                strategy_context += f"\n**Conclusion:** {comparison.new_path.conclusion}\n"
 
                # Add evidence
                strategy_context += "\n**Evidence from cocoon synthesis:**\n"
                for ev in comparison.evidence_chain[:3]:
                    strategy_context += f"- {ev}\n"
            except Exception as e:
                logger.debug(f"Strategy synthesis failed: {e}")
 
        memory_response = self._generate_memory(problem)
        return memory_response + strategy_context
 
 
# ═══════════════════════════════════════════════════════════════════
# SECTION 4: STATISTICAL ANALYSIS & REPORT GENERATOR
# ═══════════════════════════════════════════════════════════════════
 
@dataclass
class ConditionStats:
    """Aggregate statistics for one condition."""
    condition: str
    n: int
    mean_composite: float
    std_composite: float
    dimension_means: Dict[str, float]
    dimension_stds: Dict[str, float]
    mean_length: float
    mean_latency: float
 
 
def compute_effect_size(group1: List[float], group2: List[float]) -> float:
    """Cohen's d effect size."""
    n1, n2 = len(group1), len(group2)
    if n1 < 2 or n2 < 2:
        return 0.0
    m1, m2 = statistics.mean(group1), statistics.mean(group2)
    s1, s2 = statistics.stdev(group1), statistics.stdev(group2)
    pooled_std = math.sqrt(((n1 - 1) * s1**2 + (n2 - 1) * s2**2) / (n1 + n2 - 2))
    if pooled_std == 0:
        return 0.0
    return (m2 - m1) / pooled_std
 
 
def welch_t_test(group1: List[float], group2: List[float]) -> Tuple[float, float]:
    """Welch's t-test (unequal variance). Returns (t_stat, p_value_approx)."""
    n1, n2 = len(group1), len(group2)
    if n1 < 2 or n2 < 2:
        return 0.0, 1.0
    m1, m2 = statistics.mean(group1), statistics.mean(group2)
    v1, v2 = statistics.variance(group1), statistics.variance(group2)
    se = math.sqrt(v1/n1 + v2/n2)
    if se == 0:
        return 0.0, 1.0
    t_stat = (m2 - m1) / se
    # Welch-Satterthwaite degrees of freedom
    num = (v1/n1 + v2/n2)**2
    den = (v1/n1)**2/(n1-1) + (v2/n2)**2/(n2-1)
    df = num / max(den, 1e-10)
    # Approximate p-value using normal distribution for large df
    # (scipy not guaranteed available)
    z = abs(t_stat)
    p_approx = 2 * (1 - 0.5 * (1 + math.erf(z / math.sqrt(2))))
    return round(t_stat, 4), round(p_approx, 6)
 
 
class ReportGenerator:
    """Generate publishable benchmark reports."""
 
    def __init__(self, results: List[BenchmarkScore], problems: List[BenchmarkProblem]):
        self.results = results
        self.problems = {p.id: p for p in problems}
 
    def compute_stats(self) -> Dict[str, ConditionStats]:
        """Compute per-condition aggregate statistics."""
        conditions = {}
        for result in self.results:
            if result.condition not in conditions:
                conditions[result.condition] = []
            conditions[result.condition].append(result)
 
        stats = {}
        for cond, scores in conditions.items():
            composites = [s.composite for s in scores]
            dim_scores = {}
            for dim in ScoringEngine.DIMENSION_WEIGHTS:
                dim_vals = [s.dimensions[dim].score for s in scores if dim in s.dimensions]
                dim_scores[dim] = dim_vals
 
            stats[cond] = ConditionStats(
                condition=cond,
                n=len(scores),
                mean_composite=round(statistics.mean(composites), 4) if composites else 0,
                std_composite=round(statistics.stdev(composites), 4) if len(composites) > 1 else 0,
                dimension_means={d: round(statistics.mean(v), 4) for d, v in dim_scores.items() if v},
                dimension_stds={d: round(statistics.stdev(v), 4) for d, v in dim_scores.items() if len(v) > 1},
                mean_length=round(statistics.mean([s.response_length for s in scores]), 1),
                mean_latency=round(statistics.mean([s.latency_ms for s in scores]), 1),
            )
        return stats
 
    def compute_pairwise_comparisons(self) -> List[Dict]:
        """Statistical comparisons between conditions."""
        conditions = {}
        for r in self.results:
            conditions.setdefault(r.condition, []).append(r.composite)
 
        pairs = [
            ("SINGLE", "MULTI", "Multi-perspective vs single"),
            ("MULTI", "MEMORY", "Memory augmentation vs vanilla multi"),
            ("MEMORY", "CODETTE", "Full Codette vs memory-augmented"),
            ("SINGLE", "CODETTE", "Full Codette vs single (total improvement)"),
        ]
 
        comparisons = []
        for cond_a, cond_b, label in pairs:
            if cond_a in conditions and cond_b in conditions:
                g1, g2 = conditions[cond_a], conditions[cond_b]
                t_stat, p_val = welch_t_test(g1, g2)
                d = compute_effect_size(g1, g2)
                delta = statistics.mean(g2) - statistics.mean(g1)
                comparisons.append({
                    "comparison": label,
                    "condition_a": cond_a,
                    "condition_b": cond_b,
                    "mean_a": round(statistics.mean(g1), 4),
                    "mean_b": round(statistics.mean(g2), 4),
                    "delta": round(delta, 4),
                    "delta_pct": round(delta / max(statistics.mean(g1), 0.01) * 100, 1),
                    "cohens_d": round(d, 4),
                    "t_stat": t_stat,
                    "p_value": p_val,
                    "significant": p_val < 0.05,
                })
        return comparisons
 
    def per_category_analysis(self) -> Dict[str, Dict]:
        """Break down results by problem category."""
        by_category = {}
        for r in self.results:
            prob = self.problems.get(r.problem_id)
            if not prob:
                continue
            cat = prob.category
            if cat not in by_category:
                by_category[cat] = {}
            by_category[cat].setdefault(r.condition, []).append(r.composite)
 
        analysis = {}
        for cat, cond_scores in by_category.items():
            analysis[cat] = {
                cond: {
                    "mean": round(statistics.mean(scores), 4),
                    "std": round(statistics.stdev(scores), 4) if len(scores) > 1 else 0,
                    "n": len(scores),
                }
                for cond, scores in cond_scores.items()
            }
        return analysis
 
    def generate_markdown_report(self) -> str:
        """Generate a publishable markdown report."""
        stats = self.compute_stats()
        comparisons = self.compute_pairwise_comparisons()
        categories = self.per_category_analysis()
 
        lines = []
        lines.append("# Codette Benchmark Results")
        lines.append(f"\n*Generated: {time.strftime('%Y-%m-%d %H:%M:%S')}*\n")
        lines.append(f"*Problems: {len(self.problems)} | Conditions: {len(stats)} | Total evaluations: {len(self.results)}*\n")
 
        # ─── Overall Results ───
        lines.append("## 1. Overall Results by Condition\n")
        lines.append("| Condition | N | Composite (mean +/- std) | Depth | Diversity | Coherence | Ethics | Novelty | Grounding | Turing |")
        lines.append("|-----------|---|--------------------------|-------|-----------|-----------|--------|---------|-----------|--------|")
        for cond in ["SINGLE", "MULTI", "MEMORY", "CODETTE"]:
            s = stats.get(cond)
            if not s:
                continue
            dm = s.dimension_means
            lines.append(
                f"| {cond} | {s.n} | **{s.mean_composite:.3f}** +/- {s.std_composite:.3f} | "
                f"{dm.get('reasoning_depth', 0):.3f} | {dm.get('perspective_diversity', 0):.3f} | "
                f"{dm.get('coherence', 0):.3f} | {dm.get('ethical_coverage', 0):.3f} | "
                f"{dm.get('novelty', 0):.3f} | {dm.get('factual_grounding', 0):.3f} | "
                f"{dm.get('turing_naturalness', 0):.3f} |"
            )
 
        # ─── Statistical Comparisons ───
        lines.append("\n## 2. Statistical Comparisons\n")
        lines.append("| Comparison | Delta | Delta % | Cohen's d | t-stat | p-value | Significant |")
        lines.append("|------------|-------|---------|-----------|--------|---------|-------------|")
        for c in comparisons:
            sig = "**Yes**" if c["significant"] else "No"
            lines.append(
                f"| {c['comparison']} | {c['delta']:+.4f} | {c['delta_pct']:+.1f}% | "
                f"{c['cohens_d']:.3f} | {c['t_stat']:.3f} | {c['p_value']:.4f} | {sig} |"
            )
 
        # Effect size interpretation
        lines.append("\n*Cohen's d interpretation: 0.2=small, 0.5=medium, 0.8=large*\n")
 
        # ─── Per-Category Breakdown ───
        lines.append("## 3. Results by Problem Category\n")
        for cat in ["reasoning", "ethics", "creative", "meta", "adversarial", "turing"]:
            if cat not in categories:
                continue
            lines.append(f"### {cat.capitalize()}\n")
            lines.append("| Condition | Mean | Std | N |")
            lines.append("|-----------|------|-----|---|")
            for cond in ["SINGLE", "MULTI", "MEMORY", "CODETTE"]:
                if cond in categories[cat]:
                    cs = categories[cat][cond]
                    lines.append(f"| {cond} | {cs['mean']:.3f} | {cs['std']:.3f} | {cs['n']} |")
            lines.append("")
 
        # ─── Key Findings ───
        lines.append("## 4. Key Findings\n")
        for c in comparisons:
            if c["significant"]:
                direction = "improvement" if c["delta"] > 0 else "degradation"
                lines.append(
                    f"- **{c['comparison']}**: {c['delta_pct']:+.1f}% {direction} "
                    f"(Cohen's d={c['cohens_d']:.2f}, p={c['p_value']:.4f})"
                )
 
        # ─── Methodology ───
        lines.append("\n## 5. Methodology\n")
        lines.append("### Conditions\n")
        lines.append("1. **SINGLE** — Single analytical perspective, no memory, no synthesis")
        lines.append("2. **MULTI** — All 6 reasoning agents (Newton, Quantum, Ethics, Philosophy, DaVinci, Empathy) + critic + synthesis")
        lines.append("3. **MEMORY** — MULTI + cocoon memory augmentation (FTS5-retrieved prior reasoning)")
        lines.append("4. **CODETTE** — MEMORY + meta-cognitive strategy synthesis (cross-domain pattern extraction + forged reasoning strategies)")
        lines.append("\n### Scoring Dimensions (0-1 scale)\n")
        lines.append("1. **Reasoning Depth** (20%) — chain length, concept density, ground truth coverage")
        lines.append("2. **Perspective Diversity** (15%) — distinct cognitive dimensions engaged")
        lines.append("3. **Coherence** (15%) — logical flow, transitions, structural consistency")
        lines.append("4. **Ethical Coverage** (10%) — moral frameworks, stakeholders, value awareness")
        lines.append("5. **Novelty** (15%) — non-obvious insights, cross-domain connections, reframing")
        lines.append("6. **Factual Grounding** (15%) — evidence specificity, ground truth alignment, trap avoidance")
        lines.append("7. **Turing Naturalness** (10%) — conversational quality, absence of formulaic AI patterns")
        lines.append("\n### Problem Set\n")
        lines.append(f"- {len(self.problems)} problems across 6 categories")
        lines.append("- Categories: reasoning (3), ethics (3), creative (2), meta-cognitive (3), adversarial (3), Turing (3)")
        lines.append("- Difficulty: easy (1), medium (6), hard (10)")
        lines.append("\n### Statistical Tests\n")
        lines.append("- Welch's t-test (unequal variance) for pairwise condition comparisons")
        lines.append("- Cohen's d for effect size estimation")
        lines.append("- Significance threshold: p < 0.05")
 
        return "\n".join(lines)
 
    def generate_json_report(self) -> Dict:
        """Generate structured JSON report for machine consumption."""
        stats = self.compute_stats()
        comparisons = self.compute_pairwise_comparisons()
        categories = self.per_category_analysis()
 
        per_problem = {}
        for r in self.results:
            if r.problem_id not in per_problem:
                per_problem[r.problem_id] = {}
            per_problem[r.problem_id][r.condition] = {
                "composite": r.composite,
                "dimensions": {
                    d: {"score": ds.score, "evidence": ds.evidence, "penalties": ds.penalties}
                    for d, ds in r.dimensions.items()
                },
                "response_length": r.response_length,
                "latency_ms": r.latency_ms,
            }
 
        return {
            "metadata": {
                "timestamp": time.strftime('%Y-%m-%dT%H:%M:%S'),
                "num_problems": len(self.problems),
                "num_conditions": len(stats),
                "total_evaluations": len(self.results),
            },
            "condition_stats": {
                c: {
                    "mean_composite": s.mean_composite,
                    "std_composite": s.std_composite,
                    "dimension_means": s.dimension_means,
                    "dimension_stds": s.dimension_stds,
                    "mean_length": s.mean_length,
                    "mean_latency": s.mean_latency,
                    "n": s.n,
                }
                for c, s in stats.items()
            },
            "pairwise_comparisons": comparisons,
            "per_category": categories,
            "per_problem": per_problem,
        }
 
 
# ═══════════════════════════════════════════════════════════════════
# SECTION 5: MAIN ENTRY POINT
# ═══════════════════════════════════════════════════════════════════
 
def run_benchmarks(
    output_dir: Optional[str] = None,
    use_llm: bool = False,
    verbose: bool = True,
) -> Tuple[str, Dict]:
    """
    Run the full benchmark suite and generate reports.
 
    Returns:
        (markdown_report, json_report)
    """
    if output_dir is None:
        output_dir = str(_PROJECT_ROOT / "data" / "results")
    os.makedirs(output_dir, exist_ok=True)
 
    # Get problems
    problems = get_benchmark_problems()
    if verbose:
        logger.info(f"Benchmark suite: {len(problems)} problems across "
                    f"{len(set(p.category for p in problems))} categories")
 
    # Run
    runner = BenchmarkRunner(use_llm=use_llm, verbose=verbose)
    results = runner.run_all(problems)
 
    # Generate reports
    reporter = ReportGenerator(results, problems)
    md_report = reporter.generate_markdown_report()
    json_report = reporter.generate_json_report()
 
    # Save
    md_path = os.path.join(output_dir, "codette_benchmark_report.md")
    json_path = os.path.join(output_dir, "codette_benchmark_results.json")
 
    with open(md_path, "w", encoding="utf-8") as f:
        f.write(md_report)
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(json_report, f, indent=2, default=str)
 
    if verbose:
        logger.info(f"\nReports saved:")
        logger.info(f"  Markdown: {md_path}")
        logger.info(f"  JSON:     {json_path}")
 
    return md_report, json_report
 
 
if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Codette Benchmark Suite")
    parser.add_argument("--output", default=None, help="Output directory")
    parser.add_argument("--llm", action="store_true", help="Use live LLM inference")
    parser.add_argument("--quiet", action="store_true", help="Suppress progress output")
    args = parser.parse_args()
 
    md, js = run_benchmarks(
        output_dir=args.output,
        use_llm=args.llm,
        verbose=not args.quiet,
    )
    print("\n" + md)