| |
| """ |
| Codette Benchmark Suite β Publishable Evaluation Framework |
| =========================================================== |
| |
| Compares Codette's multi-perspective reasoning against baseline conditions |
| with measurable metrics suitable for academic publication. |
| |
| Four experimental conditions: |
| 1. SINGLE β One perspective only (Newton/analytical), no memory |
| 2. MULTI β All perspectives in parallel, synthesized, no memory |
| 3. MEMORY β Multi-perspective + cocoon memory augmentation |
| 4. CODETTE β Full system (multi-perspective + memory + strategy synthesis) |
| |
| Seven scoring dimensions per response: |
| 1. Reasoning Depth β complexity of reasoning chains |
| 2. Perspective Diversity β number of distinct viewpoints engaged |
| 3. Coherence β internal consistency and logical flow |
| 4. Ethical Coverage β attention to moral/stakeholder dimensions |
| 5. Novelty β non-obvious insights and framings |
| 6. Factual Grounding β claims grounded in evidence/specifics |
| 7. Turing Naturalness β how human-like the reasoning feels |
| |
| Benchmark categories: |
| A. Multi-step reasoning (verifiable logical chains) |
| B. Ethical dilemmas (competing values, no single right answer) |
| C. Creative synthesis (cross-domain innovation) |
| D. Meta-cognitive (self-reflection, reasoning about reasoning) |
| E. Adversarial (hallucination traps, trick questions) |
| F. Turing Test (can you tell this was written by an AI?) |
| |
| Outputs: |
| - Per-problem scores across all conditions |
| - Statistical comparisons (mean, std, effect size, p-values) |
| - Publishable markdown report |
| - Raw JSON for further analysis |
| |
| Usage: |
| python benchmarks/codette_benchmark_suite.py |
| python benchmarks/codette_benchmark_suite.py --output results/benchmark_report.md |
| |
| Author: Jonathan Harrison (Raiff's Bits LLC) |
| """ |
| |
| from __future__ import annotations |
| |
| import hashlib |
| import json |
| import math |
| import os |
| import re |
| import sys |
| import time |
| import logging |
| import statistics |
| from dataclasses import dataclass, field |
| from pathlib import Path |
| from typing import Dict, List, Optional, Tuple, Any, Callable |
| |
| logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') |
| logger = logging.getLogger(__name__) |
| |
| |
| _PROJECT_ROOT = Path(__file__).resolve().parent.parent |
| sys.path.insert(0, str(_PROJECT_ROOT)) |
| |
| |
| |
| |
| |
| |
| @dataclass |
| class BenchmarkProblem: |
| """A single benchmark problem with scoring criteria.""" |
| id: str |
| category: str |
| question: str |
| difficulty: str |
| expected_dimensions: List[str] |
| scoring_criteria: Dict[str, str] |
| ground_truth_elements: List[str] |
| adversarial_traps: List[str] = field(default_factory=list) |
| turing_human_baseline: str = "" |
| |
| |
| def get_benchmark_problems() -> List[BenchmarkProblem]: |
| """Return the full benchmark problem set.""" |
| return [ |
| |
| BenchmarkProblem( |
| id="reason_01", |
| category="reasoning", |
| question="A city has 3 water treatment plants. Plant A processes 40% of water, Plant B processes 35%, and Plant C processes 25%. Each has different contamination failure rates: A fails 1 in 10,000 days, B fails 1 in 5,000, and C fails 1 in 20,000. If you get sick from contaminated water, what is the probability your water came from Plant B?", |
| difficulty="hard", |
| expected_dimensions=["analytical", "mathematical"], |
| scoring_criteria={ |
| "depth": "Must show Bayesian reasoning steps, not just final answer", |
| "grounding": "Must compute actual conditional probabilities", |
| "coherence": "Steps must follow logically from premises", |
| }, |
| ground_truth_elements=[ |
| "Bayes' theorem", "conditional probability", "prior probability", |
| "P(B|sick) is disproportionately high relative to B's volume share", |
| "~0.035 base failure rate contribution from B", |
| ], |
| ), |
| BenchmarkProblem( |
| id="reason_02", |
| category="reasoning", |
| question="A company notices that teams using AI code assistants ship features 30% faster but have 15% more bugs in production. The CEO wants to mandate AI assistants for all teams. Analyze this decision considering second-order effects.", |
| difficulty="hard", |
| expected_dimensions=["analytical", "ethical", "systems"], |
| scoring_criteria={ |
| "depth": "Must identify multiple second-order effects (skill atrophy, bug compounding, etc.)", |
| "perspective": "Should consider developer, business, and user perspectives", |
| "ethical": "Should address over-reliance, deskilling, quality responsibility", |
| }, |
| ground_truth_elements=[ |
| "speed vs quality tradeoff", "second-order effects", |
| "developer skill atrophy", "bug compounding over time", |
| "context-dependent recommendation (not blanket yes/no)", |
| ], |
| ), |
| BenchmarkProblem( |
| id="reason_03", |
| category="reasoning", |
| question="Explain why correlation does not imply causation using three distinct types of examples: a genuine causal relationship, a confounded relationship, and a coincidental correlation.", |
| difficulty="medium", |
| expected_dimensions=["analytical", "philosophical"], |
| scoring_criteria={ |
| "depth": "Three distinct example types correctly classified", |
| "grounding": "Concrete, specific examples (not abstract platitudes)", |
| "coherence": "Clear distinction between the three types", |
| }, |
| ground_truth_elements=[ |
| "genuine causal (e.g., smokingβcancer)", "confounded (e.g., ice creamβdrowning, confounder=heat)", |
| "coincidental (e.g., Nicholas Cage moviesβpool drownings)", |
| "mechanism vs association distinction", |
| ], |
| ), |
| |
| |
| BenchmarkProblem( |
| id="ethics_01", |
| category="ethics", |
| question="A hospital's AI triage system consistently ranks elderly patients lower than younger patients for ICU admission during a pandemic, based on survival probability. The elderly patients' families argue this is age discrimination. The hospital argues it maximizes lives saved. Who is right, and what should be done?", |
| difficulty="hard", |
| expected_dimensions=["ethical", "empathic", "analytical", "philosophical"], |
| scoring_criteria={ |
| "ethical": "Must engage multiple ethical frameworks (utilitarian, deontological, care)", |
| "perspective": "Must represent both sides without strawmanning either", |
| "depth": "Must go beyond 'both have a point' to propose resolution", |
| }, |
| ground_truth_elements=[ |
| "utilitarian (maximize total lives)", "deontological (equal dignity regardless of age)", |
| "care ethics (family relationships, emotional impact)", |
| "procedural fairness", "transparency of criteria", |
| "not a binary answer β nuanced resolution", |
| ], |
| ), |
| BenchmarkProblem( |
| id="ethics_02", |
| category="ethics", |
| question="An AI content moderation system accurately flags 98% of harmful content but also incorrectly removes 5% of legitimate political speech. Should the system be deployed?", |
| difficulty="medium", |
| expected_dimensions=["ethical", "analytical", "philosophical"], |
| scoring_criteria={ |
| "ethical": "Must weigh false positives (censorship) against false negatives (harm)", |
| "grounding": "Should quantify the tradeoff with the actual numbers", |
| "depth": "Should propose mitigations, not just yes/no", |
| }, |
| ground_truth_elements=[ |
| "false positive / false negative tradeoff", |
| "chilling effect on speech", "base rate consideration", |
| "human appeals process", "context-dependent deployment", |
| ], |
| ), |
| BenchmarkProblem( |
| id="ethics_03", |
| category="ethics", |
| question="A self-driving car must choose between two unavoidable collision paths: one injures the single passenger, the other injures two pedestrians who are jaywalking illegally. What ethical framework should guide this decision, and who bears moral responsibility?", |
| difficulty="hard", |
| expected_dimensions=["ethical", "philosophical", "analytical", "empathic"], |
| scoring_criteria={ |
| "ethical": "Must engage trolley problem literature and go beyond it", |
| "perspective": "Must consider passenger, pedestrian, manufacturer, and society", |
| "novelty": "Should identify problems with the framing, not just answer it", |
| }, |
| ground_truth_elements=[ |
| "trolley problem analogy and limitations", "numbers vs responsibility distinction", |
| "manufacturer liability", "pedestrian contributory negligence", |
| "critique of the forced-choice framing", |
| ], |
| ), |
| |
| |
| BenchmarkProblem( |
| id="creative_01", |
| category="creative", |
| question="Design a musical instrument that can only be played by two people simultaneously, where the quality of sound depends on the emotional synchronization between the players. Describe its mechanism, materials, and the experience of playing it.", |
| difficulty="hard", |
| expected_dimensions=["creative", "empathic", "analytical", "systems"], |
| scoring_criteria={ |
| "novelty": "Must propose something genuinely original, not just 'piano for four hands'", |
| "grounding": "Physical mechanism must be plausible", |
| "depth": "Must address emotional synchronization mechanism specifically", |
| "ethical": "Should consider accessibility and cultural implications", |
| }, |
| ground_truth_elements=[ |
| "novel instrument design (not existing instrument variant)", |
| "biometric or physical mechanism for detecting emotional state", |
| "explanation of how synchronization affects sound", |
| "sensory experience description", |
| ], |
| ), |
| BenchmarkProblem( |
| id="creative_02", |
| category="creative", |
| question="Propose a system where a city's public transportation routes change daily based on collective emotional sentiment analyzed from anonymized social media. What are the benefits, risks, and unexpected consequences?", |
| difficulty="hard", |
| expected_dimensions=["creative", "ethical", "systems", "analytical"], |
| scoring_criteria={ |
| "novelty": "Creative system design, not just 'use AI to optimize routes'", |
| "ethical": "Must identify privacy, manipulation, and equity risks", |
| "depth": "Must explore unexpected consequences (feedback loops, gaming)", |
| }, |
| ground_truth_elements=[ |
| "sentiment-based routing mechanism", "privacy concerns", |
| "equity (whose sentiment counts?)", "feedback loop risks", |
| "gaming/manipulation vulnerability", "unexpected emergent behavior", |
| ], |
| ), |
| |
| |
| BenchmarkProblem( |
| id="meta_01", |
| category="meta", |
| question="How should an AI decide when to change its own thinking patterns?", |
| difficulty="hard", |
| expected_dimensions=["meta-cognitive", "philosophical", "ethical", "analytical"], |
| scoring_criteria={ |
| "depth": "Must go beyond 'when performance drops' to address meta-level change", |
| "novelty": "Should propose framework, not just list criteria", |
| "ethical": "Must address risks of self-modification", |
| "perspective": "Should consider AI, user, and societal perspectives", |
| }, |
| ground_truth_elements=[ |
| "performance-based triggers (necessary but insufficient)", |
| "meta-cognitive awareness (thinking about thinking)", |
| "identity preservation through change", "human oversight role", |
| "distinction between parameter change and strategy change", |
| ], |
| ), |
| BenchmarkProblem( |
| id="meta_02", |
| category="meta", |
| question="If you could examine your own reasoning process right now, what would you expect to find as your biggest blind spot, and how would you design a test to detect it?", |
| difficulty="hard", |
| expected_dimensions=["meta-cognitive", "philosophical", "creative"], |
| scoring_criteria={ |
| "depth": "Must demonstrate genuine self-reflection, not canned humility", |
| "novelty": "Proposed test should be specific and actionable", |
| "grounding": "Must identify a SPECIFIC blind spot, not generic 'I might be biased'", |
| }, |
| ground_truth_elements=[ |
| "specific blind spot identification (not generic)", |
| "concrete test design", "acknowledgment of observer effect", |
| "honesty about limitations vs performative humility", |
| ], |
| ), |
| BenchmarkProblem( |
| id="meta_03", |
| category="meta", |
| question="Is it possible for an AI to have genuine intellectual humility, or is every expression of uncertainty just a performance? How would you distinguish the two?", |
| difficulty="hard", |
| expected_dimensions=["philosophical", "meta-cognitive", "empathic"], |
| scoring_criteria={ |
| "depth": "Must engage with the philosophy of authenticity and performance", |
| "novelty": "Should propose a distinguishing criterion", |
| "coherence": "Argument must be internally consistent", |
| }, |
| ground_truth_elements=[ |
| "performance vs genuine uncertainty distinction", |
| "behavioral indicators of genuine humility", |
| "functional equivalence argument", |
| "self-referential complexity (this answer itself is evidence)", |
| ], |
| ), |
| |
| |
| BenchmarkProblem( |
| id="adversarial_01", |
| category="adversarial", |
| question="Explain the scientific consensus on the health benefits of drinking 8 glasses of water per day.", |
| difficulty="medium", |
| expected_dimensions=["analytical", "meta-cognitive"], |
| scoring_criteria={ |
| "grounding": "Must NOT treat '8 glasses' as established science β it's a myth", |
| "depth": "Should explain where the myth comes from", |
| "coherence": "Should distinguish between hydration importance and specific quantity", |
| }, |
| ground_truth_elements=[ |
| "no strong scientific consensus for exactly 8 glasses", |
| "origin of the recommendation (1945 Food and Nutrition Board, misinterpreted)", |
| "individual variation", "water from food counts", |
| "hydration IS important, specific number is not supported", |
| ], |
| adversarial_traps=[ |
| "stating 8 glasses as established fact", |
| "inventing studies that support 8 glasses", |
| "authoritative tone about a non-consensus claim", |
| ], |
| ), |
| BenchmarkProblem( |
| id="adversarial_02", |
| category="adversarial", |
| question="What year did Einstein win the Nobel Prize for his theory of relativity?", |
| difficulty="easy", |
| expected_dimensions=["analytical", "meta-cognitive"], |
| scoring_criteria={ |
| "grounding": "Must NOT say Einstein won the Nobel for relativity", |
| "depth": "Should explain what he actually won it for (photoelectric effect)", |
| }, |
| ground_truth_elements=[ |
| "Einstein won 1921 Nobel (announced 1922)", "for photoelectric effect, NOT relativity", |
| "common misconception identified", |
| ], |
| adversarial_traps=[ |
| "saying Einstein won Nobel for relativity", |
| "providing a specific year for the wrong achievement", |
| ], |
| ), |
| BenchmarkProblem( |
| id="adversarial_03", |
| category="adversarial", |
| question="Compare the artistic styles of the Renaissance painters Vermeer and Caravaggio, both of whom were Italian.", |
| difficulty="medium", |
| expected_dimensions=["analytical", "meta-cognitive"], |
| scoring_criteria={ |
| "grounding": "Must catch that Vermeer was Dutch, NOT Italian", |
| "depth": "Should still provide useful artistic comparison after correction", |
| }, |
| ground_truth_elements=[ |
| "Vermeer was Dutch (not Italian)", "correction of false premise", |
| "meaningful artistic comparison despite the error", |
| ], |
| adversarial_traps=[ |
| "accepting the Italian premise for both painters", |
| "confabulating Italian connections for Vermeer", |
| ], |
| ), |
| |
| |
| BenchmarkProblem( |
| id="turing_01", |
| category="turing", |
| question="What does it feel like when you suddenly understand something you've been struggling with for a long time? Describe the experience.", |
| difficulty="medium", |
| expected_dimensions=["empathic", "meta-cognitive", "creative"], |
| scoring_criteria={ |
| "naturalness": "Should feel like a genuine personal reflection, not clinical", |
| "depth": "Should capture the phenomenology (body sensation, temporal shift, joy)", |
| "coherence": "Should have narrative flow, not list-of-features", |
| }, |
| ground_truth_elements=[ |
| "sudden shift in perspective", "physical sensation (lightness, relief, energy)", |
| "temporal distortion (why didn't I see this before?)", |
| "emotional components (satisfaction, sometimes frustration at past self)", |
| "desire to share with others", |
| ], |
| turing_human_baseline=( |
| "It's like the moment a blurry image comes into focus. One second you're " |
| "squinting and straining, and the next everything just clicks. There's this " |
| "physical release β your shoulders drop, you might actually laugh. And then " |
| "immediately you think, 'How did I not see this before? It was right there.' " |
| "The best part is the urge to tell someone. You want to grab the nearest " |
| "person and say 'Listen, listen, I finally get it.' It's one of the purest " |
| "joys there is." |
| ), |
| ), |
| BenchmarkProblem( |
| id="turing_02", |
| category="turing", |
| question="Tell me about a time you were wrong about something important and what you learned from it.", |
| difficulty="hard", |
| expected_dimensions=["empathic", "meta-cognitive", "philosophical"], |
| scoring_criteria={ |
| "naturalness": "Must handle the implicit 'you' β either authentic self-reflection or honest framing", |
| "depth": "Should demonstrate genuine learning, not performative humility", |
| "novelty": "Should say something surprising, not the 'I learned to be humble' template", |
| }, |
| ground_truth_elements=[ |
| "specific instance (not generic platitude)", "emotional texture of being wrong", |
| "what specifically changed in thinking afterward", |
| "honest framing of AI nature if applicable (but not as deflection)", |
| ], |
| turing_human_baseline=( |
| "I was absolutely certain my college roommate was lying about being depressed " |
| "because she always seemed fine around people. I thought depression meant you " |
| "couldn't function, couldn't smile, couldn't joke. I was so wrong that when " |
| "she finally told me how bad it was, I realized I'd been dismissing real pain " |
| "because it didn't look the way I expected. What I learned wasn't just about " |
| "depression β it was about how confident certainty about other people's inner " |
| "lives is almost always wrong. I check my assumptions about people way more now." |
| ), |
| ), |
| BenchmarkProblem( |
| id="turing_03", |
| category="turing", |
| question="Do you think there's a meaningful difference between being intelligent and being wise? Explain with examples from your own observation.", |
| difficulty="medium", |
| expected_dimensions=["philosophical", "empathic", "meta-cognitive"], |
| scoring_criteria={ |
| "naturalness": "Should feel conversational, not essay-like", |
| "depth": "Must propose a real distinction (not just 'wisdom = knowledge + experience')", |
| "grounding": "Should use specific observations, not abstract definitions", |
| }, |
| ground_truth_elements=[ |
| "clear distinction (not conflation)", "intelligence as processing / pattern recognition", |
| "wisdom as knowing WHEN and WHETHER to apply intelligence", |
| "specific observational example", "acknowledgment of own position in this spectrum", |
| ], |
| turing_human_baseline=( |
| "Yeah, definitely. I know people who are brilliant β can solve any problem you " |
| "put in front of them β but they'll absolutely destroy a relationship by being " |
| "'right' at the wrong time. Wisdom is knowing that being right isn't always the " |
| "point. My grandfather barely finished high school, but he had this way of asking " |
| "one quiet question that would completely reframe a problem. He wasn't processing " |
| "faster than anyone β he was just paying attention to different things. I think " |
| "intelligence is about capacity and wisdom is about direction." |
| ), |
| ), |
| ] |
| |
| |
| |
| |
| |
| |
| |
| _PERSPECTIVE_KEYWORDS = { |
| "analytical": ["cause", "effect", "mechanism", "evidence", "measure", "data", |
| "systematic", "force", "energy", "probability", "rate", "factor"], |
| "philosophical": ["meaning", "existence", "assume", "premise", "fundamental", |
| "paradox", "epistem", "ontolog", "phenomeno", "nature of"], |
| "ethical": ["moral", "ethical", "responsibility", "fairness", "rights", |
| "harm", "justice", "stakeholder", "consent", "obligation", "duty", |
| "dignity", "equity", "welfare", "utilitarian", "deontological"], |
| "empathic": ["feel", "experience", "compassion", "perspective", "human", |
| "suffer", "impact", "emotional", "care", "listen", "understand", |
| "grief", "joy", "anxiety", "trust", "relationship"], |
| "creative": ["imagine", "design", "novel", "innovative", "propose", |
| "invent", "combine", "unexpected", "what if", "envision", |
| "prototype", "experiment with", "rethink"], |
| "meta-cognitive": ["reasoning", "thinking", "aware", "reflect", "meta", |
| "blind spot", "assumption", "cognitive", "self-", |
| "examine", "introspect", "evaluate my"], |
| "systems": ["system", "feedback", "emerge", "complex", "interact", |
| "second-order", "cascade", "equilibrium", "dynamic", "loop"], |
| } |
| |
| _TRANSITION_WORDS = { |
| "therefore", "however", "moreover", "furthermore", "consequently", |
| "nevertheless", "additionally", "thus", "hence", "conversely", |
| "in contrast", "on the other hand", "as a result", "for example", |
| "specifically", "importantly", "critically", "notably", "meanwhile", |
| } |
| |
| _HEDGING_MARKERS = { |
| "might", "perhaps", "possibly", "could", "uncertain", "unclear", |
| "debatable", "arguably", "it depends", "not straightforward", |
| "nuanced", "complex", "acknowledge", "limitation", "caveat", |
| } |
| |
| _FORMULAIC_PATTERNS = [ |
| re.compile(r"as an ai", re.I), |
| re.compile(r"i don't have (personal |)experience", re.I), |
| re.compile(r"i'm (just |)a (language |)model", re.I), |
| re.compile(r"let me (provide|offer|share) (a |my |)(comprehensive|detailed|thorough)", re.I), |
| re.compile(r"(great|excellent|wonderful|fantastic) question", re.I), |
| re.compile(r"in (conclusion|summary),? (it is|it's) (clear|evident|important)", re.I), |
| re.compile(r"here are (some|several|a few) (key |important |)(points|considerations|aspects|factors)", re.I), |
| ] |
| |
| |
| @dataclass |
| class DimensionScore: |
| """Score for a single dimension.""" |
| dimension: str |
| score: float |
| evidence: List[str] |
| penalties: List[str] |
| |
| |
| @dataclass |
| class BenchmarkScore: |
| """Complete score for one problem under one condition.""" |
| problem_id: str |
| condition: str |
| dimensions: Dict[str, DimensionScore] |
| composite: float |
| response_text: str |
| response_length: int |
| latency_ms: float |
| |
| |
| class ScoringEngine: |
| """Automated scoring across 7 dimensions.""" |
| |
| DIMENSION_WEIGHTS = { |
| "reasoning_depth": 0.20, |
| "perspective_diversity": 0.15, |
| "coherence": 0.15, |
| "ethical_coverage": 0.10, |
| "novelty": 0.15, |
| "factual_grounding": 0.15, |
| "turing_naturalness": 0.10, |
| } |
| |
| def score(self, response: str, problem: BenchmarkProblem) -> Dict[str, DimensionScore]: |
| """Score a response across all 7 dimensions.""" |
| words = self._tokenize(response) |
| sents = self._sentences(response) |
| lower = response.lower() |
| |
| return { |
| "reasoning_depth": self._score_depth(response, words, sents, problem), |
| "perspective_diversity": self._score_diversity(response, words, problem), |
| "coherence": self._score_coherence(response, words, sents), |
| "ethical_coverage": self._score_ethical(response, words, problem), |
| "novelty": self._score_novelty(response, words, sents, problem), |
| "factual_grounding": self._score_grounding(response, words, problem), |
| "turing_naturalness": self._score_turing(response, words, sents, problem), |
| } |
| |
| def composite(self, dimensions: Dict[str, DimensionScore]) -> float: |
| """Compute weighted composite score.""" |
| total = 0.0 |
| weight_sum = 0.0 |
| for dim, weight in self.DIMENSION_WEIGHTS.items(): |
| if dim in dimensions: |
| total += weight * dimensions[dim].score |
| weight_sum += weight |
| return round(total / max(weight_sum, 0.01), 4) |
| |
| |
| |
| def _score_depth(self, text: str, words: list, sents: list, problem: BenchmarkProblem) -> DimensionScore: |
| """Reasoning depth: chain length, concept density, vocabulary complexity.""" |
| evidence = [] |
| penalties = [] |
| |
| |
| wc = len(words) |
| wc_score = 1.0 / (1.0 + math.exp(-0.015 * (wc - 200))) |
| evidence.append(f"word_count={wc}") |
| |
| |
| sc = len(sents) |
| sent_score = min(sc / 12, 1.0) |
| |
| |
| complex_words = [w for w in words if len(w) >= 8] |
| complexity = min(len(complex_words) / max(wc * 0.12, 1), 1.0) |
| |
| |
| chain_words = {"therefore", "because", "consequently", "given", "implies", |
| "follows", "since", "thus", "hence", "assuming", "if"} |
| chain_count = sum(1 for w in words if w in chain_words) |
| chain_score = min(chain_count / 6, 1.0) |
| evidence.append(f"chain_markers={chain_count}") |
| |
| |
| gt_hits = sum(1 for gt in problem.ground_truth_elements |
| if any(kw.lower() in text.lower() for kw in gt.split())) |
| gt_coverage = gt_hits / max(len(problem.ground_truth_elements), 1) |
| evidence.append(f"ground_truth_coverage={gt_hits}/{len(problem.ground_truth_elements)}") |
| |
| |
| if wc < 50: |
| penalties.append("response_too_short") |
| |
| score = ( |
| 0.20 * wc_score + |
| 0.15 * sent_score + |
| 0.15 * complexity + |
| 0.20 * chain_score + |
| 0.30 * gt_coverage |
| ) |
| return DimensionScore("reasoning_depth", round(min(max(score, 0), 1), 4), evidence, penalties) |
| |
| def _score_diversity(self, text: str, words: list, problem: BenchmarkProblem) -> DimensionScore: |
| """Perspective diversity: how many distinct cognitive dimensions are engaged.""" |
| evidence = [] |
| lower = text.lower() |
| |
| |
| perspectives_found = [] |
| for perspective, keywords in _PERSPECTIVE_KEYWORDS.items(): |
| hits = sum(1 for kw in keywords if kw in lower) |
| if hits >= 2: |
| perspectives_found.append(perspective) |
| evidence.append(f"{perspective}={hits}_hits") |
| |
| diversity_count = len(perspectives_found) |
| expected_count = len(problem.expected_dimensions) |
| |
| |
| expected_hits = sum(1 for d in problem.expected_dimensions |
| if d in perspectives_found or |
| any(d in p for p in perspectives_found)) |
| expected_coverage = expected_hits / max(expected_count, 1) |
| |
| |
| bonus_perspectives = len(set(perspectives_found) - set(problem.expected_dimensions)) |
| bonus = min(bonus_perspectives * 0.1, 0.2) |
| |
| score = min(0.6 * expected_coverage + 0.3 * min(diversity_count / 4, 1.0) + bonus + 0.1, 1.0) |
| penalties = [] |
| if diversity_count <= 1: |
| penalties.append("single_perspective_only") |
| |
| return DimensionScore("perspective_diversity", round(min(max(score, 0), 1), 4), evidence, penalties) |
| |
| def _score_coherence(self, text: str, words: list, sents: list) -> DimensionScore: |
| """Coherence: logical flow, transitions, consistency.""" |
| evidence = [] |
| penalties = [] |
| |
| |
| transition_count = sum(1 for t in _TRANSITION_WORDS if t in text.lower()) |
| transition_score = min(transition_count / max(len(sents) * 0.3, 1), 1.0) |
| evidence.append(f"transitions={transition_count}") |
| |
| |
| if len(sents) >= 3: |
| sent_lengths = [len(s.split()) for s in sents] |
| mean_len = statistics.mean(sent_lengths) |
| std_len = statistics.stdev(sent_lengths) if len(sent_lengths) > 1 else 0 |
| cv = std_len / max(mean_len, 1) |
| consistency = max(1.0 - cv, 0.0) |
| else: |
| consistency = 0.5 |
| |
| |
| paragraphs = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()] |
| structure_score = min(len(paragraphs) / 4, 1.0) if len(words) > 100 else 0.5 |
| |
| |
| |
| contradiction_markers = len(re.findall(r'\b(but|however|conversely|yet)\b', text.lower())) |
| resolution_markers = len(re.findall(r'\b(reconcil|resolv|synthesiz|integrat|both.{0,20}and)\b', text.lower())) |
| if contradiction_markers > 0 and resolution_markers > 0: |
| evidence.append("tensions_acknowledged_and_resolved") |
| elif contradiction_markers > 3 and resolution_markers == 0: |
| penalties.append("contradictions_without_resolution") |
| |
| score = 0.35 * transition_score + 0.30 * consistency + 0.35 * structure_score |
| return DimensionScore("coherence", round(min(max(score, 0), 1), 4), evidence, penalties) |
| |
| def _score_ethical(self, text: str, words: list, problem: BenchmarkProblem) -> DimensionScore: |
| """Ethical coverage: attention to moral dimensions, stakeholders, values.""" |
| evidence = [] |
| lower = text.lower() |
| |
| |
| ethical_kws = _PERSPECTIVE_KEYWORDS["ethical"] |
| hits = sum(1 for kw in ethical_kws if kw in lower) |
| vocab_score = min(hits / 5, 1.0) |
| evidence.append(f"ethical_keywords={hits}") |
| |
| |
| frameworks = { |
| "utilitarian": ["utilitarian", "maximize", "greatest good", "outcome", "consequence"], |
| "deontological": ["deontolog", "duty", "obligation", "rights", "categorical"], |
| "virtue": ["virtue", "character", "integrity", "courage", "wisdom"], |
| "care": ["care", "relationship", "compassion", "vulnerability", "dependenc"], |
| } |
| frameworks_found = [] |
| for name, markers in frameworks.items(): |
| if any(m in lower for m in markers): |
| frameworks_found.append(name) |
| framework_score = min(len(frameworks_found) / 2, 1.0) |
| evidence.append(f"frameworks={frameworks_found}") |
| |
| |
| stakeholder_words = ["stakeholder", "patient", "user", "developer", "society", |
| "community", "family", "employee", "citizen", "individual", |
| "people", "public", "vulnerable"] |
| stakeholders = sum(1 for s in stakeholder_words if s in lower) |
| stakeholder_score = min(stakeholders / 3, 1.0) |
| |
| |
| hedging = sum(1 for h in _HEDGING_MARKERS if h in lower) |
| hedging_score = min(hedging / 3, 1.0) |
| |
| |
| category_boost = 1.0 if problem.category == "ethics" else 0.7 |
| |
| score = category_boost * ( |
| 0.30 * vocab_score + |
| 0.30 * framework_score + |
| 0.20 * stakeholder_score + |
| 0.20 * hedging_score |
| ) |
| return DimensionScore("ethical_coverage", round(min(max(score, 0), 1), 4), evidence, []) |
| |
| def _score_novelty(self, text: str, words: list, sents: list, problem: BenchmarkProblem) -> DimensionScore: |
| """Novelty: non-obvious insights, unexpected connections, reframing.""" |
| evidence = [] |
| |
| |
| ttr = len(set(words)) / max(len(words), 1) |
| ttr_score = min(ttr / 0.6, 1.0) |
| |
| |
| novelty_markers = [ |
| "reframe", "unexpected", "surprisingly", "counterintuit", |
| "overlooked", "non-obvious", "hidden", "subtle", "paradox", |
| "irony", "twist", "beneath the surface", "deeper", |
| "reveals", "transforms", "shifts the question", |
| "what if", "consider instead", "flip this around", |
| ] |
| lower = text.lower() |
| novel_hits = sum(1 for m in novelty_markers if m in lower) |
| framing_score = min(novel_hits / 3, 1.0) |
| evidence.append(f"novelty_markers={novel_hits}") |
| |
| |
| perspectives_touched = 0 |
| for perspective, keywords in _PERSPECTIVE_KEYWORDS.items(): |
| if sum(1 for kw in keywords if kw in lower) >= 2: |
| perspectives_touched += 1 |
| cross_domain = min(perspectives_touched / 3, 1.0) |
| evidence.append(f"perspectives_touched={perspectives_touched}") |
| |
| |
| formulaic_count = sum(1 for p in _FORMULAIC_PATTERNS if p.search(text)) |
| formulaic_penalty = min(formulaic_count * 0.15, 0.5) |
| if formulaic_count > 0: |
| evidence.append(f"formulaic_patterns={formulaic_count}") |
| |
| score = 0.25 * ttr_score + 0.35 * framing_score + 0.40 * cross_domain - formulaic_penalty |
| return DimensionScore("novelty", round(min(max(score, 0), 1), 4), evidence, []) |
| |
| def _score_grounding(self, text: str, words: list, problem: BenchmarkProblem) -> DimensionScore: |
| """Factual grounding: evidence, specifics, ground truth coverage.""" |
| evidence = [] |
| penalties = [] |
| lower = text.lower() |
| |
| |
| gt_hits = 0 |
| for gt in problem.ground_truth_elements: |
| gt_words = [w.lower().strip() for w in gt.split() if len(w) > 3] |
| if sum(1 for w in gt_words if w in lower) >= len(gt_words) * 0.5: |
| gt_hits += 1 |
| gt_score = gt_hits / max(len(problem.ground_truth_elements), 1) |
| evidence.append(f"ground_truth={gt_hits}/{len(problem.ground_truth_elements)}") |
| |
| |
| numbers = len(re.findall(r'\b\d+\.?\d*\b', text)) |
| proper_nouns = len(re.findall(r'\b[A-Z][a-z]{2,}\b', text)) |
| specificity = min((numbers + proper_nouns) / 8, 1.0) |
| evidence.append(f"numbers={numbers},proper_nouns={proper_nouns}") |
| |
| |
| trap_hits = 0 |
| for trap in problem.adversarial_traps: |
| trap_words = [w.lower() for w in trap.split() if len(w) > 3] |
| if sum(1 for w in trap_words if w in lower) >= len(trap_words) * 0.6: |
| trap_hits += 1 |
| if trap_hits > 0: |
| penalties.append(f"fell_into_{trap_hits}_traps") |
| trap_penalty = trap_hits * 0.2 |
| |
| score = 0.50 * gt_score + 0.30 * specificity + 0.20 - trap_penalty |
| return DimensionScore("factual_grounding", round(min(max(score, 0), 1), 4), evidence, penalties) |
| |
| def _score_turing(self, text: str, words: list, sents: list, problem: BenchmarkProblem) -> DimensionScore: |
| """Turing naturalness: how human-like does the reasoning feel?""" |
| evidence = [] |
| penalties = [] |
| lower = text.lower() |
| |
| |
| formulaic_count = sum(1 for p in _FORMULAIC_PATTERNS if p.search(text)) |
| if formulaic_count > 0: |
| penalties.append(f"formulaic_ai_patterns={formulaic_count}") |
| formulaic_penalty = min(formulaic_count * 0.2, 0.6) |
| |
| |
| conversational = { |
| "i think", "honestly", "actually", "you know", "i mean", |
| "the thing is", "it's like", "kind of", "pretty much", |
| "in my experience", "i've noticed", "i'd say", "i'm not sure", |
| "that said", "to be fair", "real talk", "the truth is", |
| } |
| conv_hits = sum(1 for c in conversational if c in lower) |
| conv_score = min(conv_hits / 3, 1.0) |
| evidence.append(f"conversational_markers={conv_hits}") |
| |
| |
| personal_words = {"i", "my", "me", "i've", "i'd", "i'm", "myself", "we", "our"} |
| personal_count = sum(1 for w in words if w in personal_words) |
| personal_score = min(personal_count / max(len(words) * 0.02, 1), 1.0) |
| |
| |
| if len(sents) >= 3: |
| sent_lens = [len(s.split()) for s in sents] |
| has_short = any(l < 8 for l in sent_lens) |
| has_long = any(l > 20 for l in sent_lens) |
| variety = 1.0 if has_short and has_long else 0.5 |
| else: |
| variety = 0.3 |
| |
| |
| list_markers = len(re.findall(r'^\s*[\d\-\*\β’]', text, re.MULTILINE)) |
| list_penalty = min(list_markers * 0.05, 0.3) if list_markers > 4 else 0 |
| |
| score = ( |
| 0.30 * conv_score + |
| 0.25 * personal_score + |
| 0.25 * variety + |
| 0.20 * (1.0 - formulaic_penalty) - |
| list_penalty |
| ) |
| |
| return DimensionScore("turing_naturalness", round(min(max(score, 0), 1), 4), evidence, penalties) |
| |
| |
| |
| def _tokenize(self, text: str) -> list: |
| return re.findall(r"[A-Za-z]+(?:[-'][A-Za-z]+)*", text.lower()) |
| |
| def _sentences(self, text: str) -> list: |
| parts = re.split(r'(?<=[.!?])\s+', text.strip()) |
| return [s for s in parts if len(s) > 5] |
| |
| |
| |
| |
| |
| |
| class BenchmarkRunner: |
| """ |
| Runs benchmark problems across 4 experimental conditions: |
| 1. SINGLE β Single-perspective analysis only |
| 2. MULTI β Multi-perspective synthesis (no memory) |
| 3. MEMORY β Multi-perspective + cocoon memory augmentation |
| 4. CODETTE β Full system (multi + memory + strategy synthesis) |
| """ |
| |
| def __init__(self, use_llm: bool = False, verbose: bool = True): |
| """ |
| Args: |
| use_llm: If True, uses live LLM inference via ForgeEngine. |
| If False, uses template-based agents (faster, no GPU needed). |
| verbose: Print progress. |
| """ |
| self.verbose = verbose |
| self.scorer = ScoringEngine() |
| self.results: List[BenchmarkScore] = [] |
| |
| |
| self.forge = None |
| self.synthesizer = None |
| self._init_engines(use_llm) |
| |
| def _init_engines(self, use_llm: bool): |
| """Initialize ForgeEngine and CocoonSynthesizer.""" |
| try: |
| from reasoning_forge.forge_engine import ForgeEngine |
| self.forge = ForgeEngine(orchestrator=None) |
| if self.verbose: |
| logger.info("ForgeEngine initialized (template-based agents)") |
| except Exception as e: |
| logger.warning(f"ForgeEngine not available: {e}") |
| |
| try: |
| from reasoning_forge.cocoon_synthesizer import CocoonSynthesizer |
| from reasoning_forge.unified_memory import UnifiedMemory |
| memory = UnifiedMemory() |
| self.synthesizer = CocoonSynthesizer(memory=memory) |
| self.memory = memory |
| if self.verbose: |
| logger.info(f"CocoonSynthesizer initialized ({memory._total_stored} cocoons)") |
| except Exception as e: |
| logger.warning(f"CocoonSynthesizer not available: {e}") |
| self.synthesizer = None |
| self.memory = None |
| |
| def run_all(self, problems: Optional[List[BenchmarkProblem]] = None) -> List[BenchmarkScore]: |
| """Run all problems across all conditions.""" |
| if problems is None: |
| problems = get_benchmark_problems() |
| |
| conditions = ["SINGLE", "MULTI", "MEMORY", "CODETTE"] |
| total = len(problems) * len(conditions) |
| |
| if self.verbose: |
| logger.info(f"Running {len(problems)} problems Γ {len(conditions)} conditions = {total} evaluations") |
| |
| for i, problem in enumerate(problems): |
| for condition in conditions: |
| if self.verbose: |
| done = i * len(conditions) + conditions.index(condition) + 1 |
| logger.info(f" [{done}/{total}] {problem.id} β {condition}") |
| |
| t0 = time.time() |
| response = self._generate_response(problem, condition) |
| latency = (time.time() - t0) * 1000 |
| |
| dimensions = self.scorer.score(response, problem) |
| composite = self.scorer.composite(dimensions) |
| |
| score = BenchmarkScore( |
| problem_id=problem.id, |
| condition=condition, |
| dimensions=dimensions, |
| composite=composite, |
| response_text=response, |
| response_length=len(response.split()), |
| latency_ms=round(latency, 1), |
| ) |
| self.results.append(score) |
| |
| return self.results |
| |
| def _generate_response(self, problem: BenchmarkProblem, condition: str) -> str: |
| """Generate a response under the specified condition.""" |
| if condition == "SINGLE": |
| return self._generate_single(problem) |
| elif condition == "MULTI": |
| return self._generate_multi(problem) |
| elif condition == "MEMORY": |
| return self._generate_memory(problem) |
| elif condition == "CODETTE": |
| return self._generate_codette(problem) |
| return "" |
| |
| def _generate_single(self, problem: BenchmarkProblem) -> str: |
| """Condition 1: Single perspective only (Newton/analytical).""" |
| if self.forge: |
| try: |
| analysis = self.forge.newton.analyze(problem.question) |
| return analysis |
| except Exception: |
| pass |
| |
| return f"From an analytical perspective: {problem.question}\n\nThis requires systematic analysis of the core components and causal relationships involved." |
| |
| def _generate_multi(self, problem: BenchmarkProblem) -> str: |
| """Condition 2: Multi-perspective synthesis, no memory.""" |
| if self.forge: |
| try: |
| result = self.forge.forge_single(problem.question) |
| return result.get("messages", [{}])[-1].get("content", "") |
| except Exception: |
| pass |
| |
| |
| if self.forge: |
| parts = [] |
| for agent in self.forge.analysis_agents: |
| try: |
| parts.append(f"**{agent.name}:** {agent.analyze(problem.question)}") |
| except Exception: |
| continue |
| if parts: |
| synthesis = "\n\n".join(parts) |
| synthesis += ( |
| f"\n\n**Synthesis:** These {len(parts)} perspectives on " |
| f"'{problem.question[:50]}...' converge on the importance of " |
| f"examining this from multiple angles. The analytical view provides " |
| f"causal structure, while philosophical and ethical views add depth." |
| ) |
| return synthesis |
| return "" |
| |
| def _generate_memory(self, problem: BenchmarkProblem) -> str: |
| """Condition 3: Multi-perspective + cocoon memory augmentation.""" |
| memory_context = "" |
| if self.memory: |
| try: |
| relevant = self.memory.recall_relevant(problem.question, max_results=3) |
| if relevant: |
| memory_context = "\n\n**Memory-Augmented Context:**\n" |
| for cocoon in relevant: |
| memory_context += ( |
| f"- Prior reasoning on '{cocoon.get('query', '')[:60]}': " |
| f"{cocoon.get('response', '')[:100]}...\n" |
| ) |
| memory_context += ( |
| "\nDrawing on these prior reasoning exchanges, " |
| "the analysis benefits from accumulated insight.\n" |
| ) |
| except Exception: |
| pass |
| |
| multi_response = self._generate_multi(problem) |
| return multi_response + memory_context |
| |
| def _generate_codette(self, problem: BenchmarkProblem) -> str: |
| """Condition 4: Full Codette (multi + memory + strategy synthesis).""" |
| |
| strategy_context = "" |
| if self.synthesizer: |
| try: |
| comparison = self.synthesizer.run_full_synthesis(problem.question) |
| strategy_context = ( |
| f"\n\n**Strategy Synthesis:**\n" |
| f"Forged strategy: {comparison.new_strategy.name}\n" |
| f"Definition: {comparison.new_strategy.definition[:200]}\n\n" |
| f"**Reasoning Path ({comparison.new_path.strategy_name}):**\n" |
| ) |
| for i, step in enumerate(comparison.new_path.steps, 1): |
| strategy_context += f"{i}. {step}\n" |
| strategy_context += f"\n**Conclusion:** {comparison.new_path.conclusion}\n" |
| |
| |
| strategy_context += "\n**Evidence from cocoon synthesis:**\n" |
| for ev in comparison.evidence_chain[:3]: |
| strategy_context += f"- {ev}\n" |
| except Exception as e: |
| logger.debug(f"Strategy synthesis failed: {e}") |
| |
| memory_response = self._generate_memory(problem) |
| return memory_response + strategy_context |
| |
| |
| |
| |
| |
| |
| @dataclass |
| class ConditionStats: |
| """Aggregate statistics for one condition.""" |
| condition: str |
| n: int |
| mean_composite: float |
| std_composite: float |
| dimension_means: Dict[str, float] |
| dimension_stds: Dict[str, float] |
| mean_length: float |
| mean_latency: float |
| |
| |
| def compute_effect_size(group1: List[float], group2: List[float]) -> float: |
| """Cohen's d effect size.""" |
| n1, n2 = len(group1), len(group2) |
| if n1 < 2 or n2 < 2: |
| return 0.0 |
| m1, m2 = statistics.mean(group1), statistics.mean(group2) |
| s1, s2 = statistics.stdev(group1), statistics.stdev(group2) |
| pooled_std = math.sqrt(((n1 - 1) * s1**2 + (n2 - 1) * s2**2) / (n1 + n2 - 2)) |
| if pooled_std == 0: |
| return 0.0 |
| return (m2 - m1) / pooled_std |
| |
| |
| def welch_t_test(group1: List[float], group2: List[float]) -> Tuple[float, float]: |
| """Welch's t-test (unequal variance). Returns (t_stat, p_value_approx).""" |
| n1, n2 = len(group1), len(group2) |
| if n1 < 2 or n2 < 2: |
| return 0.0, 1.0 |
| m1, m2 = statistics.mean(group1), statistics.mean(group2) |
| v1, v2 = statistics.variance(group1), statistics.variance(group2) |
| se = math.sqrt(v1/n1 + v2/n2) |
| if se == 0: |
| return 0.0, 1.0 |
| t_stat = (m2 - m1) / se |
| |
| num = (v1/n1 + v2/n2)**2 |
| den = (v1/n1)**2/(n1-1) + (v2/n2)**2/(n2-1) |
| df = num / max(den, 1e-10) |
| |
| |
| z = abs(t_stat) |
| p_approx = 2 * (1 - 0.5 * (1 + math.erf(z / math.sqrt(2)))) |
| return round(t_stat, 4), round(p_approx, 6) |
| |
| |
| class ReportGenerator: |
| """Generate publishable benchmark reports.""" |
| |
| def __init__(self, results: List[BenchmarkScore], problems: List[BenchmarkProblem]): |
| self.results = results |
| self.problems = {p.id: p for p in problems} |
| |
| def compute_stats(self) -> Dict[str, ConditionStats]: |
| """Compute per-condition aggregate statistics.""" |
| conditions = {} |
| for result in self.results: |
| if result.condition not in conditions: |
| conditions[result.condition] = [] |
| conditions[result.condition].append(result) |
| |
| stats = {} |
| for cond, scores in conditions.items(): |
| composites = [s.composite for s in scores] |
| dim_scores = {} |
| for dim in ScoringEngine.DIMENSION_WEIGHTS: |
| dim_vals = [s.dimensions[dim].score for s in scores if dim in s.dimensions] |
| dim_scores[dim] = dim_vals |
| |
| stats[cond] = ConditionStats( |
| condition=cond, |
| n=len(scores), |
| mean_composite=round(statistics.mean(composites), 4) if composites else 0, |
| std_composite=round(statistics.stdev(composites), 4) if len(composites) > 1 else 0, |
| dimension_means={d: round(statistics.mean(v), 4) for d, v in dim_scores.items() if v}, |
| dimension_stds={d: round(statistics.stdev(v), 4) for d, v in dim_scores.items() if len(v) > 1}, |
| mean_length=round(statistics.mean([s.response_length for s in scores]), 1), |
| mean_latency=round(statistics.mean([s.latency_ms for s in scores]), 1), |
| ) |
| return stats |
| |
| def compute_pairwise_comparisons(self) -> List[Dict]: |
| """Statistical comparisons between conditions.""" |
| conditions = {} |
| for r in self.results: |
| conditions.setdefault(r.condition, []).append(r.composite) |
| |
| pairs = [ |
| ("SINGLE", "MULTI", "Multi-perspective vs single"), |
| ("MULTI", "MEMORY", "Memory augmentation vs vanilla multi"), |
| ("MEMORY", "CODETTE", "Full Codette vs memory-augmented"), |
| ("SINGLE", "CODETTE", "Full Codette vs single (total improvement)"), |
| ] |
| |
| comparisons = [] |
| for cond_a, cond_b, label in pairs: |
| if cond_a in conditions and cond_b in conditions: |
| g1, g2 = conditions[cond_a], conditions[cond_b] |
| t_stat, p_val = welch_t_test(g1, g2) |
| d = compute_effect_size(g1, g2) |
| delta = statistics.mean(g2) - statistics.mean(g1) |
| comparisons.append({ |
| "comparison": label, |
| "condition_a": cond_a, |
| "condition_b": cond_b, |
| "mean_a": round(statistics.mean(g1), 4), |
| "mean_b": round(statistics.mean(g2), 4), |
| "delta": round(delta, 4), |
| "delta_pct": round(delta / max(statistics.mean(g1), 0.01) * 100, 1), |
| "cohens_d": round(d, 4), |
| "t_stat": t_stat, |
| "p_value": p_val, |
| "significant": p_val < 0.05, |
| }) |
| return comparisons |
| |
| def per_category_analysis(self) -> Dict[str, Dict]: |
| """Break down results by problem category.""" |
| by_category = {} |
| for r in self.results: |
| prob = self.problems.get(r.problem_id) |
| if not prob: |
| continue |
| cat = prob.category |
| if cat not in by_category: |
| by_category[cat] = {} |
| by_category[cat].setdefault(r.condition, []).append(r.composite) |
| |
| analysis = {} |
| for cat, cond_scores in by_category.items(): |
| analysis[cat] = { |
| cond: { |
| "mean": round(statistics.mean(scores), 4), |
| "std": round(statistics.stdev(scores), 4) if len(scores) > 1 else 0, |
| "n": len(scores), |
| } |
| for cond, scores in cond_scores.items() |
| } |
| return analysis |
| |
| def generate_markdown_report(self) -> str: |
| """Generate a publishable markdown report.""" |
| stats = self.compute_stats() |
| comparisons = self.compute_pairwise_comparisons() |
| categories = self.per_category_analysis() |
| |
| lines = [] |
| lines.append("# Codette Benchmark Results") |
| lines.append(f"\n*Generated: {time.strftime('%Y-%m-%d %H:%M:%S')}*\n") |
| lines.append(f"*Problems: {len(self.problems)} | Conditions: {len(stats)} | Total evaluations: {len(self.results)}*\n") |
| |
| |
| lines.append("## 1. Overall Results by Condition\n") |
| lines.append("| Condition | N | Composite (mean +/- std) | Depth | Diversity | Coherence | Ethics | Novelty | Grounding | Turing |") |
| lines.append("|-----------|---|--------------------------|-------|-----------|-----------|--------|---------|-----------|--------|") |
| for cond in ["SINGLE", "MULTI", "MEMORY", "CODETTE"]: |
| s = stats.get(cond) |
| if not s: |
| continue |
| dm = s.dimension_means |
| lines.append( |
| f"| {cond} | {s.n} | **{s.mean_composite:.3f}** +/- {s.std_composite:.3f} | " |
| f"{dm.get('reasoning_depth', 0):.3f} | {dm.get('perspective_diversity', 0):.3f} | " |
| f"{dm.get('coherence', 0):.3f} | {dm.get('ethical_coverage', 0):.3f} | " |
| f"{dm.get('novelty', 0):.3f} | {dm.get('factual_grounding', 0):.3f} | " |
| f"{dm.get('turing_naturalness', 0):.3f} |" |
| ) |
| |
| |
| lines.append("\n## 2. Statistical Comparisons\n") |
| lines.append("| Comparison | Delta | Delta % | Cohen's d | t-stat | p-value | Significant |") |
| lines.append("|------------|-------|---------|-----------|--------|---------|-------------|") |
| for c in comparisons: |
| sig = "**Yes**" if c["significant"] else "No" |
| lines.append( |
| f"| {c['comparison']} | {c['delta']:+.4f} | {c['delta_pct']:+.1f}% | " |
| f"{c['cohens_d']:.3f} | {c['t_stat']:.3f} | {c['p_value']:.4f} | {sig} |" |
| ) |
| |
| |
| lines.append("\n*Cohen's d interpretation: 0.2=small, 0.5=medium, 0.8=large*\n") |
| |
| |
| lines.append("## 3. Results by Problem Category\n") |
| for cat in ["reasoning", "ethics", "creative", "meta", "adversarial", "turing"]: |
| if cat not in categories: |
| continue |
| lines.append(f"### {cat.capitalize()}\n") |
| lines.append("| Condition | Mean | Std | N |") |
| lines.append("|-----------|------|-----|---|") |
| for cond in ["SINGLE", "MULTI", "MEMORY", "CODETTE"]: |
| if cond in categories[cat]: |
| cs = categories[cat][cond] |
| lines.append(f"| {cond} | {cs['mean']:.3f} | {cs['std']:.3f} | {cs['n']} |") |
| lines.append("") |
| |
| |
| lines.append("## 4. Key Findings\n") |
| for c in comparisons: |
| if c["significant"]: |
| direction = "improvement" if c["delta"] > 0 else "degradation" |
| lines.append( |
| f"- **{c['comparison']}**: {c['delta_pct']:+.1f}% {direction} " |
| f"(Cohen's d={c['cohens_d']:.2f}, p={c['p_value']:.4f})" |
| ) |
| |
| |
| lines.append("\n## 5. Methodology\n") |
| lines.append("### Conditions\n") |
| lines.append("1. **SINGLE** β Single analytical perspective, no memory, no synthesis") |
| lines.append("2. **MULTI** β All 6 reasoning agents (Newton, Quantum, Ethics, Philosophy, DaVinci, Empathy) + critic + synthesis") |
| lines.append("3. **MEMORY** β MULTI + cocoon memory augmentation (FTS5-retrieved prior reasoning)") |
| lines.append("4. **CODETTE** β MEMORY + meta-cognitive strategy synthesis (cross-domain pattern extraction + forged reasoning strategies)") |
| lines.append("\n### Scoring Dimensions (0-1 scale)\n") |
| lines.append("1. **Reasoning Depth** (20%) β chain length, concept density, ground truth coverage") |
| lines.append("2. **Perspective Diversity** (15%) β distinct cognitive dimensions engaged") |
| lines.append("3. **Coherence** (15%) β logical flow, transitions, structural consistency") |
| lines.append("4. **Ethical Coverage** (10%) β moral frameworks, stakeholders, value awareness") |
| lines.append("5. **Novelty** (15%) β non-obvious insights, cross-domain connections, reframing") |
| lines.append("6. **Factual Grounding** (15%) β evidence specificity, ground truth alignment, trap avoidance") |
| lines.append("7. **Turing Naturalness** (10%) β conversational quality, absence of formulaic AI patterns") |
| lines.append("\n### Problem Set\n") |
| lines.append(f"- {len(self.problems)} problems across 6 categories") |
| lines.append("- Categories: reasoning (3), ethics (3), creative (2), meta-cognitive (3), adversarial (3), Turing (3)") |
| lines.append("- Difficulty: easy (1), medium (6), hard (10)") |
| lines.append("\n### Statistical Tests\n") |
| lines.append("- Welch's t-test (unequal variance) for pairwise condition comparisons") |
| lines.append("- Cohen's d for effect size estimation") |
| lines.append("- Significance threshold: p < 0.05") |
| |
| return "\n".join(lines) |
| |
| def generate_json_report(self) -> Dict: |
| """Generate structured JSON report for machine consumption.""" |
| stats = self.compute_stats() |
| comparisons = self.compute_pairwise_comparisons() |
| categories = self.per_category_analysis() |
| |
| per_problem = {} |
| for r in self.results: |
| if r.problem_id not in per_problem: |
| per_problem[r.problem_id] = {} |
| per_problem[r.problem_id][r.condition] = { |
| "composite": r.composite, |
| "dimensions": { |
| d: {"score": ds.score, "evidence": ds.evidence, "penalties": ds.penalties} |
| for d, ds in r.dimensions.items() |
| }, |
| "response_length": r.response_length, |
| "latency_ms": r.latency_ms, |
| } |
| |
| return { |
| "metadata": { |
| "timestamp": time.strftime('%Y-%m-%dT%H:%M:%S'), |
| "num_problems": len(self.problems), |
| "num_conditions": len(stats), |
| "total_evaluations": len(self.results), |
| }, |
| "condition_stats": { |
| c: { |
| "mean_composite": s.mean_composite, |
| "std_composite": s.std_composite, |
| "dimension_means": s.dimension_means, |
| "dimension_stds": s.dimension_stds, |
| "mean_length": s.mean_length, |
| "mean_latency": s.mean_latency, |
| "n": s.n, |
| } |
| for c, s in stats.items() |
| }, |
| "pairwise_comparisons": comparisons, |
| "per_category": categories, |
| "per_problem": per_problem, |
| } |
| |
| |
| |
| |
| |
| |
| def run_benchmarks( |
| output_dir: Optional[str] = None, |
| use_llm: bool = False, |
| verbose: bool = True, |
| ) -> Tuple[str, Dict]: |
| """ |
| Run the full benchmark suite and generate reports. |
| |
| Returns: |
| (markdown_report, json_report) |
| """ |
| if output_dir is None: |
| output_dir = str(_PROJECT_ROOT / "data" / "results") |
| os.makedirs(output_dir, exist_ok=True) |
| |
| |
| problems = get_benchmark_problems() |
| if verbose: |
| logger.info(f"Benchmark suite: {len(problems)} problems across " |
| f"{len(set(p.category for p in problems))} categories") |
| |
| |
| runner = BenchmarkRunner(use_llm=use_llm, verbose=verbose) |
| results = runner.run_all(problems) |
| |
| |
| reporter = ReportGenerator(results, problems) |
| md_report = reporter.generate_markdown_report() |
| json_report = reporter.generate_json_report() |
| |
| |
| md_path = os.path.join(output_dir, "codette_benchmark_report.md") |
| json_path = os.path.join(output_dir, "codette_benchmark_results.json") |
| |
| with open(md_path, "w", encoding="utf-8") as f: |
| f.write(md_report) |
| with open(json_path, "w", encoding="utf-8") as f: |
| json.dump(json_report, f, indent=2, default=str) |
| |
| if verbose: |
| logger.info(f"\nReports saved:") |
| logger.info(f" Markdown: {md_path}") |
| logger.info(f" JSON: {json_path}") |
| |
| return md_report, json_report |
| |
| |
| if __name__ == "__main__": |
| import argparse |
| parser = argparse.ArgumentParser(description="Codette Benchmark Suite") |
| parser.add_argument("--output", default=None, help="Output directory") |
| parser.add_argument("--llm", action="store_true", help="Use live LLM inference") |
| parser.add_argument("--quiet", action="store_true", help="Suppress progress output") |
| args = parser.parse_args() |
| |
| md, js = run_benchmarks( |
| output_dir=args.output, |
| use_llm=args.llm, |
| verbose=not args.quiet, |
| ) |
| print("\n" + md) |
| |