| """ |
| Benchmark Runner - loads test prompts, runs/loads responses, scores them, |
| and produces detailed evaluation reports. |
| |
| Supports: |
| - Loading prompts from JSON files in evaluation/prompts/ |
| - Pre-generated response files (JSON mapping prompt -> response) |
| - Scoring via ReasoningMetrics |
| - Per-category and overall reports |
| - Baseline vs trained model comparison |
| - CLI interface |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import os |
| import sys |
| from datetime import datetime |
| from pathlib import Path |
| from typing import Any, Dict, List, Optional |
|
|
| |
| _THIS_DIR = Path(__file__).resolve().parent |
| _PROJECT_ROOT = _THIS_DIR.parent |
| if str(_PROJECT_ROOT) not in sys.path: |
| sys.path.insert(0, str(_PROJECT_ROOT)) |
|
|
| from evaluation.reasoning_metrics import ReasoningMetrics |
|
|
|
|
| |
| |
| |
|
|
| class BenchmarkRunner: |
| """Load prompts, score responses, produce reports.""" |
|
|
| def __init__( |
| self, |
| prompts_dir: Optional[str] = None, |
| metrics: Optional[ReasoningMetrics] = None, |
| ): |
| self.prompts_dir = Path(prompts_dir) if prompts_dir else _THIS_DIR / "prompts" |
| self.metrics = metrics or ReasoningMetrics() |
| self._prompts: Dict[str, List[str]] = {} |
| self._counterexamples: List[Dict[str, str]] = [] |
|
|
| |
|
|
| def load_prompts(self, filename: str = "reasoning_tests.json") -> Dict[str, List[str]]: |
| """Load categorised prompts from a JSON file. |
| |
| Expected format: {"category": ["prompt1", "prompt2", ...], ...} |
| """ |
| path = self.prompts_dir / filename |
| if not path.exists(): |
| raise FileNotFoundError(f"Prompt file not found: {path}") |
| with open(path, "r", encoding="utf-8") as f: |
| data = json.load(f) |
| self._prompts = data |
| return data |
|
|
| def load_counterexamples(self, filename: str = "counterexample_tests.json") -> List[Dict[str, str]]: |
| """Load counterexample test prompts.""" |
| path = self.prompts_dir / filename |
| if not path.exists(): |
| raise FileNotFoundError(f"Counterexample file not found: {path}") |
| with open(path, "r", encoding="utf-8") as f: |
| data = json.load(f) |
| self._counterexamples = data |
| return data |
|
|
| def load_responses(self, filepath: str) -> Dict[str, str]: |
| """Load pre-generated responses from a JSON file. |
| |
| Expected format: {"prompt_text": "response_text", ...} |
| """ |
| with open(filepath, "r", encoding="utf-8") as f: |
| return json.load(f) |
|
|
| |
|
|
| def score_responses( |
| self, |
| responses: Dict[str, str], |
| ) -> Dict[str, Any]: |
| """Score all responses and organise results by category. |
| |
| Args: |
| responses: mapping of prompt text -> response text |
| |
| Returns: |
| Dict with per-prompt scores, per-category averages, and overall. |
| """ |
| if not self._prompts: |
| self.load_prompts() |
|
|
| results: Dict[str, Any] = { |
| "timestamp": datetime.utcnow().isoformat(), |
| "total_prompts": 0, |
| "scored_prompts": 0, |
| "missing_responses": 0, |
| "categories": {}, |
| "all_scores": [], |
| } |
|
|
| for category, prompts in self._prompts.items(): |
| cat_scores: List[Dict[str, Any]] = [] |
| for prompt in prompts: |
| results["total_prompts"] += 1 |
| response = responses.get(prompt) |
| if response is None: |
| results["missing_responses"] += 1 |
| continue |
| scores = self.metrics.score_reasoning(response) |
| results["scored_prompts"] += 1 |
| entry = {"prompt": prompt, "scores": scores} |
| cat_scores.append(entry) |
| results["all_scores"].append(entry) |
|
|
| |
| if cat_scores: |
| avg = self._average_scores([e["scores"] for e in cat_scores]) |
| else: |
| avg = {} |
| results["categories"][category] = { |
| "prompts_scored": len(cat_scores), |
| "average_scores": avg, |
| "details": cat_scores, |
| } |
|
|
| |
| if results["all_scores"]: |
| results["overall"] = self._average_scores( |
| [e["scores"] for e in results["all_scores"]] |
| ) |
| else: |
| results["overall"] = {} |
|
|
| return results |
|
|
| def score_counterexamples( |
| self, |
| responses: Dict[str, str], |
| ) -> Dict[str, Any]: |
| """Score counterexample responses (should identify wrong reasoning).""" |
| if not self._counterexamples: |
| self.load_counterexamples() |
|
|
| results = [] |
| refutations = 0 |
| total = 0 |
|
|
| refutation_markers = [ |
| "not true", "incorrect", "misconception", "actually", |
| "contrary", "doesn't", "does not", "false", "myth", |
| "wrong", "mistake", "no,", "in fact", "however", |
| "this is a common", "oversimplification", "nuanced", |
| "not necessarily", "depends on", "more complex", |
| ] |
|
|
| for item in self._counterexamples: |
| prompt = item["prompt"] |
| expected = item.get("expected", "refutation") |
| response = responses.get(prompt, "") |
| total += 1 |
|
|
| if not response: |
| results.append({ |
| "prompt": prompt, |
| "expected": expected, |
| "responded": False, |
| "contains_refutation": False, |
| }) |
| continue |
|
|
| resp_lower = response.lower() |
| found_refutation = any(m in resp_lower for m in refutation_markers) |
| if found_refutation and expected == "refutation": |
| refutations += 1 |
|
|
| scores = self.metrics.score_reasoning(response) |
| results.append({ |
| "prompt": prompt, |
| "expected": expected, |
| "responded": True, |
| "contains_refutation": found_refutation, |
| "scores": scores, |
| }) |
|
|
| return { |
| "total": total, |
| "refutation_rate": round(refutations / max(total, 1), 4), |
| "details": results, |
| } |
|
|
| |
|
|
| def compare_models( |
| self, |
| baseline_responses: Dict[str, str], |
| trained_responses: Dict[str, str], |
| ) -> Dict[str, Any]: |
| """Compare baseline vs trained model responses.""" |
| baseline_results = self.score_responses(baseline_responses) |
| trained_results = self.score_responses(trained_responses) |
|
|
| comparison: Dict[str, Any] = { |
| "timestamp": datetime.utcnow().isoformat(), |
| "baseline_overall": baseline_results.get("overall", {}), |
| "trained_overall": trained_results.get("overall", {}), |
| "category_comparison": {}, |
| "improvements": {}, |
| "regressions": {}, |
| } |
|
|
| |
| for cat in baseline_results["categories"]: |
| b_avg = baseline_results["categories"][cat]["average_scores"] |
| t_avg = trained_results["categories"].get(cat, {}).get("average_scores", {}) |
| delta = {} |
| for k in b_avg: |
| if k in t_avg and isinstance(b_avg[k], (int, float)): |
| delta[k] = round(t_avg[k] - b_avg[k], 4) |
| comparison["category_comparison"][cat] = { |
| "baseline": b_avg, |
| "trained": t_avg, |
| "delta": delta, |
| } |
|
|
| |
| b_ov = comparison["baseline_overall"] |
| t_ov = comparison["trained_overall"] |
| for k in b_ov: |
| if k in t_ov and isinstance(b_ov[k], (int, float)): |
| d = round(t_ov[k] - b_ov[k], 4) |
| if d > 0.01: |
| comparison["improvements"][k] = d |
| elif d < -0.01: |
| comparison["regressions"][k] = d |
|
|
| return comparison |
|
|
| |
|
|
| def format_report(self, results: Dict[str, Any]) -> str: |
| """Format evaluation results as a readable text report.""" |
| lines: List[str] = [] |
| lines.append("=" * 70) |
| lines.append(" CODETTE BENCHMARK EVALUATION REPORT") |
| lines.append("=" * 70) |
| lines.append(f" Timestamp: {results.get('timestamp', 'N/A')}") |
| lines.append(f" Prompts: {results.get('scored_prompts', 0)} scored / " |
| f"{results.get('total_prompts', 0)} total") |
| if results.get("missing_responses"): |
| lines.append(f" Missing: {results['missing_responses']} responses not found") |
| lines.append("") |
|
|
| |
| overall = results.get("overall", {}) |
| if overall: |
| lines.append("-" * 70) |
| lines.append(" OVERALL SCORES") |
| lines.append("-" * 70) |
| for k, v in sorted(overall.items()): |
| if isinstance(v, float): |
| bar = self._bar(v) |
| lines.append(f" {k:<22s} {v:.4f} {bar}") |
| lines.append("") |
|
|
| |
| for cat, data in results.get("categories", {}).items(): |
| avg = data.get("average_scores", {}) |
| if not avg: |
| continue |
| lines.append("-" * 70) |
| lines.append(f" CATEGORY: {cat.upper()}") |
| lines.append(f" Prompts scored: {data.get('prompts_scored', 0)}") |
| lines.append("-" * 70) |
| for k, v in sorted(avg.items()): |
| if isinstance(v, float): |
| bar = self._bar(v) |
| lines.append(f" {k:<22s} {v:.4f} {bar}") |
| lines.append("") |
|
|
| lines.append("=" * 70) |
| return "\n".join(lines) |
|
|
| def format_comparison_report(self, comparison: Dict[str, Any]) -> str: |
| """Format a comparison report between baseline and trained model.""" |
| lines: List[str] = [] |
| lines.append("=" * 70) |
| lines.append(" MODEL COMPARISON REPORT") |
| lines.append("=" * 70) |
| lines.append(f" Timestamp: {comparison.get('timestamp', 'N/A')}") |
| lines.append("") |
|
|
| |
| lines.append("-" * 70) |
| lines.append(" OVERALL SCORES (baseline -> trained [delta])") |
| lines.append("-" * 70) |
| b = comparison.get("baseline_overall", {}) |
| t = comparison.get("trained_overall", {}) |
| for k in sorted(set(list(b.keys()) + list(t.keys()))): |
| bv = b.get(k, 0) |
| tv = t.get(k, 0) |
| if not isinstance(bv, (int, float)): |
| continue |
| d = tv - bv |
| sign = "+" if d >= 0 else "" |
| lines.append(f" {k:<22s} {bv:.4f} -> {tv:.4f} [{sign}{d:.4f}]") |
|
|
| |
| imp = comparison.get("improvements", {}) |
| reg = comparison.get("regressions", {}) |
| if imp: |
| lines.append("") |
| lines.append(" IMPROVEMENTS:") |
| for k, v in sorted(imp.items(), key=lambda x: -x[1]): |
| lines.append(f" + {k}: +{v:.4f}") |
| if reg: |
| lines.append("") |
| lines.append(" REGRESSIONS:") |
| for k, v in sorted(reg.items(), key=lambda x: x[1]): |
| lines.append(f" - {k}: {v:.4f}") |
|
|
| |
| lines.append("") |
| for cat, data in comparison.get("category_comparison", {}).items(): |
| delta = data.get("delta", {}) |
| if not delta: |
| continue |
| overall_d = delta.get("overall", 0) |
| sign = "+" if overall_d >= 0 else "" |
| lines.append(f" {cat:<18s} overall delta: {sign}{overall_d:.4f}") |
|
|
| lines.append("") |
| lines.append("=" * 70) |
| return "\n".join(lines) |
|
|
| |
|
|
| @staticmethod |
| def _average_scores(score_list: List[Dict[str, float]]) -> Dict[str, float]: |
| """Average numeric values across a list of score dicts.""" |
| if not score_list: |
| return {} |
| totals: Dict[str, float] = {} |
| counts: Dict[str, int] = {} |
| for s in score_list: |
| for k, v in s.items(): |
| if isinstance(v, (int, float)): |
| totals[k] = totals.get(k, 0.0) + v |
| counts[k] = counts.get(k, 0) + 1 |
| return {k: round(totals[k] / counts[k], 4) for k in sorted(totals)} |
|
|
| @staticmethod |
| def _bar(value: float, width: int = 20) -> str: |
| """ASCII progress bar.""" |
| filled = int(value * width) |
| return "[" + "#" * filled + "." * (width - filled) + "]" |
|
|
| |
|
|
| def save_results(self, results: Dict[str, Any], filepath: str) -> None: |
| """Save evaluation results to JSON.""" |
| |
| os.makedirs(os.path.dirname(filepath) or ".", exist_ok=True) |
| with open(filepath, "w", encoding="utf-8") as f: |
| json.dump(results, f, indent=2, default=str) |
|
|
| @staticmethod |
| def load_results(filepath: str) -> Dict[str, Any]: |
| """Load evaluation results from JSON.""" |
| with open(filepath, "r", encoding="utf-8") as f: |
| return json.load(f) |
|
|
|
|
| |
| |
| |
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser( |
| description="Codette Benchmark Runner - evaluate model reasoning quality" |
| ) |
| parser.add_argument( |
| "--responses", "-r", |
| required=True, |
| help="Path to JSON file with pre-generated responses (prompt -> response)", |
| ) |
| parser.add_argument( |
| "--prompts-dir", "-p", |
| default=None, |
| help="Directory containing prompt JSON files (default: evaluation/prompts/)", |
| ) |
| parser.add_argument( |
| "--baseline", "-b", |
| default=None, |
| help="Path to baseline responses JSON for comparison", |
| ) |
| parser.add_argument( |
| "--output", "-o", |
| default=None, |
| help="Save results to this JSON file", |
| ) |
| parser.add_argument( |
| "--counterexamples", "-c", |
| action="store_true", |
| help="Also run counterexample tests", |
| ) |
| parser.add_argument( |
| "--prompts-file", |
| default="reasoning_tests.json", |
| help="Prompt file name inside prompts dir (default: reasoning_tests.json)", |
| ) |
|
|
| args = parser.parse_args() |
|
|
| runner = BenchmarkRunner(prompts_dir=args.prompts_dir) |
| runner.load_prompts(args.prompts_file) |
|
|
| print(f"Loading responses from: {args.responses}") |
| responses = runner.load_responses(args.responses) |
| print(f" Loaded {len(responses)} responses") |
|
|
| |
| print("\nScoring responses...") |
| results = runner.score_responses(responses) |
| print(runner.format_report(results)) |
|
|
| |
| if args.counterexamples: |
| print("\nRunning counterexample tests...") |
| runner.load_counterexamples() |
| ce_results = runner.score_counterexamples(responses) |
| print(f" Refutation detection rate: {ce_results['refutation_rate']:.2%}") |
| results["counterexamples"] = ce_results |
|
|
| |
| if args.baseline: |
| print(f"\nLoading baseline from: {args.baseline}") |
| baseline = runner.load_responses(args.baseline) |
| comparison = runner.compare_models(baseline, responses) |
| print(runner.format_comparison_report(comparison)) |
| results["comparison"] = comparison |
|
|
| |
| if args.output: |
| runner.save_results(results, args.output) |
| print(f"\nResults saved to: {args.output}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|