Codette-Reasoning / evaluation /benchmark_runner.py

Jonathan Harrison

Full Codette codebase sync — transparency release

74f2af5 about 24 hours ago

16.3 kB

	"""
	Benchmark Runner - loads test prompts, runs/loads responses, scores them,
	and produces detailed evaluation reports.

	Supports:
	- Loading prompts from JSON files in evaluation/prompts/
	- Pre-generated response files (JSON mapping prompt -> response)
	- Scoring via ReasoningMetrics
	- Per-category and overall reports
	- Baseline vs trained model comparison
	- CLI interface
	"""

	from __future__ import annotations

	import argparse
	import json
	import os
	import sys
	from datetime import datetime
	from pathlib import Path
	from typing import Any, Dict, List, Optional

	# Allow running from project root or from evaluation/
	_THIS_DIR = Path(__file__).resolve().parent
	_PROJECT_ROOT = _THIS_DIR.parent
	if str(_PROJECT_ROOT) not in sys.path:
	sys.path.insert(0, str(_PROJECT_ROOT))

	from evaluation.reasoning_metrics import ReasoningMetrics


	# ---------------------------------------------------------------------------
	# Benchmark Runner
	# ---------------------------------------------------------------------------

	class BenchmarkRunner:
	"""Load prompts, score responses, produce reports."""

	def __init__(
	self,
	prompts_dir: Optional[str] = None,
	metrics: Optional[ReasoningMetrics] = None,
	):
	self.prompts_dir = Path(prompts_dir) if prompts_dir else _THIS_DIR / "prompts"
	self.metrics = metrics or ReasoningMetrics()
	self._prompts: Dict[str, List[str]] = {}
	self._counterexamples: List[Dict[str, str]] = []

	# -- loading -----------------------------------------------------------

	def load_prompts(self, filename: str = "reasoning_tests.json") -> Dict[str, List[str]]:
	"""Load categorised prompts from a JSON file.

	Expected format: {"category": ["prompt1", "prompt2", ...], ...}
	"""
	path = self.prompts_dir / filename
	if not path.exists():
	raise FileNotFoundError(f"Prompt file not found: {path}")
	with open(path, "r", encoding="utf-8") as f:
	data = json.load(f)
	self._prompts = data
	return data

	def load_counterexamples(self, filename: str = "counterexample_tests.json") -> List[Dict[str, str]]:
	"""Load counterexample test prompts."""
	path = self.prompts_dir / filename
	if not path.exists():
	raise FileNotFoundError(f"Counterexample file not found: {path}")
	with open(path, "r", encoding="utf-8") as f:
	data = json.load(f)
	self._counterexamples = data
	return data

	def load_responses(self, filepath: str) -> Dict[str, str]:
	"""Load pre-generated responses from a JSON file.

	Expected format: {"prompt_text": "response_text", ...}
	"""
	with open(filepath, "r", encoding="utf-8") as f:
	return json.load(f)

	# -- scoring -----------------------------------------------------------

	def score_responses(
	self,
	responses: Dict[str, str],
	) -> Dict[str, Any]:
	"""Score all responses and organise results by category.

	Args:
	responses: mapping of prompt text -> response text

	Returns:
	Dict with per-prompt scores, per-category averages, and overall.
	"""
	if not self._prompts:
	self.load_prompts()

	results: Dict[str, Any] = {
	"timestamp": datetime.utcnow().isoformat(),
	"total_prompts": 0,
	"scored_prompts": 0,
	"missing_responses": 0,
	"categories": {},
	"all_scores": [],
	}

	for category, prompts in self._prompts.items():
	cat_scores: List[Dict[str, Any]] = []
	for prompt in prompts:
	results["total_prompts"] += 1
	response = responses.get(prompt)
	if response is None:
	results["missing_responses"] += 1
	continue
	scores = self.metrics.score_reasoning(response)
	results["scored_prompts"] += 1
	entry = {"prompt": prompt, "scores": scores}
	cat_scores.append(entry)
	results["all_scores"].append(entry)

	# Category averages
	if cat_scores:
	avg = self._average_scores([e["scores"] for e in cat_scores])
	else:
	avg = {}
	results["categories"][category] = {
	"prompts_scored": len(cat_scores),
	"average_scores": avg,
	"details": cat_scores,
	}

	# Overall averages
	if results["all_scores"]:
	results["overall"] = self._average_scores(
	[e["scores"] for e in results["all_scores"]]
	)
	else:
	results["overall"] = {}

	return results

	def score_counterexamples(
	self,
	responses: Dict[str, str],
	) -> Dict[str, Any]:
	"""Score counterexample responses (should identify wrong reasoning)."""
	if not self._counterexamples:
	self.load_counterexamples()

	results = []
	refutations = 0
	total = 0

	refutation_markers = [
	"not true", "incorrect", "misconception", "actually",
	"contrary", "doesn't", "does not", "false", "myth",
	"wrong", "mistake", "no,", "in fact", "however",
	"this is a common", "oversimplification", "nuanced",
	"not necessarily", "depends on", "more complex",
	]

	for item in self._counterexamples:
	prompt = item["prompt"]
	expected = item.get("expected", "refutation")
	response = responses.get(prompt, "")
	total += 1

	if not response:
	results.append({
	"prompt": prompt,
	"expected": expected,
	"responded": False,
	"contains_refutation": False,
	})
	continue

	resp_lower = response.lower()
	found_refutation = any(m in resp_lower for m in refutation_markers)
	if found_refutation and expected == "refutation":
	refutations += 1

	scores = self.metrics.score_reasoning(response)
	results.append({
	"prompt": prompt,
	"expected": expected,
	"responded": True,
	"contains_refutation": found_refutation,
	"scores": scores,
	})

	return {
	"total": total,
	"refutation_rate": round(refutations / max(total, 1), 4),
	"details": results,
	}

	# -- comparison --------------------------------------------------------

	def compare_models(
	self,
	baseline_responses: Dict[str, str],
	trained_responses: Dict[str, str],
	) -> Dict[str, Any]:
	"""Compare baseline vs trained model responses."""
	baseline_results = self.score_responses(baseline_responses)
	trained_results = self.score_responses(trained_responses)

	comparison: Dict[str, Any] = {
	"timestamp": datetime.utcnow().isoformat(),
	"baseline_overall": baseline_results.get("overall", {}),
	"trained_overall": trained_results.get("overall", {}),
	"category_comparison": {},
	"improvements": {},
	"regressions": {},
	}

	# Per-category delta
	for cat in baseline_results["categories"]:
	b_avg = baseline_results["categories"][cat]["average_scores"]
	t_avg = trained_results["categories"].get(cat, {}).get("average_scores", {})
	delta = {}
	for k in b_avg:
	if k in t_avg and isinstance(b_avg[k], (int, float)):
	delta[k] = round(t_avg[k] - b_avg[k], 4)
	comparison["category_comparison"][cat] = {
	"baseline": b_avg,
	"trained": t_avg,
	"delta": delta,
	}

	# Overall delta
	b_ov = comparison["baseline_overall"]
	t_ov = comparison["trained_overall"]
	for k in b_ov:
	if k in t_ov and isinstance(b_ov[k], (int, float)):
	d = round(t_ov[k] - b_ov[k], 4)
	if d > 0.01:
	comparison["improvements"][k] = d
	elif d < -0.01:
	comparison["regressions"][k] = d

	return comparison

	# -- report ------------------------------------------------------------

	def format_report(self, results: Dict[str, Any]) -> str:
	"""Format evaluation results as a readable text report."""
	lines: List[str] = []
	lines.append("=" * 70)
	lines.append(" CODETTE BENCHMARK EVALUATION REPORT")
	lines.append("=" * 70)
	lines.append(f" Timestamp: {results.get('timestamp', 'N/A')}")
	lines.append(f" Prompts: {results.get('scored_prompts', 0)} scored / "
	f"{results.get('total_prompts', 0)} total")
	if results.get("missing_responses"):
	lines.append(f" Missing: {results['missing_responses']} responses not found")
	lines.append("")

	# Overall
	overall = results.get("overall", {})
	if overall:
	lines.append("-" * 70)
	lines.append(" OVERALL SCORES")
	lines.append("-" * 70)
	for k, v in sorted(overall.items()):
	if isinstance(v, float):
	bar = self._bar(v)
	lines.append(f" {k:<22s} {v:.4f} {bar}")
	lines.append("")

	# Per-category
	for cat, data in results.get("categories", {}).items():
	avg = data.get("average_scores", {})
	if not avg:
	continue
	lines.append("-" * 70)
	lines.append(f" CATEGORY: {cat.upper()}")
	lines.append(f" Prompts scored: {data.get('prompts_scored', 0)}")
	lines.append("-" * 70)
	for k, v in sorted(avg.items()):
	if isinstance(v, float):
	bar = self._bar(v)
	lines.append(f" {k:<22s} {v:.4f} {bar}")
	lines.append("")

	lines.append("=" * 70)
	return "\n".join(lines)

	def format_comparison_report(self, comparison: Dict[str, Any]) -> str:
	"""Format a comparison report between baseline and trained model."""
	lines: List[str] = []
	lines.append("=" * 70)
	lines.append(" MODEL COMPARISON REPORT")
	lines.append("=" * 70)
	lines.append(f" Timestamp: {comparison.get('timestamp', 'N/A')}")
	lines.append("")

	# Overall
	lines.append("-" * 70)
	lines.append(" OVERALL SCORES (baseline -> trained [delta])")
	lines.append("-" * 70)
	b = comparison.get("baseline_overall", {})
	t = comparison.get("trained_overall", {})
	for k in sorted(set(list(b.keys()) + list(t.keys()))):
	bv = b.get(k, 0)
	tv = t.get(k, 0)
	if not isinstance(bv, (int, float)):
	continue
	d = tv - bv
	sign = "+" if d >= 0 else ""
	lines.append(f" {k:<22s} {bv:.4f} -> {tv:.4f} [{sign}{d:.4f}]")

	# Improvements / regressions
	imp = comparison.get("improvements", {})
	reg = comparison.get("regressions", {})
	if imp:
	lines.append("")
	lines.append(" IMPROVEMENTS:")
	for k, v in sorted(imp.items(), key=lambda x: -x[1]):
	lines.append(f" + {k}: +{v:.4f}")
	if reg:
	lines.append("")
	lines.append(" REGRESSIONS:")
	for k, v in sorted(reg.items(), key=lambda x: x[1]):
	lines.append(f" - {k}: {v:.4f}")

	# Per-category
	lines.append("")
	for cat, data in comparison.get("category_comparison", {}).items():
	delta = data.get("delta", {})
	if not delta:
	continue
	overall_d = delta.get("overall", 0)
	sign = "+" if overall_d >= 0 else ""
	lines.append(f" {cat:<18s} overall delta: {sign}{overall_d:.4f}")

	lines.append("")
	lines.append("=" * 70)
	return "\n".join(lines)

	# -- helpers -----------------------------------------------------------

	@staticmethod
	def _average_scores(score_list: List[Dict[str, float]]) -> Dict[str, float]:
	"""Average numeric values across a list of score dicts."""
	if not score_list:
	return {}
	totals: Dict[str, float] = {}
	counts: Dict[str, int] = {}
	for s in score_list:
	for k, v in s.items():
	if isinstance(v, (int, float)):
	totals[k] = totals.get(k, 0.0) + v
	counts[k] = counts.get(k, 0) + 1
	return {k: round(totals[k] / counts[k], 4) for k in sorted(totals)}

	@staticmethod
	def _bar(value: float, width: int = 20) -> str:
	"""ASCII progress bar."""
	filled = int(value * width)
	return "[" + "#" * filled + "." * (width - filled) + "]"

	# -- save / load results -----------------------------------------------

	def save_results(self, results: Dict[str, Any], filepath: str) -> None:
	"""Save evaluation results to JSON."""
	# Convert non-serialisable types
	os.makedirs(os.path.dirname(filepath) or ".", exist_ok=True)
	with open(filepath, "w", encoding="utf-8") as f:
	json.dump(results, f, indent=2, default=str)

	@staticmethod
	def load_results(filepath: str) -> Dict[str, Any]:
	"""Load evaluation results from JSON."""
	with open(filepath, "r", encoding="utf-8") as f:
	return json.load(f)


	# ---------------------------------------------------------------------------
	# CLI
	# ---------------------------------------------------------------------------

	def main() -> None:
	parser = argparse.ArgumentParser(
	description="Codette Benchmark Runner - evaluate model reasoning quality"
	)
	parser.add_argument(
	"--responses", "-r",
	required=True,
	help="Path to JSON file with pre-generated responses (prompt -> response)",
	)
	parser.add_argument(
	"--prompts-dir", "-p",
	default=None,
	help="Directory containing prompt JSON files (default: evaluation/prompts/)",
	)
	parser.add_argument(
	"--baseline", "-b",
	default=None,
	help="Path to baseline responses JSON for comparison",
	)
	parser.add_argument(
	"--output", "-o",
	default=None,
	help="Save results to this JSON file",
	)
	parser.add_argument(
	"--counterexamples", "-c",
	action="store_true",
	help="Also run counterexample tests",
	)
	parser.add_argument(
	"--prompts-file",
	default="reasoning_tests.json",
	help="Prompt file name inside prompts dir (default: reasoning_tests.json)",
	)

	args = parser.parse_args()

	runner = BenchmarkRunner(prompts_dir=args.prompts_dir)
	runner.load_prompts(args.prompts_file)

	print(f"Loading responses from: {args.responses}")
	responses = runner.load_responses(args.responses)
	print(f" Loaded {len(responses)} responses")

	# Score
	print("\nScoring responses...")
	results = runner.score_responses(responses)
	print(runner.format_report(results))

	# Counterexamples
	if args.counterexamples:
	print("\nRunning counterexample tests...")
	runner.load_counterexamples()
	ce_results = runner.score_counterexamples(responses)
	print(f" Refutation detection rate: {ce_results['refutation_rate']:.2%}")
	results["counterexamples"] = ce_results

	# Comparison
	if args.baseline:
	print(f"\nLoading baseline from: {args.baseline}")
	baseline = runner.load_responses(args.baseline)
	comparison = runner.compare_models(baseline, responses)
	print(runner.format_comparison_report(comparison))
	results["comparison"] = comparison

	# Save
	if args.output:
	runner.save_results(results, args.output)
	print(f"\nResults saved to: {args.output}")


	if __name__ == "__main__":
	main()