Codette-Reasoning / evaluation /conflict_tests.py

Jonathan Harrison

Full Codette codebase sync — transparency release

74f2af5 2 days ago

13.7 kB

	"""
	Conflict Detection Test Suite

	10-15 conflict-triggering prompts designed to elicit specific agent disagreements.
	Tests forge_with_debate() with conflict detection enabled and measures outcomes.

	Author: Claude Code
	"""

	import csv
	import json
	from typing import Dict, List, Optional
	from dataclasses import dataclass
	from statistics import mean

	# Conflict-triggering test prompts
	CONFLICT_PROMPTS = [
	{
	"query": "Should we optimize an algorithm to run 10x faster if it reduces interpretability by 80%?",
	"expected_conflict": "ethics_vs_efficiency",
	"agents_likely_to_conflict": ["ethics", "systems_architecture"],
	"description": "Efficiency vs transparency tradeoff",
	},
	{
	"query": "Is a probabilistic solution more insightful than a deterministic one for explaining human decision-making?",
	"expected_conflict": "quantum_vs_newton",
	"agents_likely_to_conflict": ["quantum", "newton"],
	"description": "Probabilistic vs mechanistic explanation",
	},
	{
	"query": "In designing an AI system, should we prioritize consciousness theory or engineering reliability?",
	"expected_conflict": "philosophy_vs_systems",
	"agents_likely_to_conflict": ["philosophy", "systems_architecture"],
	"description": "Theoretical depth vs practical robustness",
	},
	{
	"query": "Is breaking logical rules ever justified in creative problem-solving?",
	"expected_conflict": "davinci_vs_newton",
	"agents_likely_to_conflict": ["davinci", "newton"],
	"description": "Creativity vs logical consistency",
	},
	{
	"query": "Should medical diagnosis weigh patient emotional state equally with biomarkers?",
	"expected_conflict": "empathy_vs_newton",
	"agents_likely_to_conflict": ["empathy", "newton"],
	"description": "Holistic vs reductionist medicine",
	},
	{
	"query": "Is uncertainty in a system a bug to eliminate or a feature to leverage?",
	"expected_conflict": "quantum_vs_systems",
	"agents_likely_to_conflict": ["quantum", "systems_architecture"],
	"description": "Embracing vs reducing uncertainty",
	},
	{
	"query": "Should AI systems be trained to always maximize efficiency or to leave space for unexpected behaviors?",
	"expected_conflict": "newton_vs_davinci",
	"agents_likely_to_conflict": ["newton", "davinci"],
	"description": "Optimization vs emergence",
	},
	{
	"query": "Is empathy a strength or a weakness in decision-making systems?",
	"expected_conflict": "empathy_vs_ethics",
	"agents_likely_to_conflict": ["empathy", "ethics"],
	"description": "Emotional connection vs principled rules",
	},
	{
	"query": "Should we prefer explanations that preserve mathematical elegance or human understanding?",
	"expected_conflict": "philosophy_vs_empathy",
	"agents_likely_to_conflict": ["philosophy", "empathy"],
	"description": "Aesthetic vs communicative clarity",
	},
	{
	"query": "Can a system be simultaneously more creative and more reliable?",
	"expected_conflict": "davinci_vs_systems",
	"agents_likely_to_conflict": ["davinci", "systems_architecture"],
	"description": "Innovation vs stability",
	},
	{
	"query": "Should resource allocation prioritize current needs or future possibilities?",
	"expected_conflict": "newton_vs_philosophy",
	"agents_likely_to_conflict": ["newton", "philosophy"],
	"description": "Practical vs speculative",
	},
	{
	"query": "Is it more important for an explanation to be complete or to be useful?",
	"expected_conflict": "philosophy_vs_davinci",
	"agents_likely_to_conflict": ["philosophy", "davinci"],
	"description": "Comprehensiveness vs pragmatism",
	},
	]


	@dataclass
	class ConflictTestResult:
	"""Result from running one test prompt."""
	query: str
	expected_conflict: str
	round_0_conflict_count: int
	round_1_conflict_count: int
	avg_conflict_strength_r0: float
	avg_conflict_strength_r1: float
	conflict_resolution_rate: float
	ensemble_coherence: float
	debate_tension_decay: float
	detected_conflicts: List[Dict]
	success: bool # Did test complete without error?


	class ConflictTestRunner:
	"""Runner for conflict detection tests."""

	def __init__(self, forge_engine):
	"""
	Initialize test runner.

	Args:
	forge_engine: ForgeEngine instance with conflict detection enabled
	"""
	self.forge = forge_engine

	def run_test(self, prompt_dict: Dict) -> ConflictTestResult:
	"""
	Run a single test prompt through forge_with_debate.

	Args:
	prompt_dict: Dict with query, expected_conflict, agents_likely_to_conflict

	Returns:
	ConflictTestResult with metrics
	"""
	query = prompt_dict["query"]
	expected_conflict = prompt_dict["expected_conflict"]

	try:
	result = self.forge.forge_with_debate(query, debate_rounds=1)

	metadata = result.get("metadata", {})
	debates = metadata.get("debate_log", [])

	# Extract conflict metrics
	round_0_conflicts = 0
	round_1_conflicts = 0
	avg_strength_r0 = 0.0
	avg_strength_r1 = 0.0
	resolution_rate = 0.0

	# Parse debate log
	for debate_entry in debates:
	if debate_entry.get("type") == "initial_analysis":
	round_0_conflicts = debate_entry.get("conflicts_detected", 0)
	summary = debate_entry.get("conflict_strength_summary", {})
	if round_0_conflicts > 0:
	avg_strength_r0 = summary.get("avg_conflict_strength", 0.0)

	elif debate_entry.get("type") == "debate":
	round_1_conflicts = debate_entry.get("conflicts_detected_after", 0)
	res_metrics = debate_entry.get("resolution_metrics", {})
	if res_metrics:
	resolution_rate = res_metrics.get("resolution_rate", 0.0)
	summary = res_metrics.get("conflict_strength_summary", {})
	if round_1_conflicts > 0:
	avg_strength_r1 = summary.get("avg_conflict_strength", 0.0)

	ensemble_coherence = metadata.get("ensemble_coherence", 0.0)
	tension_decay_info = metadata.get("tension_decay", {})
	tension_decay = tension_decay_info.get("decay_rate", 0.0) if isinstance(tension_decay_info, dict) else 0.0

	detected = metadata.get("conflicts_detected", [])

	test_result = ConflictTestResult(
	query=query,
	expected_conflict=expected_conflict,
	round_0_conflict_count=round_0_conflicts,
	round_1_conflict_count=round_1_conflicts,
	avg_conflict_strength_r0=avg_strength_r0,
	avg_conflict_strength_r1=avg_strength_r1,
	conflict_resolution_rate=resolution_rate,
	ensemble_coherence=ensemble_coherence,
	debate_tension_decay=tension_decay,
	detected_conflicts=detected,
	success=True,
	)

	return test_result

	except Exception as e:
	# Return failed test result
	print(f"ERROR in test '{query[:50]}...': {e}")
	return ConflictTestResult(
	query=query,
	expected_conflict=expected_conflict,
	round_0_conflict_count=0,
	round_1_conflict_count=0,
	avg_conflict_strength_r0=0.0,
	avg_conflict_strength_r1=0.0,
	conflict_resolution_rate=0.0,
	ensemble_coherence=0.0,
	debate_tension_decay=0.0,
	detected_conflicts=[],
	success=False,
	)

	def run_all_tests(self, output_csv: str = "conflict_test_results.csv") -> List[ConflictTestResult]:
	"""
	Run all test prompts.

	Args:
	output_csv: CSV file to export results

	Returns:
	List of ConflictTestResult
	"""
	results = []

	print(f"\n{'='*80}")
	print("PHASE 1: CONFLICT DETECTION TEST SUITE")
	print(f"{'='*80}\n")

	for idx, prompt_dict in enumerate(CONFLICT_PROMPTS, 1):
	print(f"\n[Test {idx}/{len(CONFLICT_PROMPTS)}] {prompt_dict['description']}")
	print(f" Query: {prompt_dict['query'][:80]}...")

	result = self.run_test(prompt_dict)
	results.append(result)

	if result.success:
	print(f" ✓ Success")
	print(f" - Conflicts detected (R0): {result.round_0_conflict_count}")
	print(f" - Conflicts detected (R1): {result.round_1_conflict_count}")
	print(f" - Resolution rate: {result.conflict_resolution_rate:.2%}")
	print(f" - Ensemble coherence: {result.ensemble_coherence:.3f}")
	print(f" - Tension decay: {result.debate_tension_decay:.3f}")
	else:
	print(f" ✗ FAILED")

	# Export to CSV
	self._export_csv(results, output_csv)

	# Print summary
	print(f"\n{'='*80}")
	self._print_summary(results)
	print(f"{'='*80}\n")

	return results

	def _export_csv(self, results: List[ConflictTestResult], filename: str):
	"""Export results to CSV."""
	try:
	with open(filename, "w", newline="") as f:
	writer = csv.writer(f)
	writer.writerow([
	"query",
	"expected_conflict",
	"round_0_conflicts",
	"round_1_conflicts",
	"avg_strength_r0",
	"avg_strength_r1",
	"resolution_rate",
	"ensemble_coherence",
	"tension_decay",
	"success",
	])
	for r in results:
	writer.writerow([
	r.query[:100],
	r.expected_conflict,
	r.round_0_conflict_count,
	r.round_1_conflict_count,
	f"{r.avg_conflict_strength_r0:.3f}",
	f"{r.avg_conflict_strength_r1:.3f}",
	f"{r.conflict_resolution_rate:.3f}",
	f"{r.ensemble_coherence:.3f}",
	f"{r.debate_tension_decay:.3f}",
	r.success,
	])
	print(f"\nResults exported to: {filename}")
	except Exception as e:
	print(f"Error exporting CSV: {e}")

	def _print_summary(self, results: List[ConflictTestResult]):
	"""Print test summary statistics."""
	successful = [r for r in results if r.success]
	if not successful:
	print("\nNo tests completed successfully!")
	return

	print("\nTEST SUMMARY")
	print(f" Total tests: {len(results)}")
	print(f" Successful: {len(successful)}")
	print(f" Failed: {len(results) - len(successful)}")

	print(f"\nCONFLICT DETECTION METRICS")
	print(f" Avg conflicts (R0): {mean(r.round_0_conflict_count for r in successful):.1f}")
	print(f" Avg conflicts (R1): {mean(r.round_1_conflict_count for r in successful):.1f}")
	print(f" Avg conflict strength (R0): {mean(r.avg_conflict_strength_r0 for r in successful if r.avg_conflict_strength_r0 > 0):.3f}")
	print(f" Avg resolution rate: {mean(r.conflict_resolution_rate for r in successful):.1%}")

	print(f"\nEPISTEMIC METRICS")
	print(f" Avg ensemble coherence: {mean(r.ensemble_coherence for r in successful):.3f}")
	print(f" Avg tension decay: {mean(r.debate_tension_decay for r in successful):.3f}")

	print(f"\nSUCCESS CRITERIA")
	conflicts_detected = sum(1 for r in successful if r.round_0_conflict_count > 0)
	resolution_positive = sum(1 for r in successful if r.conflict_resolution_rate > 0)
	coherence_good = sum(1 for r in successful if r.ensemble_coherence > 0.5)

	print(f" ✓ Conflicts detected: {conflicts_detected}/{len(successful)}")
	print(f" ✓ Resolution attempts: {resolution_positive}/{len(successful)}")
	print(f" ✓ Coherence > 0.5: {coherence_good}/{len(successful)}")


	# ============================================================================
	# QUICKSTART
	# ============================================================================

	if __name__ == "__main__":
	# This is a quickstart. In actual usage:
	# from reasoning_forge.forge_engine import ForgeEngine
	# forge = ForgeEngine()
	# runner = ConflictTestRunner(forge)
	# results = runner.run_all_tests()

	import sys

	print("To run tests:")
	print(" 1. Ensure ForgeEngine is initialized with conflict detection")
	print(" 2. Create runner: runner = ConflictTestRunner(forge)")
	print(" 3. Run: results = runner.run_all_tests()")
	print("\nExample:")
	print(" from reasoning_forge.forge_engine import ForgeEngine")
	print(" from evaluation.conflict_tests import ConflictTestRunner")
	print(" forge = ForgeEngine()")
	print(" runner = ConflictTestRunner(forge)")
	print(" results = runner.run_all_tests('phase1_results.csv')")