| |
| """Verbose Evaluation Runner β See Real-Time Agent Thinking |
| |
| Shows exactly what agents are thinking as they reason through each question. |
| |
| Usage: |
| python evaluation/run_evaluation_verbose.py --questions 1 |
| """ |
|
|
| import sys |
| import os |
| from pathlib import Path |
|
|
| |
| os.environ['CODETTE_VERBOSE'] = '1' |
|
|
| |
| import logging |
| logging.basicConfig( |
| level=logging.DEBUG, |
| format='%(name)-20s | %(levelname)-8s | %(message)s', |
| handlers=[ |
| logging.StreamHandler(sys.stdout), |
| ] |
| ) |
|
|
| sys.path.insert(0, str(Path(__file__).parent.parent / 'reasoning_forge')) |
| sys.path.insert(0, str(Path(__file__).parent.parent / 'inference')) |
|
|
| from evaluation.test_suite_evaluation import ( |
| EvaluationHarness, |
| EVALUATION_TEST_SUITE, |
| ) |
|
|
|
|
| def run_verbose_evaluation(num_questions: int = 1): |
| """Run evaluation with full real-time agent visibility.""" |
|
|
| print("\n" + "=" * 100) |
| print("CODETTE VERBOSE EVALUATION β REAL-TIME AGENT THINKING") |
| print("=" * 100) |
| print(f"Questions: {num_questions}") |
| print(f"Verbose mode: ON (see all agent reasoning)\n") |
|
|
| |
| print("[1/3] Loading ForgeEngine with real LLM agents...") |
| try: |
| from reasoning_forge.forge_engine import ForgeEngine |
|
|
| forge = ForgeEngine(living_memory=None, enable_memory_weighting=False) |
| print(" β ForgeEngine loaded") |
|
|
| if forge.newton.orchestrator: |
| print(f" β Orchestrator ready: {forge.newton.orchestrator.available_adapters}") |
| print(f" β GPU acceleration: {forge.newton.orchestrator.n_gpu_layers} layers") |
|
|
| except Exception as e: |
| print(f" β ERROR: {e}") |
| import traceback |
| traceback.print_exc() |
| return False |
|
|
| |
| print("\n[2/3] Creating evaluation harness...") |
| try: |
| harness = EvaluationHarness(forge) |
| print(" β Harness ready\n") |
| except Exception as e: |
| print(f" β ERROR: {e}") |
| return False |
|
|
| |
| print("[3/3] Running question with full real-time reasoning output...\n") |
| print("=" * 100) |
|
|
| try: |
| test_questions = EVALUATION_TEST_SUITE[:num_questions] |
|
|
| for i, question in enumerate(test_questions): |
| print(f"\n{'='*100}") |
| print(f"QUESTION {i+1}: {question.query}") |
| print(f"Category: {question.category} | Difficulty: {question.difficulty}") |
| print(f"Expected perspectives: {', '.join(question.expected_perspectives)}") |
| print(f"{'='*100}\n") |
|
|
| |
| print("[RUNNING DEBATE]\n") |
|
|
| result = forge.forge_with_debate(question.query) |
|
|
| |
| synthesis = "" |
| if "messages" in result and len(result["messages"]) >= 3: |
| synthesis = result["messages"][2].get("content", "") |
|
|
| print(f"\n{'='*100}") |
| print(f"[FINAL SYNTHESIS] ({len(synthesis)} characters)\n") |
| print(synthesis) |
| print(f"{'='*100}\n") |
|
|
| |
| metadata = result.get("metadata", {}) |
| print(f"[METADATA]") |
| print(f" Conflicts detected: {len(metadata.get('conflicts', []))}") |
| print(f" Gamma (coherence): {metadata.get('gamma', 0.5):.3f}") |
| print(f" Debate rounds: {metadata.get('debate_round', 0)}") |
|
|
| except Exception as e: |
| print(f"\nβ ERROR during evaluation: {e}") |
| import traceback |
| traceback.print_exc() |
| return False |
|
|
| return True |
|
|
|
|
| if __name__ == "__main__": |
| import argparse |
|
|
| parser = argparse.ArgumentParser(description="Verbose evaluation with real-time agent thinking") |
| parser.add_argument("--questions", type=int, default=1, help="Number of questions to run (default: 1)") |
| args = parser.parse_args() |
|
|
| success = run_verbose_evaluation(args.questions) |
| sys.exit(0 if success else 1) |
|
|