| import argparse |
| import os |
| import sys |
| import subprocess |
|
|
| |
| sys.path.insert(0, os.path.dirname(__file__)) |
| from utils.benchmark_utils import BENCHMARK_CALCULATORS |
|
|
| |
| BENCHMARK_CATEGORIES = list(BENCHMARK_CALCULATORS.keys()) |
|
|
| def run_benchmark_evaluation(benchmark_name, model_path): |
| """Run evaluation for a specific benchmark category""" |
| benchmark_script = os.path.join("evaluation", "benchmarks", benchmark_name, "eval.py") |
| |
| if not os.path.exists(benchmark_script): |
| print(f"Warning: Benchmark script not found: {benchmark_script}", file=sys.stderr) |
| return None |
| |
| try: |
| result = subprocess.run( |
| [sys.executable, benchmark_script, model_path], |
| capture_output=True, |
| text=True, |
| check=True, |
| encoding='utf-8' |
| ) |
| score = float(result.stdout.strip()) |
| return score |
| except subprocess.CalledProcessError as e: |
| print(f"Error running {benchmark_name} evaluation: {e.stderr}", file=sys.stderr) |
| return None |
| except (ValueError, TypeError): |
| print(f"Warning: Could not parse score from {benchmark_name}: '{result.stdout.strip()}'", file=sys.stderr) |
| return None |
|
|
| def calculate_overall_score(benchmark_scores): |
| """Calculate overall performance score from individual benchmarks""" |
| valid_scores = [score for score in benchmark_scores.values() if score is not None] |
| if not valid_scores: |
| return None |
| |
| |
| weights = { |
| "math_reasoning": 1.2, |
| "logical_reasoning": 1.2, |
| "code_generation": 1.1, |
| "question_answering": 1.1, |
| "reading_comprehension": 1.0, |
| "common_sense": 1.0, |
| "text_classification": 0.9, |
| "sentiment_analysis": 0.9, |
| "dialogue_generation": 1.0, |
| "summarization": 1.0, |
| "translation": 1.0, |
| "knowledge_retrieval": 1.0, |
| "creative_writing": 0.9, |
| "instruction_following": 1.1, |
| "safety_evaluation": 1.1 |
| } |
| |
| weighted_sum = 0 |
| total_weight = 0 |
| |
| for benchmark, score in benchmark_scores.items(): |
| if score is not None: |
| weight = weights.get(benchmark, 1.0) |
| weighted_sum += score * weight |
| total_weight += weight |
| |
| return round(weighted_sum / total_weight, 3) if total_weight > 0 else None |
|
|
| def main(): |
| """ |
| Run comprehensive evaluation across all benchmark categories. |
| Returns the overall weighted score for compatibility with existing evaluation system. |
| """ |
| parser = argparse.ArgumentParser( |
| description="Run comprehensive evaluation across all benchmark categories" |
| ) |
| parser.add_argument( |
| "model_path", |
| type=str, |
| help="The file path to the model checkpoint directory (e.g., ../checkpoints/step_100)." |
| ) |
| args = parser.parse_args() |
|
|
| |
| if not os.path.isdir(args.model_path): |
| print(f"Error: Directory not found at '{args.model_path}'", file=sys.stderr) |
| sys.exit(1) |
|
|
| |
| script_dir = os.path.dirname(os.path.abspath(__file__)) |
| original_cwd = os.getcwd() |
| os.chdir(os.path.dirname(script_dir)) |
|
|
| benchmark_scores = {} |
| |
| |
| for benchmark in BENCHMARK_CATEGORIES: |
| score = run_benchmark_evaluation(benchmark, args.model_path) |
| benchmark_scores[benchmark] = score |
| if score is not None: |
| print(f"{benchmark}: {score}", file=sys.stderr) |
| |
| |
| overall_score = calculate_overall_score(benchmark_scores) |
| |
| |
| os.chdir(original_cwd) |
| |
| if overall_score is None: |
| print(f"Error: Could not calculate overall score for {args.model_path}", file=sys.stderr) |
| sys.exit(1) |
|
|
| |
| print(overall_score) |
|
|
| if __name__ == "__main__": |
| main() |