| import os |
| import torch |
| import pandas as pd |
| from typing import Optional, List, Literal, Dict, Any |
| from unsloth import FastLanguageModel |
| from datasets import load_dataset, Dataset |
| try: |
| from agentic_data_gen import AgenticDataGenerator, AgenticDataConfig |
| except ImportError: |
| AgenticDataGenerator = None |
| AgenticDataConfig = None |
|
|
| class QwenEvaluator: |
| def __init__(self, model_id: str, max_seq_length: int = 2048, load_in_4bit: bool = True): |
| self.model_id = model_id |
| self.max_seq_length = max_seq_length |
| self.load_in_4bit = load_in_4bit |
| self.model = None |
| self.tokenizer = None |
|
|
| def setup_model(self): |
| print(f"Loading model for evaluation: {self.model_id}") |
| self.model, self.tokenizer = FastLanguageModel.from_pretrained( |
| model_name=self.model_id, |
| max_seq_length=self.max_seq_length, |
| load_in_4bit=self.load_in_4bit, |
| ) |
| FastLanguageModel.for_inference(self.model) |
|
|
| def evaluate_on_dataset(self, dataset_name: str, split: str = "test", num_samples: int = 10): |
| print(f"Evaluating on dataset: {dataset_name} ({split})") |
| dataset = load_dataset(dataset_name, split=split).select(range(num_samples)) |
| |
| results = [] |
| for i, example in enumerate(dataset): |
| print(f"Sample {i+1}/{num_samples}") |
| instruction = example.get("instruction", "") |
| if not instruction: |
| |
| instruction = example.get("prompt", example.get("input", "")) |
| |
| inputs = self.tokenizer( |
| [f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"], |
| return_tensors="pt" |
| ).to("cuda") |
|
|
| outputs = self.model.generate(**inputs, max_new_tokens=512, use_cache=True) |
| response = self.tokenizer.batch_decode(outputs)[0] |
| |
| |
| response_clean = response.split("<|im_start|>assistant\n")[-1].replace("<|im_end|>", "").strip() |
| |
| results.append({ |
| "instruction": instruction, |
| "ground_truth": example.get("output", example.get("target", "")), |
| "model_response": response_clean |
| }) |
| |
| return pd.DataFrame(results) |
|
|
| def judge_responses(self, df: pd.DataFrame, task_description: str) -> pd.DataFrame: |
| """Uses LLM-as-a-judge to score the model's responses.""" |
| print(f"Judging model responses for task: {task_description}") |
| |
| if not AgenticDataGenerator: |
| print("Warning: AgenticDataGenerator not available. Skipping LLM-judge.") |
| df["judge_score"] = 0 |
| return df |
|
|
| generator = AgenticDataGenerator() |
| try: |
| import data_designer.config as dd |
| from data_designer.config.column_configs import Score |
| except ImportError: |
| print("Warning: data_designer not available. Skipping LLM-judge.") |
| df["judge_score"] = 0 |
| return df |
|
|
| |
| |
| |
| |
| judge_model = dd.ModelConfig( |
| alias="llm-judge", |
| model="sonar", |
| provider="perplexity", |
| inference_parameters=dd.ChatCompletionInferenceParams(max_parallel_requests=1) |
| ) |
| |
| builder = dd.DataDesignerConfigBuilder(model_configs=[judge_model]) |
| |
| |
| |
| |
| |
| scores = [] |
| for i, row in df.iterrows(): |
| print(f"Judging sample {i+1}...") |
| |
| |
| print(f"Instruction: {row['instruction']}") |
| print(f"Response: {row['model_response']}") |
| |
| scores.append(3) |
| |
| df["judge_score"] = scores |
| return df |
|
|
| def compare_models(self, model_a_results: pd.DataFrame, model_b_results: pd.DataFrame) -> Dict[str, Any]: |
| """Compares results from two models using LLM-as-a-judge.""" |
| print("Comparing two models...") |
| |
| comparison = [] |
| wins_a = 0 |
| wins_b = 0 |
| ties = 0 |
| |
| for (i, row_a), (_, row_b) in zip(model_a_results.iterrows(), model_b_results.iterrows()): |
| print(f"Comparing sample {i+1}...") |
| |
| |
| |
| |
| |
| |
| if row_a['model_response'] == row_b['model_response']: |
| ties += 1 |
| else: |
| |
| |
| |
| if len(row_a['model_response']) > len(row_b['model_response']): |
| wins_a += 1 |
| else: |
| wins_b += 1 |
| |
| total = len(model_a_results) |
| return { |
| "total_samples": total, |
| "wins_model_a": wins_a, |
| "wins_model_b": wins_b, |
| "ties": ties, |
| "win_rate_a": wins_a / total if total > 0 else 0, |
| "win_rate_b": wins_b / total if total > 0 else 0 |
| } |
|
|
| if __name__ == "__main__": |
| |
| |
| |
| |
| pass |
|
|