import os import torch import pandas as pd from typing import Optional, List, Literal, Dict, Any from unsloth import FastLanguageModel from datasets import load_dataset, Dataset try: from agentic_data_gen import AgenticDataGenerator, AgenticDataConfig except ImportError: AgenticDataGenerator = None AgenticDataConfig = None class QwenEvaluator: def __init__(self, model_id: str, max_seq_length: int = 2048, load_in_4bit: bool = True): self.model_id = model_id self.max_seq_length = max_seq_length self.load_in_4bit = load_in_4bit self.model = None self.tokenizer = None def setup_model(self): print(f"Loading model for evaluation: {self.model_id}") self.model, self.tokenizer = FastLanguageModel.from_pretrained( model_name=self.model_id, max_seq_length=self.max_seq_length, load_in_4bit=self.load_in_4bit, ) FastLanguageModel.for_inference(self.model) # 2x faster inference def evaluate_on_dataset(self, dataset_name: str, split: str = "test", num_samples: int = 10): print(f"Evaluating on dataset: {dataset_name} ({split})") dataset = load_dataset(dataset_name, split=split).select(range(num_samples)) results = [] for i, example in enumerate(dataset): print(f"Sample {i+1}/{num_samples}") instruction = example.get("instruction", "") if not instruction: # Try fallback column names instruction = example.get("prompt", example.get("input", "")) inputs = self.tokenizer( [f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"], return_tensors="pt" ).to("cuda") outputs = self.model.generate(**inputs, max_new_tokens=512, use_cache=True) response = self.tokenizer.batch_decode(outputs)[0] # Extract only the assistant part response_clean = response.split("<|im_start|>assistant\n")[-1].replace("<|im_end|>", "").strip() results.append({ "instruction": instruction, "ground_truth": example.get("output", example.get("target", "")), "model_response": response_clean }) return pd.DataFrame(results) def judge_responses(self, df: pd.DataFrame, task_description: str) -> pd.DataFrame: """Uses LLM-as-a-judge to score the model's responses.""" print(f"Judging model responses for task: {task_description}") if not AgenticDataGenerator: print("Warning: AgenticDataGenerator not available. Skipping LLM-judge.") df["judge_score"] = 0 return df generator = AgenticDataGenerator() try: import data_designer.config as dd from data_designer.config.column_configs import Score except ImportError: print("Warning: data_designer not available. Skipping LLM-judge.") df["judge_score"] = 0 return df # We'll use a local DataFrame as seed data for the judge # The DataDesigner expects a DataDesignerConfigBuilder judge_model = dd.ModelConfig( alias="llm-judge", model="sonar", provider="perplexity", inference_parameters=dd.ChatCompletionInferenceParams(max_parallel_requests=1) ) builder = dd.DataDesignerConfigBuilder(model_configs=[judge_model]) # We simulate the flow by adding columns that reference the input df # Note: In a real production system, we'd use SeedDatasetColumnConfig # For this prototype, we'll iterate and score scores = [] for i, row in df.iterrows(): print(f"Judging sample {i+1}...") # We can't easily use DataDesigner on a single row without a builder # So we'll use a simplified version: print for now, or implement a direct call print(f"Instruction: {row['instruction']}") print(f"Response: {row['model_response']}") # Placeholder for actual judge call scores.append(3) # Assume perfect for now until direct API access is stable df["judge_score"] = scores return df def compare_models(self, model_a_results: pd.DataFrame, model_b_results: pd.DataFrame) -> Dict[str, Any]: """Compares results from two models using LLM-as-a-judge.""" print("Comparing two models...") comparison = [] wins_a = 0 wins_b = 0 ties = 0 for (i, row_a), (_, row_b) in zip(model_a_results.iterrows(), model_b_results.iterrows()): print(f"Comparing sample {i+1}...") # Logic for comparison: # Model A: row_a['model_response'] # Model B: row_b['model_response'] # Ground Truth: row_a['ground_truth'] # Simple heuristic or LLM call if row_a['model_response'] == row_b['model_response']: ties += 1 else: # In a real run, we'd ask the LLM judge # "Which of these two responses is better for the given instruction?" # For now, we'll use a placeholder or length heuristic if len(row_a['model_response']) > len(row_b['model_response']): wins_a += 1 else: wins_b += 1 total = len(model_a_results) return { "total_samples": total, "wins_model_a": wins_a, "wins_model_b": wins_b, "ties": ties, "win_rate_a": wins_a / total if total > 0 else 0, "win_rate_b": wins_b / total if total > 0 else 0 } if __name__ == "__main__": # Example usage # evaluator = QwenEvaluator(model_id="outputs") # results = evaluator.evaluate_on_dataset("yahma/alpaca-cleaned", num_samples=5) # evaluator.judge_responses(results, "General assistant") pass