"""Evaluation Metrics for Various Tasks""" import logging from typing import Any, Dict, List, Optional import numpy as np import torch from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score from transformers import EvalPrediction logger = logging.getLogger(__name__) def compute_perplexity( eval_pred: EvalPrediction, ignore_index: int = -100, ) -> Dict[str, float]: """Compute perplexity from predictions.""" predictions, labels = eval_pred predictions = torch.tensor(predictions) labels = torch.tensor(labels) # Shift so that tokens < n predict n shift_logits = predictions[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten loss_fct = torch.nn.CrossEntropyLoss(reduction="mean") loss = loss_fct( shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1), ) perplexity = torch.exp(loss).item() return {"perplexity": perplexity, "loss": loss.item()} def compute_accuracy( eval_pred: EvalPrediction, ) -> Dict[str, float]: """Compute accuracy for classification tasks.""" predictions, labels = eval_pred predictions = np.argmax(predictions, axis=1) acc = accuracy_score(labels, predictions) return {"accuracy": acc} def compute_em_score( predictions: List[str], references: List[str], ) -> Dict[str, float]: """Compute Exact Match (EM) score.""" exact_matches = [] for pred, ref in zip(predictions, references): # Normalize pred_norm = pred.strip().lower() ref_norm = ref.strip().lower() exact_matches.append(pred_norm == ref_norm) em = np.mean(exact_matches) return {"exact_match": em} def compute_f1_score( eval_pred: EvalPrediction, average: str = "macro", ) -> Dict[str, float]: """Compute F1 score for classification.""" predictions, labels = eval_pred predictions = np.argmax(predictions, axis=1) f1 = f1_score(labels, predictions, average=average) precision = precision_score(labels, predictions, average=average) recall = recall_score(labels, predictions, average=average) return { "f1": f1, "precision": precision, "recall": recall, } def compute_eq_metrics( model: torch.nn.Module, dataset: Any, tokenizer: Any, device: torch.device, batch_size: int = 32, ) -> Dict[str, float]: """Compute emotional intelligence metrics.""" model.eval() all_preds = [] all_labels = [] with torch.no_grad(): for i in range(0, len(dataset), batch_size): batch = dataset[i:i+batch_size] # Extract text and labels texts = [ex["text"] for ex in batch] emotion_labels = [ex.get("emotion", 0) for ex in batch] frustration_labels = [ex.get("frustration_level", 0.0) for ex in batch] # Tokenize inputs = tokenizer( texts, padding=True, truncation=True, max_length=512, return_tensors="pt", ).to(device) # Forward pass outputs = model(**inputs) logits = outputs.logits if hasattr(outputs, "logits") else outputs # Assuming model outputs emotion logits if hasattr(model, "emotion_classifier"): emotion_logits = model.emotion_classifier(logits) preds = torch.argmax(emotion_logits, dim=-1) all_preds.extend(preds.cpu().numpy()) all_labels.extend(emotion_labels) # Compute metrics if all_preds: accuracy = accuracy_score(all_labels, all_preds) f1 = f1_score(all_labels, all_preds, average="weighted") else: accuracy = 0.0 f1 = 0.0 return { "emotion_accuracy": accuracy, "emotion_f1": f1, } def compute_code_metrics( model: torch.nn.Module, dataset: Any, tokenizer: Any, device: torch.device, batch_size: int = 16, ) -> Dict[str, float]: """Compute code generation metrics.""" model.eval() pass_at_1 = 0 total = 0 with torch.no_grad(): for ex in dataset: prompt = ex["prompt"] tests = ex["test"] # Generate inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device) with torch.cuda.amp.autocast(enabled=True): generated = model.generate( **inputs, max_new_tokens=256, temperature=0.2, do_sample=False, pad_token_id=tokenizer.pad_token_id, ) completion = tokenizer.decode(generated[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) code = prompt + completion # Run tests (simplified - in practice use subprocess) try: # Try to compile compile(code, "", "exec") # In practice, would run the tests pass_at_1 += 1 except: pass total += 1 return {"pass@1": pass_at_1 / total if total > 0 else 0.0} def compute_reasoning_metrics( predictions: List[str], references: List[str], reasoning_steps: Optional[List[List[str]]] = None, ) -> Dict[str, float]: """Compute reasoning-specific metrics including step accuracy.""" # Exact match em_score = compute_em_score(predictions, references)["exact_match"] # Step accuracy (if reasoning steps provided) step_accuracy = 0.0 if reasoning_steps: # Compare reasoning steps (simplified) step_matches = [] for pred_steps, ref_steps in zip(reasoning_steps, reasoning_steps): if pred_steps == ref_steps: step_matches.append(1.0) else: # Partial match common = set(pred_steps) & set(ref_steps) union = set(pred_steps) | set(ref_steps) step_matches.append(len(common) / len(union) if union else 0.0) step_accuracy = np.mean(step_matches) return { "exact_match": em_score, "step_accuracy": step_accuracy, } def compute_zeroshot_metrics( model: torch.nn.Module, tokenizer: Any, tasks: List[str], device: torch.device, ) -> Dict[str, float]: """Compute zero-shot performance on various tasks.""" results = {} for task in tasks: # Load task data (simplified) # In practice, would load specific benchmark datasets accuracy = 0.0 results[f"zeroshot_{task}"] = accuracy return results def compute_benchmark_suite( model: torch.nn.Module, tokenizer: Any, datasets: Dict[str, Any], device: torch.device, batch_size: int = 32, ) -> Dict[str, Any]: """Compute comprehensive benchmark suite.""" all_metrics = {} for dataset_name, dataset in datasets.items(): if "code" in dataset_name: metrics = compute_code_metrics(model, dataset, tokenizer, device, batch_size) elif "emotion" in dataset_name or "eq" in dataset_name: metrics = compute_eq_metrics(model, dataset, tokenizer, device, batch_size) elif "reasoning" in dataset_name: # Assume we have predictions and references metrics = compute_reasoning_metrics([], []) else: metrics = compute_perplexity_from_dataset(model, dataset, tokenizer, device, batch_size) all_metrics[dataset_name] = metrics return all_metrics def compute_perplexity_from_dataset( model: torch.nn.Module, dataset: Any, tokenizer: Any, device: torch.device, batch_size: int = 32, ) -> Dict[str, float]: """Compute perplexity on a dataset.""" model.eval() total_loss = 0.0 total_tokens = 0 with torch.no_grad(): for i in range(0, len(dataset), batch_size): batch = dataset[i:i+batch_size] texts = [ex["text"] for ex in batch] inputs = tokenizer( texts, padding=True, truncation=True, max_length=1024, return_tensors="pt", ).to(device) with torch.cuda.amp.autocast(enabled=True): outputs = model(**inputs) logits = outputs.logits if hasattr(outputs, "logits") else outputs # Compute loss shift_logits = logits[..., :-1, :].contiguous() shift_labels = inputs["input_ids"][..., 1:].contiguous() loss_fct = torch.nn.CrossEntropyLoss(reduction="sum") loss = loss_fct( shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1), ) total_loss += loss.item() total_tokens += (shift_labels != tokenizer.pad_token_id).sum().item() perplexity = torch.exp(torch.tensor(total_loss / total_tokens)).item() return {"perplexity": perplexity, "loss": total_loss / total_tokens}