"""Evaluation Metrics for Various Tasks"""

import logging
from typing import Any, Dict, List, Optional

import numpy as np
import torch
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import EvalPrediction

logger = logging.getLogger(__name__)


def compute_perplexity(
    eval_pred: EvalPrediction,
    ignore_index: int = -100,
) -> Dict[str, float]:
    """Compute perplexity from predictions."""
    predictions, labels = eval_pred
    predictions = torch.tensor(predictions)
    labels = torch.tensor(labels)

    # Shift so that tokens < n predict n
    shift_logits = predictions[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()

    # Flatten
    loss_fct = torch.nn.CrossEntropyLoss(reduction="mean")
    loss = loss_fct(
        shift_logits.view(-1, shift_logits.size(-1)),
        shift_labels.view(-1),
    )

    perplexity = torch.exp(loss).item()
    return {"perplexity": perplexity, "loss": loss.item()}


def compute_accuracy(
    eval_pred: EvalPrediction,
) -> Dict[str, float]:
    """Compute accuracy for classification tasks."""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}


def compute_em_score(
    predictions: List[str],
    references: List[str],
) -> Dict[str, float]:
    """Compute Exact Match (EM) score."""
    exact_matches = []
    for pred, ref in zip(predictions, references):
        # Normalize
        pred_norm = pred.strip().lower()
        ref_norm = ref.strip().lower()
        exact_matches.append(pred_norm == ref_norm)

    em = np.mean(exact_matches)
    return {"exact_match": em}


def compute_f1_score(
    eval_pred: EvalPrediction,
    average: str = "macro",
) -> Dict[str, float]:
    """Compute F1 score for classification."""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    f1 = f1_score(labels, predictions, average=average)
    precision = precision_score(labels, predictions, average=average)
    recall = recall_score(labels, predictions, average=average)
    return {
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }


def compute_eq_metrics(
    model: torch.nn.Module,
    dataset: Any,
    tokenizer: Any,
    device: torch.device,
    batch_size: int = 32,
) -> Dict[str, float]:
    """Compute emotional intelligence metrics."""
    model.eval()

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for i in range(0, len(dataset), batch_size):
            batch = dataset[i:i+batch_size]

            # Extract text and labels
            texts = [ex["text"] for ex in batch]
            emotion_labels = [ex.get("emotion", 0) for ex in batch]
            frustration_labels = [ex.get("frustration_level", 0.0) for ex in batch]

            # Tokenize
            inputs = tokenizer(
                texts,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors="pt",
            ).to(device)

            # Forward pass
            outputs = model(**inputs)
            logits = outputs.logits if hasattr(outputs, "logits") else outputs

            # Assuming model outputs emotion logits
            if hasattr(model, "emotion_classifier"):
                emotion_logits = model.emotion_classifier(logits)
                preds = torch.argmax(emotion_logits, dim=-1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(emotion_labels)

    # Compute metrics
    if all_preds:
        accuracy = accuracy_score(all_labels, all_preds)
        f1 = f1_score(all_labels, all_preds, average="weighted")
    else:
        accuracy = 0.0
        f1 = 0.0

    return {
        "emotion_accuracy": accuracy,
        "emotion_f1": f1,
    }


def compute_code_metrics(
    model: torch.nn.Module,
    dataset: Any,
    tokenizer: Any,
    device: torch.device,
    batch_size: int = 16,
) -> Dict[str, float]:
    """Compute code generation metrics."""
    model.eval()

    pass_at_1 = 0
    total = 0

    with torch.no_grad():
        for ex in dataset:
            prompt = ex["prompt"]
            tests = ex["test"]

            # Generate
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
            with torch.cuda.amp.autocast(enabled=True):
                generated = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    temperature=0.2,
                    do_sample=False,
                    pad_token_id=tokenizer.pad_token_id,
                )

            completion = tokenizer.decode(generated[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
            code = prompt + completion

            # Run tests (simplified - in practice use subprocess)
            try:
                # Try to compile
                compile(code, "<string>", "exec")
                # In practice, would run the tests
                pass_at_1 += 1
            except:
                pass

            total += 1

    return {"pass@1": pass_at_1 / total if total > 0 else 0.0}


def compute_reasoning_metrics(
    predictions: List[str],
    references: List[str],
    reasoning_steps: Optional[List[List[str]]] = None,
) -> Dict[str, float]:
    """Compute reasoning-specific metrics including step accuracy."""
    # Exact match
    em_score = compute_em_score(predictions, references)["exact_match"]

    # Step accuracy (if reasoning steps provided)
    step_accuracy = 0.0
    if reasoning_steps:
        # Compare reasoning steps (simplified)
        step_matches = []
        for pred_steps, ref_steps in zip(reasoning_steps, reasoning_steps):
            if pred_steps == ref_steps:
                step_matches.append(1.0)
            else:
                # Partial match
                common = set(pred_steps) & set(ref_steps)
                union = set(pred_steps) | set(ref_steps)
                step_matches.append(len(common) / len(union) if union else 0.0)
        step_accuracy = np.mean(step_matches)

    return {
        "exact_match": em_score,
        "step_accuracy": step_accuracy,
    }


def compute_zeroshot_metrics(
    model: torch.nn.Module,
    tokenizer: Any,
    tasks: List[str],
    device: torch.device,
) -> Dict[str, float]:
    """Compute zero-shot performance on various tasks."""
    results = {}

    for task in tasks:
        # Load task data (simplified)
        # In practice, would load specific benchmark datasets
        accuracy = 0.0
        results[f"zeroshot_{task}"] = accuracy

    return results


def compute_benchmark_suite(
    model: torch.nn.Module,
    tokenizer: Any,
    datasets: Dict[str, Any],
    device: torch.device,
    batch_size: int = 32,
) -> Dict[str, Any]:
    """Compute comprehensive benchmark suite."""
    all_metrics = {}

    for dataset_name, dataset in datasets.items():
        if "code" in dataset_name:
            metrics = compute_code_metrics(model, dataset, tokenizer, device, batch_size)
        elif "emotion" in dataset_name or "eq" in dataset_name:
            metrics = compute_eq_metrics(model, dataset, tokenizer, device, batch_size)
        elif "reasoning" in dataset_name:
            # Assume we have predictions and references
            metrics = compute_reasoning_metrics([], [])
        else:
            metrics = compute_perplexity_from_dataset(model, dataset, tokenizer, device, batch_size)

        all_metrics[dataset_name] = metrics

    return all_metrics


def compute_perplexity_from_dataset(
    model: torch.nn.Module,
    dataset: Any,
    tokenizer: Any,
    device: torch.device,
    batch_size: int = 32,
) -> Dict[str, float]:
    """Compute perplexity on a dataset."""
    model.eval()
    total_loss = 0.0
    total_tokens = 0

    with torch.no_grad():
        for i in range(0, len(dataset), batch_size):
            batch = dataset[i:i+batch_size]

            texts = [ex["text"] for ex in batch]
            inputs = tokenizer(
                texts,
                padding=True,
                truncation=True,
                max_length=1024,
                return_tensors="pt",
            ).to(device)

            with torch.cuda.amp.autocast(enabled=True):
                outputs = model(**inputs)
                logits = outputs.logits if hasattr(outputs, "logits") else outputs

                # Compute loss
                shift_logits = logits[..., :-1, :].contiguous()
                shift_labels = inputs["input_ids"][..., 1:].contiguous()

                loss_fct = torch.nn.CrossEntropyLoss(reduction="sum")
                loss = loss_fct(
                    shift_logits.view(-1, shift_logits.size(-1)),
                    shift_labels.view(-1),
                )

                total_loss += loss.item()
                total_tokens += (shift_labels != tokenizer.pad_token_id).sum().item()

    perplexity = torch.exp(torch.tensor(total_loss / total_tokens)).item()
    return {"perplexity": perplexity, "loss": total_loss / total_tokens}