"""
Navigator — DQN-based Next Resource Recommender
================================================
Loads the pre-trained DQN model from Navigators/dqn_model.pth and uses it
to recommend the next best resource for a student to visit.

Model architecture (inferred from .pth weights):
    fc1: Linear(18, 128)   — input is a 18-dim state vector
    fc2: Linear(128, 128)  — hidden layer
    fc3: Linear(128, 18)   — output is Q-values over 18 topic modules
    (ReLU activations between layers)

State vector (18-dim): one value per topic module, representing the student's
    assimilation score (from the polyline) for that module.

Output: index of the module (0-17) with the highest Q-value among unvisited.
    The unvisited resource from that module with the highest reward is returned.
"""

import os
import numpy as np

# ──────────────────────────────────────────────
# Model Definition — must match training architecture
# ──────────────────────────────────────────────
_MODEL_PATH = os.path.join(os.path.dirname(__file__), '..', 'navigators', 'dqn_model.pth')

_dqn_net = None
_dqn_mode = "unavailable"

try:
    import torch
    import torch.nn as nn

    class DQNNet(nn.Module):
        def __init__(self, input_dim=18, hidden_dim=128, output_dim=18):
            super().__init__()
            self.fc1 = nn.Linear(input_dim, hidden_dim)
            self.fc2 = nn.Linear(hidden_dim, hidden_dim)
            self.fc3 = nn.Linear(hidden_dim, output_dim)

        def forward(self, x):
            x = torch.relu(self.fc1(x))
            x = torch.relu(self.fc2(x))
            return self.fc3(x)

    _net = DQNNet(input_dim=18, hidden_dim=128, output_dim=18)
    state_dict = torch.load(_MODEL_PATH, map_location='cpu', weights_only=False)
    _net.load_state_dict(state_dict)
    _net.eval()
    _dqn_net = _net
    _dqn_mode = "dqn"
    print("DQN Navigator loaded successfully")

except Exception as e:
    print(f"DQN Navigator fallback mode (could not load model): {e}")
    _dqn_mode = "fallback"


# ──────────────────────────────────────────────
# Topic-to-module index mapping (matches nlp_api.py order)
# ──────────────────────────────────────────────
ORDERED_MODULES = [
    "Pre training objectives",
    "Pre trained models",
    "Fine tuning LLM",
    "Instruction tuning",
    "Incontext Learning",
    "Prompting methods",
    "Multiprompt Learning",
    "Prompt aware training methods",
    "Retrieval Methods",
    "Retrieval Augmented Generation",
    "Model Distillation",
    "Model Quantization",
    "Model Pruning",
    "Mixture of Experts Model",
    "Agentic AI",
    "Multimodal LLMs",
    "Vision Language Models",
    "Policy learning using DQN",
    "RLHF",
]


def recommend_next(visited_ids: list, module_scores: list, nlp_resources: list) -> dict:
    """
    Recommend the next best resource using combined DQN + sequential progression.
    
    The DQN Q-values alone have minimal differentiation, so we combine them
    with sequential module ordering for sensible recommendations:
      - 70% weight on sequential progression (next module by S.No)
      - 30% weight on DQN Q-value ranking
    """
    visited_set = set(str(v).strip() for v in visited_ids)
    unvisited = [r for r in nlp_resources if str(r['id']).strip() not in visited_set]

    print(f"\n[NAV DEBUG] --- recommend_next called ---")
    print(f"[NAV DEBUG] Total resources: {len(nlp_resources)}, Visited: {len(visited_ids)}, Unvisited: {len(unvisited)}")

    if not unvisited:
        print("[NAV DEBUG] No unvisited resources remaining!")
        return {"resource": None, "module": None, "reason": _dqn_mode, "q_values": []}

    # ── Build state vector ──────────────────────────────────────
    state = list(module_scores) if module_scores else []
    if len(state) < 18:
        state.extend([0.5] * (18 - len(state)))
    state = state[:18]
    state_arr = np.array(state, dtype=np.float32)

    # ── Group unvisited resources by module ──────────────────
    module_to_resources = {}
    for r in unvisited:
        m = r.get('module', '')
        if m not in module_to_resources:
            module_to_resources[m] = []
        module_to_resources[m].append(r)

    print(f"[NAV DEBUG] Unvisited modules ({len(module_to_resources)}): {list(module_to_resources.keys())}")

    q_values = []
    reason = _dqn_mode

    # ── Compute sequential scores (which module should come next by S.No) ──
    # Find the highest visited module index to determine progression
    visited_module_indices = set()
    for r in nlp_resources:
        if str(r['id']).strip() in visited_set:
            m = r.get('module', '')
            if m in ORDERED_MODULES:
                visited_module_indices.add(ORDERED_MODULES.index(m))

    max_visited_idx = max(visited_module_indices) if visited_module_indices else -1
    print(f"[NAV DEBUG] Highest visited module index: {max_visited_idx} ({ORDERED_MODULES[max_visited_idx] if max_visited_idx >= 0 else 'none'})")

    # Sequential score: modules right after the last visited get highest score
    sequential_scores = {}
    for module_name in module_to_resources:
        if module_name in ORDERED_MODULES:
            idx = ORDERED_MODULES.index(module_name)
            # Distance from next expected module (max_visited_idx + 1)
            distance = abs(idx - (max_visited_idx + 1))
            # Score: closer to next = higher score (normalize to 0-1)
            # Use asinh to soften the penalty for distance so DQN can override more easily
            sequential_scores[module_name] = 1.0 / (1.0 + distance * 0.5)

    # ── DQN scores (normalized to 0-1 range) ──
    dqn_scores = {}
    if _dqn_net is not None:
        try:
            import torch
            with torch.no_grad():
                t = torch.tensor(state_arr).unsqueeze(0)
                qs = _dqn_net(t).squeeze(0).tolist()
            q_values = qs

            # Normalize Q-values to 0-1 for the modules that have unvisited resources
            # Note: DQN output dim is 18 but ORDERED_MODULES may have 19 entries;
            # modules beyond the model's output get a neutral score.
            num_q = len(qs)
            relevant_qs = []
            for module_name in module_to_resources:
                if module_name in ORDERED_MODULES:
                    idx = ORDERED_MODULES.index(module_name)
                    if idx < num_q:
                        relevant_qs.append(qs[idx])

            if relevant_qs:
                q_min = min(relevant_qs)
                q_range = max(relevant_qs) - q_min
                if q_range > 0.01:  # Meaningful differentiation
                    for module_name in module_to_resources:
                        if module_name in ORDERED_MODULES:
                            idx = ORDERED_MODULES.index(module_name)
                            if idx < num_q:
                                dqn_scores[module_name] = (qs[idx] - q_min) / q_range
                            else:
                                dqn_scores[module_name] = 0.5  # neutral for out-of-model modules
                else:
                    # Q-values are too clustered, DQN can't differentiate
                    print(f"[NAV DEBUG] Q-values too clustered (range={q_range:.4f}), ignoring DQN scores")
                    for module_name in module_to_resources:
                        dqn_scores[module_name] = 0.5  # neutral

            reason = "dqn"
        except Exception as e:
            print(f"[NAV DEBUG] DQN inference error: {e}")
            reason = "fallback"

    # ── Combined scoring (DQN-forward approach) ──
    WEIGHT_SEQUENTIAL = 0.05
    WEIGHT_DQN = 0.95

    best_module = None
    best_score = float('-inf')

    print(f"[NAV DEBUG] Module scores (seq={WEIGHT_SEQUENTIAL}, dqn={WEIGHT_DQN}):")
    for module_name in module_to_resources:
        seq = sequential_scores.get(module_name, 0.0)
        dqn = dqn_scores.get(module_name, 0.5)
        combined = WEIGHT_SEQUENTIAL * seq + WEIGHT_DQN * dqn
        
        idx_str = ""
        if module_name in ORDERED_MODULES:
            idx_str = f" (idx={ORDERED_MODULES.index(module_name)})"
        
        print(f"[NAV DEBUG]   '{module_name}'{idx_str}: seq={seq:.3f}, dqn={dqn:.3f}, combined={combined:.3f}")
        
        if combined > best_score:
            best_score = combined
            best_module = module_name

    if best_module and best_module in module_to_resources:
        candidates = module_to_resources[best_module]
        candidates.sort(key=lambda r: -r['reward'])
        chosen = candidates[0]
        print(f"[NAV DEBUG] SUCCESS: Chose '{best_module}' -> '{chosen['title']}' (id={chosen['id']}, score={best_score:.3f})")
        return {
            "resource": chosen,
            "module": best_module,
            "reason": reason,
            "q_values": q_values
        }

    # ── Fallback: next sequential unvisited resource ──
    unvisited_sorted = sorted(unvisited, key=lambda r: int(r['id']))
    best = unvisited_sorted[0]
    print(f"[NAV DEBUG] Fallback: '{best['title']}' (id={best['id']})")
    return {
        "resource": best,
        "module": best.get('module', ''),
        "reason": "fallback",
        "q_values": q_values
    }