"""Fix 7: TDK-based FOREIGN word detection."""

from __future__ import annotations

import json
import os
from pathlib import Path

_CACHE_DIR = Path.home() / ".cache" / "turk_tokenizer"
_CACHE_DIR.mkdir(parents=True, exist_ok=True)
TDK_CACHE_FILE = str(_CACHE_DIR / "tdk_words.txt")

TR_CHARS = set("çğışöüÇĞİŞÖÜ")

_TDK_WORDS: set | None = None


def load_tdk_words() -> set:
    global _TDK_WORDS
    if _TDK_WORDS is not None:
        return _TDK_WORDS

    if not os.path.exists(TDK_CACHE_FILE):
        print("[TurkTokenizer] TDK word list not found — downloading automatically...")
        words = download_tdk_words()
        if not words:
            _TDK_WORDS = set()
            return _TDK_WORDS

    with open(TDK_CACHE_FILE, encoding="utf-8") as f:
        _TDK_WORDS = {line.strip().lower() for line in f if line.strip()}
    return _TDK_WORDS


def download_tdk_words() -> list[str]:
    """Download ~76K Turkish words from the TDK API and cache them."""
    try:
        import urllib.request  # noqa: PLC0415

        url = "https://sozluk.gov.tr/autocomplete.json"
        with urllib.request.urlopen(url, timeout=30) as resp:
            data = json.loads(resp.read().decode("utf-8"))

        words = sorted({item.get("madde", "").strip().lower() for item in data if item.get("madde")})
        with open(TDK_CACHE_FILE, "w", encoding="utf-8") as f:
            f.write("\n".join(words))

        print(f"[TurkTokenizer] TDK: {len(words):,} words cached at {TDK_CACHE_FILE}")
        return words

    except Exception as exc:  # noqa: BLE001
        print(f"[TurkTokenizer] TDK download failed: {exc}")
        print("  FOREIGN detection will be disabled for this session.")
        return []


def is_foreign_word(word: str) -> bool:
    w = word.strip().lower()
    if not w or len(w) < 2:
        return False
    if any(c in TR_CHARS for c in w):
        return False
    return w not in load_tdk_words()


def reclassify_foreign_words(tokens: list[dict]) -> list[dict]:
    """Reclassify word-initial BPE tokens as ROOT if they are foreign words."""
    tdk = load_tdk_words()
    if not tdk:
        return tokens

    result: list[dict] = []
    for tok in tokens:
        if tok["type"] != "BPE":
            result.append(tok)
            continue

        raw = tok["token"]
        stripped = raw.lstrip()

        if raw == stripped:   # no leading space → not word-initial
            result.append(tok)
            continue

        if is_foreign_word(stripped):
            result.append({**tok, "type": "ROOT", "_foreign": True, "_tdk": False})
        else:
            result.append(tok)

    return result