"""Fix 7: TDK-based FOREIGN word detection.""" from __future__ import annotations import json import os from pathlib import Path _CACHE_DIR = Path.home() / ".cache" / "turk_tokenizer" _CACHE_DIR.mkdir(parents=True, exist_ok=True) TDK_CACHE_FILE = str(_CACHE_DIR / "tdk_words.txt") TR_CHARS = set("çğışöüÇĞİŞÖÜ") _TDK_WORDS: set | None = None def load_tdk_words() -> set: global _TDK_WORDS if _TDK_WORDS is not None: return _TDK_WORDS if not os.path.exists(TDK_CACHE_FILE): print("[TurkTokenizer] TDK word list not found — downloading automatically...") words = download_tdk_words() if not words: _TDK_WORDS = set() return _TDK_WORDS with open(TDK_CACHE_FILE, encoding="utf-8") as f: _TDK_WORDS = {line.strip().lower() for line in f if line.strip()} return _TDK_WORDS def download_tdk_words() -> list[str]: """Download ~76K Turkish words from the TDK API and cache them.""" try: import urllib.request # noqa: PLC0415 url = "https://sozluk.gov.tr/autocomplete.json" with urllib.request.urlopen(url, timeout=30) as resp: data = json.loads(resp.read().decode("utf-8")) words = sorted({item.get("madde", "").strip().lower() for item in data if item.get("madde")}) with open(TDK_CACHE_FILE, "w", encoding="utf-8") as f: f.write("\n".join(words)) print(f"[TurkTokenizer] TDK: {len(words):,} words cached at {TDK_CACHE_FILE}") return words except Exception as exc: # noqa: BLE001 print(f"[TurkTokenizer] TDK download failed: {exc}") print(" FOREIGN detection will be disabled for this session.") return [] def is_foreign_word(word: str) -> bool: w = word.strip().lower() if not w or len(w) < 2: return False if any(c in TR_CHARS for c in w): return False return w not in load_tdk_words() def reclassify_foreign_words(tokens: list[dict]) -> list[dict]: """Reclassify word-initial BPE tokens as ROOT if they are foreign words.""" tdk = load_tdk_words() if not tdk: return tokens result: list[dict] = [] for tok in tokens: if tok["type"] != "BPE": result.append(tok) continue raw = tok["token"] stripped = raw.lstrip() if raw == stripped: # no leading space → not word-initial result.append(tok) continue if is_foreign_word(stripped): result.append({**tok, "type": "ROOT", "_foreign": True, "_tdk": False}) else: result.append(tok) return result