""" evaluate_tokens.py — Token-level seqeval evaluation matching the Colab training metric. Unlike evaluate.py (which reconstructs text and runs the full NLP pipeline), this script feeds pre-tokenized CoNLL words directly to the model, ensuring the evaluation is identical to what Colab measured during training. Run from NLP-intelligence/: python eval/evaluate_tokens.py python eval/evaluate_tokens.py --limit 500 """ import os, sys, argparse sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) EVAL_LABELS = {"PER", "LOC", "ORG"} # MISC excluded — not in fine-tuned model def parse_conll(path, limit=None): sentences, labels = [], [] cur_w, cur_l = [], [] with open(path, encoding="utf-8") as f: for line in f: line = line.rstrip() if line == "" or line.startswith("#"): if cur_w: sentences.append(cur_w) labels.append(cur_l) cur_w, cur_l = [], [] if limit and len(sentences) >= limit: break else: parts = line.split() if len(parts) >= 4: cur_w.append(parts[0]) raw = parts[-1] # Remap MISC → O so evaluation is PER/LOC/ORG only cur_l.append("O" if "MISC" in raw else raw) if cur_w: sentences.append(cur_w) labels.append(cur_l) return sentences, labels def predict_tokens(words_list, tokenizer, model, device, batch_size=32): """ Run token classification on pre-tokenized word lists. Returns list of per-sentence label sequences aligned to original words. """ import torch from torch.nn.functional import softmax all_preds = [] for i in range(0, len(words_list), batch_size): if i % 200 == 0: print(f" {i}/{len(words_list)} sentences...", end="\r") batch_words = words_list[i: i + batch_size] enc = tokenizer( batch_words, is_split_into_words=True, truncation=True, max_length=512, padding=True, return_tensors="pt", ) # keep BatchEncoding for word_ids() before moving tensors to device word_ids_per_sent = [enc.word_ids(batch_index=b) for b in range(len(batch_words))] model_input = {k: v.to(device) for k, v in enc.items()} with torch.no_grad(): logits = model(**model_input).logits # (batch, seq, num_labels) preds_ids = logits.argmax(-1).cpu().tolist() for b_idx, words in enumerate(batch_words): word_ids = word_ids_per_sent[b_idx] word_preds = {} for pos, wid in enumerate(word_ids): if wid is None or wid in word_preds: continue # skip [CLS]/[SEP]/padding and non-first subwords word_preds[wid] = model.config.id2label[preds_ids[b_idx][pos]] sent_preds = [word_preds.get(j, "O") for j in range(len(words))] all_preds.append(sent_preds) print() return all_preds def main(limit=None): import torch from transformers import AutoTokenizer, AutoModelForTokenClassification from seqeval.metrics import (classification_report, f1_score, precision_score, recall_score) base = os.path.dirname(os.path.dirname(__file__)) test_path = os.path.join(base, "Data", "data", "test.txt") model_path = os.path.join(base, "adapters", "ner_mongolian") if not os.path.exists(model_path): print(f"ERROR: Fine-tuned model not found at {model_path}") print("Run fine-tuning first and place model at adapters/ner_mongolian/") sys.exit(1) print(f"Loading model from {model_path}...") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForTokenClassification.from_pretrained(model_path).to(device) model.eval() print(f"Model loaded on {device}") print(f"Parsing {test_path}...") sentences, true_labels = parse_conll(test_path, limit=limit) print(f"Sentences: {len(sentences)}") print("Running token-level prediction...") pred_labels = predict_tokens(sentences, tokenizer, model, device) print("\n" + "=" * 50) print("NER EVALUATION RESULTS (Token-Level, seqeval)") print("=" * 50) print(classification_report(true_labels, pred_labels)) print(f"Overall F1: {f1_score(true_labels, pred_labels):.4f}") print(f"Overall Precision: {precision_score(true_labels, pred_labels):.4f}") print(f"Overall Recall: {recall_score(true_labels, pred_labels):.4f}") print("=" * 50) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--limit", type=int, default=None, help="Evaluate on first N sentences only") args = parser.parse_args() main(args.limit)