| import os, re, string, json, tempfile, uuid |
| import html |
| import inspect |
| import importlib.resources as importlib_resources |
| from collections import defaultdict |
|
|
| import gradio as gr |
| import torch |
| import numpy as np |
| import pandas as pd |
| from transformers import AutoTokenizer, AutoModelForTokenClassification |
|
|
| |
| |
| |
| _HAS_FOTOKENIZER = False |
| try: |
| import fotokenizer |
| from fotokenizer import tokenize as fo_tokenize |
| from fotokenizer import TOK as FO_TOK |
| import fotokenizer.abbrev as fo_abbrev |
| _HAS_FOTOKENIZER = True |
| except Exception: |
| _HAS_FOTOKENIZER = False |
|
|
|
|
| def _patch_fotokenizer_for_py313() -> None: |
| """FO-Tokenizer currently uses importlib.resources.open_text(package=..., resource=...). |
| In Python 3.13, open_text no longer accepts the `package=` keyword. |
| This shim patches fotokenizer so it works on Python 3.13 (Hugging Face Spaces default).""" |
| if not _HAS_FOTOKENIZER: |
| return |
| try: |
| if "package" not in inspect.signature(importlib_resources.open_text).parameters: |
| def _open_text_compat(*args, **kwargs): |
| if "package" in kwargs: |
| pkg = kwargs.pop("package") |
| res = kwargs.pop("resource") |
| encoding = kwargs.pop("encoding", "utf-8") |
| errors = kwargs.pop("errors", "strict") |
| return importlib_resources.open_text(pkg, res, encoding=encoding, errors=errors) |
| return importlib_resources.open_text(*args, **kwargs) |
|
|
| fo_abbrev.open_text = _open_text_compat |
| except Exception: |
| pass |
|
|
|
|
| _patch_fotokenizer_for_py313() |
|
|
| |
| |
| |
| MODEL_ID = "Setur/BRAGD" |
| TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv" |
| LABELS_FILEPATH = "tag_labels.json" |
|
|
| TARGET_MAX_TOKENS = 256 |
|
|
| if not os.path.exists(LABELS_FILEPATH): |
| raise RuntimeError(f"Missing {LABELS_FILEPATH}. Add it to the Space repo root.") |
|
|
| INTERVALS = ( |
| (15, 29), (30, 33), (34, 36), (37, 41), (42, 43), (44, 45), (46, 50), |
| (51, 53), (54, 60), (61, 63), (64, 66), (67, 70), (71, 72) |
| ) |
|
|
| GROUP_ORDER = [ |
| "subcategory", "gender", "number", "case", "article", "proper", |
| "degree", "declension", "mood", "voice", "tense", "person", "definiteness" |
| ] |
| HIDE_CODES = {"subcategory": {"B"}} |
|
|
| UI = { |
| "fo": {"w": "Orð", "t": "Mark", "s": "Útgreining", "m": "Útgreinað marking"}, |
| "en": {"w": "Word", "t": "Tag", "s": "Analysis", "m": "Expanded tags"}, |
| } |
|
|
| MODEL_LINK = "https://huggingface.co/Setur/BRAGD" |
|
|
| |
| |
| |
| CSS = """ |
| /* Keep Gradio default styling; only override our buttons. */ |
| #btn_tag, #lang_fo_on, #lang_en_on{ |
| background:#89AFA9 !important; |
| border-color:#6F9992 !important; |
| color:#0b1b19 !important; |
| } |
| #btn_tag:hover, #lang_fo_on:hover, #lang_en_on:hover{ |
| background:#6F9992 !important; |
| border-color:#6F9992 !important; |
| color:#0b1b19 !important; |
| } |
| #lang_fo_off, #lang_en_off, #btn_dl_main, #btn_dl_exp{ |
| background:#C6DAD6 !important; |
| border-color:#6F9992 !important; |
| color:#0b1b19 !important; |
| } |
| #lang_fo_off:hover, #lang_en_off:hover, #btn_dl_main:hover, #btn_dl_exp:hover{ |
| background:#89AFA9 !important; |
| border-color:#6F9992 !important; |
| color:#0b1b19 !important; |
| } |
| @media (prefers-color-scheme: dark){ |
| #lang_fo_off, #lang_en_off, #btn_dl_main, #btn_dl_exp{ |
| background:#2a3b38 !important; |
| border-color:#6F9992 !important; |
| color:#e7eceb !important; |
| } |
| #lang_fo_off:hover, #lang_en_off:hover, #btn_dl_main:hover, #btn_dl_exp:hover{ |
| background:#89AFA9 !important; |
| border-color:#6F9992 !important; |
| color:#0b1b19 !important; |
| } |
| } |
| #results_hdr{ |
| display:flex !important; |
| align-items:center !important; |
| gap:12px !important; |
| } |
| #results_hdr > .gr-markdown{ flex:1 1 auto !important; } |
| #lang_buttons{ |
| display:flex !important; |
| gap:10px !important; |
| justify-content:flex-end !important; |
| align-items:center !important; |
| flex-wrap:nowrap !important; |
| } |
| #lang_buttons .gr-button, #lang_buttons button{ |
| width:auto !important; |
| min-width:120px !important; |
| flex:0 0 auto !important; |
| } |
| #expanded_hdr{ |
| display:flex !important; |
| align-items:center !important; |
| gap:12px !important; |
| } |
| #expanded_hdr > .gr-markdown{ flex:1 1 auto !important; } |
| #expanded_buttons{ |
| display:flex !important; |
| gap:10px !important; |
| justify-content:flex-end !important; |
| align-items:center !important; |
| flex-wrap:nowrap !important; |
| } |
| #expanded_buttons .gr-button, #expanded_buttons button{ |
| width:auto !important; |
| min-width:120px !important; |
| flex:0 0 auto !important; |
| } |
| #input_col, |
| #input_col > div, |
| #input_col .gr-block, |
| #input_col .gr-box, |
| #input_col .gr-panel, |
| #input_col .gr-group, |
| #input_col .gr-form{ |
| background: transparent !important; |
| border: 0 !important; |
| box-shadow: none !important; |
| } |
| #btn_tag{ |
| align-self:flex-start !important; |
| flex:0 0 auto !important; |
| height:fit-content !important; |
| } |
| #btn_tag button{ |
| height:auto !important; |
| } |
| #out_df .df-scroll, #out_mean_df .df-scroll{ |
| overflow-x:auto !important; |
| width:100% !important; |
| } |
| #out_df table.df-table, #out_mean_df table.df-table{ |
| border-collapse:collapse !important; |
| width:max-content !important; |
| min-width:100% !important; |
| } |
| #out_df th, #out_df td, |
| #out_mean_df th, #out_mean_df td{ |
| white-space:nowrap !important; |
| padding:10px 12px !important; |
| border:1px solid rgba(0,0,0,0.12) !important; |
| text-align:left !important; |
| vertical-align:top !important; |
| } |
| #out_df thead th, #out_mean_df thead th{ |
| font-weight:600 !important; |
| background: rgba(0,0,0,0.03) !important; |
| } |
| @media (prefers-color-scheme: dark){ |
| #out_df th, #out_df td, |
| #out_mean_df th, #out_mean_df td{ |
| border:1px solid rgba(255,255,255,0.14) !important; |
| } |
| #out_df thead th, #out_mean_df thead th{ |
| background: rgba(255,255,255,0.06) !important; |
| } |
| } |
| """ |
|
|
| |
| |
| |
| def simp_tok(sentence: str): |
| return re.findall(r"\w+|[" + re.escape(string.punctuation) + "]", sentence) |
|
|
|
|
| |
| |
| |
| def split_sentences(text: str): |
| """Split input into sentences. |
| |
| We use FO-Tokenizer sentence markers (BEGIN_SENT / END_SENT) when possible. |
| |
| Important detail: some FO-Tokenizer builds emit *whitespace* as "descriptor-only" |
| tokens (empty `.txt`). If we simply join `.txt` pieces we can lose spaces and end |
| up with merged words (e.g. `Núriggarkanska`). This function therefore: |
| - preserves `.txt` pieces as-is |
| - converts descriptor-only whitespace-like tokens into a single space |
| - adds a best-effort inserted space between tokens in cases where whitespace |
| is missing but clearly intended (word→word, comma/semicolon/colon→word) |
| """ |
|
|
| s = (text or "") |
| if not s.strip(): |
| return [] |
|
|
| def _norm(piece: str) -> str: |
| return re.sub(r"[\r\n]+", " ", piece) |
|
|
| def _append_piece(buf: list[str], piece: str) -> None: |
| if not piece: |
| return |
| piece = _norm(piece) |
| if not buf: |
| buf.append(piece) |
| return |
|
|
| last = buf[-1] |
| last_char = last[-1] if last else "" |
| if last_char.isspace(): |
| buf.append(piece) |
| return |
|
|
| if piece[0].isalnum() and (last_char.isalnum() or last_char in {",", ";", ":"}): |
| buf.append(" ") |
|
|
| buf.append(piece) |
|
|
| if _HAS_FOTOKENIZER: |
| try: |
| toks = fo_tokenize(s) |
| sents: list[str] = [] |
| cur: list[str] = [] |
|
|
| for tok in toks: |
| if getattr(tok, "txt", None): |
| _append_piece(cur, tok.txt) |
| continue |
|
|
| descr = FO_TOK.descr.get(tok.kind, "").replace(" ", "_") |
|
|
| if descr == "BEGIN_SENT": |
| if cur: |
| sent = "".join(cur).strip() |
| if sent: |
| sents.append(sent) |
| cur = [] |
| continue |
|
|
| if descr == "END_SENT": |
| sent = "".join(cur).strip() |
| if sent: |
| sents.append(sent) |
| cur = [] |
| continue |
|
|
| up = descr.upper() |
| if "WHITESPACE" in up or "SPACE" in up or "TAB" in up: |
| _append_piece(cur, " ") |
| elif "NEWLINE" in up or ("LINE" in up and "BREAK" in up): |
| _append_piece(cur, " ") |
| elif up == "DASH": |
| _append_piece(cur, "-") |
| else: |
| pass |
|
|
| if cur: |
| sent = "".join(cur).strip() |
| if sent: |
| sents.append(sent) |
|
|
| return sents or [s.strip()] |
| except Exception: |
| pass |
|
|
| parts = re.split(r"(?<=[.!?])\s+", s.strip()) |
| return [p.strip() for p in parts if p.strip()] |
|
|
|
|
| def run_model_multisentence(text: str): |
| """Run the model sentence-by-sentence and concatenate the rows.""" |
| rows_all = [] |
| for sent in split_sentences(text): |
| rows_all.extend(run_model(sent)) |
| return rows_all |
|
|
|
|
| |
| |
| |
| def load_tag_mappings(path: str): |
| df = pd.read_csv(path) |
| feature_cols = list(df.columns[1:]) |
| tag_to_features = { |
| row["Original Tag"]: row[1:].values.astype(int) |
| for _, row in df.iterrows() |
| } |
| features_to_tag = { |
| tuple(row[1:].values.astype(int)): row["Original Tag"] |
| for _, row in df.iterrows() |
| } |
| return tag_to_features, features_to_tag, len(feature_cols), feature_cols |
|
|
|
|
| def group_from_col(col: str): |
| if col == "Article": |
| return ("article", "A") |
| if col.startswith("No-Article "): |
| return ("article", col.split()[-1]) |
| if col == "Proper Noun": |
| return ("proper", "P") |
| if col.startswith("Not-Proper-Noun "): |
| return ("proper", col.split()[-1]) |
|
|
| prefixes = [ |
| ("Word Class ", "word_class"), |
| ("Subcategory ", "subcategory"), ("No-Subcategory ", "subcategory"), |
| ("Gender ", "gender"), ("No-Gender ", "gender"), |
| ("Number ", "number"), ("No-Number ", "number"), |
| ("Case ", "case"), ("No-Case ", "case"), |
| ("Degree ", "degree"), ("No-Degree ", "degree"), |
| ("Declension ", "declension"), ("No-Declension ", "declension"), |
| ("Mood ", "mood"), |
| ("Voice ", "voice"), ("No-Voice ", "voice"), |
| ("Tense ", "tense"), ("No-Tense ", "tense"), |
| ("Person ", "person"), ("No-Person ", "person"), |
| ("Definite ", "definiteness"), ("Indefinite ", "definiteness"), |
| ] |
| for p, g in prefixes: |
| if col.startswith(p): |
| return (g, col.split()[-1]) |
| return (None, None) |
|
|
|
|
| def process_tag_features(tag_to_features: dict, intervals): |
| arrs = [np.array(tpl) for tpl in set(tuple(a) for a in tag_to_features.values())] |
| wt_masks = {wt: [a for a in arrs if a[wt] == 1] for wt in range(15)} |
| out = {} |
| for wt, labels in wt_masks.items(): |
| if not labels: |
| out[wt] = [] |
| continue |
| sum_labels = np.sum(np.array(labels), axis=0) |
| out[wt] = [iv for iv in intervals if np.sum(sum_labels[iv[0]:iv[1] + 1]) != 0] |
| return out |
|
|
|
|
| def predict_vectors(logits, attention_mask, begin_tokens, dict_intervals, vec_len): |
| softmax = torch.nn.Softmax(dim=0) |
| vectors = [] |
| for idx in range(len(logits)): |
| if attention_mask[idx].item() != 1 or begin_tokens[idx] != 1: |
| continue |
| pred = logits[idx] |
| vec = torch.zeros(vec_len, device=logits.device) |
| wt = torch.argmax(softmax(pred[0:15])).item() |
| vec[wt] = 1 |
| for (a, b) in dict_intervals.get(wt, []): |
| seg = pred[a:b + 1] |
| k = torch.argmax(softmax(seg)).item() |
| vec[a + k] = 1 |
| vectors.append(vec) |
| return vectors |
|
|
|
|
| |
| |
| |
| with open(LABELS_FILEPATH, "r", encoding="utf-8") as f: |
| LABELS = json.load(f) |
|
|
|
|
| def label_for(lang: str, group: str, wc: str, code: str) -> str: |
| lang = "fo" if lang == "fo" else "en" |
| by_wc = LABELS.get(lang, {}).get("by_word_class", {}) |
| glob = LABELS.get(lang, {}).get("global", {}) |
| if wc and wc in by_wc and code in by_wc[wc].get(group, {}): |
| return by_wc[wc][group][code] |
| return glob.get(group, {}).get(code, "") |
|
|
|
|
| def clean_label(s: str) -> str: |
| s = (s or "").strip() |
| s = re.sub(r"\s+", " ", s) |
| return s.strip(" -;,:").strip() |
|
|
|
|
| |
| |
| |
| tag_to_features, features_to_tag, VEC_LEN, FEATURE_COLS = load_tag_mappings(TAGS_FILEPATH) |
|
|
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) |
| model = AutoModelForTokenClassification.from_pretrained(MODEL_ID) |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| model.to(device) |
| model.eval() |
|
|
| MAX_TOKENS = int(TARGET_MAX_TOKENS) |
| _model_max = getattr(getattr(model, "config", None), "max_position_embeddings", None) |
| _tok_max = getattr(tokenizer, "model_max_length", None) |
|
|
| for _m in (_model_max, _tok_max): |
| if isinstance(_m, int) and 0 < _m < 100000: |
| MAX_TOKENS = min(MAX_TOKENS, _m) |
|
|
| if hasattr(model, "config") and hasattr(model.config, "num_labels") and model.config.num_labels != VEC_LEN: |
| raise RuntimeError(f"Label size mismatch: model={model.config.num_labels}, csv={VEC_LEN}. Wrong CSV?") |
|
|
| DICT_INTERVALS = process_tag_features(tag_to_features, INTERVALS) |
|
|
| GROUPS = defaultdict(list) |
| for i, col in enumerate(FEATURE_COLS): |
| g, code = group_from_col(col) |
| if g and code not in HIDE_CODES.get(g, set()): |
| GROUPS[g].append((i, code, col)) |
|
|
|
|
| def vector_to_tag(vec: torch.Tensor) -> str: |
| return features_to_tag.get(tuple(vec.int().tolist()), "Unknown Tag") |
|
|
|
|
| def wc_code(vec: torch.Tensor) -> str: |
| for idx, code, _ in GROUPS["word_class"]: |
| if int(vec[idx].item()) == 1: |
| return code |
| return "" |
|
|
|
|
| def group_code(vec: torch.Tensor, group: str) -> str: |
| hidden = HIDE_CODES.get(group, set()) |
| for idx, code, _ in GROUPS.get(group, []): |
| if code in hidden: |
| continue |
| if int(vec[idx].item()) == 1: |
| return code |
| return "" |
|
|
|
|
| HIDE_IN_ANALYSIS = {("D", "subcategory", "G"), ("D", "subcategory", "N")} |
| VOICE_ANALYSIS = { |
| "fo": {"A": "gerðsøgn", "M": "miðalsøgn", "v": "orð luttøkuháttur"}, |
| "en": {"A": "active voice", "M": "middle voice", "v": "supine form"}, |
| } |
|
|
|
|
| def analysis_text(vec: torch.Tensor, lang: str) -> str: |
| lang = "fo" if lang == "fo" else "en" |
| tag = vector_to_tag(vec) |
| wc = wc_code(vec) |
|
|
| mood_code = group_code(vec, "mood") if wc == "V" else "" |
| skip_empty_verb_feats = (wc == "V" and mood_code in {"I", "M"}) |
|
|
| if tag == "DGd": |
| return "fyriseting" if lang == "fo" else "preposition" |
|
|
| mood = group_code(vec, "mood") |
| if mood == "U": |
| sup = label_for(lang, "mood", wc, "U") or ("luttøkuháttur" if lang == "fo" else "supine") |
| vcode = group_code(vec, "voice") or "v" |
| vlabel = VOICE_ANALYSIS[lang].get(vcode, VOICE_ANALYSIS[lang]["v"]) |
| return f"{clean_label(sup)}, {clean_label(vlabel)}" |
|
|
| parts = [] |
| if wc in {"P", "C"}: |
| subc = group_code(vec, "subcategory") |
| subl = clean_label(label_for(lang, "subcategory", wc, subc) or "") |
| if subl: |
| parts.append(subl) |
| else: |
| wcl = clean_label(label_for(lang, "word_class", wc, wc) or wc) |
| if wcl: |
| parts.append(wcl) |
|
|
| for g in GROUP_ORDER: |
| c = group_code(vec, g) |
| if not c: |
| continue |
|
|
| if skip_empty_verb_feats and g in {"number", "tense", "person"} and c in {"n", "t", "p"}: |
| continue |
|
|
| if wc in {"P", "C"} and g == "subcategory": |
| continue |
| if (wc, g, c) in HIDE_IN_ANALYSIS: |
| continue |
| lbl = clean_label(label_for(lang, g, wc, c) or label_for(lang, g, "", c) or "") |
| if lbl and lbl not in parts: |
| parts.append(lbl) |
|
|
| return ", ".join(parts) |
|
|
|
|
| def expanded_text(vec: torch.Tensor, lang: str) -> str: |
| lang = "fo" if lang == "fo" else "en" |
| wc = wc_code(vec) |
| parts = [] |
| wc_lbl = label_for(lang, "word_class", wc, wc) |
| parts.append(f"{wc} – {wc_lbl}" if wc_lbl else wc) |
| for g in GROUP_ORDER: |
| c = group_code(vec, g) |
| if not c: |
| continue |
| lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c) |
| parts.append(f"{c} – {lbl}" if lbl else c) |
| return "; ".join([p for p in parts if p]) |
|
|
|
|
| def compute_codes_by_wc(): |
| codes = defaultdict(lambda: defaultdict(set)) |
| for arr in tag_to_features.values(): |
| arr = np.array(arr) |
| wc = None |
| for idx, code, _ in GROUPS["word_class"]: |
| if arr[idx] == 1: |
| wc = code |
| break |
| if not wc: |
| continue |
| for g in GROUP_ORDER: |
| hidden = HIDE_CODES.get(g, set()) |
| for idx, code, _ in GROUPS.get(g, []): |
| if code in hidden: |
| continue |
| if arr[idx] == 1: |
| codes[wc][g].add(code) |
| return codes |
|
|
|
|
| CODES_BY_WC = compute_codes_by_wc() |
|
|
|
|
| def build_overview(lang: str) -> str: |
| lang = "fo" if lang == "fo" else "en" |
| title = "### Markayvirlit" if lang == "fo" else "### Tag Overview" |
| lines = [title, ""] |
| for wc in sorted(CODES_BY_WC.keys()): |
| wcl = label_for(lang, "word_class", wc, wc) or "" |
| lines.append(f"#### {wc} — {wcl}" if wcl else f"#### {wc}") |
| for g in GROUP_ORDER: |
| cs = sorted(CODES_BY_WC[wc].get(g, set())) |
| if not cs: |
| continue |
| group_name = { |
| "fo": { |
| "subcategory": "Undirflokkur", "gender": "Kyn", "number": "Tal", "case": "Fall", |
| "article": "Bundni/óbundni", "proper": "Sernavn / felagsnavn", "degree": "Stig", |
| "declension": "Bending", "mood": "Háttur", "voice": "Søgn", "tense": "Tíð", |
| "person": "Persónur", "definiteness": "Bundni/óbundni" |
| }, |
| "en": { |
| "subcategory": "Subcategory", "gender": "Gender", "number": "Number", "case": "Case", |
| "article": "Definiteness", "proper": "Proper/common noun", "degree": "Degree", |
| "declension": "Declension", "mood": "Mood", "voice": "Voice", "tense": "Tense", |
| "person": "Person", "definiteness": "Definiteness" |
| }, |
| }[lang].get(g, g) |
| lines.append(f"**{group_name}**") |
| for c in cs: |
| lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c) |
| lines.append(f"- `{c}` — {lbl}" if lbl else f"- `{c}`") |
| lines.append("") |
| lines.append("") |
| return "\n".join(lines).strip() |
|
|
|
|
| def run_model(sentence: str): |
| s = (sentence or "").strip() |
| if not s: |
| return [] |
| tokens = simp_tok(s) |
| if not tokens: |
| return [] |
|
|
| enc = tokenizer( |
| tokens, |
| is_split_into_words=True, |
| add_special_tokens=True, |
| max_length=MAX_TOKENS, |
| padding="max_length", |
| truncation=True, |
| return_attention_mask=True, |
| return_tensors="pt", |
| ) |
| input_ids = enc["input_ids"].to(device) |
| attention_mask = enc["attention_mask"].to(device) |
| word_ids = enc.word_ids(batch_index=0) |
|
|
| begin, last = [], None |
| for wid in word_ids: |
| if wid is None: |
| begin.append(0) |
| elif wid != last: |
| begin.append(1) |
| else: |
| begin.append(0) |
| last = wid |
|
|
| with torch.no_grad(): |
| logits = model(input_ids=input_ids, attention_mask=attention_mask).logits[0] |
|
|
| vectors = predict_vectors(logits, attention_mask[0], begin, DICT_INTERVALS, VEC_LEN) |
|
|
| rows, vec_i, seen = [], 0, set() |
| for i, wid in enumerate(word_ids): |
| if wid is None or begin[i] != 1 or wid in seen: |
| continue |
| seen.add(wid) |
| word = tokens[wid] if wid < len(tokens) else "<UNK>" |
| vec = vectors[vec_i] if vec_i < len(vectors) else torch.zeros(VEC_LEN, device=device) |
| rows.append({"word": word, "vec": vec.int().tolist()}) |
| vec_i += 1 |
| return rows |
|
|
|
|
| def _make_html_table(headers, rows): |
| th = "".join(f"<th>{html.escape(str(h))}</th>" for h in headers) |
| body_rows = [] |
| for row in rows: |
| tds = "".join(f"<td>{html.escape(str(c))}</td>" for c in row) |
| body_rows.append(f"<tr>{tds}</tr>") |
| body = "".join(body_rows) |
| return ( |
| '<div class="df-scroll">' |
| f'<table class="df-table"><thead><tr>{th}</tr></thead><tbody>{body}</tbody></table>' |
| '</div>' |
| ) |
|
|
|
|
| def render(rows_state, lang: str): |
| lang = "fo" if lang == "fo" else "en" |
| cols_main = [UI[lang]["w"], UI[lang]["t"], UI[lang]["s"]] |
| cols_mean = [UI[lang]["w"], UI[lang]["t"], UI[lang]["m"]] |
| if not rows_state: |
| return (_make_html_table(cols_main, []), _make_html_table(cols_mean, []), build_overview(lang)) |
|
|
| out_main, out_mean = [], [] |
| for r in rows_state: |
| vec = torch.tensor(r["vec"]) |
| tag = vector_to_tag(vec) |
| out_main.append([r["word"], tag, analysis_text(vec, lang)]) |
| out_mean.append([r["word"], tag, expanded_text(vec, lang)]) |
|
|
| return (_make_html_table(cols_main, out_main), _make_html_table(cols_mean, out_mean), build_overview(lang)) |
|
|
|
|
| def _write_tsv(df: pd.DataFrame, filename: str) -> str: |
| tmpdir = os.path.join(tempfile.gettempdir(), "marka_downloads", str(uuid.uuid4())) |
| os.makedirs(tmpdir, exist_ok=True) |
| path = os.path.join(tmpdir, filename) |
| df.to_csv(path, sep="\t", index=False, encoding="utf-8") |
| return path |
|
|
|
|
| def build_download_main(rows_state) -> str: |
| words, tags, fo_vals, en_vals = [], [], [], [] |
| for r in (rows_state or []): |
| vec = torch.tensor(r["vec"]) |
| tag = vector_to_tag(vec) |
| words.append(r["word"]) |
| tags.append(tag) |
| fo_vals.append(analysis_text(vec, "fo")) |
| en_vals.append(analysis_text(vec, "en")) |
|
|
| df = pd.DataFrame({ |
| UI["fo"]["w"]: words, |
| UI["fo"]["t"]: tags, |
| UI["fo"]["s"]: fo_vals, |
| UI["en"]["s"]: en_vals, |
| }) |
| return _write_tsv(df, "Markað.tsv") |
|
|
|
|
| def build_download_expanded(rows_state, lang: str) -> str: |
| lang = "fo" if lang == "fo" else "en" |
| words, tags, vals = [], [], [] |
| for r in (rows_state or []): |
| vec = torch.tensor(r["vec"]) |
| tag = vector_to_tag(vec) |
| words.append(r["word"]) |
| tags.append(tag) |
| vals.append(expanded_text(vec, lang)) |
| df = pd.DataFrame({ |
| UI[lang]["w"]: words, |
| UI[lang]["t"]: tags, |
| UI[lang]["m"]: vals, |
| }) |
| return _write_tsv(df, "Markað_útgreinað.tsv") |
|
|
|
|
| with gr.Blocks(css=CSS, title="Marka") as demo: |
| with gr.Row(equal_height=False): |
| with gr.Column(scale=2, elem_id="input_col"): |
| inp = gr.Textbox( |
| lines=6, |
| placeholder="Skriva her ... / Type here ...", |
| show_label=False, |
| elem_id="input_box", |
| ) |
| with gr.Column(scale=1, min_width=320): |
| gr.Markdown( |
| "## Marka\n" |
| "Skriv ein setning í kassan og fá hann markaðan.\n\n" |
| f"Myndil / Model: [{MODEL_ID}]({MODEL_LINK})" |
| ) |
| btn = gr.Button("Marka / Tag", variant="primary", elem_id="btn_tag") |
|
|
| state = gr.State([]) |
| lang_state = gr.State("fo") |
|
|
| results_hdr = gr.Row(elem_id="results_hdr", visible=True) |
| with results_hdr: |
| results_title = gr.Markdown("### Úrslit / Results") |
| with gr.Row(elem_id="lang_buttons") as lang_buttons_row: |
| btn_lang_fo_on = gr.Button("Føroyskt", variant="primary", elem_id="lang_fo_on", visible=False) |
| btn_lang_fo_off = gr.Button("Føroyskt", variant="secondary", elem_id="lang_fo_off", visible=False) |
| btn_lang_en_on = gr.Button("English", variant="primary", elem_id="lang_en_on", visible=False) |
| btn_lang_en_off = gr.Button("English", variant="secondary", elem_id="lang_en_off", visible=False) |
| btn_dl_main = gr.DownloadButton("Tak niður / Download", variant="secondary", elem_id="btn_dl_main", visible=False) |
| out_df = gr.HTML(value="", elem_id="out_df", visible=False) |
|
|
| expanded_acc = gr.Accordion("Útgreinað marking / Expanded tags", open=False, visible=False) |
| with expanded_acc: |
| with gr.Row(elem_id="expanded_hdr"): |
| gr.Markdown(" ") |
| with gr.Row(elem_id="expanded_buttons"): |
| btn_dl_exp = gr.DownloadButton("Tak niður / Download", variant="secondary", elem_id="btn_dl_exp", visible=False) |
| out_mean_df = gr.HTML(value="", elem_id="out_mean_df") |
|
|
| overview_acc = gr.Accordion("Markayvirlit / Tag Overview", open=False, visible=True) |
| with overview_acc: |
| overview_md = gr.Markdown(build_overview("fo")) |
|
|
| def show_loading(lang_current): |
| lang_current = "fo" if lang_current == "fo" else "en" |
| cols_main = [UI[lang_current]["w"], UI[lang_current]["t"], UI[lang_current]["s"]] |
| shell = _make_html_table(cols_main, []) |
| return ( |
| gr.update(value=shell, visible=True), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(value=""), |
| gr.update(value="Markar... / Tagging...", interactive=False), |
| ) |
|
|
| def on_tag(text, lang_current): |
| rows = run_model_multisentence(text) |
| df_main, df_mean, overview = render(rows, lang_current) |
|
|
| show_fo = (lang_current == "fo") |
| show_en = (lang_current == "en") |
|
|
| have_rows = bool(rows) |
| dl_main_path = build_download_main(rows) if have_rows else None |
| dl_exp_path = build_download_expanded(rows, lang_current) if have_rows else None |
|
|
| return ( |
| rows, |
| gr.update(value=df_main, visible=True), |
| gr.update(value=df_mean), |
| gr.update(value=overview), |
| gr.update(visible=True), |
| gr.update(visible=show_fo), |
| gr.update(visible=not show_fo), |
| gr.update(visible=show_en), |
| gr.update(visible=not show_en), |
| gr.update(value=dl_main_path, visible=have_rows), |
| gr.update(value=dl_exp_path, visible=have_rows), |
| lang_current, |
| gr.update(value="Marka / Tag", interactive=True), |
| ) |
|
|
| def on_set_lang(rows, lang_value): |
| df_main, df_mean, overview = render(rows, lang_value) |
|
|
| show_fo = (lang_value == "fo") |
| show_en = (lang_value == "en") |
|
|
| have_rows = bool(rows) |
| dl_main_path = build_download_main(rows) if have_rows else None |
| dl_exp_path = build_download_expanded(rows, lang_value) if have_rows else None |
|
|
| return ( |
| lang_value, |
| gr.update(value=df_main), |
| gr.update(value=df_mean), |
| gr.update(value=overview), |
| gr.update(visible=show_fo), |
| gr.update(visible=not show_fo), |
| gr.update(visible=show_en), |
| gr.update(visible=not show_en), |
| gr.update(value=dl_main_path, visible=have_rows), |
| gr.update(value=dl_exp_path, visible=have_rows), |
| ) |
|
|
| def on_set_fo(rows): |
| return on_set_lang(rows, "fo") |
|
|
| def on_set_en(rows): |
| return on_set_lang(rows, "en") |
|
|
| _evt = btn.click( |
| show_loading, |
| inputs=[lang_state], |
| outputs=[out_df, btn_dl_main, btn_dl_exp, expanded_acc, out_mean_df, btn], |
| queue=False, |
| ) |
|
|
| _evt.then( |
| on_tag, |
| inputs=[inp, lang_state], |
| outputs=[ |
| state, out_df, out_mean_df, overview_md, expanded_acc, |
| btn_lang_fo_on, btn_lang_fo_off, btn_lang_en_on, btn_lang_en_off, |
| btn_dl_main, btn_dl_exp, lang_state, btn |
| ], |
| queue=False, |
| ) |
|
|
| btn_lang_fo_on.click( |
| on_set_fo, |
| inputs=[state], |
| outputs=[ |
| lang_state, out_df, out_mean_df, overview_md, |
| btn_lang_fo_on, btn_lang_fo_off, btn_lang_en_on, btn_lang_en_off, |
| btn_dl_main, btn_dl_exp |
| ], |
| queue=False, |
| ) |
| btn_lang_fo_off.click( |
| on_set_fo, |
| inputs=[state], |
| outputs=[ |
| lang_state, out_df, out_mean_df, overview_md, |
| btn_lang_fo_on, btn_lang_fo_off, btn_lang_en_on, btn_lang_en_off, |
| btn_dl_main, btn_dl_exp |
| ], |
| queue=False, |
| ) |
| btn_lang_en_on.click( |
| on_set_en, |
| inputs=[state], |
| outputs=[ |
| lang_state, out_df, out_mean_df, overview_md, |
| btn_lang_fo_on, btn_lang_fo_off, btn_lang_en_on, btn_lang_en_off, |
| btn_dl_main, btn_dl_exp |
| ], |
| queue=False, |
| ) |
| btn_lang_en_off.click( |
| on_set_en, |
| inputs=[state], |
| outputs=[ |
| lang_state, out_df, out_mean_df, overview_md, |
| btn_lang_fo_on, btn_lang_fo_off, btn_lang_en_on, btn_lang_en_off, |
| btn_dl_main, btn_dl_exp |
| ], |
| queue=False, |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |