Spaces:

palli23
/

ASR_API

Running on Zero

App Files Files Community

palli23 commited on Feb 6

Commit

a39d532

verified ·

1 Parent(s): 060c793

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -18

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py — Your original working version + repetition_penalty=1.2 + ngram=3
 import os
 os.environ["OMP_NUM_THREADS"] = "1"
@@ -9,6 +9,21 @@ import spaces
 from transformers import pipeline
 import torch
 import gc
 # ——————————————————————————————
 # ZeroGPU worker – model loaded inside
@@ -19,13 +34,8 @@ def transcribe_3min(audio_path):
         return "Hlaðið upp hljóðskrá"
     pipe = pipeline(
-        "automatic-speech-recognition",
-        #model="palli23/whisper-tiny-icelandic-distilled-v3",
-        #model = "palli23/whisper-tiny-distilled-spjallromur-polish-v3",
-        model = "palli23/whisper-tiny-distilled-spjallromur-polish-v5",
-        #model="palli23/whisper-tiny-distilled-samromur-spjallromur-polish",
-        #model="palli23/whisper-tiny-samromur-spjallromur",
-        #model="palli23/whisper-small-sam_spjall",
         torch_dtype=torch.float16,
         device=0,  # GPU inside @spaces.GPU
     )
@@ -34,33 +44,37 @@ def transcribe_3min(audio_path):
         audio_path,
         chunk_length_s=30,
         batch_size=8,
-        return_timestamps=False,  # ← no timestamps, as you want
         generate_kwargs={
-            "num_beams": 5,  #var beam size 1
-            "repetition_penalty": 1.2,     # ← exactly what you asked for
-            "no_repeat_ngram_size": 3,     # ← exactly what you asked for
-            "temperature": 0.3, #when problems, temp to 0.0 first!
         }
     )
-    # Clean memory so ZeroGPU lives forever
     del pipe
     gc.collect()
     torch.cuda.empty_cache()
-    return result["text"]
 # ——————————————————————————————
 # UI – clean and simple
 # ——————————————————————————————
 with gr.Blocks() as demo:
-    gr.Markdown("# Íslenskt ASR – 3 mínútur")
-    gr.Markdown("**palli23/whisper-small-sam_spjall** · mjög lágur WER · allt að 5 mín hljóð")
     gr.Markdown("**Hafa samband:** pallinr1@protonmail.com")
     audio_in = gr.Audio(type="filepath", label="Hlaðið upp .mp3 / .wav")
     btn = gr.Button("Transcribe", variant="primary", size="lg")
-    output = gr.Textbox(lines=25, label="Útskrift")
     btn.click(fn=transcribe_3min, inputs=audio_in, outputs=output)

+# app.py — Cleaned output version (no <UNK>, [HIK:...], etc.)
 import os
 os.environ["OMP_NUM_THREADS"] = "1"
 from transformers import pipeline
 import torch
 import gc
+import re
+# Simple post-processing to remove noise tokens
+def clean_asr_text(text: str) -> str:
+    if not text:
+        return text
+    # Remove common Whisper noise patterns
+    text = re.sub(r'<[^>]+>', '', text)                     # <UNK>, < |0.00| > etc.
+    text = re.sub(r'\[.*?\]', '', text)                     # [HIK:xxx], [laughter] etc.
+    text = re.sub(r'\s+', ' ', text)                        # normalize spaces
+    text = re.sub(r'^\s+|\s+$', '', text)                   # strip leading/trailing
+    text = text.replace(' ,', ',').replace(' .', '.')       # fix spacing around punctuation
+    text = re.sub(r' +([.,!?])', r'\1', text)               # no space before punctuation
+    return text.strip()
 # ——————————————————————————————
 # ZeroGPU worker – model loaded inside
         return "Hlaðið upp hljóðskrá"
     pipe = pipeline(
+        "automatic-speech-recognition",
+        model="palli23/whisper-tiny-distilled-spjallromur-polish-v5",  # your current best model
         torch_dtype=torch.float16,
         device=0,  # GPU inside @spaces.GPU
     )
         audio_path,
         chunk_length_s=30,
         batch_size=8,
+        return_timestamps="word",           # often gives cleaner raw text than False
         generate_kwargs={
+            "num_beams": 5,
+            "repetition_penalty": 1.2,
+            "no_repeat_ngram_size": 3,
+            "temperature": 0.2,             # lower → less creative garbage
+            "suppress_tokens": [-1],        # sometimes helps suppress <unk> (-1 = unk token)
         }
     )
+    raw_text = result["text"]
+    cleaned = clean_asr_text(raw_text)
+    # Clean memory
     del pipe
     gc.collect()
     torch.cuda.empty_cache()
+    return cleaned
 # ——————————————————————————————
 # UI – clean and simple
 # ——————————————————————————————
 with gr.Blocks() as demo:
+    gr.Markdown("# Íslenskt ASR – 3 mínútur (hreinsuð útgáfa)")
+    gr.Markdown("**palli23/whisper-tiny-distilled-spjallromur-polish-v5** · reynir að fjarlægja <UNK>, [HIK...], osfrv.")
     gr.Markdown("**Hafa samband:** pallinr1@protonmail.com")
     audio_in = gr.Audio(type="filepath", label="Hlaðið upp .mp3 / .wav")
     btn = gr.Button("Transcribe", variant="primary", size="lg")
+    output = gr.Textbox(lines=25, label="Útskrift (hreinsuð)")
     btn.click(fn=transcribe_3min, inputs=audio_in, outputs=output)