| from transformers import pipeline |
| from transformers.pipelines.audio_utils import ffmpeg_microphone_live |
| import torch |
| import gradio as gr |
|
|
| asr_model = "openai/whisper-tiny.en" |
| nlp_model = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli" |
|
|
| pipe = pipeline("automatic-speech-recognition", model=asr_model, device=device) |
| sampling_rate = pipe.feature_extractor.sampling_rate |
|
|
| chunk_length_s = 10 |
| stream_chunk_s = 1 |
| mic = ffmpeg_microphone_live( |
| sampling_rate=sampling_rate, |
| chunk_length_s=chunk_length_s, |
| stream_chunk_s=stream_chunk_s, |
| ) |
|
|
| def listen_print_loop(responses): |
| for response in responses: |
| if response["text"]: |
| print(response["text"], end="\r") |
| return response["text"] |
| if not response["partial"]: |
| print("") |
|
|
|
|
| classifier = pipeline("zero-shot-classification", model=nlp_model) |
| candidate_labels = ["dim the light", "turn on light fully", "turn off light fully", "raise the light", "nothing about light"] |
|
|
|
|
| while True: |
| context = listen_print_loop(pipe(mic)) |
| print(context) |
| output = classifier(context, candidate_labels, multi_label=False) |
| top_label = output['labels'][0] |
| top_score = output['scores'][0] |
| print(f"Top Prediction: {top_label} with a score of {top_score:.2f}") |
| |
|
|
| iface = gr.Interface( |
| fn=transcribe, |
| inputs=gr.inputs.Audio(source="microphone", type="filepath"), |
| outputs="text", |
| title="Real-Time ASR Transcription", |
| description="Speak into the microphone and get the real-time transcription." |
| ) |
|
|
| iface.launch() |