artificialguybr commited on
Commit
b50edfb
·
verified ·
1 Parent(s): fa9104e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -1
app.py CHANGED
@@ -84,6 +84,37 @@ def decode_codes_to_audio(merged_codes):
84
  return audio[0, 0]
85
 
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  def estimate_duration(text):
88
  words = len(text.split())
89
  seconds = max(5, int(words * 0.4))
@@ -209,9 +240,10 @@ with gr.Blocks(title="Fish Audio S2 Pro") as app:
209
  "The model will clone that voice for synthesis. Language is inferred automatically."
210
  )
211
  ref_audio = gr.Audio(label="Reference Audio", type="filepath")
 
212
  ref_text = gr.Textbox(
213
  label="Reference Audio Transcription",
214
- placeholder="Exact transcription of the reference audio...",
215
  )
216
 
217
  with gr.Accordion("⚙️ Advanced Settings", open=False):
@@ -280,6 +312,12 @@ with gr.Blocks(title="Fish Audio S2 Pro") as app:
280
  cache_examples=False,
281
  )
282
 
 
 
 
 
 
 
283
  generate_btn.click(
284
  fn=tts_inference,
285
  inputs=[text_input, ref_audio, ref_text, max_new_tokens, chunk_length, top_p, repetition_penalty, temperature],
 
84
  return audio[0, 0]
85
 
86
 
87
+ whisper_model = None
88
+
89
+
90
+ def get_whisper_model():
91
+ global whisper_model
92
+ if whisper_model is None:
93
+ from faster_whisper import WhisperModel
94
+ whisper_model = WhisperModel("large-v3", device="cuda", compute_type="int8")
95
+ return whisper_model
96
+
97
+
98
+ @spaces.GPU(duration=60)
99
+ def transcribe_audio(audio_path):
100
+ if audio_path is None:
101
+ raise gr.Error("Please upload a reference audio file first.")
102
+ try:
103
+ gr.Info("Transcribing audio with Whisper large-v3...")
104
+ model = get_whisper_model()
105
+ segments, info = model.transcribe(audio_path, beam_size=5, vad_filter=True)
106
+ text = " ".join(seg.text.strip() for seg in segments).strip()
107
+ if not text:
108
+ raise gr.Error("Whisper could not detect any speech in the audio.")
109
+ gr.Info(f"Detected language: {info.language} ({info.language_probability:.0%} confidence)")
110
+ return text
111
+ except gr.Error:
112
+ raise
113
+ except Exception as e:
114
+ traceback.print_exc()
115
+ raise gr.Error(f"Transcription error: {str(e)}")
116
+
117
+
118
  def estimate_duration(text):
119
  words = len(text.split())
120
  seconds = max(5, int(words * 0.4))
 
240
  "The model will clone that voice for synthesis. Language is inferred automatically."
241
  )
242
  ref_audio = gr.Audio(label="Reference Audio", type="filepath")
243
+ transcribe_btn = gr.Button("🎤 Auto-transcribe with Whisper", variant="secondary", size="sm")
244
  ref_text = gr.Textbox(
245
  label="Reference Audio Transcription",
246
+ placeholder="Exact transcription of the reference audio, or click Auto-transcribe above...",
247
  )
248
 
249
  with gr.Accordion("⚙️ Advanced Settings", open=False):
 
312
  cache_examples=False,
313
  )
314
 
315
+ transcribe_btn.click(
316
+ fn=transcribe_audio,
317
+ inputs=[ref_audio],
318
+ outputs=[ref_text],
319
+ )
320
+
321
  generate_btn.click(
322
  fn=tts_inference,
323
  inputs=[text_input, ref_audio, ref_text, max_new_tokens, chunk_length, top_p, repetition_penalty, temperature],