Spaces:
Running on Zero
Running on Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -84,6 +84,37 @@ def decode_codes_to_audio(merged_codes):
|
|
| 84 |
return audio[0, 0]
|
| 85 |
|
| 86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
def estimate_duration(text):
|
| 88 |
words = len(text.split())
|
| 89 |
seconds = max(5, int(words * 0.4))
|
|
@@ -209,9 +240,10 @@ with gr.Blocks(title="Fish Audio S2 Pro") as app:
|
|
| 209 |
"The model will clone that voice for synthesis. Language is inferred automatically."
|
| 210 |
)
|
| 211 |
ref_audio = gr.Audio(label="Reference Audio", type="filepath")
|
|
|
|
| 212 |
ref_text = gr.Textbox(
|
| 213 |
label="Reference Audio Transcription",
|
| 214 |
-
placeholder="Exact transcription of the reference audio...",
|
| 215 |
)
|
| 216 |
|
| 217 |
with gr.Accordion("⚙️ Advanced Settings", open=False):
|
|
@@ -280,6 +312,12 @@ with gr.Blocks(title="Fish Audio S2 Pro") as app:
|
|
| 280 |
cache_examples=False,
|
| 281 |
)
|
| 282 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
generate_btn.click(
|
| 284 |
fn=tts_inference,
|
| 285 |
inputs=[text_input, ref_audio, ref_text, max_new_tokens, chunk_length, top_p, repetition_penalty, temperature],
|
|
|
|
| 84 |
return audio[0, 0]
|
| 85 |
|
| 86 |
|
| 87 |
+
whisper_model = None
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def get_whisper_model():
|
| 91 |
+
global whisper_model
|
| 92 |
+
if whisper_model is None:
|
| 93 |
+
from faster_whisper import WhisperModel
|
| 94 |
+
whisper_model = WhisperModel("large-v3", device="cuda", compute_type="int8")
|
| 95 |
+
return whisper_model
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
@spaces.GPU(duration=60)
|
| 99 |
+
def transcribe_audio(audio_path):
|
| 100 |
+
if audio_path is None:
|
| 101 |
+
raise gr.Error("Please upload a reference audio file first.")
|
| 102 |
+
try:
|
| 103 |
+
gr.Info("Transcribing audio with Whisper large-v3...")
|
| 104 |
+
model = get_whisper_model()
|
| 105 |
+
segments, info = model.transcribe(audio_path, beam_size=5, vad_filter=True)
|
| 106 |
+
text = " ".join(seg.text.strip() for seg in segments).strip()
|
| 107 |
+
if not text:
|
| 108 |
+
raise gr.Error("Whisper could not detect any speech in the audio.")
|
| 109 |
+
gr.Info(f"Detected language: {info.language} ({info.language_probability:.0%} confidence)")
|
| 110 |
+
return text
|
| 111 |
+
except gr.Error:
|
| 112 |
+
raise
|
| 113 |
+
except Exception as e:
|
| 114 |
+
traceback.print_exc()
|
| 115 |
+
raise gr.Error(f"Transcription error: {str(e)}")
|
| 116 |
+
|
| 117 |
+
|
| 118 |
def estimate_duration(text):
|
| 119 |
words = len(text.split())
|
| 120 |
seconds = max(5, int(words * 0.4))
|
|
|
|
| 240 |
"The model will clone that voice for synthesis. Language is inferred automatically."
|
| 241 |
)
|
| 242 |
ref_audio = gr.Audio(label="Reference Audio", type="filepath")
|
| 243 |
+
transcribe_btn = gr.Button("🎤 Auto-transcribe with Whisper", variant="secondary", size="sm")
|
| 244 |
ref_text = gr.Textbox(
|
| 245 |
label="Reference Audio Transcription",
|
| 246 |
+
placeholder="Exact transcription of the reference audio, or click Auto-transcribe above...",
|
| 247 |
)
|
| 248 |
|
| 249 |
with gr.Accordion("⚙️ Advanced Settings", open=False):
|
|
|
|
| 312 |
cache_examples=False,
|
| 313 |
)
|
| 314 |
|
| 315 |
+
transcribe_btn.click(
|
| 316 |
+
fn=transcribe_audio,
|
| 317 |
+
inputs=[ref_audio],
|
| 318 |
+
outputs=[ref_text],
|
| 319 |
+
)
|
| 320 |
+
|
| 321 |
generate_btn.click(
|
| 322 |
fn=tts_inference,
|
| 323 |
inputs=[text_input, ref_audio, ref_text, max_new_tokens, chunk_length, top_p, repetition_penalty, temperature],
|