| """ |
| Audio Transcription Tool - Whisper speech-to-text |
| Author: @mangubee |
| Date: 2026-01-13 |
| |
| Provides audio transcription using OpenAI Whisper: |
| - Supports MP3, WAV, M4A, and other audio formats |
| - ZeroGPU acceleration via @spaces.GPU decorator |
| - Model caching for efficient repeated use |
| - Unified tool for Phase 1 (YouTube fallback) and Phase 2 (MP3 files) |
| |
| Requirements: |
| - openai-whisper: pip install openai-whisper |
| - ZeroGPU: @spaces.GPU decorator required for HF Spaces |
| """ |
|
|
| import logging |
| import os |
| import tempfile |
| from typing import Dict, Any |
| from pathlib import Path |
|
|
| |
| |
| |
| WHISPER_MODEL = "small" |
| WHISPER_LANGUAGE = "en" |
| AUDIO_FORMATS = [".mp3", ".wav", ".m4a", ".ogg", ".flac", ".aac"] |
|
|
| |
| |
| |
| logger = logging.getLogger(__name__) |
|
|
| |
| |
| |
| _MODEL = None |
|
|
|
|
| |
| |
| |
| try: |
| from spaces import GPU |
| ZERO_GPU_AVAILABLE = True |
| except ImportError: |
| |
| def GPU(func): |
| return func |
| ZERO_GPU_AVAILABLE = False |
| logger.info("ZeroGPU not available, running in CPU mode") |
|
|
|
|
| |
| |
| |
|
|
| @GPU |
| def transcribe_audio(file_path: str) -> Dict[str, Any]: |
| """ |
| Transcribe audio file using Whisper (ZeroGPU accelerated). |
| |
| Args: |
| file_path: Path to audio file (MP3, WAV, M4A, etc.) |
| |
| Returns: |
| Dict with structure: { |
| "text": str, # Transcribed text |
| "file_path": str, # Original file path |
| "success": bool, # True if transcription succeeded |
| "error": str or None # Error message if failed |
| } |
| |
| Raises: |
| FileNotFoundError: If audio file doesn't exist |
| ValueError: If file format is not supported |
| |
| Examples: |
| >>> transcribe_audio("audio.mp3") |
| {"text": "Hello world", "file_path": "audio.mp3", "success": True, "error": None} |
| """ |
| global _MODEL |
|
|
| |
| if not file_path: |
| logger.error("Empty file path provided") |
| return { |
| "text": "", |
| "file_path": "", |
| "success": False, |
| "error": "Empty file path provided" |
| } |
|
|
| file_path = Path(file_path) |
|
|
| if not file_path.exists(): |
| logger.error(f"File not found: {file_path}") |
| return { |
| "text": "", |
| "file_path": str(file_path), |
| "success": False, |
| "error": f"File not found: {file_path}" |
| } |
|
|
| |
| if file_path.suffix.lower() not in AUDIO_FORMATS: |
| logger.error(f"Unsupported audio format: {file_path.suffix}") |
| return { |
| "text": "", |
| "file_path": str(file_path), |
| "success": False, |
| "error": f"Unsupported audio format: {file_path.suffix}. Supported: {AUDIO_FORMATS}" |
| } |
|
|
| logger.info(f"Transcribing audio: {file_path}") |
|
|
| try: |
| |
| import whisper |
|
|
| |
| if _MODEL is None: |
| logger.info(f"Loading Whisper model: {WHISPER_MODEL}") |
| device = "cuda" if ZERO_GPU_AVAILABLE else "cpu" |
| _MODEL = whisper.load_model(WHISPER_MODEL, device=device) |
| logger.info(f"Whisper model loaded on {device}") |
|
|
| |
| result = _MODEL.transcribe( |
| str(file_path), |
| language=WHISPER_LANGUAGE, |
| fp16=False |
| ) |
|
|
| text = result["text"].strip() |
| logger.info(f"Transcription successful: {len(text)} characters") |
|
|
| return { |
| "text": text, |
| "file_path": str(file_path), |
| "success": True, |
| "error": None |
| } |
|
|
| except FileNotFoundError: |
| logger.error(f"Audio file not found: {file_path}") |
| return { |
| "text": "", |
| "file_path": str(file_path), |
| "success": False, |
| "error": f"Audio file not found: {file_path}" |
| } |
| except Exception as e: |
| logger.error(f"Transcription failed: {e}") |
| return { |
| "text": "", |
| "file_path": str(file_path), |
| "success": False, |
| "error": f"Transcription failed: {str(e)}" |
| } |
|
|
|
|
| |
| |
| |
|
|
| def cleanup(): |
| """Reset global model cache (useful for testing).""" |
| global _MODEL |
| _MODEL = None |
| logger.info("Whisper model cache cleared") |
|
|