agentbee

Sleeping

App Files Files Community

agentbee / src /tools /audio.py

mangubee

fix: correct author name formatting in multiple files

e7b4937 2 months ago

raw

history blame contribute delete

5.52 kB

	"""
	Audio Transcription Tool - Whisper speech-to-text
	Author: @mangubee
	Date: 2026-01-13

	Provides audio transcription using OpenAI Whisper:
	- Supports MP3, WAV, M4A, and other audio formats
	- ZeroGPU acceleration via @spaces.GPU decorator
	- Model caching for efficient repeated use
	- Unified tool for Phase 1 (YouTube fallback) and Phase 2 (MP3 files)

	Requirements:
	- openai-whisper: pip install openai-whisper
	- ZeroGPU: @spaces.GPU decorator required for HF Spaces
	"""

	import logging
	import os
	import tempfile
	from typing import Dict, Any
	from pathlib import Path

	# ============================================================================
	# CONFIG
	# ============================================================================
	WHISPER_MODEL = "small" # tiny, base, small, medium, large
	WHISPER_LANGUAGE = "en" # English (auto-detect if None)
	AUDIO_FORMATS = [".mp3", ".wav", ".m4a", ".ogg", ".flac", ".aac"]

	# ============================================================================
	# Logging Setup
	# ============================================================================
	logger = logging.getLogger(__name__)

	# ============================================================================
	# Global Model Cache
	# ============================================================================
	_MODEL = None


	# ============================================================================
	# ZeroGPU Import (conditional)
	# ============================================================================
	try:
	from spaces import GPU
	ZERO_GPU_AVAILABLE = True
	except ImportError:
	# Not on HF Spaces, use dummy decorator
	def GPU(func):
	return func
	ZERO_GPU_AVAILABLE = False
	logger.info("ZeroGPU not available, running in CPU mode")


	# ============================================================================
	# Transcription Function
	# =============================================================================

	@GPU # Required for ZeroGPU - tells HF Spaces to allocate GPU
	def transcribe_audio(file_path: str) -> Dict[str, Any]:
	"""
	Transcribe audio file using Whisper (ZeroGPU accelerated).

	Args:
	file_path: Path to audio file (MP3, WAV, M4A, etc.)

	Returns:
	Dict with structure: {
	"text": str, # Transcribed text
	"file_path": str, # Original file path
	"success": bool, # True if transcription succeeded
	"error": str or None # Error message if failed
	}

	Raises:
	FileNotFoundError: If audio file doesn't exist
	ValueError: If file format is not supported

	Examples:
	>>> transcribe_audio("audio.mp3")
	{"text": "Hello world", "file_path": "audio.mp3", "success": True, "error": None}
	"""
	global _MODEL

	# Validate file path
	if not file_path:
	logger.error("Empty file path provided")
	return {
	"text": "",
	"file_path": "",
	"success": False,
	"error": "Empty file path provided"
	}

	file_path = Path(file_path)

	if not file_path.exists():
	logger.error(f"File not found: {file_path}")
	return {
	"text": "",
	"file_path": str(file_path),
	"success": False,
	"error": f"File not found: {file_path}"
	}

	# Check file extension
	if file_path.suffix.lower() not in AUDIO_FORMATS:
	logger.error(f"Unsupported audio format: {file_path.suffix}")
	return {
	"text": "",
	"file_path": str(file_path),
	"success": False,
	"error": f"Unsupported audio format: {file_path.suffix}. Supported: {AUDIO_FORMATS}"
	}

	logger.info(f"Transcribing audio: {file_path}")

	try:
	# Lazy import Whisper (only when function is called)
	import whisper

	# Load model (cached globally)
	if _MODEL is None:
	logger.info(f"Loading Whisper model: {WHISPER_MODEL}")
	device = "cuda" if ZERO_GPU_AVAILABLE else "cpu"
	_MODEL = whisper.load_model(WHISPER_MODEL, device=device)
	logger.info(f"Whisper model loaded on {device}")

	# Transcribe audio
	result = _MODEL.transcribe(
	str(file_path),
	language=WHISPER_LANGUAGE,
	fp16=False # Use fp32 for compatibility
	)

	text = result["text"].strip()
	logger.info(f"Transcription successful: {len(text)} characters")

	return {
	"text": text,
	"file_path": str(file_path),
	"success": True,
	"error": None
	}

	except FileNotFoundError:
	logger.error(f"Audio file not found: {file_path}")
	return {
	"text": "",
	"file_path": str(file_path),
	"success": False,
	"error": f"Audio file not found: {file_path}"
	}
	except Exception as e:
	logger.error(f"Transcription failed: {e}")
	return {
	"text": "",
	"file_path": str(file_path),
	"success": False,
	"error": f"Transcription failed: {str(e)}"
	}


	# ============================================================================
	# Cleanup Function
	# =============================================================================

	def cleanup():
	"""Reset global model cache (useful for testing)."""
	global _MODEL
	_MODEL = None
	logger.info("Whisper model cache cleared")