| """ |
| Advanced Speech Recognition Module for Multilingual Audio Intelligence System |
| |
| This module implements state-of-the-art automatic speech recognition using openai-whisper |
| with integrated language identification capabilities. Designed for maximum performance |
| on CPU-constrained environments while maintaining SOTA accuracy. |
| |
| Key Features: |
| - OpenAI Whisper with optimized backend for speed improvement |
| - Integrated Language Identification (no separate LID module needed) |
| - VAD-based batching for real-time performance on CPU |
| - Word-level timestamps for interactive UI synchronization |
| - Robust error handling and multilingual support |
| - CPU and GPU optimization paths |
| |
| Model: openai/whisper-small (optimized for speed/accuracy balance) |
| Dependencies: openai-whisper, torch, numpy |
| """ |
|
|
| import os |
| import logging |
| import warnings |
| import numpy as np |
| import torch |
| from typing import List, Dict, Optional, Tuple, Union |
| import tempfile |
| from dataclasses import dataclass |
| import time |
|
|
| try: |
| import whisper |
| WHISPER_AVAILABLE = True |
| except ImportError: |
| WHISPER_AVAILABLE = False |
| logging.warning("openai-whisper not available. Install with: pip install openai-whisper") |
|
|
| |
| logging.basicConfig(level=logging.INFO) |
| logger = logging.getLogger(__name__) |
|
|
| |
| warnings.filterwarnings("ignore", category=UserWarning) |
| warnings.filterwarnings("ignore", category=FutureWarning) |
|
|
|
|
| @dataclass |
| class TranscriptionSegment: |
| """ |
| Data class representing a transcribed speech segment with rich metadata. |
| """ |
| start: float |
| end: float |
| text: str |
| language: str |
| language_probability: float |
| no_speech_probability: float |
| words: Optional[List[Dict]] = None |
| speaker_id: Optional[str] = None |
| confidence: Optional[float] = None |
| word_timestamps: Optional[List[Dict]] = None |
|
|
|
|
| class SpeechRecognizer: |
| """ |
| Advanced Speech Recognition Engine using OpenAI Whisper. |
| |
| This class provides high-performance speech recognition with integrated language |
| identification, optimized for both CPU and GPU environments. |
| """ |
| |
| def __init__(self, model_size: str = "small", device: str = "auto", |
| compute_type: str = "int8", language: Optional[str] = None): |
| """ |
| Initialize the Speech Recognizer. |
| |
| Args: |
| model_size: Whisper model size (tiny, base, small, medium, large) |
| device: Device to use (auto, cpu, cuda) |
| compute_type: Computation precision (int8, float16, float32) |
| language: Target language code (None for auto-detection) |
| """ |
| self.model_size = model_size |
| self.device = self._determine_device(device) |
| self.compute_type = compute_type |
| self.language = language |
| self.model = None |
| self._initialize_model() |
| |
| def _determine_device(self, device: str) -> str: |
| """Determine the best available device.""" |
| if device == "auto": |
| if torch.cuda.is_available(): |
| return "cuda" |
| elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): |
| return "mps" |
| else: |
| return "cpu" |
| return device |
| |
| def _initialize_model(self): |
| """Initialize the Whisper model.""" |
| if not WHISPER_AVAILABLE: |
| raise ImportError("openai-whisper is required. Install with: pip install openai-whisper") |
| |
| try: |
| logger.info(f"Loading {self.model_size} Whisper model...") |
| self.model = whisper.load_model(self.model_size, device=self.device) |
| logger.info(f"Speech recognition models loaded on {self.device}") |
| except Exception as e: |
| logger.error(f"Failed to load Whisper model: {e}") |
| raise |
| |
| def transcribe_audio(self, audio_data: np.ndarray, sample_rate: int = 16000, |
| language: Optional[str] = None, |
| initial_prompt: Optional[str] = None) -> List[TranscriptionSegment]: |
| """ |
| Transcribe audio data with language identification. |
| |
| Args: |
| audio_data: Audio data as numpy array |
| sample_rate: Sample rate of the audio |
| language: Language code (None for auto-detection) |
| initial_prompt: Initial prompt for better transcription |
| |
| Returns: |
| List of TranscriptionSegment objects |
| """ |
| if self.model is None: |
| raise RuntimeError("Model not initialized") |
| |
| try: |
| |
| if sample_rate != 16000: |
| import librosa |
| audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000) |
| |
| |
| result = self.model.transcribe( |
| audio_data, |
| language=language or self.language, |
| initial_prompt=initial_prompt, |
| word_timestamps=True, |
| verbose=False |
| ) |
| |
| |
| segments = [] |
| for segment in result["segments"]: |
| words = [] |
| if "words" in segment: |
| for word in segment["words"]: |
| words.append({ |
| "word": word["word"], |
| "start": word["start"], |
| "end": word["end"], |
| "probability": word.get("probability", 1.0) |
| }) |
| |
| segments.append(TranscriptionSegment( |
| start=segment["start"], |
| end=segment["end"], |
| text=segment["text"].strip(), |
| language=result.get("language", "unknown"), |
| language_probability=result.get("language_probability", 1.0), |
| no_speech_probability=segment.get("no_speech_prob", 0.0), |
| words=words, |
| speaker_id=None, |
| confidence=1.0 - segment.get("no_speech_prob", 0.0), |
| word_timestamps=words |
| )) |
| |
| return segments |
| |
| except Exception as e: |
| logger.error(f"Transcription failed: {e}") |
| raise |
| |
| def transcribe_file(self, file_path: str, language: Optional[str] = None, |
| initial_prompt: Optional[str] = None) -> List[TranscriptionSegment]: |
| """ |
| Transcribe an audio file. |
| |
| Args: |
| file_path: Path to audio file |
| language: Language code (None for auto-detection) |
| initial_prompt: Initial prompt for better transcription |
| |
| Returns: |
| List of TranscriptionSegment objects |
| """ |
| try: |
| |
| import librosa |
| audio_data, sample_rate = librosa.load(file_path, sr=16000) |
| |
| return self.transcribe_audio(audio_data, sample_rate, language, initial_prompt) |
| |
| except Exception as e: |
| logger.error(f"File transcription failed: {e}") |
| raise |
| |
| def transcribe_segments(self, audio_data: np.ndarray, sample_rate: int, |
| speaker_segments: List[Tuple[float, float, str]], |
| word_timestamps: bool = True) -> List[TranscriptionSegment]: |
| """ |
| Transcribe audio segments with speaker information. |
| |
| Args: |
| audio_data: Audio data as numpy array |
| sample_rate: Sample rate of the audio |
| speaker_segments: List of (start_time, end_time, speaker_id) tuples |
| word_timestamps: Whether to include word-level timestamps |
| |
| Returns: |
| List of TranscriptionSegment objects with speaker information |
| """ |
| if self.model is None: |
| raise RuntimeError("Model not initialized") |
| |
| try: |
| |
| if sample_rate != 16000: |
| import librosa |
| audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000) |
| |
| |
| result = self.model.transcribe( |
| audio_data, |
| language=self.language, |
| word_timestamps=word_timestamps, |
| verbose=False |
| ) |
| |
| |
| segments = [] |
| for segment in result["segments"]: |
| |
| speaker_id = "Unknown" |
| for start_time, end_time, spk_id in speaker_segments: |
| if (segment["start"] >= start_time and segment["end"] <= end_time): |
| speaker_id = spk_id |
| break |
| |
| words = [] |
| if word_timestamps and "words" in segment: |
| for word in segment["words"]: |
| words.append({ |
| "word": word["word"], |
| "start": word["start"], |
| "end": word["end"], |
| "probability": word.get("probability", 1.0) |
| }) |
| |
| segments.append(TranscriptionSegment( |
| start=segment["start"], |
| end=segment["end"], |
| text=segment["text"].strip(), |
| language=result.get("language", "unknown"), |
| language_probability=result.get("language_probability", 1.0), |
| no_speech_probability=segment.get("no_speech_prob", 0.0), |
| words=words, |
| speaker_id=speaker_id, |
| confidence=1.0 - segment.get("no_speech_prob", 0.0), |
| word_timestamps=words |
| )) |
| |
| return segments |
| |
| except Exception as e: |
| logger.error(f"Segment transcription failed: {e}") |
| raise |
|
|
| def get_supported_languages(self) -> List[str]: |
| """Get list of supported language codes.""" |
| return [ |
| "en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl", "ca", "nl", "ar", "sv", "it", "id", "hi", "fi", "vi", "he", "uk", "el", "ms", "cs", "ro", "da", "hu", "ta", "no", "th", "ur", "hr", "bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn", "sr", "az", "sl", "kn", "et", "mk", "br", "eu", "is", "hy", "ne", "mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", "km", "sn", "yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi", "lo", "uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl", "mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su" |
| ] |
| |
| def detect_language(self, audio_data: np.ndarray, sample_rate: int = 16000) -> Tuple[str, float]: |
| """ |
| Detect the language of audio data. |
| |
| Args: |
| audio_data: Audio data as numpy array |
| sample_rate: Sample rate of the audio |
| |
| Returns: |
| Tuple of (language_code, confidence) |
| """ |
| try: |
| |
| if sample_rate != 16000: |
| import librosa |
| audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000) |
| |
| |
| result = self.model.transcribe(audio_data, language=None, verbose=False) |
| |
| return result.get("language", "unknown"), result.get("language_probability", 0.0) |
| |
| except Exception as e: |
| logger.error(f"Language detection failed: {e}") |
| return "unknown", 0.0 |
|
|
|
|
| def create_speech_recognizer(model_size: str = "small", device: str = "auto", |
| compute_type: str = "int8", language: Optional[str] = None) -> SpeechRecognizer: |
| """ |
| Factory function to create a SpeechRecognizer instance. |
| |
| Args: |
| model_size: Whisper model size |
| device: Device to use |
| compute_type: Computation precision |
| language: Target language code |
| |
| Returns: |
| SpeechRecognizer instance |
| """ |
| return SpeechRecognizer(model_size, device, compute_type, language) |