syncmaster2 / audio_processor.py
aseelflihan's picture
fix
e879d8d
# audio_processor.py - Enhanced with AI Translation Support
import os
from dotenv import load_dotenv
import tempfile
from typing import List, Dict, Optional, Tuple
import json
import traceback
# --- DEFINITIVE NUMBA FIX ---
# This MUST be done BEFORE importing librosa
os.environ["NUMBA_CACHE_DIR"] = "/tmp"
# Now, import librosa safely
import librosa
# --- END OF FIX ---
import google.generativeai as genai
from translator import AITranslator
from google.api_core import exceptions as google_exceptions
class AudioProcessor:
def __init__(self):
self.translator = None
self.init_error = None
self._initialize_translator()
def _initialize_translator(self):
"""Initialize AI translator for multi-language support"""
try:
self.translator = AITranslator()
if self.translator.init_error:
print(f"--- WARNING: Translator has initialization error: {self.translator.init_error} ---")
except Exception as e:
print(f"--- WARNING: Translator initialization failed: {str(e)} ---")
self.translator = None
def transcribe_audio(self, audio_file_path: str) -> Tuple[Optional[str], Optional[str]]:
"""
Transcribes audio. Returns (text, error_message).
"""
if not self.translator or not self.translator.model:
return None, "--- ERROR: Translator model is not available for transcription. ---"
try:
if not os.path.exists(audio_file_path):
return None, f"--- ERROR: Audio file for transcription not found at: {audio_file_path} ---"
audio_file = genai.upload_file(path=audio_file_path)
prompt = "Transcribe this audio file accurately. Provide only the text content."
response = self.translator.model.generate_content([prompt, audio_file])
if response and hasattr(response, 'text') and response.text:
return response.text.strip(), None
else:
return None, "--- WARNING: Gemini returned an empty response for transcription. ---"
except google_exceptions.ResourceExhausted:
error_msg = "--- QUOTA ERROR: You have exceeded the daily free usage limit for the AI service. Please wait for your quota to reset (usually within 24 hours) or upgrade your Google AI plan. ---"
return None, error_msg
except Exception as e:
error_msg = f"--- FATAL ERROR during transcription: {traceback.format_exc()} ---"
return None, error_msg
def get_audio_duration(self, audio_file_path: str) -> Tuple[Optional[float], Optional[str]]:
"""
Gets audio duration. Returns (duration, error_message).
"""
try:
if not os.path.exists(audio_file_path):
return None, f"--- ERROR: Audio file for duration not found at: {audio_file_path} ---"
duration = librosa.get_duration(path=audio_file_path)
if duration is None or duration < 0.1:
return None, f"--- ERROR: librosa returned an invalid duration: {duration}s ---"
return duration, None
except Exception as e:
error_msg = f"--- FATAL ERROR getting audio duration with librosa: {traceback.format_exc()} ---"
return None, error_msg
def get_word_timestamps(self, audio_file_path: str) -> Tuple[List[Dict], List[str]]:
"""
Generates timestamps. Returns (timestamps, log_messages).
"""
logs = ["--- INFO: Starting get_word_timestamps... ---"]
transcription, error = self.transcribe_audio(audio_file_path)
if error:
logs.append(error)
return [], logs
logs.append(f"--- DEBUG: Transcription successful. Text: '{transcription[:50]}...'")
audio_duration, error = self.get_audio_duration(audio_file_path)
if error:
logs.append(error)
return [], logs
logs.append(f"--- DEBUG: Audio duration successful. Duration: {audio_duration:.2f}s")
words = transcription.split()
if not words:
logs.append("--- WARNING: Transcription resulted in zero words. ---")
return [], logs
logs.append(f"--- INFO: Distributing {len(words)} words across the duration. ---")
word_timestamps = []
total_words = len(words)
usable_duration = max(0, audio_duration - 1.0)
for i, word in enumerate(words):
start_time = 0.5 + (i * (usable_duration / total_words))
end_time = 0.5 + ((i + 1) * (usable_duration / total_words))
word_timestamps.append({'word': word.strip(), 'start': round(start_time, 3), 'end': round(end_time, 3)})
logs.append(f"--- SUCCESS: Generated {len(word_timestamps)} word timestamps. ---")
return word_timestamps, logs
def get_word_timestamps_with_translation(self, audio_file_path: str, target_language: str = 'ar') -> Tuple[Dict, List[str]]:
"""
Enhanced function that provides both transcription and translation
Args:
audio_file_path: Path to audio file
target_language: Target language for translation ('ar' for Arabic)
Returns:
Tuple of (result_dict, log_messages)
result_dict contains: {
'original_text': str,
'translated_text': str,
'word_timestamps': List[Dict],
'translated_timestamps': List[Dict],
'language_detected': str,
'target_language': str
}
"""
logs = ["--- INFO: Starting enhanced transcription with translation... ---"]
# Get original transcription and timestamps
word_timestamps, transcription_logs = self.get_word_timestamps(audio_file_path)
logs.extend(transcription_logs)
if not word_timestamps:
logs.append("--- ERROR: No transcription available for translation ---")
return {}, logs
# Extract original text
original_text = " ".join([d['word'] for d in word_timestamps])
logs.append(f"--- INFO: Original transcription: '{original_text[:50]}...' ---")
# Initialize result dictionary
result = {
'original_text': original_text,
'translated_text': '',
'word_timestamps': word_timestamps,
'translated_timestamps': [],
'language_detected': 'unknown',
'target_language': target_language,
'translation_success': False
}
# Check if translator is available
if not self.translator:
logs.append("--- WARNING: Translator not available, returning original text only ---")
result['translated_text'] = original_text
return result, logs
try:
# Detect original language
detected_lang, detect_error = self.translator.detect_language(original_text)
if detected_lang and detected_lang != 'unknown':
result['language_detected'] = detected_lang
logs.append(f"--- INFO: Detected language: {detected_lang} ---")
else:
logs.append(f"--- WARNING: Language detection failed: {detect_error} ---")
# Translate the text
translated_text, translation_error = self.translator.translate_text(
original_text,
target_language=target_language
)
if translated_text:
result['translated_text'] = translated_text
result['translation_success'] = True
logs.append(f"--- SUCCESS: Translation completed: '{translated_text[:50]}...' ---")
# Create translated timestamps by mapping words
translated_timestamps = self._create_translated_timestamps(
word_timestamps,
original_text,
translated_text
)
result['translated_timestamps'] = translated_timestamps
logs.append(f"--- INFO: Created {len(translated_timestamps)} translated timestamps ---")
else:
logs.append(f"--- ERROR: Translation failed: {translation_error} ---")
result['translated_text'] = original_text # Fallback to original
result['translated_timestamps'] = word_timestamps # Use original timestamps
except Exception as e:
error_msg = f"--- FATAL ERROR during translation process: {traceback.format_exc()} ---"
logs.append(error_msg)
result['translated_text'] = original_text # Fallback
result['translated_timestamps'] = word_timestamps
return result, logs
def _create_translated_timestamps(self, original_timestamps: List[Dict], original_text: str, translated_text: str) -> List[Dict]:
"""
Create timestamps for translated text by proportional mapping
Args:
original_timestamps: Original word timestamps
original_text: Original transcribed text
translated_text: Translated text
Returns:
List of translated word timestamps
"""
try:
translated_words = translated_text.split()
if not translated_words:
return []
# Get total duration from original timestamps
if not original_timestamps:
return []
start_time = original_timestamps[0]['start']
end_time = original_timestamps[-1]['end']
total_duration = end_time - start_time
# Create proportional timestamps for translated words
translated_timestamps = []
word_count = len(translated_words)
for i, word in enumerate(translated_words):
# Calculate proportional timing
word_start = start_time + (i * total_duration / word_count)
word_end = start_time + ((i + 1) * total_duration / word_count)
translated_timestamps.append({
'word': word.strip(),
'start': round(word_start, 3),
'end': round(word_end, 3)
})
return translated_timestamps
except Exception as e:
print(f"--- ERROR creating translated timestamps: {str(e)} ---")
return []
def batch_translate_transcription(self, audio_file_path: str, target_languages: List[str]) -> Tuple[Dict, List[str]]:
"""
Transcribe audio and translate to multiple languages
Args:
audio_file_path: Path to audio file
target_languages: List of target language codes
Returns:
Tuple of (results_dict, log_messages)
"""
logs = ["--- INFO: Starting batch translation process... ---"]
# Get original transcription
word_timestamps, transcription_logs = self.get_word_timestamps(audio_file_path)
logs.extend(transcription_logs)
if not word_timestamps:
return {}, logs
original_text = " ".join([d['word'] for d in word_timestamps])
# Initialize results
results = {
'original': {
'text': original_text,
'timestamps': word_timestamps,
'language': 'detected'
},
'translations': {}
}
# Translate to each target language
if self.translator:
for lang_code in target_languages:
try:
translated_text, error = self.translator.translate_text(original_text, lang_code)
if translated_text:
translated_timestamps = self._create_translated_timestamps(
word_timestamps, original_text, translated_text
)
results['translations'][lang_code] = {
'text': translated_text,
'timestamps': translated_timestamps,
'success': True
}
logs.append(f"--- SUCCESS: Translation to {lang_code} completed ---")
else:
results['translations'][lang_code] = {
'text': original_text,
'timestamps': word_timestamps,
'success': False,
'error': error
}
logs.append(f"--- ERROR: Translation to {lang_code} failed: {error} ---")
except Exception as e:
logs.append(f"--- FATAL ERROR translating to {lang_code}: {str(e)} ---")
else:
logs.append("--- WARNING: Translator not available for batch translation ---")
return results, logs