import os import asyncio import pandas as pd import soundfile as sf import librosa from gtts import gTTS import edge_tts from tqdm import tqdm import sys # Add src to path sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from src.config import LANGUAGES, RAW_AI_DIR, SAMPLE_RATE # Text prompts for generation (Mix of lengths and types) # We will generate permutations or use a larger corpus if needed. # For now, a small set repeated with different voices/speeds is a good start. TEXT_CORPUS = { 'en': [ "The quick brown fox jumps over the lazy dog.", "Hello, this is an AI voice generation test.", "Artificial intelligence is transforming the world.", "Can you tell me the time please?", "I am not a human, but I sound like one.", "Verification code is one two three four.", "Open the pod bay doors, HAL.", "The weather today is sunny with a chance of rain.", "Please confirm your identity.", "This is a secure channel." ], 'ta': [ "வணக்கம், எப்படி இருக்கிறீர்கள்?", # Hello, how are you? "இது ஒரு செயற்கை நுண்ணறிவு குரல் சோதனை.", # This is an AI voice test. "தமிழ் உலகின் மூத்த மொழிகளில் ஒன்று.", # Tamil is one of the oldest languages. "இன்று வானிலை மிக நன்றாக உள்ளது.", # The weather is very good today. "தயவுசெய்து உங்கள் அடையாளத்தை உறுதிப்படுத்தவும்." # Please verify your identity. ], 'hi': [ "नमस्ते, आप कैसे हैं?", # Hello, how are you? "यह एक एआई आवाज़ परीक्षण है।", # This is an AI voice test. "भारत एक विशाल देश है।", # India is a huge country. "कृपया अपना पासवर्ड दर्ज करें।", # Please enter your password. "मौसम आज बहुत सुहावना है।" # The weather is very pleasant today. ], 'ml': [ "നമസ്കാരം, സുഖമാണോ?", # Hello, are you fine? "ഇതൊരു നിർമ്മിത ബുദ്ധി പരീക്ഷണമാണ്.", # This is an AI test. "കേരളം ദൈവത്തിന്റെ സ്വന്തം നാടാണ്.", # Kerala is God's own country. "ദയവായി വാതിൽ തുറക്കൂ.", # Please open the door. "ഇന്നത്തെ കാലാവസ്ഥ എങ്ങനെയുണ്ട്?" # How is today's weather? ], 'te': [ "నమస్కారం, మీరు ఎలా ఉన్నారు?", # Hello, how are you? "ఇది ఒక కృత్రిమ మేధస్సు పరీక్ష.", # This is an AI test. "తెలుగు చాలా తీయని భాష.", # Telugu is a very sweet language. "దయచేసి మీ పేరు చెప్పండి.", # Please tell your name. "ఈ రోజు వర్షం పడే అవకాశం ఉంది." # There is a chance of rain today. ] } async def generate_edge_tts(text, voice, output_path): """Generate audio using Edge TTS""" communicate = edge_tts.Communicate(text, voice) await communicate.save(output_path) def generate_gtts(text, lang_code, output_path): """Generate audio using Google TTS""" tts = gTTS(text=text, lang=lang_code, slow=False) tts.save(output_path) async def main(): if not os.path.exists(RAW_AI_DIR): os.makedirs(RAW_AI_DIR) data_records = [] # Edge TTS Voices Map (Approximate) EDGE_VOICES = { 'en': ['en-US-ChristopherNeural', 'en-US-JennyNeural', 'en-GB-SoniaNeural'], 'ta': ['ta-IN-ValluvarNeural', 'ta-IN-PallaviNeural'], 'hi': ['hi-IN-MadhurNeural', 'hi-IN-SwaraNeural'], 'ml': ['ml-IN-MidhunNeural', 'ml-IN-SobhanaNeural'], 'te': ['te-IN-MohanNeural', 'te-IN-ShrutiNeural'] } target_per_lang = 50 for lang_code, lang_name in LANGUAGES.items(): print(f"Generating AI samples for {lang_name} ({lang_code})...") lang_dir = os.path.join(RAW_AI_DIR, lang_code) if not os.path.exists(lang_dir): os.makedirs(lang_dir) texts = TEXT_CORPUS.get(lang_code, TEXT_CORPUS['en']) # Fallback to English if missing count = 0 # 1. Edge TTS Generation voices = EDGE_VOICES.get(lang_code, []) for voice in voices: for text in texts: if count >= target_per_lang // 2: # Do half with Edge, half with gTTS break fname = f"ai_edge_{lang_code}_{count:04d}.mp3" fpath = os.path.join(lang_dir, fname) try: await generate_edge_tts(text, voice, fpath) # Verify and convert to consistent format if needed (deferred to preprocessing) # For now just save record data_records.append({ 'filename': fname, 'language': lang_code, 'path': fpath, 'source': 'edge_tts', 'voice_engine': voice }) count += 1 except Exception as e: print(f"Error generating Edge TTS for {lang_code}: {e}") # 2. gTTS Generation (Fill the rest) gtts_lang = lang_code # gTTS mappings usually match ISO codes, but check docs if failures occur. # ta, hi, ml, te are supported. for text in texts: if count >= target_per_lang: break fname = f"ai_gtts_{lang_code}_{count:04d}.mp3" fpath = os.path.join(lang_dir, fname) try: generate_gtts(text, gtts_lang, fpath) data_records.append({ 'filename': fname, 'language': lang_code, 'path': fpath, 'source': 'gtts', 'voice_engine': 'gtts_standard' }) count += 1 except Exception as e: print(f"Error generating gTTS for {lang_code}: {e}") # Save Metadata df = pd.DataFrame(data_records) csv_path = os.path.join(RAW_AI_DIR, 'ai_samples.csv') df.to_csv(csv_path, index=False) print(f"AI Data Generation Complete! Saved to {csv_path}") if __name__ == "__main__": asyncio.run(main())