import os
import pandas as pd
import soundfile as sf
import librosa
from datasets import load_dataset
from tqdm import tqdm
import sys

# Add src to path to import config
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from src.config import LANGUAGES, RAW_HUMAN_DIR, SAMPLE_RATE

def ensure_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

def download_english_data(num_samples=50):
    print(f"Downloading English samples from LibriSpeech...")
    lang_dir = os.path.join(RAW_HUMAN_DIR, 'en')
    ensure_dir(lang_dir)
    
    # Using LibriSpeech clean test set for quick access
    dataset = load_dataset("librispeech_asr", "clean", split="validation", streaming=True)
    
    data_records = []
    
    count = 0
    for sample in tqdm(dataset, total=num_samples):
        if count >= num_samples:
            break
            
        audio_array = sample['audio']['array']
        sr = sample['audio']['sampling_rate']
        
        # Resample if necessary
        if sr != SAMPLE_RATE:
            audio_array = librosa.resample(audio_array, orig_sr=sr, target_sr=SAMPLE_RATE)
            
        file_name = f"human_en_{count:04d}.flac"
        file_path = os.path.join(lang_dir, file_name)
        
        sf.write(file_path, audio_array, SAMPLE_RATE)
        
        data_records.append({
            'filename': file_name,
            'language': 'en',
            'path': file_path,
            'source': 'librispeech'
        })
        count += 1
        
    return data_records

def download_indic_data(lang_code, lang_name, num_samples=50):
    print(f"Downloading {lang_name} ({lang_code}) samples...")
    lang_dir = os.path.join(RAW_HUMAN_DIR, lang_code)
    ensure_dir(lang_dir)
    
    # Try IndicVoices first, fallback to Common Voice or FLEURS if needed
    # Note: IndicVoices might require manual download or authentication.
    # We'll use google/fleurs as a reliable automated fallback for this script 
    # if IndicVoices requires specific auth/access that we can't guarantee here.
    # However, user requested IndicVoices. Let's try to load a subset or use a compatible open dataset.
    # Common Voice (mozilla-foundation/common_voice_11_0) is a good standard.
    
    dataset_name = "google/fleurs" # Reliable open access
    subset = f"{lang_code}_in"
    
    print(f"Attempting to download from {dataset_name} ({subset})...")
    
    try:
        dataset = load_dataset(dataset_name, subset, split="validation", streaming=True, trust_remote_code=True)
    except Exception as e:
        print(f"Error loading {dataset_name}: {e}")
        return []

    data_records = []
    count = 0
    
    for sample in tqdm(dataset, total=num_samples):
        if count >= num_samples:
            break
            
        audio_array = sample['audio']['array']
        sr = sample['audio']['sampling_rate']
        
        if sr != SAMPLE_RATE:
            audio_array = librosa.resample(audio_array, orig_sr=sr, target_sr=SAMPLE_RATE)
            
        file_name = f"human_{lang_code}_{count:04d}.flac"
        file_path = os.path.join(lang_dir, file_name)
        
        sf.write(file_path, audio_array, SAMPLE_RATE)
        
        data_records.append({
            'filename': file_name,
            'language': lang_code,
            'path': file_path,
            'source': dataset_name
        })
        count += 1
        
    return data_records

def main():
    all_records = []
    
    # 1. English
    en_records = download_english_data()
    all_records.extend(en_records)
    
    # 2. Indic Languages
    for code, name in LANGUAGES.items():
        if code == 'en': continue
        
        records = download_indic_data(code, name)
        all_records.extend(records)
        
    # Save CSV
    df = pd.DataFrame(all_records)
    csv_path = os.path.join(RAW_HUMAN_DIR, 'human_samples.csv')
    df.to_csv(csv_path, index=False)
    print(f"Completed! Metadata saved to {csv_path}")
    print(f"Total samples: {len(df)}")

if __name__ == "__main__":
    main()