import os import pandas as pd import librosa import soundfile as sf import numpy as np from tqdm import tqdm import sys # Add src to path sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from src.config import RAW_HUMAN_DIR, RAW_AI_DIR, PROCESSED_DIR, SAMPLE_RATE, DURATION_LIMIT, DATA_DIR def preprocess_audio(file_path, output_path): """ Standardize audio: - Load as Mono - Resample to 16kHz - Trim silence - Normalize amplitude - Pad/Trim to fixed duration (optional, but good for batching, let's just ensure min length for now) """ try: # Load audio (librosa handles resampling and mono conversion) y, sr = librosa.load(file_path, sr=SAMPLE_RATE, mono=True) # Trim silence (top_db=20 is a standard threshold) y_trimmed, _ = librosa.effects.trim(y, top_db=20) # Skip if too short (less than 0.5s) if len(y_trimmed) < 0.5 * SAMPLE_RATE: return False, "Too short" # Normalize amplitude (Peak normalization) y_norm = librosa.util.normalize(y_trimmed) # Save processed file sf.write(output_path, y_norm, SAMPLE_RATE) return True, "Success" except Exception as e: return False, str(e) def process_dataset(input_csv, source_type): """ Process all files listed in the CSV source_type: 'human' or 'ai' """ if not os.path.exists(input_csv): print(f"Dataset CSV not found: {input_csv}") return [] df = pd.read_csv(input_csv) processed_records = [] output_dir = os.path.join(PROCESSED_DIR, source_type) if not os.path.exists(output_dir): os.makedirs(output_dir) print(f"Processing {source_type} samples...") for _, row in tqdm(df.iterrows(), total=len(df)): file_path = row['path'] filename = row['filename'] lang = row['language'] # Create language subdir in processed lang_dir = os.path.join(output_dir, lang) if not os.path.exists(lang_dir): os.makedirs(lang_dir) output_filename = f"proc_{filename}" if not output_filename.endswith('.wav'): # Enforce wav for processed data usually, or keep original extension if flac/mp3 is fine. # wav is safer for downstream processing. output_filename = os.path.splitext(output_filename)[0] + ".wav" output_path = os.path.join(lang_dir, output_filename) success, msg = preprocess_audio(file_path, output_path) if success: processed_records.append({ 'filename': output_filename, 'original_filename': filename, 'path': output_path, 'label': source_type, # 'human' or 'ai' 'language': lang, 'split': 'train' # Default, will split later }) return processed_records def main(): if not os.path.exists(PROCESSED_DIR): os.makedirs(PROCESSED_DIR) all_processed = [] # Process Human Data human_csv = os.path.join(RAW_HUMAN_DIR, 'human_samples.csv') human_data = process_dataset(human_csv, 'human') all_processed.extend(human_data) # Process AI Data ai_csv = os.path.join(RAW_AI_DIR, 'ai_samples.csv') ai_data = process_dataset(ai_csv, 'ai') all_processed.extend(ai_data) # Save Master Dataset master_df = pd.DataFrame(all_processed) master_csv = os.path.join(DATA_DIR, 'master_dataset.csv') master_df.to_csv(master_csv, index=False) # Print Stats print("\nProcessing Complete!") print(f"Total Processed Samples: {len(master_df)}") print(master_df['label'].value_counts()) print(master_df['language'].value_counts()) if __name__ == "__main__": main()