pratikpawar0204
AI Voice Detection API - Competition Submission
dead0b1
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import sys
# Add src to path
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from src.config import DATA_DIR
MODELS_DIR = os.path.join(os.path.dirname(__file__), '..', 'models')
if not os.path.exists(MODELS_DIR):
os.makedirs(MODELS_DIR)
def load_data():
dsp_path = os.path.join(DATA_DIR, 'features', 'dsp_features.csv')
emb_path = os.path.join(DATA_DIR, 'features', 'embeddings.csv')
if not os.path.exists(dsp_path):
raise FileNotFoundError("DSP Feature file not found. Run feature extraction first.")
dsp_df = pd.read_csv(dsp_path)
if os.path.exists(emb_path):
print("Loading Embeddings...")
emb_df = pd.read_csv(emb_path)
if 'label' in emb_df.columns:
emb_df = emb_df.drop(columns=['label'])
merged_df = pd.merge(dsp_df, emb_df, on='filename', how='inner')
return merged_df, True
else:
print("Embeddings not found. Using DSP features only.")
return dsp_df, False
def train_dsp_model(X_train, y_train):
print("Training DSP Classifier (Random Forest)...")
clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
calibrated_clf = CalibratedClassifierCV(clf, method='sigmoid', cv=5)
calibrated_clf.fit(X_train, y_train)
return calibrated_clf
def train_embedding_model(X_train, y_train):
print("Training Embedding Classifier (XGBoost)...")
clf = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42, eval_metric='logloss')
calibrated_clf = CalibratedClassifierCV(clf, method='sigmoid', cv=5)
calibrated_clf.fit(X_train, y_train)
return calibrated_clf
def main():
try:
data, has_embeddings = load_data()
except Exception as e:
print(f"Skipping training for now: {e}")
return
print(f"Loaded {len(data)} samples.")
# Prepare Features
dsp_cols = [c for c in data.columns if c not in ['filename', 'label', 'path'] and not c.startswith('emb_')]
X_dsp = data[dsp_cols].values
y = (data['label'] == 'ai').astype(int).values
# Train/Test Split
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(splitter.split(np.zeros(len(y)), y))
X_dsp_train, X_dsp_test = X_dsp[train_idx], X_dsp[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
# 1. Train DSP Model
dsp_model = train_dsp_model(X_dsp_train, y_train)
# Save DSP Model
joblib.dump(dsp_model, os.path.join(MODELS_DIR, 'dsp_model.pkl'))
joblib.dump(dsp_cols, os.path.join(MODELS_DIR, 'dsp_cols.pkl'))
# Eval
y_pred_dsp = dsp_model.predict(X_dsp_test)
print(f"DSP Accuracy: {accuracy_score(y_test, y_pred_dsp):.4f}")
emb_model = None
if has_embeddings:
emb_cols = [c for c in data.columns if c.startswith('emb_')]
X_emb = data[emb_cols].values
X_emb_train, X_emb_test = X_emb[train_idx], X_emb[test_idx]
# 2. Train Embedding Model
emb_model = train_embedding_model(X_emb_train, y_train)
# Eval
y_pred_emb = emb_model.predict(X_emb_test)
print(f"Embedding Accuracy: {accuracy_score(y_test, y_pred_emb):.4f}")
# Ensemble
y_prob_dsp = dsp_model.predict_proba(X_dsp_test)[:, 1]
y_prob_emb = emb_model.predict_proba(X_emb_test)[:, 1]
y_prob_ensemble = (y_prob_dsp + y_prob_emb) / 2
y_pred_ensemble = (y_prob_ensemble > 0.5).astype(int)
print(f"Ensemble Accuracy: {accuracy_score(y_test, y_pred_ensemble):.4f}")
joblib.dump(emb_model, os.path.join(MODELS_DIR, 'emb_model.pkl'))
else:
print("Skipping Embedding Model training.")
# Remove old model if exists
e_path = os.path.join(MODELS_DIR, 'emb_model.pkl')
if os.path.exists(e_path):
os.remove(e_path)
print(f"\nModels saved to {MODELS_DIR}")
if __name__ == "__main__":
main()