Spaces:

pp22
/

voice-detection-api

Sleeping

pratikpawar0204

AI Voice Detection API - Competition Submission

dead0b1 about 1 month ago

9.67 kB

	"""
	Model Evaluation Script for Documentation
	Generates:
	1. Per-language accuracy table
	2. Confusion matrix (saved as image)
	3. Calibration reliability curve (saved as image)
	4. Latency benchmarks
	"""

	import os
	import sys
	import io

	# Fix Windows console encoding
	sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
	sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
	import time
	import numpy as np
	import pandas as pd
	import joblib
	import matplotlib.pyplot as plt
	from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
	from sklearn.calibration import calibration_curve
	from sklearn.model_selection import StratifiedShuffleSplit

	# Add src to path
	sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
	from src.config import DATA_DIR

	MODELS_DIR = os.path.join(os.path.dirname(__file__), '..', 'models')
	DOCS_DIR = os.path.join(os.path.dirname(__file__), '..', 'docs')

	if not os.path.exists(DOCS_DIR):
	os.makedirs(DOCS_DIR)

	# Language code to name mapping
	LANG_NAMES = {
	'en': 'English',
	'ta': 'Tamil',
	'hi': 'Hindi',
	'ml': 'Malayalam',
	'te': 'Telugu'
	}

	def load_data():
	"""Load feature data with language info"""
	dsp_path = os.path.join(DATA_DIR, 'features', 'dsp_features.csv')
	master_path = os.path.join(DATA_DIR, 'master_dataset.csv')

	dsp_df = pd.read_csv(dsp_path)
	master_df = pd.read_csv(master_path)

	# Merge to get language info
	merged = pd.merge(dsp_df, master_df[['filename', 'language']], on='filename', how='left')
	return merged

	def evaluate_model():
	"""Main evaluation function"""
	print("=" * 60)
	print("MODEL EVALUATION FOR DOCUMENTATION")
	print("=" * 60)

	# Load data
	data = load_data()
	print(f"\nTotal samples: {len(data)}")

	# Load model
	model = joblib.load(os.path.join(MODELS_DIR, 'dsp_model.pkl'))
	dsp_cols = joblib.load(os.path.join(MODELS_DIR, 'dsp_cols.pkl'))

	# Prepare data
	X = data[dsp_cols].values
	y = (data['label'] == 'ai').astype(int).values
	languages = data['language'].values

	# Train/Test Split (same as training)
	splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
	train_idx, test_idx = next(splitter.split(np.zeros(len(y)), y))

	X_test = X[test_idx]
	y_test = y[test_idx]
	langs_test = languages[test_idx]

	# Predictions
	y_pred = model.predict(X_test)
	y_prob = model.predict_proba(X_test)[:, 1]

	# 1. OVERALL METRICS
	print("\n" + "=" * 40)
	print("OVERALL PERFORMANCE")
	print("=" * 40)
	overall_acc = accuracy_score(y_test, y_pred)
	print(f"Overall Accuracy: {overall_acc:.4f} ({overall_acc*100:.2f}%)")

	# 2. PER-LANGUAGE ACCURACY
	print("\n" + "=" * 40)
	print("PER-LANGUAGE ACCURACY")
	print("=" * 40)

	lang_results = []
	for lang_code in ['en', 'ta', 'hi', 'ml', 'te']:
	mask = langs_test == lang_code
	if mask.sum() > 0:
	acc = accuracy_score(y_test[mask], y_pred[mask])
	n_samples = mask.sum()
	n_correct = (y_test[mask] == y_pred[mask]).sum()
	lang_results.append({
	'Language': LANG_NAMES[lang_code],
	'Code': lang_code,
	'Samples': n_samples,
	'Correct': n_correct,
	'Accuracy': acc
	})
	print(f"{LANG_NAMES[lang_code]:12s} ({lang_code}): {acc*100:.1f}% ({n_correct}/{n_samples})")

	# Save per-language results
	lang_df = pd.DataFrame(lang_results)
	lang_df.to_csv(os.path.join(DOCS_DIR, 'per_language_accuracy.csv'), index=False)

	# 3. CONFUSION MATRIX
	print("\n" + "=" * 40)
	print("CONFUSION MATRIX")
	print("=" * 40)

	cm = confusion_matrix(y_test, y_pred)
	print(f" Predicted")
	print(f" HUMAN AI")
	print(f"Actual HUMAN {cm[0,0]:3d} {cm[0,1]:3d}")
	print(f"Actual AI {cm[1,0]:3d} {cm[1,1]:3d}")

	# Plot confusion matrix
	fig, ax = plt.subplots(figsize=(8, 6))
	im = ax.imshow(cm, interpolation='nearest', cmap='Blues')
	ax.figure.colorbar(im, ax=ax)

	classes = ['HUMAN', 'AI_GENERATED']
	ax.set(xticks=np.arange(cm.shape[1]),
	yticks=np.arange(cm.shape[0]),
	xticklabels=classes, yticklabels=classes,
	xlabel='Predicted Label',
	ylabel='True Label',
	title='Confusion Matrix - AI Voice Detection')

	# Add text annotations
	thresh = cm.max() / 2.
	for i in range(cm.shape[0]):
	for j in range(cm.shape[1]):
	ax.text(j, i, format(cm[i, j], 'd'),
	ha="center", va="center",
	color="white" if cm[i, j] > thresh else "black",
	fontsize=20)

	plt.tight_layout()
	plt.savefig(os.path.join(DOCS_DIR, 'confusion_matrix.png'), dpi=150, bbox_inches='tight')
	plt.close()
	print(f"\nSaved: {os.path.join(DOCS_DIR, 'confusion_matrix.png')}")

	# 4. CALIBRATION CURVE
	print("\n" + "=" * 40)
	print("CALIBRATION RELIABILITY")
	print("=" * 40)

	prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10)

	fig, ax = plt.subplots(figsize=(8, 6))
	ax.plot([0, 1], [0, 1], 'k--', label='Perfectly Calibrated')
	ax.plot(prob_pred, prob_true, 'b-o', label='Model Calibration')
	ax.set_xlabel('Predicted Probability')
	ax.set_ylabel('True Probability')
	ax.set_title('Calibration Reliability Curve')
	ax.legend(loc='lower right')
	ax.grid(True, alpha=0.3)

	plt.tight_layout()
	plt.savefig(os.path.join(DOCS_DIR, 'calibration_curve.png'), dpi=150, bbox_inches='tight')
	plt.close()
	print(f"Saved: {os.path.join(DOCS_DIR, 'calibration_curve.png')}")

	# Calculate calibration error
	ece = np.mean(np.abs(prob_true - prob_pred))
	print(f"Expected Calibration Error (ECE): {ece:.4f}")

	# 5. LATENCY BENCHMARKS
	print("\n" + "=" * 40)
	print("LATENCY BENCHMARKS")
	print("=" * 40)

	# Benchmark prediction time
	latencies = []
	for _ in range(100):
	sample = X_test[np.random.randint(len(X_test))].reshape(1, -1)
	start = time.perf_counter()
	_ = model.predict_proba(sample)
	latencies.append((time.perf_counter() - start) * 1000) # ms

	latencies = np.array(latencies)
	print(f"Prediction latency (model only):")
	print(f" Mean: {latencies.mean():.2f} ms")
	print(f" P50: {np.percentile(latencies, 50):.2f} ms")
	print(f" P95: {np.percentile(latencies, 95):.2f} ms")
	print(f" P99: {np.percentile(latencies, 99):.2f} ms")

	# Save latency results
	latency_stats = {
	'metric': ['Mean', 'P50', 'P95', 'P99'],
	'latency_ms': [latencies.mean(), np.percentile(latencies, 50),
	np.percentile(latencies, 95), np.percentile(latencies, 99)]
	}
	pd.DataFrame(latency_stats).to_csv(os.path.join(DOCS_DIR, 'latency_benchmarks.csv'), index=False)

	# 6. GENERATE MARKDOWN SUMMARY
	print("\n" + "=" * 40)
	print("GENERATING MARKDOWN SUMMARY")
	print("=" * 40)

	# Calculate precision/recall
	from sklearn.metrics import precision_score, recall_score, f1_score
	precision = precision_score(y_test, y_pred)
	recall = recall_score(y_test, y_pred)
	f1 = f1_score(y_test, y_pred)

	markdown = f"""## 📈 Model Performance Metrics

	### Overall Performance

	\| Metric \| Value \|
	\|--------\|-------\|
	\| Overall Accuracy \| {overall_acc*100:.1f}% \|
	\| Precision \| {precision*100:.1f}% \|
	\| Recall \| {recall*100:.1f}% \|
	\| F1 Score \| {f1*100:.1f}% \|
	\| Test Samples \| {len(y_test)} \|

	---

	### Per-Language Accuracy

	\| Language \| Samples \| Accuracy \|
	\|----------\|---------\|----------\|
	"""
	for r in lang_results:
	markdown += f"\| {r['Language']} \| {r['Samples']} \| {r['Accuracy']*100:.1f}% \|\n"

	markdown += f"""
	---

	### Confusion Matrix

	![Confusion Matrix](docs/confusion_matrix.png)

	\| \| Predicted HUMAN \| Predicted AI \|
	\|--\|-----------------\|--------------\|
	\| Actual HUMAN \| {cm[0,0]} \| {cm[0,1]} \|
	\| Actual AI \| {cm[1,0]} \| {cm[1,1]} \|

	---

	### Calibration Reliability

	![Calibration Curve](docs/calibration_curve.png)

	- Expected Calibration Error (ECE): {ece:.4f}
	- The closer to the diagonal line, the better calibrated the model is

	---

	### Latency Benchmarks

	\| Metric \| Latency (ms) \|
	\|--------\|--------------\|
	\| Mean \| {latencies.mean():.2f} \|
	\| P50 (Median) \| {np.percentile(latencies, 50):.2f} \|
	\| P95 \| {np.percentile(latencies, 95):.2f} \|
	\| P99 \| {np.percentile(latencies, 99):.2f} \|

	> Note: These are model-only prediction times. Full API latency includes audio decoding, feature extraction, and network overhead (typically ~500-1500ms total).

	---
	"""

	# Save markdown
	with open(os.path.join(DOCS_DIR, 'performance_metrics.md'), 'w') as f:
	f.write(markdown)
	print(f"Saved: {os.path.join(DOCS_DIR, 'performance_metrics.md')}")

	print("\n" + "=" * 60)
	print("EVALUATION COMPLETE!")
	print("=" * 60)
	print(f"\nOutput files in: {DOCS_DIR}")
	print(" - confusion_matrix.png")
	print(" - calibration_curve.png")
	print(" - per_language_accuracy.csv")
	print(" - latency_benchmarks.csv")
	print(" - performance_metrics.md")

	return {
	'overall_accuracy': overall_acc,
	'per_language': lang_results,
	'confusion_matrix': cm,
	'ece': ece,
	'latency_mean': latencies.mean()
	}

	if __name__ == "__main__":
	evaluate_model()