Spaces:
Sleeping
Sleeping
| """ | |
| Model Evaluation Script for Documentation | |
| Generates: | |
| 1. Per-language accuracy table | |
| 2. Confusion matrix (saved as image) | |
| 3. Calibration reliability curve (saved as image) | |
| 4. Latency benchmarks | |
| """ | |
| import os | |
| import sys | |
| import io | |
| # Fix Windows console encoding | |
| sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') | |
| sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace') | |
| import time | |
| import numpy as np | |
| import pandas as pd | |
| import joblib | |
| import matplotlib.pyplot as plt | |
| from sklearn.metrics import confusion_matrix, classification_report, accuracy_score | |
| from sklearn.calibration import calibration_curve | |
| from sklearn.model_selection import StratifiedShuffleSplit | |
| # Add src to path | |
| sys.path.append(os.path.join(os.path.dirname(__file__), '..')) | |
| from src.config import DATA_DIR | |
| MODELS_DIR = os.path.join(os.path.dirname(__file__), '..', 'models') | |
| DOCS_DIR = os.path.join(os.path.dirname(__file__), '..', 'docs') | |
| if not os.path.exists(DOCS_DIR): | |
| os.makedirs(DOCS_DIR) | |
| # Language code to name mapping | |
| LANG_NAMES = { | |
| 'en': 'English', | |
| 'ta': 'Tamil', | |
| 'hi': 'Hindi', | |
| 'ml': 'Malayalam', | |
| 'te': 'Telugu' | |
| } | |
| def load_data(): | |
| """Load feature data with language info""" | |
| dsp_path = os.path.join(DATA_DIR, 'features', 'dsp_features.csv') | |
| master_path = os.path.join(DATA_DIR, 'master_dataset.csv') | |
| dsp_df = pd.read_csv(dsp_path) | |
| master_df = pd.read_csv(master_path) | |
| # Merge to get language info | |
| merged = pd.merge(dsp_df, master_df[['filename', 'language']], on='filename', how='left') | |
| return merged | |
| def evaluate_model(): | |
| """Main evaluation function""" | |
| print("=" * 60) | |
| print("MODEL EVALUATION FOR DOCUMENTATION") | |
| print("=" * 60) | |
| # Load data | |
| data = load_data() | |
| print(f"\nTotal samples: {len(data)}") | |
| # Load model | |
| model = joblib.load(os.path.join(MODELS_DIR, 'dsp_model.pkl')) | |
| dsp_cols = joblib.load(os.path.join(MODELS_DIR, 'dsp_cols.pkl')) | |
| # Prepare data | |
| X = data[dsp_cols].values | |
| y = (data['label'] == 'ai').astype(int).values | |
| languages = data['language'].values | |
| # Train/Test Split (same as training) | |
| splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) | |
| train_idx, test_idx = next(splitter.split(np.zeros(len(y)), y)) | |
| X_test = X[test_idx] | |
| y_test = y[test_idx] | |
| langs_test = languages[test_idx] | |
| # Predictions | |
| y_pred = model.predict(X_test) | |
| y_prob = model.predict_proba(X_test)[:, 1] | |
| # 1. OVERALL METRICS | |
| print("\n" + "=" * 40) | |
| print("OVERALL PERFORMANCE") | |
| print("=" * 40) | |
| overall_acc = accuracy_score(y_test, y_pred) | |
| print(f"Overall Accuracy: {overall_acc:.4f} ({overall_acc*100:.2f}%)") | |
| # 2. PER-LANGUAGE ACCURACY | |
| print("\n" + "=" * 40) | |
| print("PER-LANGUAGE ACCURACY") | |
| print("=" * 40) | |
| lang_results = [] | |
| for lang_code in ['en', 'ta', 'hi', 'ml', 'te']: | |
| mask = langs_test == lang_code | |
| if mask.sum() > 0: | |
| acc = accuracy_score(y_test[mask], y_pred[mask]) | |
| n_samples = mask.sum() | |
| n_correct = (y_test[mask] == y_pred[mask]).sum() | |
| lang_results.append({ | |
| 'Language': LANG_NAMES[lang_code], | |
| 'Code': lang_code, | |
| 'Samples': n_samples, | |
| 'Correct': n_correct, | |
| 'Accuracy': acc | |
| }) | |
| print(f"{LANG_NAMES[lang_code]:12s} ({lang_code}): {acc*100:.1f}% ({n_correct}/{n_samples})") | |
| # Save per-language results | |
| lang_df = pd.DataFrame(lang_results) | |
| lang_df.to_csv(os.path.join(DOCS_DIR, 'per_language_accuracy.csv'), index=False) | |
| # 3. CONFUSION MATRIX | |
| print("\n" + "=" * 40) | |
| print("CONFUSION MATRIX") | |
| print("=" * 40) | |
| cm = confusion_matrix(y_test, y_pred) | |
| print(f" Predicted") | |
| print(f" HUMAN AI") | |
| print(f"Actual HUMAN {cm[0,0]:3d} {cm[0,1]:3d}") | |
| print(f"Actual AI {cm[1,0]:3d} {cm[1,1]:3d}") | |
| # Plot confusion matrix | |
| fig, ax = plt.subplots(figsize=(8, 6)) | |
| im = ax.imshow(cm, interpolation='nearest', cmap='Blues') | |
| ax.figure.colorbar(im, ax=ax) | |
| classes = ['HUMAN', 'AI_GENERATED'] | |
| ax.set(xticks=np.arange(cm.shape[1]), | |
| yticks=np.arange(cm.shape[0]), | |
| xticklabels=classes, yticklabels=classes, | |
| xlabel='Predicted Label', | |
| ylabel='True Label', | |
| title='Confusion Matrix - AI Voice Detection') | |
| # Add text annotations | |
| thresh = cm.max() / 2. | |
| for i in range(cm.shape[0]): | |
| for j in range(cm.shape[1]): | |
| ax.text(j, i, format(cm[i, j], 'd'), | |
| ha="center", va="center", | |
| color="white" if cm[i, j] > thresh else "black", | |
| fontsize=20) | |
| plt.tight_layout() | |
| plt.savefig(os.path.join(DOCS_DIR, 'confusion_matrix.png'), dpi=150, bbox_inches='tight') | |
| plt.close() | |
| print(f"\nSaved: {os.path.join(DOCS_DIR, 'confusion_matrix.png')}") | |
| # 4. CALIBRATION CURVE | |
| print("\n" + "=" * 40) | |
| print("CALIBRATION RELIABILITY") | |
| print("=" * 40) | |
| prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10) | |
| fig, ax = plt.subplots(figsize=(8, 6)) | |
| ax.plot([0, 1], [0, 1], 'k--', label='Perfectly Calibrated') | |
| ax.plot(prob_pred, prob_true, 'b-o', label='Model Calibration') | |
| ax.set_xlabel('Predicted Probability') | |
| ax.set_ylabel('True Probability') | |
| ax.set_title('Calibration Reliability Curve') | |
| ax.legend(loc='lower right') | |
| ax.grid(True, alpha=0.3) | |
| plt.tight_layout() | |
| plt.savefig(os.path.join(DOCS_DIR, 'calibration_curve.png'), dpi=150, bbox_inches='tight') | |
| plt.close() | |
| print(f"Saved: {os.path.join(DOCS_DIR, 'calibration_curve.png')}") | |
| # Calculate calibration error | |
| ece = np.mean(np.abs(prob_true - prob_pred)) | |
| print(f"Expected Calibration Error (ECE): {ece:.4f}") | |
| # 5. LATENCY BENCHMARKS | |
| print("\n" + "=" * 40) | |
| print("LATENCY BENCHMARKS") | |
| print("=" * 40) | |
| # Benchmark prediction time | |
| latencies = [] | |
| for _ in range(100): | |
| sample = X_test[np.random.randint(len(X_test))].reshape(1, -1) | |
| start = time.perf_counter() | |
| _ = model.predict_proba(sample) | |
| latencies.append((time.perf_counter() - start) * 1000) # ms | |
| latencies = np.array(latencies) | |
| print(f"Prediction latency (model only):") | |
| print(f" Mean: {latencies.mean():.2f} ms") | |
| print(f" P50: {np.percentile(latencies, 50):.2f} ms") | |
| print(f" P95: {np.percentile(latencies, 95):.2f} ms") | |
| print(f" P99: {np.percentile(latencies, 99):.2f} ms") | |
| # Save latency results | |
| latency_stats = { | |
| 'metric': ['Mean', 'P50', 'P95', 'P99'], | |
| 'latency_ms': [latencies.mean(), np.percentile(latencies, 50), | |
| np.percentile(latencies, 95), np.percentile(latencies, 99)] | |
| } | |
| pd.DataFrame(latency_stats).to_csv(os.path.join(DOCS_DIR, 'latency_benchmarks.csv'), index=False) | |
| # 6. GENERATE MARKDOWN SUMMARY | |
| print("\n" + "=" * 40) | |
| print("GENERATING MARKDOWN SUMMARY") | |
| print("=" * 40) | |
| # Calculate precision/recall | |
| from sklearn.metrics import precision_score, recall_score, f1_score | |
| precision = precision_score(y_test, y_pred) | |
| recall = recall_score(y_test, y_pred) | |
| f1 = f1_score(y_test, y_pred) | |
| markdown = f"""## 📈 Model Performance Metrics | |
| ### Overall Performance | |
| | Metric | Value | | |
| |--------|-------| | |
| | **Overall Accuracy** | {overall_acc*100:.1f}% | | |
| | **Precision** | {precision*100:.1f}% | | |
| | **Recall** | {recall*100:.1f}% | | |
| | **F1 Score** | {f1*100:.1f}% | | |
| | **Test Samples** | {len(y_test)} | | |
| --- | |
| ### Per-Language Accuracy | |
| | Language | Samples | Accuracy | | |
| |----------|---------|----------| | |
| """ | |
| for r in lang_results: | |
| markdown += f"| {r['Language']} | {r['Samples']} | {r['Accuracy']*100:.1f}% |\n" | |
| markdown += f""" | |
| --- | |
| ### Confusion Matrix | |
|  | |
| | | Predicted HUMAN | Predicted AI | | |
| |--|-----------------|--------------| | |
| | **Actual HUMAN** | {cm[0,0]} | {cm[0,1]} | | |
| | **Actual AI** | {cm[1,0]} | {cm[1,1]} | | |
| --- | |
| ### Calibration Reliability | |
|  | |
| - **Expected Calibration Error (ECE)**: {ece:.4f} | |
| - The closer to the diagonal line, the better calibrated the model is | |
| --- | |
| ### Latency Benchmarks | |
| | Metric | Latency (ms) | | |
| |--------|--------------| | |
| | Mean | {latencies.mean():.2f} | | |
| | P50 (Median) | {np.percentile(latencies, 50):.2f} | | |
| | P95 | {np.percentile(latencies, 95):.2f} | | |
| | P99 | {np.percentile(latencies, 99):.2f} | | |
| > **Note**: These are model-only prediction times. Full API latency includes audio decoding, feature extraction, and network overhead (typically ~500-1500ms total). | |
| --- | |
| """ | |
| # Save markdown | |
| with open(os.path.join(DOCS_DIR, 'performance_metrics.md'), 'w') as f: | |
| f.write(markdown) | |
| print(f"Saved: {os.path.join(DOCS_DIR, 'performance_metrics.md')}") | |
| print("\n" + "=" * 60) | |
| print("EVALUATION COMPLETE!") | |
| print("=" * 60) | |
| print(f"\nOutput files in: {DOCS_DIR}") | |
| print(" - confusion_matrix.png") | |
| print(" - calibration_curve.png") | |
| print(" - per_language_accuracy.csv") | |
| print(" - latency_benchmarks.csv") | |
| print(" - performance_metrics.md") | |
| return { | |
| 'overall_accuracy': overall_acc, | |
| 'per_language': lang_results, | |
| 'confusion_matrix': cm, | |
| 'ece': ece, | |
| 'latency_mean': latencies.mean() | |
| } | |
| if __name__ == "__main__": | |
| evaluate_model() | |