Spaces:
Sleeping
Sleeping
| import os | |
| import pandas as pd | |
| import numpy as np | |
| import joblib | |
| from sklearn.model_selection import StratifiedShuffleSplit | |
| from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.metrics import accuracy_score, classification_report, confusion_matrix | |
| from sklearn.calibration import CalibratedClassifierCV | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.preprocessing import StandardScaler | |
| import xgboost as xgb | |
| import sys | |
| # Add src to path | |
| sys.path.append(os.path.join(os.path.dirname(__file__), '..')) | |
| from src.config import DATA_DIR | |
| MODELS_DIR = os.path.join(os.path.dirname(__file__), '..', 'models') | |
| if not os.path.exists(MODELS_DIR): | |
| os.makedirs(MODELS_DIR) | |
| def load_data(): | |
| dsp_path = os.path.join(DATA_DIR, 'features', 'dsp_features.csv') | |
| emb_path = os.path.join(DATA_DIR, 'features', 'embeddings.csv') | |
| if not os.path.exists(dsp_path): | |
| raise FileNotFoundError("DSP Feature file not found. Run feature extraction first.") | |
| dsp_df = pd.read_csv(dsp_path) | |
| if os.path.exists(emb_path): | |
| print("Loading Embeddings...") | |
| emb_df = pd.read_csv(emb_path) | |
| if 'label' in emb_df.columns: | |
| emb_df = emb_df.drop(columns=['label']) | |
| merged_df = pd.merge(dsp_df, emb_df, on='filename', how='inner') | |
| return merged_df, True | |
| else: | |
| print("Embeddings not found. Using DSP features only.") | |
| return dsp_df, False | |
| def train_dsp_model(X_train, y_train): | |
| print("Training DSP Classifier (Random Forest)...") | |
| clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42) | |
| calibrated_clf = CalibratedClassifierCV(clf, method='sigmoid', cv=5) | |
| calibrated_clf.fit(X_train, y_train) | |
| return calibrated_clf | |
| def train_embedding_model(X_train, y_train): | |
| print("Training Embedding Classifier (XGBoost)...") | |
| clf = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42, eval_metric='logloss') | |
| calibrated_clf = CalibratedClassifierCV(clf, method='sigmoid', cv=5) | |
| calibrated_clf.fit(X_train, y_train) | |
| return calibrated_clf | |
| def main(): | |
| try: | |
| data, has_embeddings = load_data() | |
| except Exception as e: | |
| print(f"Skipping training for now: {e}") | |
| return | |
| print(f"Loaded {len(data)} samples.") | |
| # Prepare Features | |
| dsp_cols = [c for c in data.columns if c not in ['filename', 'label', 'path'] and not c.startswith('emb_')] | |
| X_dsp = data[dsp_cols].values | |
| y = (data['label'] == 'ai').astype(int).values | |
| # Train/Test Split | |
| splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) | |
| train_idx, test_idx = next(splitter.split(np.zeros(len(y)), y)) | |
| X_dsp_train, X_dsp_test = X_dsp[train_idx], X_dsp[test_idx] | |
| y_train, y_test = y[train_idx], y[test_idx] | |
| # 1. Train DSP Model | |
| dsp_model = train_dsp_model(X_dsp_train, y_train) | |
| # Save DSP Model | |
| joblib.dump(dsp_model, os.path.join(MODELS_DIR, 'dsp_model.pkl')) | |
| joblib.dump(dsp_cols, os.path.join(MODELS_DIR, 'dsp_cols.pkl')) | |
| # Eval | |
| y_pred_dsp = dsp_model.predict(X_dsp_test) | |
| print(f"DSP Accuracy: {accuracy_score(y_test, y_pred_dsp):.4f}") | |
| emb_model = None | |
| if has_embeddings: | |
| emb_cols = [c for c in data.columns if c.startswith('emb_')] | |
| X_emb = data[emb_cols].values | |
| X_emb_train, X_emb_test = X_emb[train_idx], X_emb[test_idx] | |
| # 2. Train Embedding Model | |
| emb_model = train_embedding_model(X_emb_train, y_train) | |
| # Eval | |
| y_pred_emb = emb_model.predict(X_emb_test) | |
| print(f"Embedding Accuracy: {accuracy_score(y_test, y_pred_emb):.4f}") | |
| # Ensemble | |
| y_prob_dsp = dsp_model.predict_proba(X_dsp_test)[:, 1] | |
| y_prob_emb = emb_model.predict_proba(X_emb_test)[:, 1] | |
| y_prob_ensemble = (y_prob_dsp + y_prob_emb) / 2 | |
| y_pred_ensemble = (y_prob_ensemble > 0.5).astype(int) | |
| print(f"Ensemble Accuracy: {accuracy_score(y_test, y_pred_ensemble):.4f}") | |
| joblib.dump(emb_model, os.path.join(MODELS_DIR, 'emb_model.pkl')) | |
| else: | |
| print("Skipping Embedding Model training.") | |
| # Remove old model if exists | |
| e_path = os.path.join(MODELS_DIR, 'emb_model.pkl') | |
| if os.path.exists(e_path): | |
| os.remove(e_path) | |
| print(f"\nModels saved to {MODELS_DIR}") | |
| if __name__ == "__main__": | |
| main() | |