File size: 4,529 Bytes
dead0b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import sys

# Add src to path
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from src.config import DATA_DIR

MODELS_DIR = os.path.join(os.path.dirname(__file__), '..', 'models')
if not os.path.exists(MODELS_DIR):
    os.makedirs(MODELS_DIR)

def load_data():
    dsp_path = os.path.join(DATA_DIR, 'features', 'dsp_features.csv')
    emb_path = os.path.join(DATA_DIR, 'features', 'embeddings.csv')
    
    if not os.path.exists(dsp_path):
        raise FileNotFoundError("DSP Feature file not found. Run feature extraction first.")
        
    dsp_df = pd.read_csv(dsp_path)
    
    if os.path.exists(emb_path):
        print("Loading Embeddings...")
        emb_df = pd.read_csv(emb_path)
        if 'label' in emb_df.columns:
            emb_df = emb_df.drop(columns=['label'])
        merged_df = pd.merge(dsp_df, emb_df, on='filename', how='inner')
        return merged_df, True
    else:
        print("Embeddings not found. Using DSP features only.")
        return dsp_df, False

def train_dsp_model(X_train, y_train):
    print("Training DSP Classifier (Random Forest)...")
    clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
    calibrated_clf = CalibratedClassifierCV(clf, method='sigmoid', cv=5)
    calibrated_clf.fit(X_train, y_train)
    return calibrated_clf

def train_embedding_model(X_train, y_train):
    print("Training Embedding Classifier (XGBoost)...")
    clf = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42, eval_metric='logloss')
    calibrated_clf = CalibratedClassifierCV(clf, method='sigmoid', cv=5)
    calibrated_clf.fit(X_train, y_train)
    return calibrated_clf

def main():
    try:
        data, has_embeddings = load_data()
    except Exception as e:
        print(f"Skipping training for now: {e}")
        return

    print(f"Loaded {len(data)} samples.")
    
    # Prepare Features
    dsp_cols = [c for c in data.columns if c not in ['filename', 'label', 'path'] and not c.startswith('emb_')]
    X_dsp = data[dsp_cols].values
    y = (data['label'] == 'ai').astype(int).values
    
    # Train/Test Split
    splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    train_idx, test_idx = next(splitter.split(np.zeros(len(y)), y))
    
    X_dsp_train, X_dsp_test = X_dsp[train_idx], X_dsp[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    # 1. Train DSP Model
    dsp_model = train_dsp_model(X_dsp_train, y_train)
    
    # Save DSP Model
    joblib.dump(dsp_model, os.path.join(MODELS_DIR, 'dsp_model.pkl'))
    joblib.dump(dsp_cols, os.path.join(MODELS_DIR, 'dsp_cols.pkl'))
    
    # Eval
    y_pred_dsp = dsp_model.predict(X_dsp_test)
    print(f"DSP Accuracy: {accuracy_score(y_test, y_pred_dsp):.4f}")
    
    emb_model = None
    if has_embeddings:
        emb_cols = [c for c in data.columns if c.startswith('emb_')]
        X_emb = data[emb_cols].values
        X_emb_train, X_emb_test = X_emb[train_idx], X_emb[test_idx]
        
        # 2. Train Embedding Model
        emb_model = train_embedding_model(X_emb_train, y_train)
        
        # Eval
        y_pred_emb = emb_model.predict(X_emb_test)
        print(f"Embedding Accuracy: {accuracy_score(y_test, y_pred_emb):.4f}")
        
        # Ensemble
        y_prob_dsp = dsp_model.predict_proba(X_dsp_test)[:, 1]
        y_prob_emb = emb_model.predict_proba(X_emb_test)[:, 1]
        y_prob_ensemble = (y_prob_dsp + y_prob_emb) / 2
        y_pred_ensemble = (y_prob_ensemble > 0.5).astype(int)
        print(f"Ensemble Accuracy: {accuracy_score(y_test, y_pred_ensemble):.4f}")
        
        joblib.dump(emb_model, os.path.join(MODELS_DIR, 'emb_model.pkl'))
    else:
        print("Skipping Embedding Model training.")
        # Remove old model if exists
        e_path = os.path.join(MODELS_DIR, 'emb_model.pkl')
        if os.path.exists(e_path):
            os.remove(e_path)
            
    print(f"\nModels saved to {MODELS_DIR}")

if __name__ == "__main__":
    main()