import numpy as np
import os
import sys
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Add the project root to sys.path to import path_utils
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import path_utils

def train_models():
    # Load preprocessed arrays
    preprocessed_path = path_utils.get_processed_data_path('preprocessed_data.pkl')
    if not os.path.exists(preprocessed_path):
        print(f"Error: Preprocessed data not found at {preprocessed_path}")
        return

    data = joblib.load(preprocessed_path)
    X_train = data['X_train']
    y_train = data['y_train']
    print("Preprocessed data loaded.")

    # 1. Isolation Forest (Unsupervised Baseline)
    # Contamination should be roughly equal to the failure rate in original data (~3.5%)
    print("Training Isolation Forest...")
    clf_iso = IsolationForest(contamination=0.035, random_state=42)
    clf_iso.fit(X_train)
    joblib.dump(clf_iso, path_utils.get_model_path('isolation_forest.pkl'))

    # 2. Logistic Regression (Baseline)
    print("Training Logistic Regression...")
    clf_lr = LogisticRegression(random_state=42, max_iter=1000)
    clf_lr.fit(X_train, y_train)
    joblib.dump(clf_lr, path_utils.get_model_path('logistic_regression.pkl'))

    # 3. Support Vector Machine (SVM)
    print("Training SVM...")
    clf_svm = SVC(kernel='rbf', probability=True, random_state=42)
    clf_svm.fit(X_train, y_train)
    joblib.dump(clf_svm, path_utils.get_model_path('svm_model.pkl'))

    # 4. Random Forest (Robust Ensemble)
    print("Training Random Forest...")
    clf_rf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf_rf.fit(X_train, y_train)
    joblib.dump(clf_rf, path_utils.get_model_path('random_forest.pkl'))

    # 5. Decision Tree (Interpretable)
    print("Training Decision Tree...")
    clf_dt = DecisionTreeClassifier(random_state=42)
    clf_dt.fit(X_train, y_train)
    joblib.dump(clf_dt, path_utils.get_model_path('decision_tree.pkl'))

    # 6. XGBoost (Best Performer)
    print("Training XGBoost with GridSearch...")
    # scale_pos_weight is for imbalanced data, but since we used SMOTE, it's 1.0 (balanced)
    # However, I'll tune some key parameters
    xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
    
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [4, 6],
        'learning_rate': [0.05, 0.1],
        'subsample': [0.8, 1.0]
    }
    
    grid_search = GridSearchCV(xgb, param_grid, cv=3, scoring='f1', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_xgb = grid_search.best_estimator_
    print(f"Best XGBoost Params: {grid_search.best_params_}")
    joblib.dump(best_xgb, path_utils.get_model_path('xgboost_model.pkl'))

    print("All models trained and saved in 'models/' directory.")

if __name__ == "__main__":
    train_models()