""" Custom XGBoost ClearML + Optuna Sweep ===================================== Because ClearML's built-in HyperParameterOptimizer struggles with the Optuna study lifecycle in local execution, this script uses raw Optuna to drive the sweep, and logs each trial manually as a separate ClearML Task. """ import os import optuna from clearml import Task import numpy as np from xgboost import XGBClassifier from sklearn.metrics import f1_score, roc_auc_score, accuracy_score # Import your own dataset loading logic from data_preparation.prepare_dataset import get_numpy_splits # ── General Settings ────────────────────────────────────────────────────────── PROJECT_NAME = "FocusGuards Large Group Project" BASE_TASK_NAME = "XGBoost Sweep Trial" DATA_SPLITS = (0.7, 0.15, 0.15) SEED = 42 # ── Search Space ────────────────────────────────────────────────────────────── def objective(trial): # 1. Sample hyperparameters params = { "n_estimators": trial.suggest_categorical("n_estimators", [100, 200, 400, 600]), "max_depth": trial.suggest_int("max_depth", 3, 8), "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True), "subsample": trial.suggest_float("subsample", 0.5, 1.0), "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0), "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 2.0), "reg_lambda": trial.suggest_float("reg_lambda", 0.5, 5.0), "eval_metric": "logloss", "random_state": SEED, "verbosity": 0 } # 2. Init a distinct ClearML task for this exact trial task = Task.init( project_name=PROJECT_NAME, task_name=f"{BASE_TASK_NAME} #{trial.number}", tags=["sweep", "xgboost", "optuna_manual"], reuse_last_task_id=False # Crucial: forces a NEW task every time ) task.connect(params) print(f"\n[Trial #{trial.number}] Starting with params: {params}") # 3. Load Data (cached per process) splits, num_features, num_classes, scaler = get_numpy_splits( model_name="face_orientation", split_ratios=DATA_SPLITS, seed=SEED, scale=False ) X_train, y_train = splits["X_train"], splits["y_train"] X_val, y_val = splits["X_val"], splits["y_val"] # 4. Train model = XGBClassifier(**params) model.fit( X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=False ) # 5. Extract Train/Val loss curves evals = model.evals_result() train_losses = evals["validation_0"]["logloss"] val_losses = evals["validation_1"]["logloss"] best_val_loss = min(val_losses) # 6. Evaluate on Validation Set (Sweep target) val_preds = model.predict(X_val) val_f1 = float(f1_score(y_val, val_preds, average='weighted')) val_acc = float(accuracy_score(y_val, val_preds)) # Log to ClearML for i, (tl, vl) in enumerate(zip(train_losses, val_losses)): task.logger.report_scalar("Loss", "Train", tl, iteration=i) task.logger.report_scalar("Loss", "Val", vl, iteration=i) task.logger.report_single_value("val_f1", val_f1) task.logger.report_single_value("val_accuracy", val_acc) task.logger.report_single_value("val_loss", best_val_loss) # Close task so the next trial can create its own task.close() # We minimize validation loss return best_val_loss def main(): # Make sure we don't nest tasks — this script itself is not the tracked task Task.force_requirements_env_freeze() study = optuna.create_study(direction="minimize", study_name="xgboost_sweep") print("[SWEEP] Starting local 40-trial Optuna + ClearML sweep...") study.optimize(objective, n_trials=40) print("\n[SWEEP] ── Top-5 trials by Validation Loss ──────────────────────────────") trials = sorted(study.trials, key=lambda t: t.value if t.value is not None else float('inf')) for rank, t in enumerate(trials[:5], 1): print(f" #{rank} Val_Loss={t.value:.4f} Trial_Number={t.number}") print(f" {t.params}") if __name__ == "__main__": main()