final_test / models /xgboost /sweep_local.py
Abdelrahman Almatrooshi
Deploy snapshot from main b7a59b11809483dfc959f196f1930240f2662c49
22a6915
"""
Custom XGBoost ClearML + Optuna Sweep
=====================================
Because ClearML's built-in HyperParameterOptimizer struggles with the Optuna
study lifecycle in local execution, this script uses raw Optuna to drive
the sweep, and logs each trial manually as a separate ClearML Task.
"""
import os
import optuna
from clearml import Task
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from config import CLEARML_PROJECT_NAME
# Import your own dataset loading logic
from data_preparation.prepare_dataset import get_default_split_config, get_numpy_splits
# ── General Settings ──────────────────────────────────────────────────────────
PROJECT_NAME = CLEARML_PROJECT_NAME
BASE_TASK_NAME = "XGBoost Sweep Trial"
DATA_SPLITS, SEED = get_default_split_config()
# ── Search Space ──────────────────────────────────────────────────────────────
def objective(trial):
# 1. Sample hyperparameters
# 1. Sample hyperparameters
params = {
"n_estimators": trial.suggest_categorical("n_estimators", [100, 200, 400, 600]),
"max_depth": trial.suggest_int("max_depth", 3, 8),
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
"subsample": trial.suggest_float("subsample", 0.5, 1.0),
"colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
"reg_alpha": trial.suggest_float("reg_alpha", 0.0, 2.0),
"reg_lambda": trial.suggest_float("reg_lambda", 0.5, 5.0),
"eval_metric": "logloss",
"random_state": SEED,
"verbosity": 0
}
# 2. Init a distinct ClearML task for this exact trial
task = Task.init(
project_name=PROJECT_NAME,
task_name=f"{BASE_TASK_NAME} #{trial.number}",
tags=["sweep", "xgboost", "optuna_manual"],
reuse_last_task_id=False # Crucial: forces a NEW task every time
)
task.connect(params)
print(f"\n[Trial #{trial.number}] Starting with params: {params}")
# 3. Load Data (cached per process)
splits, num_features, num_classes, scaler = get_numpy_splits(
model_name="face_orientation",
split_ratios=DATA_SPLITS,
seed=SEED,
scale=False
)
X_train, y_train = splits["X_train"], splits["y_train"]
X_val, y_val = splits["X_val"], splits["y_val"]
# 4. Train
model = XGBClassifier(**params)
model.fit(
X_train, y_train,
eval_set=[(X_train, y_train), (X_val, y_val)],
verbose=False
)
# 5. Extract Train/Val loss curves
evals = model.evals_result()
train_losses = evals["validation_0"]["logloss"]
val_losses = evals["validation_1"]["logloss"]
best_val_loss = min(val_losses)
# 6. Evaluate on Validation Set (Sweep target)
val_preds = model.predict(X_val)
val_f1 = float(f1_score(y_val, val_preds, average='weighted'))
val_acc = float(accuracy_score(y_val, val_preds))
# Log to ClearML
for i, (tl, vl) in enumerate(zip(train_losses, val_losses)):
task.logger.report_scalar("Loss", "Train", tl, iteration=i)
task.logger.report_scalar("Loss", "Val", vl, iteration=i)
task.logger.report_single_value("val_f1", val_f1)
task.logger.report_single_value("val_accuracy", val_acc)
task.logger.report_single_value("val_loss", best_val_loss)
# Close task so the next trial can create its own
task.close()
# We minimize validation loss
return best_val_loss
def main():
# Make sure we don't nest tasks β€” this script itself is not the tracked task
Task.force_requirements_env_freeze()
study = optuna.create_study(direction="minimize", study_name="xgboost_sweep")
print("[SWEEP] Starting local 40-trial Optuna + ClearML sweep...")
study.optimize(objective, n_trials=40)
print("\n[SWEEP] ── Top-5 trials by Validation Loss ──────────────────────────────")
trials = sorted(study.trials, key=lambda t: t.value if t.value is not None else float('inf'))
for rank, t in enumerate(trials[:5], 1):
print(f" #{rank} Val_Loss={t.value:.4f} Trial_Number={t.number}")
print(f" {t.params}")
if __name__ == "__main__":
main()