Spaces:
Sleeping
Sleeping
| """ | |
| Custom XGBoost ClearML + Optuna Sweep | |
| ===================================== | |
| Because ClearML's built-in HyperParameterOptimizer struggles with the Optuna | |
| study lifecycle in local execution, this script uses raw Optuna to drive | |
| the sweep, and logs each trial manually as a separate ClearML Task. | |
| """ | |
| import os | |
| import optuna | |
| from clearml import Task | |
| import numpy as np | |
| from xgboost import XGBClassifier | |
| from sklearn.metrics import f1_score, roc_auc_score, accuracy_score | |
| from config import CLEARML_PROJECT_NAME | |
| # Import your own dataset loading logic | |
| from data_preparation.prepare_dataset import get_default_split_config, get_numpy_splits | |
| # ββ General Settings ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| PROJECT_NAME = CLEARML_PROJECT_NAME | |
| BASE_TASK_NAME = "XGBoost Sweep Trial" | |
| DATA_SPLITS, SEED = get_default_split_config() | |
| # ββ Search Space ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def objective(trial): | |
| # 1. Sample hyperparameters | |
| # 1. Sample hyperparameters | |
| params = { | |
| "n_estimators": trial.suggest_categorical("n_estimators", [100, 200, 400, 600]), | |
| "max_depth": trial.suggest_int("max_depth", 3, 8), | |
| "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True), | |
| "subsample": trial.suggest_float("subsample", 0.5, 1.0), | |
| "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0), | |
| "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 2.0), | |
| "reg_lambda": trial.suggest_float("reg_lambda", 0.5, 5.0), | |
| "eval_metric": "logloss", | |
| "random_state": SEED, | |
| "verbosity": 0 | |
| } | |
| # 2. Init a distinct ClearML task for this exact trial | |
| task = Task.init( | |
| project_name=PROJECT_NAME, | |
| task_name=f"{BASE_TASK_NAME} #{trial.number}", | |
| tags=["sweep", "xgboost", "optuna_manual"], | |
| reuse_last_task_id=False # Crucial: forces a NEW task every time | |
| ) | |
| task.connect(params) | |
| print(f"\n[Trial #{trial.number}] Starting with params: {params}") | |
| # 3. Load Data (cached per process) | |
| splits, num_features, num_classes, scaler = get_numpy_splits( | |
| model_name="face_orientation", | |
| split_ratios=DATA_SPLITS, | |
| seed=SEED, | |
| scale=False | |
| ) | |
| X_train, y_train = splits["X_train"], splits["y_train"] | |
| X_val, y_val = splits["X_val"], splits["y_val"] | |
| # 4. Train | |
| model = XGBClassifier(**params) | |
| model.fit( | |
| X_train, y_train, | |
| eval_set=[(X_train, y_train), (X_val, y_val)], | |
| verbose=False | |
| ) | |
| # 5. Extract Train/Val loss curves | |
| evals = model.evals_result() | |
| train_losses = evals["validation_0"]["logloss"] | |
| val_losses = evals["validation_1"]["logloss"] | |
| best_val_loss = min(val_losses) | |
| # 6. Evaluate on Validation Set (Sweep target) | |
| val_preds = model.predict(X_val) | |
| val_f1 = float(f1_score(y_val, val_preds, average='weighted')) | |
| val_acc = float(accuracy_score(y_val, val_preds)) | |
| # Log to ClearML | |
| for i, (tl, vl) in enumerate(zip(train_losses, val_losses)): | |
| task.logger.report_scalar("Loss", "Train", tl, iteration=i) | |
| task.logger.report_scalar("Loss", "Val", vl, iteration=i) | |
| task.logger.report_single_value("val_f1", val_f1) | |
| task.logger.report_single_value("val_accuracy", val_acc) | |
| task.logger.report_single_value("val_loss", best_val_loss) | |
| # Close task so the next trial can create its own | |
| task.close() | |
| # We minimize validation loss | |
| return best_val_loss | |
| def main(): | |
| # Make sure we don't nest tasks β this script itself is not the tracked task | |
| Task.force_requirements_env_freeze() | |
| study = optuna.create_study(direction="minimize", study_name="xgboost_sweep") | |
| print("[SWEEP] Starting local 40-trial Optuna + ClearML sweep...") | |
| study.optimize(objective, n_trials=40) | |
| print("\n[SWEEP] ββ Top-5 trials by Validation Loss ββββββββββββββββββββββββββββββ") | |
| trials = sorted(study.trials, key=lambda t: t.value if t.value is not None else float('inf')) | |
| for rank, t in enumerate(trials[:5], 1): | |
| print(f" #{rank} Val_Loss={t.value:.4f} Trial_Number={t.number}") | |
| print(f" {t.params}") | |
| if __name__ == "__main__": | |
| main() | |