test_final / models /xgboost /train.py
k22056537
feat: ClearML project name, artifacts, repro snapshots, model metadata
479c932
import csv
import json
import os
import random
import sys
import numpy as np
from sklearn.metrics import (
classification_report,
confusion_matrix,
f1_score,
precision_recall_fscore_support,
roc_auc_score,
roc_curve,
)
from data_preparation.prepare_dataset import get_numpy_splits, SELECTED_FEATURES
from models.xgboost.config import XGB_BASE_PARAMS, build_xgb_classifier
_PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
def _load_cfg():
try:
from config import get
xgb = get("xgboost") or {}
data = get("data") or {}
ratios = data.get("split_ratios", [0.7, 0.15, 0.15])
return {
"model_name": get("mlp.model_name") or "face_orientation",
"seed": get("mlp.seed") or 42,
"split_ratios": tuple(ratios),
"scale": False,
"checkpoints_dir": os.path.join(_PROJECT_ROOT, "checkpoints"),
"logs_dir": os.path.join(_PROJECT_ROOT, "evaluation", "logs"),
"xgb_params": dict(XGB_BASE_PARAMS),
}
except Exception:
return {
"model_name": "face_orientation",
"seed": 42,
"split_ratios": (0.7, 0.15, 0.15),
"scale": False,
"checkpoints_dir": os.path.join(_PROJECT_ROOT, "checkpoints"),
"logs_dir": os.path.join(_PROJECT_ROOT, "evaluation", "logs"),
"xgb_params": dict(XGB_BASE_PARAMS),
}
CFG = _load_cfg()
USE_CLEARML = os.environ.get("USE_CLEARML", "0") == "1" or bool(os.environ.get("CLEARML_TASK_ID"))
CLEARML_QUEUE = os.environ.get("CLEARML_QUEUE", "")
task = None
if USE_CLEARML:
try:
from clearml import Task
from config import CLEARML_PROJECT_NAME, flatten_for_clearml
task = Task.init(
project_name=CLEARML_PROJECT_NAME,
task_name="XGBoost Model Training",
tags=["training", "xgboost"],
)
from config.clearml_enrich import enrich_task, upload_repro_artifacts
enrich_task(task, role="train_xgboost")
flat = flatten_for_clearml()
for k, v in CFG.get("xgb_params", {}).items():
flat[f"xgb_params/{k}"] = v
flat["model_name"] = CFG["model_name"]
flat["seed"] = CFG["seed"]
flat["split_ratios"] = str(CFG["split_ratios"])
task.connect(flat)
upload_repro_artifacts(task)
if CLEARML_QUEUE:
print(f"[ClearML] Enqueuing to queue '{CLEARML_QUEUE}'.")
task.execute_remotely(queue_name=CLEARML_QUEUE)
sys.exit(0)
except ImportError:
task = None
USE_CLEARML = False
def set_seed(seed: int):
random.seed(seed)
np.random.seed(seed)
def main():
set_seed(CFG["seed"])
print(f"[TRAIN] Model: XGBoost")
print(f"[TRAIN] Task: {CFG['model_name']}")
# ── Data ──────────────────────────────────────────────────────
splits, num_features, num_classes, scaler = get_numpy_splits(
model_name=CFG["model_name"],
split_ratios=CFG["split_ratios"],
seed=CFG["seed"],
scale=CFG["scale"],
)
X_train, y_train = splits["X_train"], splits["y_train"]
X_val, y_val = splits["X_val"], splits["y_val"]
X_test, y_test = splits["X_test"], splits["y_test"]
# ── Model ─────────────────────────────────────────────────────
model = build_xgb_classifier(CFG["seed"], verbosity=1, early_stopping_rounds=30)
model.fit(
X_train, y_train,
eval_set=[(X_train, y_train), (X_val, y_val)],
verbose=10,
)
best_it = getattr(model, "best_iteration", None)
print(f"[TRAIN] Best iteration: {best_it} / {CFG['xgb_params']['n_estimators']}")
# ── Evaluation ────────────────────────────────────────────────
evals = model.evals_result()
eval_metric_name = CFG["xgb_params"]["eval_metric"]
train_losses = evals["validation_0"][eval_metric_name]
val_losses = evals["validation_1"][eval_metric_name]
# Test metrics
test_preds = model.predict(X_test)
test_probs = model.predict_proba(X_test)
test_acc = float(np.mean(test_preds == y_test))
test_f1 = float(f1_score(y_test, test_preds, average='weighted'))
if num_classes > 2:
test_auc = float(roc_auc_score(y_test, test_probs, multi_class='ovr', average='weighted'))
else:
test_auc = float(roc_auc_score(y_test, test_probs[:, 1]))
print(f"\n[TEST] Accuracy: {test_acc:.2%}")
print(f"[TEST] F1: {test_f1:.4f}")
print(f"[TEST] ROC-AUC: {test_auc:.4f}")
# Dataset stats
dataset_stats = {
"train_size": len(y_train),
"val_size": len(y_val),
"test_size": len(y_test),
"train_class_counts": np.bincount(y_train.astype(int), minlength=num_classes).tolist(),
"val_class_counts": np.bincount(y_val.astype(int), minlength=num_classes).tolist(),
"test_class_counts": np.bincount(y_test.astype(int), minlength=num_classes).tolist(),
}
logs_dir = CFG["logs_dir"]
os.makedirs(logs_dir, exist_ok=True)
cm = confusion_matrix(y_test, test_preds)
y_test_i = y_test.astype(int)
pred_path = os.path.join(logs_dir, f"xgboost_{CFG['model_name']}_test_predictions.csv")
with open(pred_path, "w", newline="") as f:
w = csv.writer(f)
w.writerow(["y_true", "y_pred"] + [f"prob_{j}" for j in range(num_classes)])
for i in range(len(y_test_i)):
w.writerow(
[int(y_test_i[i]), int(test_preds[i])]
+ [float(x) for x in test_probs[i]]
)
summary_path = os.path.join(logs_dir, f"xgboost_{CFG['model_name']}_test_metrics_summary.json")
with open(summary_path, "w", encoding="utf-8") as f:
json.dump(
{
"model": "xgboost",
"model_name": CFG["model_name"],
"test_accuracy": round(test_acc, 6),
"test_f1_weighted": round(test_f1, 6),
"test_roc_auc": round(test_auc, 6),
"confusion_matrix": cm.tolist(),
"classification_report": classification_report(
y_test, test_preds, digits=4
),
},
f,
indent=2,
)
feat_names = list(
SELECTED_FEATURES.get(CFG["model_name"], SELECTED_FEATURES["face_orientation"])
)
imp_vals = model.feature_importances_
imp_rows = [
{"feature": feat_names[i], "importance": float(imp_vals[i])}
for i in range(min(len(feat_names), len(imp_vals)))
]
imp_path = os.path.join(logs_dir, f"xgboost_{CFG['model_name']}_feature_importance.json")
with open(imp_path, "w", encoding="utf-8") as f:
json.dump(imp_rows, f, indent=2)
print(f"[LOG] Test predictions β†’ {pred_path}")
if task is not None:
for i, (tl, vl) in enumerate(zip(train_losses, val_losses)):
task.logger.report_scalar("Loss", "Train", tl, iteration=i + 1)
task.logger.report_scalar("Loss", "Val", vl, iteration=i + 1)
task.logger.report_single_value("test/accuracy", test_acc)
task.logger.report_single_value("test/f1_weighted", test_f1)
task.logger.report_single_value("test/roc_auc", test_auc)
for key, val in dataset_stats.items():
if isinstance(val, list):
for i, v in enumerate(val):
task.logger.report_single_value(f"dataset/{key}/{i}", float(v))
else:
task.logger.report_single_value(f"dataset/{key}", float(val))
prec, rec, f1_per_class, _ = precision_recall_fscore_support(
y_test, test_preds, average=None, zero_division=0
)
for c in range(num_classes):
task.logger.report_single_value(f"test/class_{c}_precision", float(prec[c]))
task.logger.report_single_value(f"test/class_{c}_recall", float(rec[c]))
task.logger.report_single_value(f"test/class_{c}_f1", float(f1_per_class[c]))
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(6, 5))
ax.imshow(cm, cmap="Blues")
ax.set_xticks(range(num_classes))
ax.set_yticks(range(num_classes))
ax.set_xticklabels([f"Class {i}" for i in range(num_classes)])
ax.set_yticklabels([f"Class {i}" for i in range(num_classes)])
for i in range(num_classes):
for j in range(num_classes):
ax.text(j, i, str(cm[i, j]), ha="center", va="center", color="black")
ax.set_xlabel("Predicted")
ax.set_ylabel("True")
ax.set_title("Test set confusion matrix")
fig.tight_layout()
task.logger.report_matplotlib_figure(title="Confusion Matrix", series="test", figure=fig, iteration=0)
plt.close(fig)
if num_classes == 2:
fpr, tpr, _ = roc_curve(y_test, test_probs[:, 1])
fig_r, ax_r = plt.subplots(figsize=(6, 5))
ax_r.plot(fpr, tpr, label=f"ROC-AUC = {test_auc:.4f}")
ax_r.plot([0, 1], [0, 1], "k--", lw=1)
ax_r.set_xlabel("False positive rate")
ax_r.set_ylabel("True positive rate")
ax_r.set_title("Test ROC (XGBoost)")
ax_r.legend(loc="lower right")
fig_r.tight_layout()
task.logger.report_matplotlib_figure(
title="ROC", series="test", figure=fig_r, iteration=0
)
plt.close(fig_r)
task.logger.flush()
# ── Save checkpoint ───────────────────────────────────────────
ckpt_dir = CFG["checkpoints_dir"]
os.makedirs(ckpt_dir, exist_ok=True)
model_path = os.path.join(ckpt_dir, f"xgboost_{CFG['model_name']}_best.json")
model.save_model(model_path)
print(f"\n[CKPT] Model saved to: {model_path}")
# ── Write JSON log (same schema as MLP) ───────────────────────
# pandas-free tree/node count (trees_to_dataframe() needs pandas)
booster = model.get_booster()
tree_count = int(booster.num_boosted_rounds())
node_count = int(sum(tree.count("\n") + 1 for tree in booster.get_dump()))
history = {
"model_name": f"xgboost_{CFG['model_name']}",
"param_count": node_count,
"tree_count": tree_count,
"xgb_params": CFG["xgb_params"],
"epochs": list(range(1, len(train_losses) + 1)),
"train_loss": [round(v, 4) for v in train_losses],
"val_loss": [round(v, 4) for v in val_losses],
"test_acc": round(test_acc, 4),
"test_f1": round(test_f1, 4),
"test_auc": round(test_auc, 4),
"dataset_stats": dataset_stats,
}
log_path = os.path.join(logs_dir, f"xgboost_{CFG['model_name']}_training_log.json")
with open(log_path, "w") as f:
json.dump(history, f, indent=2)
print(f"[LOG] Training history saved to: {log_path}")
if task is not None:
from clearml import OutputModel
from config.clearml_enrich import attach_output_metrics, task_done_summary
task.upload_artifact(name="xgboost_model", artifact_object=model_path)
task.upload_artifact(name="training_log", artifact_object=log_path)
task.upload_artifact(name="test_predictions", artifact_object=pred_path)
task.upload_artifact(name="test_metrics_summary", artifact_object=summary_path)
task.upload_artifact(name="feature_importance", artifact_object=imp_path)
out_model = OutputModel(
task=task, name=f"XGBoost_{CFG['model_name']}", framework="XGBoost"
)
out_model.update_weights(weights_filename=model_path, auto_delete_file=False)
attach_output_metrics(
out_model,
{
"test_accuracy": round(test_acc, 6),
"test_f1_weighted": round(test_f1, 6),
"test_roc_auc": round(test_auc, 6),
},
)
task_done_summary(
task,
f"XGBoost {CFG['model_name']}: test acc={test_acc:.4f}, F1={test_f1:.4f}, ROC-AUC={test_auc:.4f}",
)
if __name__ == "__main__":
main()