Spaces:
Sleeping
Sleeping
| import csv | |
| import json | |
| import os | |
| import random | |
| import sys | |
| import numpy as np | |
| from sklearn.metrics import ( | |
| classification_report, | |
| confusion_matrix, | |
| f1_score, | |
| precision_recall_fscore_support, | |
| roc_auc_score, | |
| roc_curve, | |
| ) | |
| from data_preparation.prepare_dataset import get_numpy_splits, SELECTED_FEATURES | |
| from models.xgboost.config import XGB_BASE_PARAMS, build_xgb_classifier | |
| _PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) | |
| def _load_cfg(): | |
| try: | |
| from config import get | |
| xgb = get("xgboost") or {} | |
| data = get("data") or {} | |
| ratios = data.get("split_ratios", [0.7, 0.15, 0.15]) | |
| return { | |
| "model_name": get("mlp.model_name") or "face_orientation", | |
| "seed": get("mlp.seed") or 42, | |
| "split_ratios": tuple(ratios), | |
| "scale": False, | |
| "checkpoints_dir": os.path.join(_PROJECT_ROOT, "checkpoints"), | |
| "logs_dir": os.path.join(_PROJECT_ROOT, "evaluation", "logs"), | |
| "xgb_params": dict(XGB_BASE_PARAMS), | |
| } | |
| except Exception: | |
| return { | |
| "model_name": "face_orientation", | |
| "seed": 42, | |
| "split_ratios": (0.7, 0.15, 0.15), | |
| "scale": False, | |
| "checkpoints_dir": os.path.join(_PROJECT_ROOT, "checkpoints"), | |
| "logs_dir": os.path.join(_PROJECT_ROOT, "evaluation", "logs"), | |
| "xgb_params": dict(XGB_BASE_PARAMS), | |
| } | |
| CFG = _load_cfg() | |
| USE_CLEARML = os.environ.get("USE_CLEARML", "0") == "1" or bool(os.environ.get("CLEARML_TASK_ID")) | |
| CLEARML_QUEUE = os.environ.get("CLEARML_QUEUE", "") | |
| task = None | |
| if USE_CLEARML: | |
| try: | |
| from clearml import Task | |
| from config import CLEARML_PROJECT_NAME, flatten_for_clearml | |
| task = Task.init( | |
| project_name=CLEARML_PROJECT_NAME, | |
| task_name="XGBoost Model Training", | |
| tags=["training", "xgboost"], | |
| ) | |
| from config.clearml_enrich import enrich_task, upload_repro_artifacts | |
| enrich_task(task, role="train_xgboost") | |
| flat = flatten_for_clearml() | |
| for k, v in CFG.get("xgb_params", {}).items(): | |
| flat[f"xgb_params/{k}"] = v | |
| flat["model_name"] = CFG["model_name"] | |
| flat["seed"] = CFG["seed"] | |
| flat["split_ratios"] = str(CFG["split_ratios"]) | |
| task.connect(flat) | |
| upload_repro_artifacts(task) | |
| if CLEARML_QUEUE: | |
| print(f"[ClearML] Enqueuing to queue '{CLEARML_QUEUE}'.") | |
| task.execute_remotely(queue_name=CLEARML_QUEUE) | |
| sys.exit(0) | |
| except ImportError: | |
| task = None | |
| USE_CLEARML = False | |
| def set_seed(seed: int): | |
| random.seed(seed) | |
| np.random.seed(seed) | |
| def main(): | |
| set_seed(CFG["seed"]) | |
| print(f"[TRAIN] Model: XGBoost") | |
| print(f"[TRAIN] Task: {CFG['model_name']}") | |
| # ββ Data ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| splits, num_features, num_classes, scaler = get_numpy_splits( | |
| model_name=CFG["model_name"], | |
| split_ratios=CFG["split_ratios"], | |
| seed=CFG["seed"], | |
| scale=CFG["scale"], | |
| ) | |
| X_train, y_train = splits["X_train"], splits["y_train"] | |
| X_val, y_val = splits["X_val"], splits["y_val"] | |
| X_test, y_test = splits["X_test"], splits["y_test"] | |
| # ββ Model βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| model = build_xgb_classifier(CFG["seed"], verbosity=1, early_stopping_rounds=30) | |
| model.fit( | |
| X_train, y_train, | |
| eval_set=[(X_train, y_train), (X_val, y_val)], | |
| verbose=10, | |
| ) | |
| best_it = getattr(model, "best_iteration", None) | |
| print(f"[TRAIN] Best iteration: {best_it} / {CFG['xgb_params']['n_estimators']}") | |
| # ββ Evaluation ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| evals = model.evals_result() | |
| eval_metric_name = CFG["xgb_params"]["eval_metric"] | |
| train_losses = evals["validation_0"][eval_metric_name] | |
| val_losses = evals["validation_1"][eval_metric_name] | |
| # Test metrics | |
| test_preds = model.predict(X_test) | |
| test_probs = model.predict_proba(X_test) | |
| test_acc = float(np.mean(test_preds == y_test)) | |
| test_f1 = float(f1_score(y_test, test_preds, average='weighted')) | |
| if num_classes > 2: | |
| test_auc = float(roc_auc_score(y_test, test_probs, multi_class='ovr', average='weighted')) | |
| else: | |
| test_auc = float(roc_auc_score(y_test, test_probs[:, 1])) | |
| print(f"\n[TEST] Accuracy: {test_acc:.2%}") | |
| print(f"[TEST] F1: {test_f1:.4f}") | |
| print(f"[TEST] ROC-AUC: {test_auc:.4f}") | |
| # Dataset stats | |
| dataset_stats = { | |
| "train_size": len(y_train), | |
| "val_size": len(y_val), | |
| "test_size": len(y_test), | |
| "train_class_counts": np.bincount(y_train.astype(int), minlength=num_classes).tolist(), | |
| "val_class_counts": np.bincount(y_val.astype(int), minlength=num_classes).tolist(), | |
| "test_class_counts": np.bincount(y_test.astype(int), minlength=num_classes).tolist(), | |
| } | |
| logs_dir = CFG["logs_dir"] | |
| os.makedirs(logs_dir, exist_ok=True) | |
| cm = confusion_matrix(y_test, test_preds) | |
| y_test_i = y_test.astype(int) | |
| pred_path = os.path.join(logs_dir, f"xgboost_{CFG['model_name']}_test_predictions.csv") | |
| with open(pred_path, "w", newline="") as f: | |
| w = csv.writer(f) | |
| w.writerow(["y_true", "y_pred"] + [f"prob_{j}" for j in range(num_classes)]) | |
| for i in range(len(y_test_i)): | |
| w.writerow( | |
| [int(y_test_i[i]), int(test_preds[i])] | |
| + [float(x) for x in test_probs[i]] | |
| ) | |
| summary_path = os.path.join(logs_dir, f"xgboost_{CFG['model_name']}_test_metrics_summary.json") | |
| with open(summary_path, "w", encoding="utf-8") as f: | |
| json.dump( | |
| { | |
| "model": "xgboost", | |
| "model_name": CFG["model_name"], | |
| "test_accuracy": round(test_acc, 6), | |
| "test_f1_weighted": round(test_f1, 6), | |
| "test_roc_auc": round(test_auc, 6), | |
| "confusion_matrix": cm.tolist(), | |
| "classification_report": classification_report( | |
| y_test, test_preds, digits=4 | |
| ), | |
| }, | |
| f, | |
| indent=2, | |
| ) | |
| feat_names = list( | |
| SELECTED_FEATURES.get(CFG["model_name"], SELECTED_FEATURES["face_orientation"]) | |
| ) | |
| imp_vals = model.feature_importances_ | |
| imp_rows = [ | |
| {"feature": feat_names[i], "importance": float(imp_vals[i])} | |
| for i in range(min(len(feat_names), len(imp_vals))) | |
| ] | |
| imp_path = os.path.join(logs_dir, f"xgboost_{CFG['model_name']}_feature_importance.json") | |
| with open(imp_path, "w", encoding="utf-8") as f: | |
| json.dump(imp_rows, f, indent=2) | |
| print(f"[LOG] Test predictions β {pred_path}") | |
| if task is not None: | |
| for i, (tl, vl) in enumerate(zip(train_losses, val_losses)): | |
| task.logger.report_scalar("Loss", "Train", tl, iteration=i + 1) | |
| task.logger.report_scalar("Loss", "Val", vl, iteration=i + 1) | |
| task.logger.report_single_value("test/accuracy", test_acc) | |
| task.logger.report_single_value("test/f1_weighted", test_f1) | |
| task.logger.report_single_value("test/roc_auc", test_auc) | |
| for key, val in dataset_stats.items(): | |
| if isinstance(val, list): | |
| for i, v in enumerate(val): | |
| task.logger.report_single_value(f"dataset/{key}/{i}", float(v)) | |
| else: | |
| task.logger.report_single_value(f"dataset/{key}", float(val)) | |
| prec, rec, f1_per_class, _ = precision_recall_fscore_support( | |
| y_test, test_preds, average=None, zero_division=0 | |
| ) | |
| for c in range(num_classes): | |
| task.logger.report_single_value(f"test/class_{c}_precision", float(prec[c])) | |
| task.logger.report_single_value(f"test/class_{c}_recall", float(rec[c])) | |
| task.logger.report_single_value(f"test/class_{c}_f1", float(f1_per_class[c])) | |
| import matplotlib | |
| matplotlib.use("Agg") | |
| import matplotlib.pyplot as plt | |
| fig, ax = plt.subplots(figsize=(6, 5)) | |
| ax.imshow(cm, cmap="Blues") | |
| ax.set_xticks(range(num_classes)) | |
| ax.set_yticks(range(num_classes)) | |
| ax.set_xticklabels([f"Class {i}" for i in range(num_classes)]) | |
| ax.set_yticklabels([f"Class {i}" for i in range(num_classes)]) | |
| for i in range(num_classes): | |
| for j in range(num_classes): | |
| ax.text(j, i, str(cm[i, j]), ha="center", va="center", color="black") | |
| ax.set_xlabel("Predicted") | |
| ax.set_ylabel("True") | |
| ax.set_title("Test set confusion matrix") | |
| fig.tight_layout() | |
| task.logger.report_matplotlib_figure(title="Confusion Matrix", series="test", figure=fig, iteration=0) | |
| plt.close(fig) | |
| if num_classes == 2: | |
| fpr, tpr, _ = roc_curve(y_test, test_probs[:, 1]) | |
| fig_r, ax_r = plt.subplots(figsize=(6, 5)) | |
| ax_r.plot(fpr, tpr, label=f"ROC-AUC = {test_auc:.4f}") | |
| ax_r.plot([0, 1], [0, 1], "k--", lw=1) | |
| ax_r.set_xlabel("False positive rate") | |
| ax_r.set_ylabel("True positive rate") | |
| ax_r.set_title("Test ROC (XGBoost)") | |
| ax_r.legend(loc="lower right") | |
| fig_r.tight_layout() | |
| task.logger.report_matplotlib_figure( | |
| title="ROC", series="test", figure=fig_r, iteration=0 | |
| ) | |
| plt.close(fig_r) | |
| task.logger.flush() | |
| # ββ Save checkpoint βββββββββββββββββββββββββββββββββββββββββββ | |
| ckpt_dir = CFG["checkpoints_dir"] | |
| os.makedirs(ckpt_dir, exist_ok=True) | |
| model_path = os.path.join(ckpt_dir, f"xgboost_{CFG['model_name']}_best.json") | |
| model.save_model(model_path) | |
| print(f"\n[CKPT] Model saved to: {model_path}") | |
| # ββ Write JSON log (same schema as MLP) βββββββββββββββββββββββ | |
| # pandas-free tree/node count (trees_to_dataframe() needs pandas) | |
| booster = model.get_booster() | |
| tree_count = int(booster.num_boosted_rounds()) | |
| node_count = int(sum(tree.count("\n") + 1 for tree in booster.get_dump())) | |
| history = { | |
| "model_name": f"xgboost_{CFG['model_name']}", | |
| "param_count": node_count, | |
| "tree_count": tree_count, | |
| "xgb_params": CFG["xgb_params"], | |
| "epochs": list(range(1, len(train_losses) + 1)), | |
| "train_loss": [round(v, 4) for v in train_losses], | |
| "val_loss": [round(v, 4) for v in val_losses], | |
| "test_acc": round(test_acc, 4), | |
| "test_f1": round(test_f1, 4), | |
| "test_auc": round(test_auc, 4), | |
| "dataset_stats": dataset_stats, | |
| } | |
| log_path = os.path.join(logs_dir, f"xgboost_{CFG['model_name']}_training_log.json") | |
| with open(log_path, "w") as f: | |
| json.dump(history, f, indent=2) | |
| print(f"[LOG] Training history saved to: {log_path}") | |
| if task is not None: | |
| from clearml import OutputModel | |
| from config.clearml_enrich import attach_output_metrics, task_done_summary | |
| task.upload_artifact(name="xgboost_model", artifact_object=model_path) | |
| task.upload_artifact(name="training_log", artifact_object=log_path) | |
| task.upload_artifact(name="test_predictions", artifact_object=pred_path) | |
| task.upload_artifact(name="test_metrics_summary", artifact_object=summary_path) | |
| task.upload_artifact(name="feature_importance", artifact_object=imp_path) | |
| out_model = OutputModel( | |
| task=task, name=f"XGBoost_{CFG['model_name']}", framework="XGBoost" | |
| ) | |
| out_model.update_weights(weights_filename=model_path, auto_delete_file=False) | |
| attach_output_metrics( | |
| out_model, | |
| { | |
| "test_accuracy": round(test_acc, 6), | |
| "test_f1_weighted": round(test_f1, 6), | |
| "test_roc_auc": round(test_auc, 6), | |
| }, | |
| ) | |
| task_done_summary( | |
| task, | |
| f"XGBoost {CFG['model_name']}: test acc={test_acc:.4f}, F1={test_f1:.4f}, ROC-AUC={test_auc:.4f}", | |
| ) | |
| if __name__ == "__main__": | |
| main() | |