Production UI/telemetry upgrade + monochrome theme + safety hardening
Browse files- .env.example +14 -0
- PRODUCTION.md +51 -0
- README.md +37 -1
- app.py +704 -143
- requirements.txt +1 -0
- scripts/eval_sota.py +7 -1
- scripts/preflight_check.py +204 -0
- scripts/train_sota.py +30 -6
- tests/test_core_utils.py +246 -0
.env.example
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Hugging Face authentication (set at runtime; do not commit real tokens)
|
| 2 |
+
HF_TOKEN=
|
| 3 |
+
# HUGGINGFACE_HUB_TOKEN=
|
| 4 |
+
|
| 5 |
+
# Continuous mode safety controls
|
| 6 |
+
# Seconds to wait between auto-restart cycles.
|
| 7 |
+
CONTINUOUS_RESTART_DELAY_SECONDS=15
|
| 8 |
+
# Stop continuous mode after this many consecutive non-success cycles.
|
| 9 |
+
CONTINUOUS_MAX_CONSECUTIVE_FAILURES=3
|
| 10 |
+
|
| 11 |
+
# Max retained characters in live app log buffer.
|
| 12 |
+
APP_LOG_MAX_CHARS=200000
|
| 13 |
+
# Max number of entries retained in workspace/runtime/run_history.json.
|
| 14 |
+
RUN_HISTORY_LIMIT=80
|
PRODUCTION.md
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Production Runbook
|
| 2 |
+
|
| 3 |
+
## 1. Pre-Deploy Checks
|
| 4 |
+
|
| 5 |
+
Run all checks from `space_trainer/`:
|
| 6 |
+
|
| 7 |
+
```bash
|
| 8 |
+
python scripts/preflight_check.py
|
| 9 |
+
python -m unittest discover -s tests -v
|
| 10 |
+
```
|
| 11 |
+
|
| 12 |
+
Optional deeper check (loads tokenizer/model dependencies and runs stage-1 dry-run):
|
| 13 |
+
|
| 14 |
+
```bash
|
| 15 |
+
python scripts/preflight_check.py --run-training-dry-run
|
| 16 |
+
```
|
| 17 |
+
|
| 18 |
+
## 2. Runtime Configuration
|
| 19 |
+
|
| 20 |
+
Set runtime secrets in Hugging Face Space settings:
|
| 21 |
+
|
| 22 |
+
- `HF_TOKEN` (or `HUGGINGFACE_HUB_TOKEN`)
|
| 23 |
+
|
| 24 |
+
Optional safety overrides:
|
| 25 |
+
|
| 26 |
+
- `CONTINUOUS_RESTART_DELAY_SECONDS` (default `15`)
|
| 27 |
+
- `CONTINUOUS_MAX_CONSECUTIVE_FAILURES` (default `3`)
|
| 28 |
+
- `APP_LOG_MAX_CHARS` (default `200000`)
|
| 29 |
+
- `RUN_HISTORY_LIMIT` (default `80`)
|
| 30 |
+
|
| 31 |
+
## 3. Release Checklist
|
| 32 |
+
|
| 33 |
+
1. Ensure pre-deploy checks are green.
|
| 34 |
+
2. Ensure `requirements.txt` includes all runtime dependencies.
|
| 35 |
+
3. Deploy Space files (exclude `workspace/` artifacts).
|
| 36 |
+
4. Wait for Space runtime stage to reach `RUNNING`.
|
| 37 |
+
5. Trigger a UI preflight run (`Validation Mode (No Training)`).
|
| 38 |
+
6. Trigger one non-autonomous single-stage run before enabling continuous autonomous mode.
|
| 39 |
+
7. Confirm `workspace/runtime/run_history.json` is being updated and recent run cards render in telemetry.
|
| 40 |
+
|
| 41 |
+
## 4. Rollback Strategy
|
| 42 |
+
|
| 43 |
+
1. Re-deploy the last known good commit to the Space.
|
| 44 |
+
2. Disable `Continuous Auto-Restart`.
|
| 45 |
+
3. Run preflight mode only until health is restored.
|
| 46 |
+
4. Re-enable autonomous/continuous mode after one successful full run.
|
| 47 |
+
|
| 48 |
+
## 5. Operational Notes
|
| 49 |
+
|
| 50 |
+
- Full run records are persisted under `workspace/runtime/run_records/`.
|
| 51 |
+
- The compact run index at `workspace/runtime/run_history.json` is capped by `RUN_HISTORY_LIMIT`.
|
README.md
CHANGED
|
@@ -39,7 +39,7 @@ Credentials and publish permissions are handled by deployment runtime settings.
|
|
| 39 |
- `Run Evaluation After Training`: toggles post-train eval in runtime config.
|
| 40 |
- `Enforce Quality Gate`: enables/disables promotion gate checks.
|
| 41 |
- `Gate Min pass@1`, `Gate Min pass@k`, `Gate Min Rows`: runtime gate thresholds.
|
| 42 |
-
- `Live Telemetry`: real-time stage progression, runtime posture,
|
| 43 |
- `Run Log (Live Log + Summary JSON)`: unified panel for line-by-line runtime stream, heartbeats, and structured run summary.
|
| 44 |
- `Validation Mode (No Training)`: validates pipeline with `--dry-run`.
|
| 45 |
- `Force Dataset Redownload`: bypasses cached parquet files.
|
|
@@ -52,8 +52,44 @@ Credentials and publish permissions are handled by deployment runtime settings.
|
|
| 52 |
- final adapter: `workspace/runs/math-conjecture-sota/final_adapter`
|
| 53 |
- training summary: `workspace/runs/math-conjecture-sota/training_summary.json`
|
| 54 |
- post-eval report: `workspace/runs/math-conjecture-sota/post_eval_report.json`
|
|
|
|
|
|
|
| 55 |
|
| 56 |
## Notes
|
| 57 |
|
| 58 |
- Full training runs on GPU when available and automatically falls back to CPU mode when CUDA is unavailable.
|
| 59 |
- App handles Gradio copy-button compatibility across versions automatically.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
- `Run Evaluation After Training`: toggles post-train eval in runtime config.
|
| 40 |
- `Enforce Quality Gate`: enables/disables promotion gate checks.
|
| 41 |
- `Gate Min pass@1`, `Gate Min pass@k`, `Gate Min Rows`: runtime gate thresholds.
|
| 42 |
+
- `Live Telemetry`: real-time stage progression, runtime posture, training-loss graph (sparkline), artifact index, and recent-run outcomes.
|
| 43 |
- `Run Log (Live Log + Summary JSON)`: unified panel for line-by-line runtime stream, heartbeats, and structured run summary.
|
| 44 |
- `Validation Mode (No Training)`: validates pipeline with `--dry-run`.
|
| 45 |
- `Force Dataset Redownload`: bypasses cached parquet files.
|
|
|
|
| 52 |
- final adapter: `workspace/runs/math-conjecture-sota/final_adapter`
|
| 53 |
- training summary: `workspace/runs/math-conjecture-sota/training_summary.json`
|
| 54 |
- post-eval report: `workspace/runs/math-conjecture-sota/post_eval_report.json`
|
| 55 |
+
- run history index: `workspace/runtime/run_history.json`
|
| 56 |
+
- per-run records: `workspace/runtime/run_records/<run_label>.json`
|
| 57 |
|
| 58 |
## Notes
|
| 59 |
|
| 60 |
- Full training runs on GPU when available and automatically falls back to CPU mode when CUDA is unavailable.
|
| 61 |
- App handles Gradio copy-button compatibility across versions automatically.
|
| 62 |
+
|
| 63 |
+
## Production Hardening
|
| 64 |
+
|
| 65 |
+
Before deployment, run:
|
| 66 |
+
|
| 67 |
+
```bash
|
| 68 |
+
python scripts/preflight_check.py
|
| 69 |
+
python -m unittest discover -s tests -v
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
Optional deeper validation (runs stage-1 dry-run training check):
|
| 73 |
+
|
| 74 |
+
```bash
|
| 75 |
+
python scripts/preflight_check.py --run-training-dry-run
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
Continuous mode now includes two production fail-safes:
|
| 79 |
+
|
| 80 |
+
- restart cooldown between cycles (default `15s`)
|
| 81 |
+
- circuit breaker that stops after consecutive non-success cycles (default `3`)
|
| 82 |
+
|
| 83 |
+
Environment overrides:
|
| 84 |
+
|
| 85 |
+
- `CONTINUOUS_RESTART_DELAY_SECONDS` (integer, `>=0`)
|
| 86 |
+
- `CONTINUOUS_MAX_CONSECUTIVE_FAILURES` (integer, `>=1`)
|
| 87 |
+
- `APP_LOG_MAX_CHARS` (integer, `>=20000`; default `200000`)
|
| 88 |
+
- `RUN_HISTORY_LIMIT` (integer, `>=5`; default `80`)
|
| 89 |
+
|
| 90 |
+
Recommended runtime secrets posture:
|
| 91 |
+
|
| 92 |
+
- set `HF_TOKEN` / `HUGGINGFACE_HUB_TOKEN` from Space secrets
|
| 93 |
+
- avoid storing long-lived API tokens in repository files
|
| 94 |
+
|
| 95 |
+
Detailed deployment/rollback steps are documented in `PRODUCTION.md`.
|
app.py
CHANGED
|
@@ -16,6 +16,7 @@ import subprocess
|
|
| 16 |
import sys
|
| 17 |
import threading
|
| 18 |
import time
|
|
|
|
| 19 |
from pathlib import Path
|
| 20 |
from typing import Any, Dict, Generator, List, Optional, Tuple
|
| 21 |
|
|
@@ -29,6 +30,9 @@ ROOT = Path(__file__).resolve().parent
|
|
| 29 |
WORKSPACE_DIR = ROOT / "workspace"
|
| 30 |
DATA_DIR = WORKSPACE_DIR / "data" / "releases" / "v1"
|
| 31 |
RUNTIME_DIR = WORKSPACE_DIR / "runtime"
|
|
|
|
|
|
|
|
|
|
| 32 |
CONFIG_TEMPLATE = ROOT / "configs" / "deepseek_math_sota.yaml"
|
| 33 |
TRAIN_SCRIPT = ROOT / "scripts" / "train_sota.py"
|
| 34 |
EVAL_SCRIPT = ROOT / "scripts" / "eval_sota.py"
|
|
@@ -48,6 +52,26 @@ CANCEL_REQUESTED = False
|
|
| 48 |
ACTIVE_PROCESS: Optional[subprocess.Popen] = None
|
| 49 |
ACTIVE_RUN_LABEL = ""
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
PROJECT_DESCRIPTION = """
|
| 52 |
## Pipeline Scope
|
| 53 |
This app runs the full training pipeline for the `maths-conjuncture-solutions` project.
|
|
@@ -104,131 +128,131 @@ UI_CSS = """
|
|
| 104 |
@import url("https://fonts.googleapis.com/css2?family=Rajdhani:wght@500;600;700&family=IBM+Plex+Mono:wght@400;500&display=swap");
|
| 105 |
|
| 106 |
:root {
|
| 107 |
-
--nt-bg: #
|
| 108 |
-
--nt-bg-2: #
|
| 109 |
-
--nt-bg-radial: rgba(
|
| 110 |
-
--nt-panel: #
|
| 111 |
-
--nt-panel-2: #
|
| 112 |
-
--nt-border: #
|
| 113 |
-
--nt-border-strong: #
|
| 114 |
-
--nt-text: #
|
| 115 |
-
--nt-muted: #
|
| 116 |
-
--nt-accent: #
|
| 117 |
-
--nt-accent-2: #
|
| 118 |
-
--nt-shadow: rgba(0, 0, 0, 0.
|
| 119 |
-
--nt-hero-border: #
|
| 120 |
-
--nt-hero-bg-1: #
|
| 121 |
-
--nt-hero-bg-2: #
|
| 122 |
-
--nt-hero-bg-3: #
|
| 123 |
-
--nt-hero-text: #
|
| 124 |
-
--nt-hero-topline: #
|
| 125 |
-
--nt-hero-body: #
|
| 126 |
-
--nt-hero-grid: rgba(
|
| 127 |
-
--nt-chip-border: rgba(
|
| 128 |
-
--nt-chip-bg: rgba(
|
| 129 |
-
--nt-chip-text: #
|
| 130 |
-
--nt-input-bg: #
|
| 131 |
-
--nt-focus-ring: rgba(
|
| 132 |
-
--nt-btn-border: #
|
| 133 |
-
--nt-btn-bg-1: #
|
| 134 |
-
--nt-btn-bg-2: #
|
| 135 |
-
--nt-btn-text: #
|
| 136 |
-
--nt-primary-btn-border: #
|
| 137 |
-
--nt-primary-btn-bg-1: #
|
| 138 |
-
--nt-primary-btn-bg-2: #
|
| 139 |
-
--nt-primary-btn-text: #
|
| 140 |
-
--nt-stop-btn-border: #
|
| 141 |
-
--nt-stop-btn-bg-1: #
|
| 142 |
-
--nt-stop-btn-bg-2: #
|
| 143 |
-
--nt-stop-btn-text: #
|
| 144 |
-
--nt-ops-panel-1: #
|
| 145 |
-
--nt-ops-panel-2: #
|
| 146 |
-
--nt-card-border: #
|
| 147 |
-
--nt-card-bg-1: #
|
| 148 |
-
--nt-card-bg-2: #
|
| 149 |
-
--nt-metric-text: #
|
| 150 |
-
--nt-meter-border: #
|
| 151 |
-
--nt-meter-bg: #
|
| 152 |
-
--nt-meter-fill-1: #
|
| 153 |
-
--nt-meter-fill-2: #
|
| 154 |
-
--nt-spark-border: #
|
| 155 |
-
--nt-spark-bg: #
|
| 156 |
-
--nt-spark-stroke: #
|
| 157 |
-
--nt-settings-border: rgba(
|
| 158 |
-
--nt-settings-bg: rgba(
|
| 159 |
-
--nt-settings-bg-hover: rgba(
|
| 160 |
-
--nt-settings-text: #
|
| 161 |
-
--nt-settings-menu-bg: #
|
| 162 |
-
--nt-settings-menu-border: #
|
| 163 |
-
--nt-settings-menu-shadow: rgba(0, 0, 0, 0.
|
| 164 |
-
--nt-settings-menu-btn: #
|
| 165 |
-
--nt-settings-menu-btn-hover: #
|
| 166 |
-
--nt-settings-menu-btn-active-bg: #
|
| 167 |
-
--nt-settings-menu-btn-active-text: #
|
| 168 |
}
|
| 169 |
|
| 170 |
:root[data-nt-theme="light"] {
|
| 171 |
-
--nt-bg: #
|
| 172 |
-
--nt-bg-2: #
|
| 173 |
-
--nt-bg-radial: rgba(255, 255, 255, 0.
|
| 174 |
-
--nt-panel: #
|
| 175 |
-
--nt-panel-2: #
|
| 176 |
-
--nt-border: #
|
| 177 |
-
--nt-border-strong: #
|
| 178 |
-
--nt-text: #
|
| 179 |
-
--nt-muted: #
|
| 180 |
-
--nt-accent: #
|
| 181 |
-
--nt-accent-2: #
|
| 182 |
-
--nt-shadow: rgba(
|
| 183 |
-
--nt-hero-border: #
|
| 184 |
-
--nt-hero-bg-1: #
|
| 185 |
-
--nt-hero-bg-2: #
|
| 186 |
-
--nt-hero-bg-3: #
|
| 187 |
-
--nt-hero-text: #
|
| 188 |
-
--nt-hero-topline: #
|
| 189 |
-
--nt-hero-body: #
|
| 190 |
-
--nt-hero-grid: rgba(
|
| 191 |
-
--nt-chip-border: rgba(
|
| 192 |
-
--nt-chip-bg: rgba(
|
| 193 |
-
--nt-chip-text: #
|
| 194 |
-
--nt-input-bg: #
|
| 195 |
-
--nt-focus-ring: rgba(
|
| 196 |
-
--nt-btn-border: #
|
| 197 |
-
--nt-btn-bg-1: #
|
| 198 |
-
--nt-btn-bg-2: #
|
| 199 |
-
--nt-btn-text: #
|
| 200 |
-
--nt-primary-btn-border: #
|
| 201 |
-
--nt-primary-btn-bg-1: #
|
| 202 |
-
--nt-primary-btn-bg-2: #
|
| 203 |
-
--nt-primary-btn-text: #
|
| 204 |
-
--nt-stop-btn-border: #
|
| 205 |
-
--nt-stop-btn-bg-1: #
|
| 206 |
-
--nt-stop-btn-bg-2: #
|
| 207 |
-
--nt-stop-btn-text: #
|
| 208 |
-
--nt-ops-panel-1: #
|
| 209 |
-
--nt-ops-panel-2: #
|
| 210 |
-
--nt-card-border: #
|
| 211 |
-
--nt-card-bg-1: #
|
| 212 |
-
--nt-card-bg-2: #
|
| 213 |
-
--nt-metric-text: #
|
| 214 |
-
--nt-meter-border: #
|
| 215 |
-
--nt-meter-bg: #
|
| 216 |
-
--nt-meter-fill-1: #
|
| 217 |
-
--nt-meter-fill-2: #
|
| 218 |
-
--nt-spark-border: #
|
| 219 |
-
--nt-spark-bg: #
|
| 220 |
-
--nt-spark-stroke: #
|
| 221 |
-
--nt-settings-border: rgba(
|
| 222 |
-
--nt-settings-bg: rgba(
|
| 223 |
-
--nt-settings-bg-hover: rgba(
|
| 224 |
-
--nt-settings-text: #
|
| 225 |
-
--nt-settings-menu-bg: #
|
| 226 |
-
--nt-settings-menu-border: #
|
| 227 |
-
--nt-settings-menu-shadow: rgba(
|
| 228 |
-
--nt-settings-menu-btn: #
|
| 229 |
-
--nt-settings-menu-btn-hover: #
|
| 230 |
-
--nt-settings-menu-btn-active-bg: #
|
| 231 |
-
--nt-settings-menu-btn-active-text: #
|
| 232 |
}
|
| 233 |
|
| 234 |
@keyframes ntFadeUp {
|
|
@@ -246,8 +270,8 @@ UI_CSS = """
|
|
| 246 |
color: var(--nt-text) !important;
|
| 247 |
font-family: "Rajdhani", "Segoe UI", sans-serif !important;
|
| 248 |
background:
|
| 249 |
-
linear-gradient(rgba(
|
| 250 |
-
linear-gradient(90deg, rgba(
|
| 251 |
radial-gradient(circle at 12% 10%, var(--nt-bg-radial) 0%, rgba(255, 255, 255, 0) 44%),
|
| 252 |
linear-gradient(145deg, var(--nt-bg) 0%, var(--nt-bg-2) 100%) !important;
|
| 253 |
background-size: 24px 24px, 24px 24px, 100% 100%, 100% 100% !important;
|
|
@@ -266,7 +290,7 @@ UI_CSS = """
|
|
| 266 |
background: linear-gradient(180deg, var(--nt-panel) 0%, var(--nt-panel-2) 100%) !important;
|
| 267 |
border: 1px solid var(--nt-border) !important;
|
| 268 |
border-radius: 8px !important;
|
| 269 |
-
box-shadow: inset 0 0 0 1px rgba(
|
| 270 |
}
|
| 271 |
|
| 272 |
.app-hero {
|
|
@@ -277,7 +301,7 @@ UI_CSS = """
|
|
| 277 |
background: linear-gradient(132deg, var(--nt-hero-bg-1) 0%, var(--nt-hero-bg-2) 62%, var(--nt-hero-bg-3) 100%);
|
| 278 |
color: var(--nt-hero-text);
|
| 279 |
padding: 22px 24px 20px;
|
| 280 |
-
box-shadow: inset 0 0 0 1px rgba(
|
| 281 |
animation: ntFadeUp 500ms ease-out both;
|
| 282 |
}
|
| 283 |
|
|
@@ -425,7 +449,7 @@ UI_CSS = """
|
|
| 425 |
.nt-settings-menu .nt-mode-btn.active {
|
| 426 |
background: var(--nt-settings-menu-btn-active-bg) !important;
|
| 427 |
color: var(--nt-settings-menu-btn-active-text) !important;
|
| 428 |
-
box-shadow: inset 0 0 0 1px rgba(
|
| 429 |
}
|
| 430 |
|
| 431 |
.section-copy {
|
|
@@ -524,7 +548,7 @@ UI_CSS = """
|
|
| 524 |
|
| 525 |
.gradio-container button:hover {
|
| 526 |
transform: translateY(-1px);
|
| 527 |
-
box-shadow: 0 0 0 1px rgba(
|
| 528 |
filter: brightness(1.03);
|
| 529 |
}
|
| 530 |
|
|
@@ -535,7 +559,7 @@ UI_CSS = """
|
|
| 535 |
background: linear-gradient(180deg, var(--nt-ops-panel-1) 0%, var(--nt-ops-panel-2) 100%);
|
| 536 |
padding: 14px;
|
| 537 |
animation: ntFadeUp 420ms ease-out both;
|
| 538 |
-
box-shadow: inset 0 0 0 1px rgba(
|
| 539 |
}
|
| 540 |
|
| 541 |
.ops-visual-head {
|
|
@@ -575,7 +599,11 @@ UI_CSS = """
|
|
| 575 |
padding: 9px 10px;
|
| 576 |
min-height: 86px;
|
| 577 |
animation: ntFadeUp 460ms ease-out both;
|
| 578 |
-
box-shadow: inset 0 0 0 1px rgba(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 579 |
}
|
| 580 |
|
| 581 |
.ops-k {
|
|
@@ -636,6 +664,128 @@ UI_CSS = """
|
|
| 636 |
display: block;
|
| 637 |
}
|
| 638 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 639 |
.gradio-container [data-testid="footer"],
|
| 640 |
.gradio-container .built-with {
|
| 641 |
display: none !important;
|
|
@@ -691,6 +841,10 @@ UI_CSS = """
|
|
| 691 |
.ops-grid {
|
| 692 |
grid-template-columns: 1fr;
|
| 693 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 694 |
}
|
| 695 |
"""
|
| 696 |
|
|
@@ -963,14 +1117,14 @@ DEFAULT_AUTO_PUSH_TO_HUB = bool(TEMPLATE_HUB.get("push_to_hub", True))
|
|
| 963 |
|
| 964 |
|
| 965 |
def now_ts() -> str:
|
| 966 |
-
return dt.datetime.
|
| 967 |
|
| 968 |
|
| 969 |
def append_log(lines: List[str], message: str) -> str:
|
| 970 |
lines.append(f"[{now_ts()}] {message}")
|
| 971 |
text = "\n".join(lines)
|
| 972 |
-
if len(text) >
|
| 973 |
-
text = text[-
|
| 974 |
return text
|
| 975 |
|
| 976 |
|
|
@@ -991,7 +1145,7 @@ def compose_ops_console(log_text: str, summary_json: str) -> str:
|
|
| 991 |
return payload
|
| 992 |
|
| 993 |
|
| 994 |
-
def _merge_log_chunk(existing: str, chunk: str, max_chars: int =
|
| 995 |
if not chunk:
|
| 996 |
return existing
|
| 997 |
merged = existing
|
|
@@ -1031,6 +1185,246 @@ def _fmt_float(value: Any, digits: int = 3) -> str:
|
|
| 1031 |
return "--"
|
| 1032 |
|
| 1033 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1034 |
def _extract_loss_values(log_text: str, limit: int = 48) -> List[float]:
|
| 1035 |
losses: List[float] = []
|
| 1036 |
for line in log_text.splitlines():
|
|
@@ -1137,18 +1531,67 @@ def _infer_stage_snapshot(summary: Dict[str, Any], log_text: str) -> Dict[str, A
|
|
| 1137 |
}
|
| 1138 |
|
| 1139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1140 |
def render_ops_visual(summary: Dict[str, Any], status_text: str, log_text: str) -> str:
|
| 1141 |
safe_summary = _as_dict(summary)
|
| 1142 |
runtime = _as_dict(safe_summary.get("runtime"))
|
| 1143 |
quality_gate = _as_dict(safe_summary.get("quality_gate"))
|
| 1144 |
evaluation = _as_dict(safe_summary.get("evaluation"))
|
| 1145 |
push_report = _as_dict(safe_summary.get("push"))
|
|
|
|
| 1146 |
|
| 1147 |
run_label = html.escape(str(safe_summary.get("run_label") or "not-started"))
|
| 1148 |
status_value = html.escape(status_text or "Idle")
|
| 1149 |
runtime_mode = "GPU READY" if runtime.get("cuda_available") else "CPU FALLBACK"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1150 |
runtime_mode = html.escape(runtime_mode)
|
| 1151 |
device_count = _safe_int(runtime.get("cuda_device_count"), 0)
|
|
|
|
|
|
|
| 1152 |
|
| 1153 |
gate_enabled = bool(quality_gate.get("enabled"))
|
| 1154 |
gate_passed = quality_gate.get("passed")
|
|
@@ -1177,6 +1620,7 @@ def render_ops_visual(summary: Dict[str, Any], status_text: str, log_text: str)
|
|
| 1177 |
pass_k = _fmt_pct(evaluation.get("pass_at_k"))
|
| 1178 |
pass_1 = _fmt_pct(evaluation.get("pass_at_1"))
|
| 1179 |
exact_k = _fmt_pct(evaluation.get("exact_at_k"))
|
|
|
|
| 1180 |
|
| 1181 |
push_state = "Pending"
|
| 1182 |
if push_report:
|
|
@@ -1189,6 +1633,30 @@ def render_ops_visual(summary: Dict[str, Any], status_text: str, log_text: str)
|
|
| 1189 |
else:
|
| 1190 |
push_state = "Blocked"
|
| 1191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1192 |
return f"""
|
| 1193 |
<div class="ops-visual">
|
| 1194 |
<div class="ops-visual-head">
|
|
@@ -1206,6 +1674,11 @@ def render_ops_visual(summary: Dict[str, Any], status_text: str, log_text: str)
|
|
| 1206 |
<div class="ops-v">{runtime_mode}</div>
|
| 1207 |
<div class="ops-v-small">cuda devices: {device_count}</div>
|
| 1208 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1209 |
<div class="ops-card">
|
| 1210 |
<div class="ops-k">Stage Progress</div>
|
| 1211 |
<div class="ops-v">{stage_meta['completed']} / {stage_meta['stage_count']}</div>
|
|
@@ -1220,9 +1693,31 @@ def render_ops_visual(summary: Dict[str, Any], status_text: str, log_text: str)
|
|
| 1220 |
<div class="ops-card">
|
| 1221 |
<div class="ops-k">Eval pass@k</div>
|
| 1222 |
<div class="ops-v">{pass_k}</div>
|
| 1223 |
-
<div class="ops-v-small">pass@1 {pass_1} | exact@k {exact_k}</div>
|
| 1224 |
</div>
|
| 1225 |
<div class="ops-card">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1226 |
<div class="ops-k">Loss Stream</div>
|
| 1227 |
{sparkline_html}
|
| 1228 |
</div>
|
|
@@ -1271,9 +1766,26 @@ def validate_repo_id(repo_id: str, field_name: str) -> str:
|
|
| 1271 |
return value
|
| 1272 |
|
| 1273 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1274 |
def ensure_workspace() -> None:
|
| 1275 |
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
| 1276 |
RUNTIME_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1277 |
|
| 1278 |
|
| 1279 |
def run_runtime_snapshot() -> Dict[str, Any]:
|
|
@@ -1575,7 +2087,7 @@ def run_pipeline_core(
|
|
| 1575 |
) -> Generator[Tuple[str, str, str], None, None]:
|
| 1576 |
log_lines: List[str] = []
|
| 1577 |
summary: Dict[str, Any] = {}
|
| 1578 |
-
run_label = dt.datetime.
|
| 1579 |
|
| 1580 |
if not begin_run(run_label):
|
| 1581 |
append_log(log_lines, "A run is already in progress. Wait for it to finish or click Stop.")
|
|
@@ -1618,12 +2130,7 @@ def run_pipeline_core(
|
|
| 1618 |
force_redownload = False
|
| 1619 |
preflight_only = False
|
| 1620 |
|
| 1621 |
-
|
| 1622 |
-
raise ValueError("Start stage must be >= 1.")
|
| 1623 |
-
if stage_start > TEMPLATE_STAGE_COUNT:
|
| 1624 |
-
raise ValueError(f"Start stage must be <= {TEMPLATE_STAGE_COUNT}.")
|
| 1625 |
-
if stage_count < 1:
|
| 1626 |
-
raise ValueError("How many stages must be >= 1.")
|
| 1627 |
if eval_k < 1:
|
| 1628 |
raise ValueError("Eval K must be >= 1.")
|
| 1629 |
if eval_samples < 1:
|
|
@@ -1663,6 +2170,7 @@ def run_pipeline_core(
|
|
| 1663 |
"force_redownload": bool(force_redownload),
|
| 1664 |
"preflight_only": bool(preflight_only),
|
| 1665 |
"runtime": runtime,
|
|
|
|
| 1666 |
}
|
| 1667 |
)
|
| 1668 |
|
|
@@ -1746,6 +2254,9 @@ def run_pipeline_core(
|
|
| 1746 |
env.pop("HF_TOKEN", None)
|
| 1747 |
env.pop("HUGGINGFACE_HUB_TOKEN", None)
|
| 1748 |
env["PYTHONUNBUFFERED"] = "1"
|
|
|
|
|
|
|
|
|
|
| 1749 |
|
| 1750 |
train_cmd = [
|
| 1751 |
sys.executable,
|
|
@@ -1782,6 +2293,7 @@ def run_pipeline_core(
|
|
| 1782 |
summary["result"] = "cancelled"
|
| 1783 |
summary["finished_at_utc"] = now_ts()
|
| 1784 |
append_log(log_lines, "Run cancelled by user.")
|
|
|
|
| 1785 |
yield "\n".join(log_lines), "Cancelled", summary_text(summary)
|
| 1786 |
return
|
| 1787 |
|
|
@@ -1789,6 +2301,7 @@ def run_pipeline_core(
|
|
| 1789 |
summary["result"] = "failed"
|
| 1790 |
summary["failure_stage"] = "training"
|
| 1791 |
summary["finished_at_utc"] = now_ts()
|
|
|
|
| 1792 |
yield "\n".join(log_lines), "Failed", summary_text(summary)
|
| 1793 |
return
|
| 1794 |
|
|
@@ -1796,6 +2309,7 @@ def run_pipeline_core(
|
|
| 1796 |
summary["result"] = "preflight_passed"
|
| 1797 |
summary["finished_at_utc"] = now_ts()
|
| 1798 |
append_log(log_lines, "Validation mode completed successfully.")
|
|
|
|
| 1799 |
yield "\n".join(log_lines), "Preflight complete", summary_text(summary)
|
| 1800 |
return
|
| 1801 |
|
|
@@ -1884,6 +2398,7 @@ def run_pipeline_core(
|
|
| 1884 |
summary["result"] = "cancelled"
|
| 1885 |
summary["finished_at_utc"] = now_ts()
|
| 1886 |
append_log(log_lines, "Run cancelled by user.")
|
|
|
|
| 1887 |
yield "\n".join(log_lines), "Cancelled", summary_text(summary)
|
| 1888 |
return
|
| 1889 |
|
|
@@ -1891,6 +2406,7 @@ def run_pipeline_core(
|
|
| 1891 |
summary["result"] = "failed"
|
| 1892 |
summary["failure_stage"] = "evaluation"
|
| 1893 |
summary["finished_at_utc"] = now_ts()
|
|
|
|
| 1894 |
yield "\n".join(log_lines), "Failed", summary_text(summary)
|
| 1895 |
return
|
| 1896 |
|
|
@@ -1911,16 +2427,25 @@ def run_pipeline_core(
|
|
| 1911 |
summary["result"] = "completed"
|
| 1912 |
summary["finished_at_utc"] = now_ts()
|
| 1913 |
append_log(log_lines, "Pipeline completed.")
|
|
|
|
| 1914 |
yield "\n".join(log_lines), "Completed", summary_text(summary)
|
| 1915 |
except Exception as exc:
|
| 1916 |
cancelled = is_cancel_requested() or str(exc) == "Run cancelled by user."
|
|
|
|
| 1917 |
summary["result"] = "cancelled" if cancelled else "failed"
|
| 1918 |
-
summary["error"] = {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1919 |
summary["finished_at_utc"] = now_ts()
|
| 1920 |
append_log(
|
| 1921 |
log_lines,
|
| 1922 |
f"Pipeline {'cancelled' if cancelled else 'failed'}: {type(exc).__name__}: {exc}",
|
| 1923 |
)
|
|
|
|
|
|
|
|
|
|
| 1924 |
yield "\n".join(log_lines), "Cancelled" if cancelled else "Failed", summary_text(summary)
|
| 1925 |
finally:
|
| 1926 |
finish_run()
|
|
@@ -1946,6 +2471,7 @@ def run_pipeline(
|
|
| 1946 |
preflight_only: bool,
|
| 1947 |
) -> Generator[Tuple[str, str, str], None, None]:
|
| 1948 |
cycle_index = 1
|
|
|
|
| 1949 |
continuous_mode = bool(continuous_mode)
|
| 1950 |
if preflight_only and continuous_mode:
|
| 1951 |
continuous_mode = False
|
|
@@ -2013,6 +2539,19 @@ def run_pipeline(
|
|
| 2013 |
yield compose_ops_console(session_logs, final_summary_json), stop_status, final_visual
|
| 2014 |
break
|
| 2015 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2016 |
session_logs = _merge_log_chunk(
|
| 2017 |
session_logs,
|
| 2018 |
f"[{now_ts()}] Continuous mode: cycle {cycle_index} finished with result="
|
|
@@ -2020,6 +2559,28 @@ def run_pipeline(
|
|
| 2020 |
)
|
| 2021 |
restart_status = f"Cycle {cycle_index}: restarting"
|
| 2022 |
yield compose_ops_console(session_logs, final_summary_json), restart_status, final_visual
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2023 |
cycle_index += 1
|
| 2024 |
|
| 2025 |
|
|
|
|
| 16 |
import sys
|
| 17 |
import threading
|
| 18 |
import time
|
| 19 |
+
import traceback
|
| 20 |
from pathlib import Path
|
| 21 |
from typing import Any, Dict, Generator, List, Optional, Tuple
|
| 22 |
|
|
|
|
| 30 |
WORKSPACE_DIR = ROOT / "workspace"
|
| 31 |
DATA_DIR = WORKSPACE_DIR / "data" / "releases" / "v1"
|
| 32 |
RUNTIME_DIR = WORKSPACE_DIR / "runtime"
|
| 33 |
+
HF_HOME_DIR = WORKSPACE_DIR / ".hf_home"
|
| 34 |
+
HF_DATASETS_CACHE_DIR = HF_HOME_DIR / "datasets"
|
| 35 |
+
HF_HUB_CACHE_DIR = HF_HOME_DIR / "hub"
|
| 36 |
CONFIG_TEMPLATE = ROOT / "configs" / "deepseek_math_sota.yaml"
|
| 37 |
TRAIN_SCRIPT = ROOT / "scripts" / "train_sota.py"
|
| 38 |
EVAL_SCRIPT = ROOT / "scripts" / "eval_sota.py"
|
|
|
|
| 52 |
ACTIVE_PROCESS: Optional[subprocess.Popen] = None
|
| 53 |
ACTIVE_RUN_LABEL = ""
|
| 54 |
|
| 55 |
+
|
| 56 |
+
def _env_int(name: str, default: int, min_value: int = 0) -> int:
|
| 57 |
+
raw = (os.environ.get(name) or "").strip()
|
| 58 |
+
if not raw:
|
| 59 |
+
return default
|
| 60 |
+
try:
|
| 61 |
+
value = int(raw)
|
| 62 |
+
except ValueError:
|
| 63 |
+
return default
|
| 64 |
+
return max(min_value, value)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
CONTINUOUS_RESTART_DELAY_SECONDS = _env_int("CONTINUOUS_RESTART_DELAY_SECONDS", default=15, min_value=0)
|
| 68 |
+
CONTINUOUS_MAX_CONSECUTIVE_FAILURES = _env_int("CONTINUOUS_MAX_CONSECUTIVE_FAILURES", default=3, min_value=1)
|
| 69 |
+
APP_LOG_MAX_CHARS = _env_int("APP_LOG_MAX_CHARS", default=200_000, min_value=20_000)
|
| 70 |
+
RUN_HISTORY_LIMIT = _env_int("RUN_HISTORY_LIMIT", default=80, min_value=5)
|
| 71 |
+
RECENT_RUNS_VISUAL_LIMIT = 6
|
| 72 |
+
RUN_HISTORY_PATH = RUNTIME_DIR / "run_history.json"
|
| 73 |
+
RUN_RECORDS_DIR = RUNTIME_DIR / "run_records"
|
| 74 |
+
|
| 75 |
PROJECT_DESCRIPTION = """
|
| 76 |
## Pipeline Scope
|
| 77 |
This app runs the full training pipeline for the `maths-conjuncture-solutions` project.
|
|
|
|
| 128 |
@import url("https://fonts.googleapis.com/css2?family=Rajdhani:wght@500;600;700&family=IBM+Plex+Mono:wght@400;500&display=swap");
|
| 129 |
|
| 130 |
:root {
|
| 131 |
+
--nt-bg: #0a0a0a;
|
| 132 |
+
--nt-bg-2: #131313;
|
| 133 |
+
--nt-bg-radial: rgba(255, 255, 255, 0.08);
|
| 134 |
+
--nt-panel: #151515;
|
| 135 |
+
--nt-panel-2: #1c1c1c;
|
| 136 |
+
--nt-border: #313131;
|
| 137 |
+
--nt-border-strong: #515151;
|
| 138 |
+
--nt-text: #ececec;
|
| 139 |
+
--nt-muted: #adadad;
|
| 140 |
+
--nt-accent: #f5f5f5;
|
| 141 |
+
--nt-accent-2: #ffffff;
|
| 142 |
+
--nt-shadow: rgba(0, 0, 0, 0.58);
|
| 143 |
+
--nt-hero-border: #5a5a5a;
|
| 144 |
+
--nt-hero-bg-1: #181818;
|
| 145 |
+
--nt-hero-bg-2: #252525;
|
| 146 |
+
--nt-hero-bg-3: #333333;
|
| 147 |
+
--nt-hero-text: #f7f7f7;
|
| 148 |
+
--nt-hero-topline: #cecece;
|
| 149 |
+
--nt-hero-body: #e1e1e1;
|
| 150 |
+
--nt-hero-grid: rgba(255, 255, 255, 0.09);
|
| 151 |
+
--nt-chip-border: rgba(212, 212, 212, 0.35);
|
| 152 |
+
--nt-chip-bg: rgba(24, 24, 24, 0.72);
|
| 153 |
+
--nt-chip-text: #f0f0f0;
|
| 154 |
+
--nt-input-bg: #121212;
|
| 155 |
+
--nt-focus-ring: rgba(220, 220, 220, 0.2);
|
| 156 |
+
--nt-btn-border: #5a5a5a;
|
| 157 |
+
--nt-btn-bg-1: #2b2b2b;
|
| 158 |
+
--nt-btn-bg-2: #1e1e1e;
|
| 159 |
+
--nt-btn-text: #efefef;
|
| 160 |
+
--nt-primary-btn-border: #8a8a8a;
|
| 161 |
+
--nt-primary-btn-bg-1: #505050;
|
| 162 |
+
--nt-primary-btn-bg-2: #3a3a3a;
|
| 163 |
+
--nt-primary-btn-text: #ffffff;
|
| 164 |
+
--nt-stop-btn-border: #6e6e6e;
|
| 165 |
+
--nt-stop-btn-bg-1: #464646;
|
| 166 |
+
--nt-stop-btn-bg-2: #303030;
|
| 167 |
+
--nt-stop-btn-text: #f8f8f8;
|
| 168 |
+
--nt-ops-panel-1: #171717;
|
| 169 |
+
--nt-ops-panel-2: #101010;
|
| 170 |
+
--nt-card-border: #3c3c3c;
|
| 171 |
+
--nt-card-bg-1: #232323;
|
| 172 |
+
--nt-card-bg-2: #1a1a1a;
|
| 173 |
+
--nt-metric-text: #f2f2f2;
|
| 174 |
+
--nt-meter-border: #5a5a5a;
|
| 175 |
+
--nt-meter-bg: #2d2d2d;
|
| 176 |
+
--nt-meter-fill-1: #d4d4d4;
|
| 177 |
+
--nt-meter-fill-2: #8c8c8c;
|
| 178 |
+
--nt-spark-border: #555555;
|
| 179 |
+
--nt-spark-bg: #161616;
|
| 180 |
+
--nt-spark-stroke: #dddddd;
|
| 181 |
+
--nt-settings-border: rgba(200, 200, 200, 0.42);
|
| 182 |
+
--nt-settings-bg: rgba(20, 20, 20, 0.72);
|
| 183 |
+
--nt-settings-bg-hover: rgba(32, 32, 32, 0.9);
|
| 184 |
+
--nt-settings-text: #f2f2f2;
|
| 185 |
+
--nt-settings-menu-bg: #1d1d1d;
|
| 186 |
+
--nt-settings-menu-border: #545454;
|
| 187 |
+
--nt-settings-menu-shadow: rgba(0, 0, 0, 0.62);
|
| 188 |
+
--nt-settings-menu-btn: #f0f0f0;
|
| 189 |
+
--nt-settings-menu-btn-hover: #303030;
|
| 190 |
+
--nt-settings-menu-btn-active-bg: #6a6a6a;
|
| 191 |
+
--nt-settings-menu-btn-active-text: #101010;
|
| 192 |
}
|
| 193 |
|
| 194 |
:root[data-nt-theme="light"] {
|
| 195 |
+
--nt-bg: #f2f2f2;
|
| 196 |
+
--nt-bg-2: #e5e5e5;
|
| 197 |
+
--nt-bg-radial: rgba(255, 255, 255, 0.8);
|
| 198 |
+
--nt-panel: #ffffff;
|
| 199 |
+
--nt-panel-2: #f4f4f4;
|
| 200 |
+
--nt-border: #c2c2c2;
|
| 201 |
+
--nt-border-strong: #9f9f9f;
|
| 202 |
+
--nt-text: #1a1a1a;
|
| 203 |
+
--nt-muted: #5c5c5c;
|
| 204 |
+
--nt-accent: #111111;
|
| 205 |
+
--nt-accent-2: #000000;
|
| 206 |
+
--nt-shadow: rgba(0, 0, 0, 0.14);
|
| 207 |
+
--nt-hero-border: #4d4d4d;
|
| 208 |
+
--nt-hero-bg-1: #3a3a3a;
|
| 209 |
+
--nt-hero-bg-2: #2c2c2c;
|
| 210 |
+
--nt-hero-bg-3: #1f1f1f;
|
| 211 |
+
--nt-hero-text: #fafafa;
|
| 212 |
+
--nt-hero-topline: #d8d8d8;
|
| 213 |
+
--nt-hero-body: #ececec;
|
| 214 |
+
--nt-hero-grid: rgba(255, 255, 255, 0.16);
|
| 215 |
+
--nt-chip-border: rgba(236, 236, 236, 0.52);
|
| 216 |
+
--nt-chip-bg: rgba(20, 20, 20, 0.68);
|
| 217 |
+
--nt-chip-text: #f5f5f5;
|
| 218 |
+
--nt-input-bg: #ffffff;
|
| 219 |
+
--nt-focus-ring: rgba(110, 110, 110, 0.24);
|
| 220 |
+
--nt-btn-border: #8d8d8d;
|
| 221 |
+
--nt-btn-bg-1: #f8f8f8;
|
| 222 |
+
--nt-btn-bg-2: #e8e8e8;
|
| 223 |
+
--nt-btn-text: #1f1f1f;
|
| 224 |
+
--nt-primary-btn-border: #505050;
|
| 225 |
+
--nt-primary-btn-bg-1: #6d6d6d;
|
| 226 |
+
--nt-primary-btn-bg-2: #505050;
|
| 227 |
+
--nt-primary-btn-text: #ffffff;
|
| 228 |
+
--nt-stop-btn-border: #595959;
|
| 229 |
+
--nt-stop-btn-bg-1: #7a7a7a;
|
| 230 |
+
--nt-stop-btn-bg-2: #5b5b5b;
|
| 231 |
+
--nt-stop-btn-text: #ffffff;
|
| 232 |
+
--nt-ops-panel-1: #fbfbfb;
|
| 233 |
+
--nt-ops-panel-2: #efefef;
|
| 234 |
+
--nt-card-border: #b8b8b8;
|
| 235 |
+
--nt-card-bg-1: #ffffff;
|
| 236 |
+
--nt-card-bg-2: #f2f2f2;
|
| 237 |
+
--nt-metric-text: #1d1d1d;
|
| 238 |
+
--nt-meter-border: #9d9d9d;
|
| 239 |
+
--nt-meter-bg: #dadada;
|
| 240 |
+
--nt-meter-fill-1: #5e5e5e;
|
| 241 |
+
--nt-meter-fill-2: #3f3f3f;
|
| 242 |
+
--nt-spark-border: #a9a9a9;
|
| 243 |
+
--nt-spark-bg: #f8f8f8;
|
| 244 |
+
--nt-spark-stroke: #404040;
|
| 245 |
+
--nt-settings-border: rgba(120, 120, 120, 0.4);
|
| 246 |
+
--nt-settings-bg: rgba(45, 45, 45, 0.62);
|
| 247 |
+
--nt-settings-bg-hover: rgba(30, 30, 30, 0.8);
|
| 248 |
+
--nt-settings-text: #f7f7f7;
|
| 249 |
+
--nt-settings-menu-bg: #f1f1f1;
|
| 250 |
+
--nt-settings-menu-border: #9d9d9d;
|
| 251 |
+
--nt-settings-menu-shadow: rgba(0, 0, 0, 0.18);
|
| 252 |
+
--nt-settings-menu-btn: #1f1f1f;
|
| 253 |
+
--nt-settings-menu-btn-hover: #dcdcdc;
|
| 254 |
+
--nt-settings-menu-btn-active-bg: #4e4e4e;
|
| 255 |
+
--nt-settings-menu-btn-active-text: #ffffff;
|
| 256 |
}
|
| 257 |
|
| 258 |
@keyframes ntFadeUp {
|
|
|
|
| 270 |
color: var(--nt-text) !important;
|
| 271 |
font-family: "Rajdhani", "Segoe UI", sans-serif !important;
|
| 272 |
background:
|
| 273 |
+
linear-gradient(rgba(255, 255, 255, 0.05) 1px, transparent 1px),
|
| 274 |
+
linear-gradient(90deg, rgba(255, 255, 255, 0.05) 1px, transparent 1px),
|
| 275 |
radial-gradient(circle at 12% 10%, var(--nt-bg-radial) 0%, rgba(255, 255, 255, 0) 44%),
|
| 276 |
linear-gradient(145deg, var(--nt-bg) 0%, var(--nt-bg-2) 100%) !important;
|
| 277 |
background-size: 24px 24px, 24px 24px, 100% 100%, 100% 100% !important;
|
|
|
|
| 290 |
background: linear-gradient(180deg, var(--nt-panel) 0%, var(--nt-panel-2) 100%) !important;
|
| 291 |
border: 1px solid var(--nt-border) !important;
|
| 292 |
border-radius: 8px !important;
|
| 293 |
+
box-shadow: inset 0 0 0 1px rgba(255, 255, 255, 0.04), 0 12px 26px var(--nt-shadow) !important;
|
| 294 |
}
|
| 295 |
|
| 296 |
.app-hero {
|
|
|
|
| 301 |
background: linear-gradient(132deg, var(--nt-hero-bg-1) 0%, var(--nt-hero-bg-2) 62%, var(--nt-hero-bg-3) 100%);
|
| 302 |
color: var(--nt-hero-text);
|
| 303 |
padding: 22px 24px 20px;
|
| 304 |
+
box-shadow: inset 0 0 0 1px rgba(255, 255, 255, 0.05), 0 18px 38px rgba(0, 0, 0, 0.38);
|
| 305 |
animation: ntFadeUp 500ms ease-out both;
|
| 306 |
}
|
| 307 |
|
|
|
|
| 449 |
.nt-settings-menu .nt-mode-btn.active {
|
| 450 |
background: var(--nt-settings-menu-btn-active-bg) !important;
|
| 451 |
color: var(--nt-settings-menu-btn-active-text) !important;
|
| 452 |
+
box-shadow: inset 0 0 0 1px rgba(255, 255, 255, 0.2) !important;
|
| 453 |
}
|
| 454 |
|
| 455 |
.section-copy {
|
|
|
|
| 548 |
|
| 549 |
.gradio-container button:hover {
|
| 550 |
transform: translateY(-1px);
|
| 551 |
+
box-shadow: 0 0 0 1px rgba(255, 255, 255, 0.2), 0 10px 20px rgba(0, 0, 0, 0.35);
|
| 552 |
filter: brightness(1.03);
|
| 553 |
}
|
| 554 |
|
|
|
|
| 559 |
background: linear-gradient(180deg, var(--nt-ops-panel-1) 0%, var(--nt-ops-panel-2) 100%);
|
| 560 |
padding: 14px;
|
| 561 |
animation: ntFadeUp 420ms ease-out both;
|
| 562 |
+
box-shadow: inset 0 0 0 1px rgba(255, 255, 255, 0.04), 0 12px 28px rgba(0, 0, 0, 0.34);
|
| 563 |
}
|
| 564 |
|
| 565 |
.ops-visual-head {
|
|
|
|
| 599 |
padding: 9px 10px;
|
| 600 |
min-height: 86px;
|
| 601 |
animation: ntFadeUp 460ms ease-out both;
|
| 602 |
+
box-shadow: inset 0 0 0 1px rgba(255, 255, 255, 0.04);
|
| 603 |
+
}
|
| 604 |
+
|
| 605 |
+
.ops-card-wide {
|
| 606 |
+
grid-column: span 2;
|
| 607 |
}
|
| 608 |
|
| 609 |
.ops-k {
|
|
|
|
| 664 |
display: block;
|
| 665 |
}
|
| 666 |
|
| 667 |
+
.ops-chip-list {
|
| 668 |
+
margin-top: 6px;
|
| 669 |
+
display: flex;
|
| 670 |
+
flex-wrap: wrap;
|
| 671 |
+
gap: 6px;
|
| 672 |
+
}
|
| 673 |
+
|
| 674 |
+
.ops-chip {
|
| 675 |
+
border-radius: 999px;
|
| 676 |
+
border: 1px solid var(--nt-chip-border);
|
| 677 |
+
background: var(--nt-chip-bg);
|
| 678 |
+
color: var(--nt-chip-text);
|
| 679 |
+
padding: 2px 8px;
|
| 680 |
+
font-size: 0.66rem;
|
| 681 |
+
text-transform: uppercase;
|
| 682 |
+
letter-spacing: 0.1em;
|
| 683 |
+
font-family: "IBM Plex Mono", "Courier New", monospace;
|
| 684 |
+
}
|
| 685 |
+
|
| 686 |
+
.ops-chip.off {
|
| 687 |
+
opacity: 0.72;
|
| 688 |
+
}
|
| 689 |
+
|
| 690 |
+
.ops-stage-list {
|
| 691 |
+
margin: 8px 0 0;
|
| 692 |
+
padding: 0;
|
| 693 |
+
list-style: none;
|
| 694 |
+
display: grid;
|
| 695 |
+
gap: 6px;
|
| 696 |
+
}
|
| 697 |
+
|
| 698 |
+
.ops-stage-item {
|
| 699 |
+
display: flex;
|
| 700 |
+
align-items: center;
|
| 701 |
+
gap: 8px;
|
| 702 |
+
}
|
| 703 |
+
|
| 704 |
+
.ops-stage-dot {
|
| 705 |
+
width: 12px;
|
| 706 |
+
min-width: 12px;
|
| 707 |
+
height: 12px;
|
| 708 |
+
border-radius: 999px;
|
| 709 |
+
border: 1px solid var(--nt-border-strong);
|
| 710 |
+
background: var(--nt-meter-bg);
|
| 711 |
+
color: transparent;
|
| 712 |
+
}
|
| 713 |
+
|
| 714 |
+
.ops-stage-item.done .ops-stage-dot {
|
| 715 |
+
background: linear-gradient(90deg, var(--nt-meter-fill-1) 0%, var(--nt-meter-fill-2) 100%);
|
| 716 |
+
}
|
| 717 |
+
|
| 718 |
+
.ops-stage-item.active .ops-stage-dot {
|
| 719 |
+
background: var(--nt-accent);
|
| 720 |
+
box-shadow: 0 0 0 3px rgba(255, 255, 255, 0.22);
|
| 721 |
+
}
|
| 722 |
+
|
| 723 |
+
.ops-stage-label {
|
| 724 |
+
color: var(--nt-muted);
|
| 725 |
+
font-size: 0.78rem;
|
| 726 |
+
font-family: "IBM Plex Mono", "Courier New", monospace;
|
| 727 |
+
}
|
| 728 |
+
|
| 729 |
+
.ops-stage-item.done .ops-stage-label,
|
| 730 |
+
.ops-stage-item.active .ops-stage-label {
|
| 731 |
+
color: var(--nt-text);
|
| 732 |
+
}
|
| 733 |
+
|
| 734 |
+
.ops-run-list {
|
| 735 |
+
margin: 8px 0 0;
|
| 736 |
+
padding: 0;
|
| 737 |
+
list-style: none;
|
| 738 |
+
display: grid;
|
| 739 |
+
gap: 8px;
|
| 740 |
+
}
|
| 741 |
+
|
| 742 |
+
.ops-run-item {
|
| 743 |
+
border: 1px solid var(--nt-card-border);
|
| 744 |
+
border-radius: 8px;
|
| 745 |
+
background: rgba(255, 255, 255, 0.03);
|
| 746 |
+
padding: 8px;
|
| 747 |
+
}
|
| 748 |
+
|
| 749 |
+
.ops-run-head {
|
| 750 |
+
display: flex;
|
| 751 |
+
justify-content: space-between;
|
| 752 |
+
align-items: center;
|
| 753 |
+
gap: 10px;
|
| 754 |
+
}
|
| 755 |
+
|
| 756 |
+
.ops-run-label {
|
| 757 |
+
color: var(--nt-text);
|
| 758 |
+
font-size: 0.78rem;
|
| 759 |
+
font-family: "IBM Plex Mono", "Courier New", monospace;
|
| 760 |
+
}
|
| 761 |
+
|
| 762 |
+
.ops-run-badge {
|
| 763 |
+
border-radius: 999px;
|
| 764 |
+
border: 1px solid var(--nt-chip-border);
|
| 765 |
+
background: var(--nt-chip-bg);
|
| 766 |
+
color: var(--nt-chip-text);
|
| 767 |
+
font-size: 0.62rem;
|
| 768 |
+
text-transform: uppercase;
|
| 769 |
+
letter-spacing: 0.08em;
|
| 770 |
+
padding: 2px 7px;
|
| 771 |
+
font-family: "IBM Plex Mono", "Courier New", monospace;
|
| 772 |
+
}
|
| 773 |
+
|
| 774 |
+
.ops-run-badge.ok {
|
| 775 |
+
border-color: #8b8b8b;
|
| 776 |
+
color: #f0f0f0;
|
| 777 |
+
}
|
| 778 |
+
|
| 779 |
+
.ops-run-badge.fail {
|
| 780 |
+
border-color: #6f6f6f;
|
| 781 |
+
color: #d8d8d8;
|
| 782 |
+
}
|
| 783 |
+
|
| 784 |
+
.ops-run-badge.cancel {
|
| 785 |
+
border-color: #7d7d7d;
|
| 786 |
+
color: #e2e2e2;
|
| 787 |
+
}
|
| 788 |
+
|
| 789 |
.gradio-container [data-testid="footer"],
|
| 790 |
.gradio-container .built-with {
|
| 791 |
display: none !important;
|
|
|
|
| 841 |
.ops-grid {
|
| 842 |
grid-template-columns: 1fr;
|
| 843 |
}
|
| 844 |
+
|
| 845 |
+
.ops-card-wide {
|
| 846 |
+
grid-column: auto;
|
| 847 |
+
}
|
| 848 |
}
|
| 849 |
"""
|
| 850 |
|
|
|
|
| 1117 |
|
| 1118 |
|
| 1119 |
def now_ts() -> str:
|
| 1120 |
+
return dt.datetime.now(dt.timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
|
| 1121 |
|
| 1122 |
|
| 1123 |
def append_log(lines: List[str], message: str) -> str:
|
| 1124 |
lines.append(f"[{now_ts()}] {message}")
|
| 1125 |
text = "\n".join(lines)
|
| 1126 |
+
if len(text) > APP_LOG_MAX_CHARS:
|
| 1127 |
+
text = text[-APP_LOG_MAX_CHARS:]
|
| 1128 |
return text
|
| 1129 |
|
| 1130 |
|
|
|
|
| 1145 |
return payload
|
| 1146 |
|
| 1147 |
|
| 1148 |
+
def _merge_log_chunk(existing: str, chunk: str, max_chars: int = APP_LOG_MAX_CHARS) -> str:
|
| 1149 |
if not chunk:
|
| 1150 |
return existing
|
| 1151 |
merged = existing
|
|
|
|
| 1185 |
return "--"
|
| 1186 |
|
| 1187 |
|
| 1188 |
+
def _parse_utc_stamp(value: Any) -> Optional[dt.datetime]:
|
| 1189 |
+
text = str(value or "").strip()
|
| 1190 |
+
if not text:
|
| 1191 |
+
return None
|
| 1192 |
+
try:
|
| 1193 |
+
parsed = dt.datetime.strptime(text, "%Y-%m-%d %H:%M:%S UTC")
|
| 1194 |
+
except ValueError:
|
| 1195 |
+
return None
|
| 1196 |
+
return parsed.replace(tzinfo=dt.timezone.utc)
|
| 1197 |
+
|
| 1198 |
+
|
| 1199 |
+
def _format_duration(seconds: float) -> str:
|
| 1200 |
+
total = max(0, int(seconds))
|
| 1201 |
+
hours, rem = divmod(total, 3600)
|
| 1202 |
+
minutes, sec = divmod(rem, 60)
|
| 1203 |
+
if hours > 0:
|
| 1204 |
+
return f"{hours}h {minutes:02d}m {sec:02d}s"
|
| 1205 |
+
return f"{minutes:02d}m {sec:02d}s"
|
| 1206 |
+
|
| 1207 |
+
|
| 1208 |
+
def _runtime_duration_text(summary: Dict[str, Any]) -> str:
|
| 1209 |
+
started_at = _parse_utc_stamp(summary.get("started_at_utc"))
|
| 1210 |
+
if started_at is None:
|
| 1211 |
+
return "--"
|
| 1212 |
+
finished_at = _parse_utc_stamp(summary.get("finished_at_utc"))
|
| 1213 |
+
end_time = finished_at or dt.datetime.now(dt.timezone.utc)
|
| 1214 |
+
state = "done" if finished_at is not None else "running"
|
| 1215 |
+
elapsed = (end_time - started_at).total_seconds()
|
| 1216 |
+
return f"{_format_duration(elapsed)} ({state})"
|
| 1217 |
+
|
| 1218 |
+
|
| 1219 |
+
def _maybe_float(value: Any) -> Optional[float]:
|
| 1220 |
+
try:
|
| 1221 |
+
return float(value)
|
| 1222 |
+
except (TypeError, ValueError):
|
| 1223 |
+
return None
|
| 1224 |
+
|
| 1225 |
+
|
| 1226 |
+
def _duration_seconds(summary: Dict[str, Any]) -> Optional[int]:
|
| 1227 |
+
started_at = _parse_utc_stamp(summary.get("started_at_utc"))
|
| 1228 |
+
if started_at is None:
|
| 1229 |
+
return None
|
| 1230 |
+
finished_at = _parse_utc_stamp(summary.get("finished_at_utc"))
|
| 1231 |
+
if finished_at is None:
|
| 1232 |
+
return None
|
| 1233 |
+
return max(0, int((finished_at - started_at).total_seconds()))
|
| 1234 |
+
|
| 1235 |
+
|
| 1236 |
+
def _normalize_history_entry(entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
| 1237 |
+
run_label = str(entry.get("run_label") or "").strip()
|
| 1238 |
+
result = str(entry.get("result") or "").strip().lower()
|
| 1239 |
+
if not run_label or not result:
|
| 1240 |
+
return None
|
| 1241 |
+
evaluation = _as_dict(entry.get("evaluation"))
|
| 1242 |
+
quality_gate = _as_dict(entry.get("quality_gate"))
|
| 1243 |
+
push_report = _as_dict(entry.get("push"))
|
| 1244 |
+
return {
|
| 1245 |
+
"run_label": run_label,
|
| 1246 |
+
"result": result,
|
| 1247 |
+
"started_at_utc": str(entry.get("started_at_utc") or "").strip(),
|
| 1248 |
+
"finished_at_utc": str(entry.get("finished_at_utc") or "").strip(),
|
| 1249 |
+
"duration_seconds": _safe_int(entry.get("duration_seconds"), -1),
|
| 1250 |
+
"compute_mode": str(entry.get("compute_mode") or "").strip(),
|
| 1251 |
+
"evaluation": {
|
| 1252 |
+
"pass_at_1": _maybe_float(evaluation.get("pass_at_1")),
|
| 1253 |
+
"pass_at_k": _maybe_float(evaluation.get("pass_at_k")),
|
| 1254 |
+
"evaluated_rows": _safe_int(evaluation.get("evaluated_rows"), 0),
|
| 1255 |
+
},
|
| 1256 |
+
"quality_gate": {
|
| 1257 |
+
"enabled": bool(quality_gate.get("enabled")),
|
| 1258 |
+
"passed": quality_gate.get("passed"),
|
| 1259 |
+
},
|
| 1260 |
+
"push": {
|
| 1261 |
+
"requested": bool(push_report.get("requested")),
|
| 1262 |
+
"performed": bool(push_report.get("performed")),
|
| 1263 |
+
},
|
| 1264 |
+
"error_message": str(entry.get("error_message") or "").strip(),
|
| 1265 |
+
}
|
| 1266 |
+
|
| 1267 |
+
|
| 1268 |
+
def _read_history_entries() -> List[Dict[str, Any]]:
|
| 1269 |
+
if not RUN_HISTORY_PATH.exists():
|
| 1270 |
+
return []
|
| 1271 |
+
try:
|
| 1272 |
+
payload = json.loads(RUN_HISTORY_PATH.read_text(encoding="utf-8"))
|
| 1273 |
+
except Exception:
|
| 1274 |
+
return []
|
| 1275 |
+
if not isinstance(payload, list):
|
| 1276 |
+
return []
|
| 1277 |
+
normalized: List[Dict[str, Any]] = []
|
| 1278 |
+
for item in payload:
|
| 1279 |
+
if not isinstance(item, dict):
|
| 1280 |
+
continue
|
| 1281 |
+
parsed = _normalize_history_entry(item)
|
| 1282 |
+
if parsed is not None:
|
| 1283 |
+
normalized.append(parsed)
|
| 1284 |
+
return normalized
|
| 1285 |
+
|
| 1286 |
+
|
| 1287 |
+
def load_run_history(limit: int = RECENT_RUNS_VISUAL_LIMIT) -> List[Dict[str, Any]]:
|
| 1288 |
+
normalized_limit = max(1, _safe_int(limit, RECENT_RUNS_VISUAL_LIMIT))
|
| 1289 |
+
return _read_history_entries()[:normalized_limit]
|
| 1290 |
+
|
| 1291 |
+
|
| 1292 |
+
def _build_run_history_entry(summary: Dict[str, Any]) -> Dict[str, Any]:
|
| 1293 |
+
evaluation = _as_dict(summary.get("evaluation"))
|
| 1294 |
+
quality_gate = _as_dict(summary.get("quality_gate"))
|
| 1295 |
+
push_report = _as_dict(summary.get("push"))
|
| 1296 |
+
error_payload = _as_dict(summary.get("error"))
|
| 1297 |
+
return {
|
| 1298 |
+
"run_label": str(summary.get("run_label") or "").strip(),
|
| 1299 |
+
"result": str(summary.get("result") or "").strip().lower(),
|
| 1300 |
+
"started_at_utc": str(summary.get("started_at_utc") or "").strip(),
|
| 1301 |
+
"finished_at_utc": str(summary.get("finished_at_utc") or "").strip(),
|
| 1302 |
+
"duration_seconds": _duration_seconds(summary),
|
| 1303 |
+
"compute_mode": str(summary.get("compute_mode") or "").strip(),
|
| 1304 |
+
"evaluation": {
|
| 1305 |
+
"pass_at_1": _maybe_float(evaluation.get("pass_at_1")),
|
| 1306 |
+
"pass_at_k": _maybe_float(evaluation.get("pass_at_k")),
|
| 1307 |
+
"evaluated_rows": _safe_int(evaluation.get("evaluated_rows"), 0),
|
| 1308 |
+
},
|
| 1309 |
+
"quality_gate": {
|
| 1310 |
+
"enabled": bool(quality_gate.get("enabled")),
|
| 1311 |
+
"passed": quality_gate.get("passed"),
|
| 1312 |
+
},
|
| 1313 |
+
"push": {
|
| 1314 |
+
"requested": bool(push_report.get("requested")),
|
| 1315 |
+
"performed": bool(push_report.get("performed")),
|
| 1316 |
+
},
|
| 1317 |
+
"error_message": str(error_payload.get("message") or "").strip()[:280],
|
| 1318 |
+
}
|
| 1319 |
+
|
| 1320 |
+
|
| 1321 |
+
def persist_run_artifacts(summary: Dict[str, Any]) -> Optional[str]:
|
| 1322 |
+
run_label = str(summary.get("run_label") or "").strip()
|
| 1323 |
+
result = str(summary.get("result") or "").strip().lower()
|
| 1324 |
+
if not run_label or not result or result == "busy":
|
| 1325 |
+
return None
|
| 1326 |
+
|
| 1327 |
+
try:
|
| 1328 |
+
ensure_workspace()
|
| 1329 |
+
record_path = RUN_RECORDS_DIR / f"{run_label}.json"
|
| 1330 |
+
record_payload = dict(summary)
|
| 1331 |
+
record_payload["recorded_at_utc"] = now_ts()
|
| 1332 |
+
record_path.write_text(json.dumps(record_payload, ensure_ascii=True, indent=2), encoding="utf-8")
|
| 1333 |
+
|
| 1334 |
+
entry = _build_run_history_entry(summary)
|
| 1335 |
+
history = _read_history_entries()
|
| 1336 |
+
history = [item for item in history if str(item.get("run_label") or "").strip() != run_label]
|
| 1337 |
+
history.insert(0, entry)
|
| 1338 |
+
history = history[:RUN_HISTORY_LIMIT]
|
| 1339 |
+
tmp_path = RUN_HISTORY_PATH.with_suffix(".json.tmp")
|
| 1340 |
+
tmp_path.write_text(json.dumps(history, ensure_ascii=True, indent=2), encoding="utf-8")
|
| 1341 |
+
tmp_path.replace(RUN_HISTORY_PATH)
|
| 1342 |
+
return None
|
| 1343 |
+
except Exception as exc:
|
| 1344 |
+
return f"Warning: run artifact persistence failed: {type(exc).__name__}: {exc}"
|
| 1345 |
+
|
| 1346 |
+
|
| 1347 |
+
def _refresh_recent_runs(summary: Dict[str, Any], log_lines: List[str]) -> None:
|
| 1348 |
+
warning = persist_run_artifacts(summary)
|
| 1349 |
+
if warning:
|
| 1350 |
+
append_log(log_lines, warning)
|
| 1351 |
+
summary["recent_runs"] = load_run_history(limit=RECENT_RUNS_VISUAL_LIMIT)
|
| 1352 |
+
|
| 1353 |
+
|
| 1354 |
+
def _run_result_badge_class(result_text: str) -> str:
|
| 1355 |
+
normalized = (result_text or "").strip().lower()
|
| 1356 |
+
if normalized in {"completed", "preflight_passed"}:
|
| 1357 |
+
return "ok"
|
| 1358 |
+
if normalized in {"failed", "error"}:
|
| 1359 |
+
return "fail"
|
| 1360 |
+
if normalized in {"cancelled", "canceled"}:
|
| 1361 |
+
return "cancel"
|
| 1362 |
+
return "neutral"
|
| 1363 |
+
|
| 1364 |
+
|
| 1365 |
+
def _build_recent_runs_panel(summary: Dict[str, Any]) -> str:
|
| 1366 |
+
raw_runs = summary.get("recent_runs")
|
| 1367 |
+
if not isinstance(raw_runs, list) or not raw_runs:
|
| 1368 |
+
raw_runs = load_run_history(limit=RECENT_RUNS_VISUAL_LIMIT)
|
| 1369 |
+
entries = [item for item in raw_runs if isinstance(item, dict)]
|
| 1370 |
+
if not entries:
|
| 1371 |
+
return "<div class='ops-v-small ops-empty'>No completed cycles recorded yet.</div>"
|
| 1372 |
+
|
| 1373 |
+
lines: List[str] = []
|
| 1374 |
+
for entry in entries[:RECENT_RUNS_VISUAL_LIMIT]:
|
| 1375 |
+
run_label = html.escape(str(entry.get("run_label") or "--"))
|
| 1376 |
+
result_text = str(entry.get("result") or "unknown").replace("_", " ").strip().lower()
|
| 1377 |
+
badge_cls = _run_result_badge_class(result_text)
|
| 1378 |
+
badge_label = html.escape(result_text)
|
| 1379 |
+
evaluation = _as_dict(entry.get("evaluation"))
|
| 1380 |
+
pass_1 = _fmt_pct(evaluation.get("pass_at_1"))
|
| 1381 |
+
pass_k = _fmt_pct(evaluation.get("pass_at_k"))
|
| 1382 |
+
rows = _safe_int(evaluation.get("evaluated_rows"), 0)
|
| 1383 |
+
duration_seconds = _safe_int(entry.get("duration_seconds"), -1)
|
| 1384 |
+
duration_text = "--" if duration_seconds < 0 else _format_duration(duration_seconds)
|
| 1385 |
+
finished_at = html.escape(str(entry.get("finished_at_utc") or "--"))
|
| 1386 |
+
lines.append(
|
| 1387 |
+
"<li class='ops-run-item'>"
|
| 1388 |
+
"<div class='ops-run-head'>"
|
| 1389 |
+
f"<span class='ops-run-label'>{run_label}</span>"
|
| 1390 |
+
f"<span class='ops-run-badge {badge_cls}'>{badge_label}</span>"
|
| 1391 |
+
"</div>"
|
| 1392 |
+
f"<div class='ops-v-small'>duration {html.escape(duration_text)} | pass@1 {pass_1} | "
|
| 1393 |
+
f"pass@k {pass_k} | rows {rows}</div>"
|
| 1394 |
+
f"<div class='ops-v-small'>finished {finished_at}</div>"
|
| 1395 |
+
"</li>"
|
| 1396 |
+
)
|
| 1397 |
+
return f"<ul class='ops-run-list'>{''.join(lines)}</ul>"
|
| 1398 |
+
|
| 1399 |
+
|
| 1400 |
+
def _build_artifact_index(summary: Dict[str, Any]) -> str:
|
| 1401 |
+
runtime_cfg = str(summary.get("runtime_config") or "").strip()
|
| 1402 |
+
training_summary_path = str(summary.get("training_summary_path") or "").strip()
|
| 1403 |
+
evaluation = _as_dict(summary.get("evaluation"))
|
| 1404 |
+
eval_report_path = str(evaluation.get("report_path") or "").strip()
|
| 1405 |
+
adapter_path = str(TRAIN_OUTPUT_DIR / "final_adapter")
|
| 1406 |
+
|
| 1407 |
+
def _display_path(path_text: str) -> str:
|
| 1408 |
+
if not path_text:
|
| 1409 |
+
return "--"
|
| 1410 |
+
return html.escape(path_text)
|
| 1411 |
+
|
| 1412 |
+
return (
|
| 1413 |
+
"<div class='ops-v-small'>"
|
| 1414 |
+
f"config: {_display_path(runtime_cfg)}"
|
| 1415 |
+
"</div>"
|
| 1416 |
+
"<div class='ops-v-small'>"
|
| 1417 |
+
f"training summary: {_display_path(training_summary_path)}"
|
| 1418 |
+
"</div>"
|
| 1419 |
+
"<div class='ops-v-small'>"
|
| 1420 |
+
f"eval report: {_display_path(eval_report_path)}"
|
| 1421 |
+
"</div>"
|
| 1422 |
+
"<div class='ops-v-small'>"
|
| 1423 |
+
f"adapter dir: {_display_path(adapter_path)}"
|
| 1424 |
+
"</div>"
|
| 1425 |
+
)
|
| 1426 |
+
|
| 1427 |
+
|
| 1428 |
def _extract_loss_values(log_text: str, limit: int = 48) -> List[float]:
|
| 1429 |
losses: List[float] = []
|
| 1430 |
for line in log_text.splitlines():
|
|
|
|
| 1531 |
}
|
| 1532 |
|
| 1533 |
|
| 1534 |
+
def _build_stage_timeline(summary: Dict[str, Any], stage_meta: Dict[str, Any]) -> str:
|
| 1535 |
+
start_stage = max(1, _safe_int(stage_meta.get("start_stage"), 1))
|
| 1536 |
+
stage_count = max(1, _safe_int(stage_meta.get("stage_count"), TEMPLATE_STAGE_COUNT))
|
| 1537 |
+
completed = max(0, _safe_int(stage_meta.get("completed"), 0))
|
| 1538 |
+
active_stage = _safe_int(stage_meta.get("active_stage"), 0)
|
| 1539 |
+
|
| 1540 |
+
stage_cfg = TEMPLATE_CFG.get("stages", [])
|
| 1541 |
+
if not isinstance(stage_cfg, list):
|
| 1542 |
+
stage_cfg = []
|
| 1543 |
+
lines: List[str] = []
|
| 1544 |
+
|
| 1545 |
+
for stage_index in range(start_stage, start_stage + stage_count):
|
| 1546 |
+
stage_name = f"stage_{stage_index:02d}"
|
| 1547 |
+
if 0 <= stage_index - 1 < len(stage_cfg):
|
| 1548 |
+
stage_data = stage_cfg[stage_index - 1]
|
| 1549 |
+
if isinstance(stage_data, dict):
|
| 1550 |
+
candidate = str(stage_data.get("name") or "").strip()
|
| 1551 |
+
if candidate:
|
| 1552 |
+
stage_name = candidate
|
| 1553 |
+
|
| 1554 |
+
relative = stage_index - start_stage + 1
|
| 1555 |
+
if relative <= completed:
|
| 1556 |
+
cls = "done"
|
| 1557 |
+
marker = "done"
|
| 1558 |
+
elif active_stage == stage_index:
|
| 1559 |
+
cls = "active"
|
| 1560 |
+
marker = "active"
|
| 1561 |
+
else:
|
| 1562 |
+
cls = "pending"
|
| 1563 |
+
marker = "pending"
|
| 1564 |
+
|
| 1565 |
+
lines.append(
|
| 1566 |
+
f"<li class='ops-stage-item {cls}'><span class='ops-stage-dot'>{marker}</span>"
|
| 1567 |
+
f"<span class='ops-stage-label'>{stage_index}. {html.escape(stage_name)}</span></li>"
|
| 1568 |
+
)
|
| 1569 |
+
|
| 1570 |
+
if not lines:
|
| 1571 |
+
return "<div class='ops-v-small ops-empty'>No stage layout available.</div>"
|
| 1572 |
+
return f"<ul class='ops-stage-list'>{''.join(lines)}</ul>"
|
| 1573 |
+
|
| 1574 |
+
|
| 1575 |
def render_ops_visual(summary: Dict[str, Any], status_text: str, log_text: str) -> str:
|
| 1576 |
safe_summary = _as_dict(summary)
|
| 1577 |
runtime = _as_dict(safe_summary.get("runtime"))
|
| 1578 |
quality_gate = _as_dict(safe_summary.get("quality_gate"))
|
| 1579 |
evaluation = _as_dict(safe_summary.get("evaluation"))
|
| 1580 |
push_report = _as_dict(safe_summary.get("push"))
|
| 1581 |
+
error_payload = _as_dict(safe_summary.get("error"))
|
| 1582 |
|
| 1583 |
run_label = html.escape(str(safe_summary.get("run_label") or "not-started"))
|
| 1584 |
status_value = html.escape(status_text or "Idle")
|
| 1585 |
runtime_mode = "GPU READY" if runtime.get("cuda_available") else "CPU FALLBACK"
|
| 1586 |
+
compute_mode = str(safe_summary.get("compute_mode") or "").strip().lower()
|
| 1587 |
+
if compute_mode == "gpu":
|
| 1588 |
+
runtime_mode = "GPU TRAINING"
|
| 1589 |
+
elif compute_mode == "cpu_fallback":
|
| 1590 |
+
runtime_mode = "CPU FALLBACK"
|
| 1591 |
runtime_mode = html.escape(runtime_mode)
|
| 1592 |
device_count = _safe_int(runtime.get("cuda_device_count"), 0)
|
| 1593 |
+
duration_text = html.escape(_runtime_duration_text(safe_summary))
|
| 1594 |
+
started_at = html.escape(str(safe_summary.get("started_at_utc") or "--"))
|
| 1595 |
|
| 1596 |
gate_enabled = bool(quality_gate.get("enabled"))
|
| 1597 |
gate_passed = quality_gate.get("passed")
|
|
|
|
| 1620 |
pass_k = _fmt_pct(evaluation.get("pass_at_k"))
|
| 1621 |
pass_1 = _fmt_pct(evaluation.get("pass_at_1"))
|
| 1622 |
exact_k = _fmt_pct(evaluation.get("exact_at_k"))
|
| 1623 |
+
evaluated_rows = _safe_int(evaluation.get("evaluated_rows"), 0)
|
| 1624 |
|
| 1625 |
push_state = "Pending"
|
| 1626 |
if push_report:
|
|
|
|
| 1633 |
else:
|
| 1634 |
push_state = "Blocked"
|
| 1635 |
|
| 1636 |
+
profile_flags = [
|
| 1637 |
+
("autonomous", bool(safe_summary.get("autonomous_mode"))),
|
| 1638 |
+
("continuous", "Cycle" in (status_text or "")),
|
| 1639 |
+
("preflight", bool(safe_summary.get("preflight_only"))),
|
| 1640 |
+
("post_eval", bool(safe_summary.get("run_eval"))),
|
| 1641 |
+
("quality_gate", bool(safe_summary.get("enforce_quality_gate"))),
|
| 1642 |
+
("push", bool(safe_summary.get("push_to_hub"))),
|
| 1643 |
+
]
|
| 1644 |
+
profile_chips = "".join(
|
| 1645 |
+
f"<span class='ops-chip {'on' if enabled else 'off'}'>{name}:{'on' if enabled else 'off'}</span>"
|
| 1646 |
+
for name, enabled in profile_flags
|
| 1647 |
+
)
|
| 1648 |
+
stage_timeline = _build_stage_timeline(safe_summary, stage_meta)
|
| 1649 |
+
recent_runs = _build_recent_runs_panel(safe_summary)
|
| 1650 |
+
artifact_index = _build_artifact_index(safe_summary)
|
| 1651 |
+
|
| 1652 |
+
error_message = str(error_payload.get("message") or "").strip()
|
| 1653 |
+
if error_message:
|
| 1654 |
+
if len(error_message) > 130:
|
| 1655 |
+
error_message = error_message[:127] + "..."
|
| 1656 |
+
error_text = html.escape(error_message)
|
| 1657 |
+
else:
|
| 1658 |
+
error_text = "none"
|
| 1659 |
+
|
| 1660 |
return f"""
|
| 1661 |
<div class="ops-visual">
|
| 1662 |
<div class="ops-visual-head">
|
|
|
|
| 1674 |
<div class="ops-v">{runtime_mode}</div>
|
| 1675 |
<div class="ops-v-small">cuda devices: {device_count}</div>
|
| 1676 |
</div>
|
| 1677 |
+
<div class="ops-card">
|
| 1678 |
+
<div class="ops-k">Runtime Window</div>
|
| 1679 |
+
<div class="ops-v">{duration_text}</div>
|
| 1680 |
+
<div class="ops-v-small">start: {started_at}</div>
|
| 1681 |
+
</div>
|
| 1682 |
<div class="ops-card">
|
| 1683 |
<div class="ops-k">Stage Progress</div>
|
| 1684 |
<div class="ops-v">{stage_meta['completed']} / {stage_meta['stage_count']}</div>
|
|
|
|
| 1693 |
<div class="ops-card">
|
| 1694 |
<div class="ops-k">Eval pass@k</div>
|
| 1695 |
<div class="ops-v">{pass_k}</div>
|
| 1696 |
+
<div class="ops-v-small">pass@1 {pass_1} | exact@k {exact_k} | rows {evaluated_rows}</div>
|
| 1697 |
</div>
|
| 1698 |
<div class="ops-card">
|
| 1699 |
+
<div class="ops-k">Run Profile</div>
|
| 1700 |
+
<div class="ops-chip-list">{profile_chips}</div>
|
| 1701 |
+
<div class="ops-v-small">result: {html.escape(str(safe_summary.get('result') or 'pending'))}</div>
|
| 1702 |
+
</div>
|
| 1703 |
+
<div class="ops-card">
|
| 1704 |
+
<div class="ops-k">Last Error</div>
|
| 1705 |
+
<div class="ops-v-small">{error_text}</div>
|
| 1706 |
+
<div class="ops-v-small">status: {status_value}</div>
|
| 1707 |
+
</div>
|
| 1708 |
+
<div class="ops-card ops-card-wide">
|
| 1709 |
+
<div class="ops-k">Stage Timeline</div>
|
| 1710 |
+
{stage_timeline}
|
| 1711 |
+
</div>
|
| 1712 |
+
<div class="ops-card ops-card-wide">
|
| 1713 |
+
<div class="ops-k">Recent Runs</div>
|
| 1714 |
+
{recent_runs}
|
| 1715 |
+
</div>
|
| 1716 |
+
<div class="ops-card ops-card-wide">
|
| 1717 |
+
<div class="ops-k">Artifact Index</div>
|
| 1718 |
+
{artifact_index}
|
| 1719 |
+
</div>
|
| 1720 |
+
<div class="ops-card ops-card-wide">
|
| 1721 |
<div class="ops-k">Loss Stream</div>
|
| 1722 |
{sparkline_html}
|
| 1723 |
</div>
|
|
|
|
| 1766 |
return value
|
| 1767 |
|
| 1768 |
|
| 1769 |
+
def validate_stage_window(stage_start: int, stage_count: int) -> Tuple[int, int]:
|
| 1770 |
+
if stage_start < 1:
|
| 1771 |
+
raise ValueError("Start stage must be >= 1.")
|
| 1772 |
+
if stage_start > TEMPLATE_STAGE_COUNT:
|
| 1773 |
+
raise ValueError(f"Start stage must be <= {TEMPLATE_STAGE_COUNT}.")
|
| 1774 |
+
if stage_count < 1:
|
| 1775 |
+
raise ValueError("How many stages must be >= 1.")
|
| 1776 |
+
max_count = TEMPLATE_STAGE_COUNT - stage_start + 1
|
| 1777 |
+
if stage_count > max_count:
|
| 1778 |
+
raise ValueError(f"Stage count exceeds available stages from start stage (max {max_count}).")
|
| 1779 |
+
return stage_start, stage_count
|
| 1780 |
+
|
| 1781 |
+
|
| 1782 |
def ensure_workspace() -> None:
|
| 1783 |
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
| 1784 |
RUNTIME_DIR.mkdir(parents=True, exist_ok=True)
|
| 1785 |
+
HF_HOME_DIR.mkdir(parents=True, exist_ok=True)
|
| 1786 |
+
HF_DATASETS_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
| 1787 |
+
HF_HUB_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
| 1788 |
+
RUN_RECORDS_DIR.mkdir(parents=True, exist_ok=True)
|
| 1789 |
|
| 1790 |
|
| 1791 |
def run_runtime_snapshot() -> Dict[str, Any]:
|
|
|
|
| 2087 |
) -> Generator[Tuple[str, str, str], None, None]:
|
| 2088 |
log_lines: List[str] = []
|
| 2089 |
summary: Dict[str, Any] = {}
|
| 2090 |
+
run_label = dt.datetime.now(dt.timezone.utc).strftime("run-%Y%m%d-%H%M%S")
|
| 2091 |
|
| 2092 |
if not begin_run(run_label):
|
| 2093 |
append_log(log_lines, "A run is already in progress. Wait for it to finish or click Stop.")
|
|
|
|
| 2130 |
force_redownload = False
|
| 2131 |
preflight_only = False
|
| 2132 |
|
| 2133 |
+
stage_start, stage_count = validate_stage_window(stage_start, stage_count)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2134 |
if eval_k < 1:
|
| 2135 |
raise ValueError("Eval K must be >= 1.")
|
| 2136 |
if eval_samples < 1:
|
|
|
|
| 2170 |
"force_redownload": bool(force_redownload),
|
| 2171 |
"preflight_only": bool(preflight_only),
|
| 2172 |
"runtime": runtime,
|
| 2173 |
+
"recent_runs": load_run_history(limit=RECENT_RUNS_VISUAL_LIMIT),
|
| 2174 |
}
|
| 2175 |
)
|
| 2176 |
|
|
|
|
| 2254 |
env.pop("HF_TOKEN", None)
|
| 2255 |
env.pop("HUGGINGFACE_HUB_TOKEN", None)
|
| 2256 |
env["PYTHONUNBUFFERED"] = "1"
|
| 2257 |
+
env.setdefault("HF_HOME", str(HF_HOME_DIR))
|
| 2258 |
+
env.setdefault("HF_DATASETS_CACHE", str(HF_DATASETS_CACHE_DIR))
|
| 2259 |
+
env.setdefault("HUGGINGFACE_HUB_CACHE", str(HF_HUB_CACHE_DIR))
|
| 2260 |
|
| 2261 |
train_cmd = [
|
| 2262 |
sys.executable,
|
|
|
|
| 2293 |
summary["result"] = "cancelled"
|
| 2294 |
summary["finished_at_utc"] = now_ts()
|
| 2295 |
append_log(log_lines, "Run cancelled by user.")
|
| 2296 |
+
_refresh_recent_runs(summary, log_lines)
|
| 2297 |
yield "\n".join(log_lines), "Cancelled", summary_text(summary)
|
| 2298 |
return
|
| 2299 |
|
|
|
|
| 2301 |
summary["result"] = "failed"
|
| 2302 |
summary["failure_stage"] = "training"
|
| 2303 |
summary["finished_at_utc"] = now_ts()
|
| 2304 |
+
_refresh_recent_runs(summary, log_lines)
|
| 2305 |
yield "\n".join(log_lines), "Failed", summary_text(summary)
|
| 2306 |
return
|
| 2307 |
|
|
|
|
| 2309 |
summary["result"] = "preflight_passed"
|
| 2310 |
summary["finished_at_utc"] = now_ts()
|
| 2311 |
append_log(log_lines, "Validation mode completed successfully.")
|
| 2312 |
+
_refresh_recent_runs(summary, log_lines)
|
| 2313 |
yield "\n".join(log_lines), "Preflight complete", summary_text(summary)
|
| 2314 |
return
|
| 2315 |
|
|
|
|
| 2398 |
summary["result"] = "cancelled"
|
| 2399 |
summary["finished_at_utc"] = now_ts()
|
| 2400 |
append_log(log_lines, "Run cancelled by user.")
|
| 2401 |
+
_refresh_recent_runs(summary, log_lines)
|
| 2402 |
yield "\n".join(log_lines), "Cancelled", summary_text(summary)
|
| 2403 |
return
|
| 2404 |
|
|
|
|
| 2406 |
summary["result"] = "failed"
|
| 2407 |
summary["failure_stage"] = "evaluation"
|
| 2408 |
summary["finished_at_utc"] = now_ts()
|
| 2409 |
+
_refresh_recent_runs(summary, log_lines)
|
| 2410 |
yield "\n".join(log_lines), "Failed", summary_text(summary)
|
| 2411 |
return
|
| 2412 |
|
|
|
|
| 2427 |
summary["result"] = "completed"
|
| 2428 |
summary["finished_at_utc"] = now_ts()
|
| 2429 |
append_log(log_lines, "Pipeline completed.")
|
| 2430 |
+
_refresh_recent_runs(summary, log_lines)
|
| 2431 |
yield "\n".join(log_lines), "Completed", summary_text(summary)
|
| 2432 |
except Exception as exc:
|
| 2433 |
cancelled = is_cancel_requested() or str(exc) == "Run cancelled by user."
|
| 2434 |
+
trace = traceback.format_exc()
|
| 2435 |
summary["result"] = "cancelled" if cancelled else "failed"
|
| 2436 |
+
summary["error"] = {
|
| 2437 |
+
"type": type(exc).__name__,
|
| 2438 |
+
"message": str(exc),
|
| 2439 |
+
"traceback": trace[-12000:],
|
| 2440 |
+
}
|
| 2441 |
summary["finished_at_utc"] = now_ts()
|
| 2442 |
append_log(
|
| 2443 |
log_lines,
|
| 2444 |
f"Pipeline {'cancelled' if cancelled else 'failed'}: {type(exc).__name__}: {exc}",
|
| 2445 |
)
|
| 2446 |
+
if trace.strip():
|
| 2447 |
+
append_log(log_lines, trace.rstrip())
|
| 2448 |
+
_refresh_recent_runs(summary, log_lines)
|
| 2449 |
yield "\n".join(log_lines), "Cancelled" if cancelled else "Failed", summary_text(summary)
|
| 2450 |
finally:
|
| 2451 |
finish_run()
|
|
|
|
| 2471 |
preflight_only: bool,
|
| 2472 |
) -> Generator[Tuple[str, str, str], None, None]:
|
| 2473 |
cycle_index = 1
|
| 2474 |
+
consecutive_failures = 0
|
| 2475 |
continuous_mode = bool(continuous_mode)
|
| 2476 |
if preflight_only and continuous_mode:
|
| 2477 |
continuous_mode = False
|
|
|
|
| 2539 |
yield compose_ops_console(session_logs, final_summary_json), stop_status, final_visual
|
| 2540 |
break
|
| 2541 |
|
| 2542 |
+
if final_result in {"completed", "preflight_passed"}:
|
| 2543 |
+
consecutive_failures = 0
|
| 2544 |
+
else:
|
| 2545 |
+
consecutive_failures += 1
|
| 2546 |
+
if consecutive_failures >= CONTINUOUS_MAX_CONSECUTIVE_FAILURES:
|
| 2547 |
+
session_logs = _merge_log_chunk(
|
| 2548 |
+
session_logs,
|
| 2549 |
+
f"[{now_ts()}] Continuous mode halted after {consecutive_failures} consecutive non-success cycles.",
|
| 2550 |
+
)
|
| 2551 |
+
stop_status = f"Cycle {cycle_index}: halted"
|
| 2552 |
+
yield compose_ops_console(session_logs, final_summary_json), stop_status, final_visual
|
| 2553 |
+
break
|
| 2554 |
+
|
| 2555 |
session_logs = _merge_log_chunk(
|
| 2556 |
session_logs,
|
| 2557 |
f"[{now_ts()}] Continuous mode: cycle {cycle_index} finished with result="
|
|
|
|
| 2559 |
)
|
| 2560 |
restart_status = f"Cycle {cycle_index}: restarting"
|
| 2561 |
yield compose_ops_console(session_logs, final_summary_json), restart_status, final_visual
|
| 2562 |
+
if CONTINUOUS_RESTART_DELAY_SECONDS > 0:
|
| 2563 |
+
cooldown_s = CONTINUOUS_RESTART_DELAY_SECONDS
|
| 2564 |
+
deadline = time.monotonic() + cooldown_s
|
| 2565 |
+
session_logs = _merge_log_chunk(
|
| 2566 |
+
session_logs,
|
| 2567 |
+
f"[{now_ts()}] Continuous mode cooldown: waiting {cooldown_s}s before next cycle.",
|
| 2568 |
+
)
|
| 2569 |
+
while True:
|
| 2570 |
+
remaining = int(max(0, round(deadline - time.monotonic())))
|
| 2571 |
+
cooldown_status = f"Cycle {cycle_index}: cooldown {remaining}s"
|
| 2572 |
+
yield compose_ops_console(session_logs, final_summary_json), cooldown_status, final_visual
|
| 2573 |
+
if remaining <= 0:
|
| 2574 |
+
break
|
| 2575 |
+
if is_cancel_requested():
|
| 2576 |
+
session_logs = _merge_log_chunk(
|
| 2577 |
+
session_logs,
|
| 2578 |
+
f"[{now_ts()}] Continuous mode cooldown interrupted by user cancellation.",
|
| 2579 |
+
)
|
| 2580 |
+
stop_status = f"Cycle {cycle_index}: stopped"
|
| 2581 |
+
yield compose_ops_console(session_logs, final_summary_json), stop_status, final_visual
|
| 2582 |
+
return
|
| 2583 |
+
time.sleep(1)
|
| 2584 |
cycle_index += 1
|
| 2585 |
|
| 2586 |
|
requirements.txt
CHANGED
|
@@ -8,3 +8,4 @@ bitsandbytes>=0.45.0,<1
|
|
| 8 |
huggingface_hub>=0.26.0,<1
|
| 9 |
pyyaml>=6.0.2,<7
|
| 10 |
sentencepiece>=0.2.0,<1
|
|
|
|
|
|
| 8 |
huggingface_hub>=0.26.0,<1
|
| 9 |
pyyaml>=6.0.2,<7
|
| 10 |
sentencepiece>=0.2.0,<1
|
| 11 |
+
protobuf>=4.25.0,<6
|
scripts/eval_sota.py
CHANGED
|
@@ -326,7 +326,13 @@ def load_model_and_tokenizer(
|
|
| 326 |
adapter_path: Optional[Path],
|
| 327 |
trust_remote_code: bool,
|
| 328 |
) -> Tuple[Any, AutoTokenizer]:
|
| 329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
if tokenizer.pad_token is None:
|
| 331 |
tokenizer.pad_token = tokenizer.eos_token or tokenizer.unk_token
|
| 332 |
if tokenizer.pad_token is None:
|
|
|
|
| 326 |
adapter_path: Optional[Path],
|
| 327 |
trust_remote_code: bool,
|
| 328 |
) -> Tuple[Any, AutoTokenizer]:
|
| 329 |
+
try:
|
| 330 |
+
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=trust_remote_code, use_fast=True)
|
| 331 |
+
except ImportError as exc:
|
| 332 |
+
if "protobuf" not in str(exc).lower():
|
| 333 |
+
raise
|
| 334 |
+
print("protobuf missing for fast tokenizer. Retrying with use_fast=False.")
|
| 335 |
+
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=trust_remote_code, use_fast=False)
|
| 336 |
if tokenizer.pad_token is None:
|
| 337 |
tokenizer.pad_token = tokenizer.eos_token or tokenizer.unk_token
|
| 338 |
if tokenizer.pad_token is None:
|
scripts/preflight_check.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Production preflight checks for the Math Conjecture Trainer Space."""
|
| 3 |
+
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import argparse
|
| 7 |
+
import importlib
|
| 8 |
+
import json
|
| 9 |
+
import os
|
| 10 |
+
import subprocess
|
| 11 |
+
import sys
|
| 12 |
+
from dataclasses import dataclass
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import Any, Callable, Dict, List
|
| 15 |
+
|
| 16 |
+
import yaml
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 20 |
+
CONFIG_PATH = ROOT / "configs" / "deepseek_math_sota.yaml"
|
| 21 |
+
HF_HOME_DIR = ROOT / "workspace" / ".hf_home"
|
| 22 |
+
HF_DATASETS_CACHE_DIR = HF_HOME_DIR / "datasets"
|
| 23 |
+
HF_HUB_CACHE_DIR = HF_HOME_DIR / "hub"
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class CheckResult:
|
| 28 |
+
name: str
|
| 29 |
+
ok: bool
|
| 30 |
+
detail: str
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def check_required_files() -> str:
|
| 34 |
+
required = [
|
| 35 |
+
ROOT / "app.py",
|
| 36 |
+
ROOT / "scripts" / "train_sota.py",
|
| 37 |
+
ROOT / "scripts" / "eval_sota.py",
|
| 38 |
+
CONFIG_PATH,
|
| 39 |
+
ROOT / "requirements.txt",
|
| 40 |
+
]
|
| 41 |
+
missing = [str(path) for path in required if not path.exists()]
|
| 42 |
+
if missing:
|
| 43 |
+
raise FileNotFoundError("Missing required files: " + ", ".join(missing))
|
| 44 |
+
return f"{len(required)} required files present."
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def check_config_shape() -> str:
|
| 48 |
+
cfg = yaml.safe_load(CONFIG_PATH.read_text(encoding="utf-8"))
|
| 49 |
+
if not isinstance(cfg, dict):
|
| 50 |
+
raise ValueError("Config root must be a mapping.")
|
| 51 |
+
required_sections = ("model", "data", "stages")
|
| 52 |
+
for section in required_sections:
|
| 53 |
+
if section not in cfg:
|
| 54 |
+
raise ValueError(f"Missing config section: {section}")
|
| 55 |
+
stages = cfg.get("stages")
|
| 56 |
+
if not isinstance(stages, list) or not stages:
|
| 57 |
+
raise ValueError("Config must contain at least one stage.")
|
| 58 |
+
return f"Config valid with {len(stages)} stage(s)."
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def check_python_imports() -> str:
|
| 62 |
+
modules = [
|
| 63 |
+
"gradio",
|
| 64 |
+
"torch",
|
| 65 |
+
"yaml",
|
| 66 |
+
"huggingface_hub",
|
| 67 |
+
"datasets",
|
| 68 |
+
"transformers",
|
| 69 |
+
"peft",
|
| 70 |
+
]
|
| 71 |
+
versions: Dict[str, str] = {}
|
| 72 |
+
for module_name in modules:
|
| 73 |
+
mod = importlib.import_module(module_name)
|
| 74 |
+
versions[module_name] = str(getattr(mod, "__version__", "unknown"))
|
| 75 |
+
return "Imports OK: " + ", ".join(f"{k}={v}" for k, v in versions.items())
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def check_module_integrity() -> str:
|
| 79 |
+
root_str = str(ROOT)
|
| 80 |
+
if root_str not in sys.path:
|
| 81 |
+
sys.path.insert(0, root_str)
|
| 82 |
+
|
| 83 |
+
app = importlib.import_module("app")
|
| 84 |
+
train_sota = importlib.import_module("scripts.train_sota")
|
| 85 |
+
eval_sota = importlib.import_module("scripts.eval_sota")
|
| 86 |
+
|
| 87 |
+
runtime = app.run_runtime_snapshot()
|
| 88 |
+
if not isinstance(runtime, dict):
|
| 89 |
+
raise ValueError("Runtime snapshot is not a dictionary.")
|
| 90 |
+
if "python" not in runtime or "torch" not in runtime:
|
| 91 |
+
raise ValueError("Runtime snapshot missing expected keys.")
|
| 92 |
+
|
| 93 |
+
train_cfg = train_sota.load_config(CONFIG_PATH)
|
| 94 |
+
eval_cfg = eval_sota.load_config(CONFIG_PATH)
|
| 95 |
+
if not isinstance(train_cfg, dict) or not isinstance(eval_cfg, dict):
|
| 96 |
+
raise ValueError("Config loaders did not return dictionaries.")
|
| 97 |
+
return "App/train/eval module imports and config loaders are healthy."
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def run_optional_training_dry_run(timeout_seconds: int) -> str:
|
| 101 |
+
HF_HOME_DIR.mkdir(parents=True, exist_ok=True)
|
| 102 |
+
HF_DATASETS_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
| 103 |
+
HF_HUB_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
| 104 |
+
env = os.environ.copy()
|
| 105 |
+
env.setdefault("HF_HOME", str(HF_HOME_DIR))
|
| 106 |
+
env.setdefault("HF_DATASETS_CACHE", str(HF_DATASETS_CACHE_DIR))
|
| 107 |
+
env.setdefault("HUGGINGFACE_HUB_CACHE", str(HF_HUB_CACHE_DIR))
|
| 108 |
+
|
| 109 |
+
cmd = [
|
| 110 |
+
sys.executable,
|
| 111 |
+
str(ROOT / "scripts" / "train_sota.py"),
|
| 112 |
+
"--config",
|
| 113 |
+
str(CONFIG_PATH),
|
| 114 |
+
"--start-stage",
|
| 115 |
+
"1",
|
| 116 |
+
"--max-stages",
|
| 117 |
+
"1",
|
| 118 |
+
"--dry-run",
|
| 119 |
+
]
|
| 120 |
+
completed = subprocess.run(
|
| 121 |
+
cmd,
|
| 122 |
+
cwd=str(ROOT),
|
| 123 |
+
check=False,
|
| 124 |
+
env=env,
|
| 125 |
+
stdout=subprocess.PIPE,
|
| 126 |
+
stderr=subprocess.STDOUT,
|
| 127 |
+
text=True,
|
| 128 |
+
timeout=timeout_seconds,
|
| 129 |
+
)
|
| 130 |
+
if completed.returncode != 0:
|
| 131 |
+
tail = "\n".join((completed.stdout or "").splitlines()[-30:])
|
| 132 |
+
raise RuntimeError(f"Dry-run failed with exit code {completed.returncode}.\n{tail}")
|
| 133 |
+
return "Optional training dry-run passed."
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def run_checks(checks: List[tuple[str, Callable[[], str]]]) -> List[CheckResult]:
|
| 137 |
+
out: List[CheckResult] = []
|
| 138 |
+
for name, fn in checks:
|
| 139 |
+
try:
|
| 140 |
+
detail = fn()
|
| 141 |
+
out.append(CheckResult(name=name, ok=True, detail=detail))
|
| 142 |
+
except Exception as exc:
|
| 143 |
+
out.append(CheckResult(name=name, ok=False, detail=f"{type(exc).__name__}: {exc}"))
|
| 144 |
+
return out
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def parse_args() -> argparse.Namespace:
|
| 148 |
+
parser = argparse.ArgumentParser(description="Run production preflight checks for the Space trainer.")
|
| 149 |
+
parser.add_argument(
|
| 150 |
+
"--run-training-dry-run",
|
| 151 |
+
action="store_true",
|
| 152 |
+
help="Also execute scripts/train_sota.py in --dry-run mode (stage 1 only).",
|
| 153 |
+
)
|
| 154 |
+
parser.add_argument(
|
| 155 |
+
"--dry-run-timeout-seconds",
|
| 156 |
+
type=int,
|
| 157 |
+
default=1800,
|
| 158 |
+
help="Timeout for optional training dry-run step.",
|
| 159 |
+
)
|
| 160 |
+
parser.add_argument(
|
| 161 |
+
"--json",
|
| 162 |
+
action="store_true",
|
| 163 |
+
help="Print machine-readable JSON output.",
|
| 164 |
+
)
|
| 165 |
+
return parser.parse_args()
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def main() -> None:
|
| 169 |
+
args = parse_args()
|
| 170 |
+
checks: List[tuple[str, Callable[[], str]]] = [
|
| 171 |
+
("required_files", check_required_files),
|
| 172 |
+
("config_shape", check_config_shape),
|
| 173 |
+
("python_imports", check_python_imports),
|
| 174 |
+
("module_integrity", check_module_integrity),
|
| 175 |
+
]
|
| 176 |
+
if args.run_training_dry_run:
|
| 177 |
+
checks.append(
|
| 178 |
+
(
|
| 179 |
+
"training_dry_run",
|
| 180 |
+
lambda: run_optional_training_dry_run(timeout_seconds=max(30, args.dry_run_timeout_seconds)),
|
| 181 |
+
)
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
results = run_checks(checks)
|
| 185 |
+
ok = all(item.ok for item in results)
|
| 186 |
+
payload: Dict[str, Any] = {
|
| 187 |
+
"ok": ok,
|
| 188 |
+
"checks": [{"name": item.name, "ok": item.ok, "detail": item.detail} for item in results],
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
if args.json:
|
| 192 |
+
print(json.dumps(payload, ensure_ascii=True, indent=2))
|
| 193 |
+
else:
|
| 194 |
+
for item in results:
|
| 195 |
+
status = "PASS" if item.ok else "FAIL"
|
| 196 |
+
print(f"[{status}] {item.name}: {item.detail}")
|
| 197 |
+
print("Overall:", "PASS" if ok else "FAIL")
|
| 198 |
+
|
| 199 |
+
if not ok:
|
| 200 |
+
raise SystemExit(1)
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
if __name__ == "__main__":
|
| 204 |
+
main()
|
scripts/train_sota.py
CHANGED
|
@@ -430,11 +430,22 @@ def build_tokenizer(model_cfg: Dict[str, Any]) -> AutoTokenizer:
|
|
| 430 |
base_model = as_text(model_cfg.get("base_model"))
|
| 431 |
if not base_model:
|
| 432 |
raise ValueError("model.base_model is required.")
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
if tokenizer.pad_token is None:
|
| 439 |
tokenizer.pad_token = tokenizer.eos_token or tokenizer.unk_token
|
| 440 |
if tokenizer.pad_token is None:
|
|
@@ -512,6 +523,12 @@ def build_model_and_tokenizer(model_cfg: Dict[str, Any], training_defaults: Dict
|
|
| 512 |
return model, tokenizer
|
| 513 |
|
| 514 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 515 |
class WeightedLossCollator:
|
| 516 |
def __init__(self, tokenizer: AutoTokenizer, model: Any) -> None:
|
| 517 |
self.base = DataCollatorForSeq2Seq(
|
|
@@ -909,7 +926,14 @@ def main() -> None:
|
|
| 909 |
raise ValueError("Hub push requested but repo_id is missing.")
|
| 910 |
|
| 911 |
if args.dry_run:
|
| 912 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 913 |
model = None
|
| 914 |
else:
|
| 915 |
model, tokenizer = build_model_and_tokenizer(cfg["model"], cfg.get("training_defaults", {}))
|
|
|
|
| 430 |
base_model = as_text(model_cfg.get("base_model"))
|
| 431 |
if not base_model:
|
| 432 |
raise ValueError("model.base_model is required.")
|
| 433 |
+
trust_remote_code = bool(model_cfg.get("trust_remote_code", False))
|
| 434 |
+
try:
|
| 435 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 436 |
+
base_model,
|
| 437 |
+
trust_remote_code=trust_remote_code,
|
| 438 |
+
use_fast=True,
|
| 439 |
+
)
|
| 440 |
+
except ImportError as exc:
|
| 441 |
+
if "protobuf" not in str(exc).lower():
|
| 442 |
+
raise
|
| 443 |
+
print("protobuf missing for fast tokenizer. Retrying with use_fast=False.")
|
| 444 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 445 |
+
base_model,
|
| 446 |
+
trust_remote_code=trust_remote_code,
|
| 447 |
+
use_fast=False,
|
| 448 |
+
)
|
| 449 |
if tokenizer.pad_token is None:
|
| 450 |
tokenizer.pad_token = tokenizer.eos_token or tokenizer.unk_token
|
| 451 |
if tokenizer.pad_token is None:
|
|
|
|
| 523 |
return model, tokenizer
|
| 524 |
|
| 525 |
|
| 526 |
+
class DryRunTokenizerFallback:
|
| 527 |
+
"""Minimal tokenizer-like object for dry-run prompt-format checks."""
|
| 528 |
+
|
| 529 |
+
chat_template = None
|
| 530 |
+
|
| 531 |
+
|
| 532 |
class WeightedLossCollator:
|
| 533 |
def __init__(self, tokenizer: AutoTokenizer, model: Any) -> None:
|
| 534 |
self.base = DataCollatorForSeq2Seq(
|
|
|
|
| 926 |
raise ValueError("Hub push requested but repo_id is missing.")
|
| 927 |
|
| 928 |
if args.dry_run:
|
| 929 |
+
try:
|
| 930 |
+
tokenizer = build_tokenizer(cfg["model"])
|
| 931 |
+
except Exception as exc:
|
| 932 |
+
print(
|
| 933 |
+
"Dry-run warning: tokenizer load failed; using formatting-only fallback tokenizer. "
|
| 934 |
+
f"Reason: {type(exc).__name__}: {exc}"
|
| 935 |
+
)
|
| 936 |
+
tokenizer = DryRunTokenizerFallback()
|
| 937 |
model = None
|
| 938 |
else:
|
| 939 |
model, tokenizer = build_model_and_tokenizer(cfg["model"], cfg.get("training_defaults", {}))
|
tests/test_core_utils.py
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Production safety tests for key pipeline utilities."""
|
| 3 |
+
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import tempfile
|
| 8 |
+
import unittest
|
| 9 |
+
from unittest import mock
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
import app
|
| 13 |
+
from scripts import eval_sota
|
| 14 |
+
from scripts import train_sota
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class AppUtilityTests(unittest.TestCase):
|
| 18 |
+
def test_validate_repo_id_accepts_valid(self) -> None:
|
| 19 |
+
self.assertEqual(
|
| 20 |
+
app.validate_repo_id("NorthernTribe-Research/math_trainer", "Model repo"),
|
| 21 |
+
"NorthernTribe-Research/math_trainer",
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
def test_validate_repo_id_rejects_invalid(self) -> None:
|
| 25 |
+
with self.assertRaises(ValueError):
|
| 26 |
+
app.validate_repo_id("invalid repo id", "Model repo")
|
| 27 |
+
|
| 28 |
+
def test_merge_log_chunk_truncates(self) -> None:
|
| 29 |
+
merged = app._merge_log_chunk("a" * 9, "b" * 9, max_chars=10)
|
| 30 |
+
self.assertEqual(len(merged), 10)
|
| 31 |
+
self.assertTrue(merged.endswith("b" * 9))
|
| 32 |
+
|
| 33 |
+
def test_build_stage_timeline_returns_list_markup(self) -> None:
|
| 34 |
+
stage_meta = {"start_stage": 1, "stage_count": 2, "completed": 1, "active_stage": 2}
|
| 35 |
+
html = app._build_stage_timeline({}, stage_meta)
|
| 36 |
+
self.assertIn("ops-stage-list", html)
|
| 37 |
+
self.assertIn("ops-stage-item", html)
|
| 38 |
+
|
| 39 |
+
def test_validate_stage_window_rejects_overflow(self) -> None:
|
| 40 |
+
with self.assertRaises(ValueError):
|
| 41 |
+
app.validate_stage_window(app.TEMPLATE_STAGE_COUNT, 2)
|
| 42 |
+
|
| 43 |
+
def test_build_recent_runs_panel_markup(self) -> None:
|
| 44 |
+
summary = {
|
| 45 |
+
"recent_runs": [
|
| 46 |
+
{
|
| 47 |
+
"run_label": "run-20260101-000000",
|
| 48 |
+
"result": "completed",
|
| 49 |
+
"duration_seconds": 42,
|
| 50 |
+
"finished_at_utc": "2026-01-01 00:00:42 UTC",
|
| 51 |
+
"evaluation": {"pass_at_1": 0.11, "pass_at_k": 0.27, "evaluated_rows": 128},
|
| 52 |
+
}
|
| 53 |
+
]
|
| 54 |
+
}
|
| 55 |
+
html = app._build_recent_runs_panel(summary)
|
| 56 |
+
self.assertIn("ops-run-list", html)
|
| 57 |
+
self.assertIn("run-20260101-000000", html)
|
| 58 |
+
self.assertIn("completed", html)
|
| 59 |
+
|
| 60 |
+
def test_persist_run_artifacts_updates_history(self) -> None:
|
| 61 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 62 |
+
history_path = Path(tmpdir) / "run_history.json"
|
| 63 |
+
records_dir = Path(tmpdir) / "run_records"
|
| 64 |
+
summary = {
|
| 65 |
+
"run_label": "run-20260102-030405",
|
| 66 |
+
"result": "completed",
|
| 67 |
+
"started_at_utc": "2026-01-02 03:04:05 UTC",
|
| 68 |
+
"finished_at_utc": "2026-01-02 03:04:35 UTC",
|
| 69 |
+
"evaluation": {"pass_at_1": 0.1, "pass_at_k": 0.2, "evaluated_rows": 64},
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
with mock.patch.object(app, "RUN_HISTORY_PATH", history_path):
|
| 73 |
+
with mock.patch.object(app, "RUN_RECORDS_DIR", records_dir):
|
| 74 |
+
warning = app.persist_run_artifacts(summary)
|
| 75 |
+
|
| 76 |
+
self.assertIsNone(warning)
|
| 77 |
+
self.assertTrue(history_path.exists())
|
| 78 |
+
payload = json.loads(history_path.read_text(encoding="utf-8"))
|
| 79 |
+
self.assertEqual(payload[0]["run_label"], "run-20260102-030405")
|
| 80 |
+
self.assertEqual(payload[0]["result"], "completed")
|
| 81 |
+
self.assertTrue((records_dir / "run-20260102-030405.json").exists())
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
class EvalUtilityTests(unittest.TestCase):
|
| 85 |
+
def test_parse_numeric_fraction(self) -> None:
|
| 86 |
+
value = eval_sota.parse_numeric_value("3/4")
|
| 87 |
+
self.assertIsNotNone(value)
|
| 88 |
+
assert value is not None
|
| 89 |
+
self.assertAlmostEqual(value, 0.75, places=8)
|
| 90 |
+
|
| 91 |
+
def test_match_candidate_boxed(self) -> None:
|
| 92 |
+
result = eval_sota.match_candidate(r"\boxed{42}", ["42"])
|
| 93 |
+
self.assertTrue(result["match"])
|
| 94 |
+
self.assertTrue(result["boxed"] or result["exact"])
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class TrainUtilityTests(unittest.TestCase):
|
| 98 |
+
def test_as_bool_conversions(self) -> None:
|
| 99 |
+
self.assertTrue(train_sota.as_bool("yes"))
|
| 100 |
+
self.assertFalse(train_sota.as_bool("no"))
|
| 101 |
+
self.assertTrue(train_sota.as_bool(True))
|
| 102 |
+
self.assertFalse(train_sota.as_bool(None, default=False))
|
| 103 |
+
|
| 104 |
+
def test_build_tokenizer_falls_back_when_protobuf_missing(self) -> None:
|
| 105 |
+
class DummyTokenizer:
|
| 106 |
+
def __init__(self) -> None:
|
| 107 |
+
self.pad_token = None
|
| 108 |
+
self.eos_token = "<eos>"
|
| 109 |
+
self.unk_token = "<unk>"
|
| 110 |
+
|
| 111 |
+
def add_special_tokens(self, tokens):
|
| 112 |
+
self.pad_token = tokens.get("pad_token")
|
| 113 |
+
|
| 114 |
+
calls = []
|
| 115 |
+
|
| 116 |
+
def fake_from_pretrained(*args, **kwargs):
|
| 117 |
+
calls.append(kwargs.get("use_fast"))
|
| 118 |
+
if kwargs.get("use_fast"):
|
| 119 |
+
raise ImportError("requires the protobuf library")
|
| 120 |
+
return DummyTokenizer()
|
| 121 |
+
|
| 122 |
+
with mock.patch.object(train_sota.AutoTokenizer, "from_pretrained", side_effect=fake_from_pretrained):
|
| 123 |
+
tok = train_sota.build_tokenizer({"base_model": "dummy/model", "trust_remote_code": False})
|
| 124 |
+
|
| 125 |
+
self.assertEqual(calls, [True, False])
|
| 126 |
+
self.assertEqual(tok.pad_token, "<eos>")
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
class EvalTokenizerFallbackTests(unittest.TestCase):
|
| 130 |
+
def test_eval_tokenizer_falls_back_when_protobuf_missing(self) -> None:
|
| 131 |
+
class DummyTokenizer:
|
| 132 |
+
def __init__(self) -> None:
|
| 133 |
+
self.pad_token = None
|
| 134 |
+
self.eos_token = "<eos>"
|
| 135 |
+
self.unk_token = "<unk>"
|
| 136 |
+
|
| 137 |
+
def add_special_tokens(self, tokens):
|
| 138 |
+
self.pad_token = tokens.get("pad_token")
|
| 139 |
+
|
| 140 |
+
class DummyModel:
|
| 141 |
+
def eval(self):
|
| 142 |
+
return None
|
| 143 |
+
|
| 144 |
+
calls = []
|
| 145 |
+
|
| 146 |
+
def fake_tok_from_pretrained(*args, **kwargs):
|
| 147 |
+
calls.append(kwargs.get("use_fast"))
|
| 148 |
+
if kwargs.get("use_fast"):
|
| 149 |
+
raise ImportError("requires the protobuf library")
|
| 150 |
+
return DummyTokenizer()
|
| 151 |
+
|
| 152 |
+
with mock.patch.object(eval_sota.AutoTokenizer, "from_pretrained", side_effect=fake_tok_from_pretrained):
|
| 153 |
+
with mock.patch.object(eval_sota.AutoModelForCausalLM, "from_pretrained", return_value=DummyModel()):
|
| 154 |
+
model, tok = eval_sota.load_model_and_tokenizer(
|
| 155 |
+
base_model="dummy/model",
|
| 156 |
+
adapter_path=None,
|
| 157 |
+
trust_remote_code=False,
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
self.assertIsNotNone(model)
|
| 161 |
+
self.assertEqual(calls, [True, False])
|
| 162 |
+
self.assertEqual(tok.pad_token, "<eos>")
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
class ContinuousModeSafetyTests(unittest.TestCase):
|
| 166 |
+
def test_continuous_mode_halts_after_consecutive_failures(self) -> None:
|
| 167 |
+
original_max = app.CONTINUOUS_MAX_CONSECUTIVE_FAILURES
|
| 168 |
+
original_delay = app.CONTINUOUS_RESTART_DELAY_SECONDS
|
| 169 |
+
app.CONTINUOUS_MAX_CONSECUTIVE_FAILURES = 2
|
| 170 |
+
app.CONTINUOUS_RESTART_DELAY_SECONDS = 0
|
| 171 |
+
self.addCleanup(setattr, app, "CONTINUOUS_MAX_CONSECUTIVE_FAILURES", original_max)
|
| 172 |
+
self.addCleanup(setattr, app, "CONTINUOUS_RESTART_DELAY_SECONDS", original_delay)
|
| 173 |
+
|
| 174 |
+
def fake_pipeline_core(**kwargs):
|
| 175 |
+
summary = json.dumps({"result": "failed"})
|
| 176 |
+
yield "line-1", "Failed", summary
|
| 177 |
+
|
| 178 |
+
with mock.patch.object(app, "run_pipeline_core", side_effect=fake_pipeline_core):
|
| 179 |
+
outputs = list(
|
| 180 |
+
app.run_pipeline(
|
| 181 |
+
dataset_repo_id="owner/dataset",
|
| 182 |
+
model_repo_id="owner/model",
|
| 183 |
+
base_model_id="model/base",
|
| 184 |
+
autonomous_mode=False,
|
| 185 |
+
continuous_mode=True,
|
| 186 |
+
start_stage=1,
|
| 187 |
+
max_stages=1,
|
| 188 |
+
run_eval=False,
|
| 189 |
+
eval_k=1,
|
| 190 |
+
eval_samples=50,
|
| 191 |
+
enforce_quality_gate=False,
|
| 192 |
+
gate_min_pass_at_1=0.0,
|
| 193 |
+
gate_min_pass_at_k=0.0,
|
| 194 |
+
gate_min_rows=10,
|
| 195 |
+
push_to_hub=False,
|
| 196 |
+
force_redownload=False,
|
| 197 |
+
preflight_only=False,
|
| 198 |
+
)
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
self.assertGreaterEqual(len(outputs), 3)
|
| 202 |
+
last_status = outputs[-1][1]
|
| 203 |
+
self.assertIn("halted", last_status.lower())
|
| 204 |
+
|
| 205 |
+
def test_continuous_mode_cooldown_stops_on_cancel(self) -> None:
|
| 206 |
+
original_max = app.CONTINUOUS_MAX_CONSECUTIVE_FAILURES
|
| 207 |
+
original_delay = app.CONTINUOUS_RESTART_DELAY_SECONDS
|
| 208 |
+
app.CONTINUOUS_MAX_CONSECUTIVE_FAILURES = 3
|
| 209 |
+
app.CONTINUOUS_RESTART_DELAY_SECONDS = 1
|
| 210 |
+
self.addCleanup(setattr, app, "CONTINUOUS_MAX_CONSECUTIVE_FAILURES", original_max)
|
| 211 |
+
self.addCleanup(setattr, app, "CONTINUOUS_RESTART_DELAY_SECONDS", original_delay)
|
| 212 |
+
|
| 213 |
+
def fake_pipeline_core(**kwargs):
|
| 214 |
+
summary = json.dumps({"result": "completed"})
|
| 215 |
+
yield "line-1", "Completed", summary
|
| 216 |
+
|
| 217 |
+
with mock.patch.object(app, "run_pipeline_core", side_effect=fake_pipeline_core):
|
| 218 |
+
with mock.patch.object(app, "is_cancel_requested", return_value=True):
|
| 219 |
+
outputs = list(
|
| 220 |
+
app.run_pipeline(
|
| 221 |
+
dataset_repo_id="owner/dataset",
|
| 222 |
+
model_repo_id="owner/model",
|
| 223 |
+
base_model_id="model/base",
|
| 224 |
+
autonomous_mode=False,
|
| 225 |
+
continuous_mode=True,
|
| 226 |
+
start_stage=1,
|
| 227 |
+
max_stages=1,
|
| 228 |
+
run_eval=False,
|
| 229 |
+
eval_k=1,
|
| 230 |
+
eval_samples=50,
|
| 231 |
+
enforce_quality_gate=False,
|
| 232 |
+
gate_min_pass_at_1=0.0,
|
| 233 |
+
gate_min_pass_at_k=0.0,
|
| 234 |
+
gate_min_rows=10,
|
| 235 |
+
push_to_hub=False,
|
| 236 |
+
force_redownload=False,
|
| 237 |
+
preflight_only=False,
|
| 238 |
+
)
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
self.assertGreaterEqual(len(outputs), 3)
|
| 242 |
+
self.assertIn("stopped", outputs[-1][1].lower())
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
if __name__ == "__main__":
|
| 246 |
+
unittest.main(verbosity=2)
|