Spaces:
Running
Running
File size: 14,177 Bytes
3824ea2 05a4bf2 1d58c43 3824ea2 1d58c43 3824ea2 5b123b0 3824ea2 1d58c43 3824ea2 1d58c43 3824ea2 1d58c43 5b123b0 05a4bf2 3824ea2 1d58c43 3824ea2 05a4bf2 1d58c43 3824ea2 05a4bf2 3824ea2 1d58c43 3824ea2 1d58c43 3824ea2 5b123b0 1d58c43 3824ea2 5b123b0 3824ea2 05a4bf2 3824ea2 05a4bf2 3824ea2 1d58c43 3824ea2 1d58c43 3824ea2 1d58c43 3824ea2 5b123b0 3824ea2 5b123b0 1d58c43 3824ea2 5b123b0 3824ea2 5b123b0 3824ea2 05a4bf2 3824ea2 05a4bf2 3824ea2 1d58c43 05a4bf2 3824ea2 05a4bf2 3824ea2 05a4bf2 1d58c43 3824ea2 1d58c43 05a4bf2 3824ea2 05a4bf2 3824ea2 05a4bf2 3824ea2 05a4bf2 3824ea2 05a4bf2 1d58c43 05a4bf2 3824ea2 1d58c43 05a4bf2 1d58c43 3824ea2 1d58c43 05a4bf2 3824ea2 05a4bf2 3824ea2 05a4bf2 5b123b0 05a4bf2 3824ea2 05a4bf2 1d58c43 3824ea2 05a4bf2 1d58c43 05a4bf2 3824ea2 05a4bf2 3824ea2 1d58c43 05a4bf2 3824ea2 05a4bf2 3824ea2 05a4bf2 3824ea2 05a4bf2 1d58c43 05a4bf2 3824ea2 05a4bf2 3824ea2 5b123b0 3824ea2 05a4bf2 3824ea2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 | # /// script
# requires-python = ">=3.11"
# dependencies = [
# "httpx",
# "huggingface_hub",
# ]
# ///
"""
Regenerate data.json and upload to the elevow/benchmarks Space.
Source template: duplicated from davanstrien/benchmark-race
https://huggingface.co/spaces/elevow/benchmarks
**Single file:** All Aligned race branding, axis relabeling, optional org-groq tagging, and
offline ``patch_output_dict`` live here (no separate inject script).
1. Add HF ``model_id`` strings to ``MODEL_IDS_ALIGNED_ON_RACE`` (exact strings — use
``DUMP_MODEL_IDS=1`` once to list them). That rewrites ``short_name`` and sets ``race_logo_key``.
2. **Upload the forked** ``scripts/elevow-benchmarks/index.html`` **to your Space** (same folder as
``data.json``). Upstream benchmark-race ignores ``race_logo_key``; without this file you will
not see the Aligned logo or Aligned bar color.
Run locally (from repo root or this folder):
export HF_TOKEN=hf_...
uv run scripts/elevow-benchmarks/update_data.py
Or copy this file to your Space repo root on Hugging Face and run there.
Schedule on HF Jobs (example — point to YOUR raw file):
hf jobs scheduled uv run "0 8,20 * * *" \\
--secrets HF_TOKEN \\
https://huggingface.co/spaces/elevow/benchmarks/resolve/main/update_data.py
Upload the forked UI in the same commit as data (one shot):
UPLOAD_INDEX_HTML=1 uv run scripts/elevow-benchmarks/update_data.py
"""
from __future__ import annotations
import json
import os
import re
import tempfile
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import httpx
from huggingface_hub import HfApi
# Upload target: your fork (was davanstrien/benchmark-race in upstream).
SPACE_REPO = os.environ.get("BENCHMARK_SPACE_REPO", "elevow/benchmarks")
ALIGNED_LOGO_URL = (
"https://www.google.com/s2/favicons?sz=128&domain_url="
"https%3A%2F%2Ftryaligned.ai"
)
ALIGNED_LOGOS_KEY = "AlignedAI"
ALIGNED_COLOR = "#059669"
# Preferred: one list for both **Aligned bar label** + **race_logo_key** + Aligned bar color.
# Run with DUMP_MODEL_IDS=1 once to print every model_id the script saw (copy exact strings).
MODEL_IDS_ALIGNED_ON_RACE: frozenset[str] = frozenset(
{
# "meta-llama/Llama-3.3-70B-Instruct",
# "meta-llama/Llama-4-Scout-17B-16E-Instruct",
}
)
# Legacy: unioned with MODEL_IDS_ALIGNED_ON_RACE (you can use any of these three sets).
MODEL_IDS_USE_ALIGNED_LOGO: frozenset[str] = frozenset()
MODEL_IDS_ALIGNED_AXIS_LABEL: frozenset[str] = frozenset()
def _all_branded_model_ids() -> frozenset[str]:
return MODEL_IDS_ALIGNED_ON_RACE | MODEL_IDS_USE_ALIGNED_LOGO | MODEL_IDS_ALIGNED_AXIS_LABEL
# If True, tag every row whose HF org is literally "groq" with race_logo_key (rare on leaderboards).
USE_ALIGNED_FOR_ORG_GROQ = False
# Copy-paste example if you add a synthetic Aligned row by hand (ensure logos/colors cover provider).
SYNTHETIC_ALIGNED_ROW_EXAMPLE = r"""
# After building `models` for one benchmark, you may append:
# models.append({
# "model_id": "tryaligned/Aligned-AI",
# "short_name": "Aligned-AI",
# "provider": "tryaligned",
# "score": 0.0,
# "date": "2026-01-01",
# "race_logo_key": "AlignedAI",
# })
# Then ensure logos["AlignedAI"] is set and colors include "tryaligned".
"""
def aligned_groq_lane_for_model_id(model_id: str) -> str:
"""Match client `alignedGroqLaneForRawModel` heuristics on HF model_id."""
s = model_id.lower()
if "scout" in s:
return "Vision"
if "coder" in s:
return "Code"
if "llama-3.1" in s and "8b" in s:
return "Fast"
return "Reasoning"
def aligned_axis_label_from_model_id(model_id: str) -> str:
"""Bar label for forked data.json (benchmark-race reads `m.short_name`)."""
slug = model_id.split("/")[-1].replace("-", " ").replace("_", " ")
slug = re.sub(r"\s+", " ", slug).strip()
if len(slug) > 20:
slug = f"{slug[:18]}…"
lane = aligned_groq_lane_for_model_id(model_id)
label = f"Aligned AI — {lane} · {slug}"
if len(label) > 45:
label = f"{label[:43]}…"
return label
BENCHMARK_CONFIGS = [
{"dataset": "SWE-bench/SWE-bench_Verified", "key": "sweVerified", "name": "SWE-bench Verified", "gated": False},
{"dataset": "ScaleAI/SWE-bench_Pro", "key": "swePro", "name": "SWE-bench Pro", "gated": False},
{"dataset": "TIGER-Lab/MMLU-Pro", "key": "mmluPro", "name": "MMLU-Pro", "gated": False},
{"dataset": "Idavidrein/gpqa", "key": "gpqa", "name": "GPQA Diamond", "gated": True},
{"dataset": "cais/hle", "key": "hle", "name": "HLE", "gated": True},
{"dataset": "MathArena/aime_2026", "key": "aime2026", "name": "AIME 2026", "gated": False},
{"dataset": "MathArena/hmmt_feb_2026", "key": "hmmt2026", "name": "HMMT Feb 2026", "gated": False},
{"dataset": "allenai/olmOCR-bench", "key": "olmOcr", "name": "olmOCR-bench", "gated": False},
{"dataset": "harborframework/terminal-bench-2.0", "key": "terminalBench", "name": "Terminal-Bench 2.0", "gated": False},
{"dataset": "FutureMa/EvasionBench", "key": "evasionBench", "name": "EvasionBench", "gated": False},
]
PALETTE = [
"#6366f1", "#0d9488", "#d97706", "#e11d48", "#7c3aed",
"#16a34a", "#2563eb", "#ea580c", "#8b5cf6", "#0891b2",
"#c026d3", "#65a30d", "#dc2626", "#0284c7", "#a21caf",
"#059669", "#9333ea", "#ca8a04", "#be185d", "#0369a1",
]
def inject_aligned_race_branding(
benchmarks: dict[str, Any],
logos: dict[str, str],
color_map: dict[str, str],
) -> tuple[int, int]:
"""Add Aligned logo URL, optional per-model race_logo_key, bar color, and axis labels.
Returns (logo_tag_count, axis_relabel_count) for logging.
"""
logos[ALIGNED_LOGOS_KEY] = ALIGNED_LOGO_URL
color_map[ALIGNED_LOGOS_KEY] = ALIGNED_COLOR
logo_n = 0
axis_n = 0
for _key, bm in benchmarks.items():
for m in bm.get("models") or []:
mid = m.get("model_id") or ""
provider = mid.split("/")[0] if "/" in mid else mid
branded = mid in _all_branded_model_ids()
use_groq_org = USE_ALIGNED_FOR_ORG_GROQ and provider.lower() == "groq"
if branded or use_groq_org:
m["race_logo_key"] = ALIGNED_LOGOS_KEY
logo_n += 1
if branded:
orig_sn = m.get("short_name") or (mid.split("/")[-1] if "/" in mid else mid)
m["chart_full_name"] = f"Published HF model: {orig_sn.replace('-', ' ')}"
m["short_name"] = aligned_axis_label_from_model_id(mid)
axis_n += 1
return logo_n, axis_n
def _upload_index_html_fork(api: HfApi) -> None:
"""Stock benchmark-race ignores race_logo_key; upload sibling index.html when asked."""
flag = os.environ.get("UPLOAD_INDEX_HTML", "").lower()
if flag not in ("1", "true", "yes"):
return
index_path = Path(__file__).resolve().parent / "index.html"
if not index_path.is_file():
print("UPLOAD_INDEX_HTML set but scripts/elevow-benchmarks/index.html is missing.")
return
api.upload_file(
path_or_fileobj=str(index_path),
path_in_repo="index.html",
repo_id=SPACE_REPO,
repo_type="space",
commit_message=f"Update index.html Aligned fork ({datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')})",
)
print(f"Uploaded index.html → {SPACE_REPO}")
def patch_output_dict(output: dict[str, Any]) -> dict[str, Any]:
"""Deep-copy a loaded data.json dict, apply Aligned branding in place, return the copy."""
out = json.loads(json.dumps(output))
benchmarks = out.get("benchmarks") or {}
logos = out.setdefault("logos", {})
colors = out.setdefault("colors", {})
inject_aligned_race_branding(benchmarks, logos, colors)
return out
def fetch_leaderboard(config: dict, hf_token: str | None) -> list[dict]:
url = f"https://huggingface.co/api/datasets/{config['dataset']}/leaderboard"
headers = {}
if config["gated"] and hf_token:
headers["Authorization"] = f"Bearer {hf_token}"
elif config["gated"]:
print(f" {config['name']}: skipped (gated, no token)")
return []
print(f" {config['name']}: fetching scores...")
try:
resp = httpx.get(url, headers=headers, timeout=30)
if resp.status_code != 200:
print(f" skip (status {resp.status_code})")
return []
data = resp.json()
if not isinstance(data, list):
return []
except Exception as e:
print(f" error: {e}")
return []
seen: dict[str, float] = {}
for entry in data:
model_id = entry.get("modelId")
score = entry.get("value")
if model_id and score is not None:
score = float(score)
if model_id not in seen or score > seen[model_id]:
seen[model_id] = score
print(f" {len(seen)} models")
return [{"model_id": mid, "score": s} for mid, s in seen.items()]
def fetch_model_dates(model_ids: list[str], hf_token: str | None) -> dict[str, dict]:
api = HfApi()
results: dict[str, dict] = {}
def _get_info(mid: str):
try:
info = api.model_info(mid, token=hf_token)
params_b = None
if info.safetensors and hasattr(info.safetensors, "total"):
params_b = round(info.safetensors.total / 1_000_000_000, 1)
if params_b is None:
m = re.findall(r"[-_/](\d+\.?\d*)[Bb](?:[-_/]|$)", mid)
if m:
params_b = max(float(x) for x in m)
return mid, info.created_at.strftime("%Y-%m-%d"), params_b
except Exception:
return mid, None, None
with ThreadPoolExecutor(max_workers=8) as pool:
futures = {pool.submit(_get_info, mid): mid for mid in model_ids}
for f in as_completed(futures):
mid, date, params = f.result()
if date:
results[mid] = {"date": date, "parameters_b": params}
return results
def fetch_logo(provider: str) -> str | None:
try:
resp = httpx.get(
f"https://huggingface.co/api/organizations/{provider}/avatar",
timeout=5,
)
if resp.status_code == 200:
return resp.json().get("avatarUrl")
except Exception:
pass
return None
def fetch_all_logos(providers: set[str]) -> dict[str, str]:
logos: dict[str, str] = {}
with ThreadPoolExecutor(max_workers=8) as pool:
futures = {pool.submit(fetch_logo, p): p for p in providers}
for f in as_completed(futures):
p = futures[f]
url = f.result()
if url:
logos[p] = url
return logos
def main() -> None:
hf_token = os.environ.get("HF_TOKEN")
print(f"Generating data.json → upload to {SPACE_REPO}\n")
all_scores: dict[str, dict] = {}
all_model_ids: set[str] = set()
for config in BENCHMARK_CONFIGS:
rows = fetch_leaderboard(config, hf_token)
if rows:
all_scores[config["key"]] = {"name": config["name"], "rows": rows}
all_model_ids.update(r["model_id"] for r in rows)
print(f"\n{len(all_model_ids)} unique models across {len(all_scores)} benchmarks")
if os.environ.get("DUMP_MODEL_IDS"):
print("\n-- DUMP_MODEL_IDS (copy into MODEL_IDS_ALIGNED_ON_RACE) --")
for mid in sorted(all_model_ids):
print(mid)
print("-- end --\n")
print("Fetching model dates...")
model_dates = fetch_model_dates(list(all_model_ids), hf_token)
print(f" got dates for {len(model_dates)}/{len(all_model_ids)} models")
all_providers: set[str] = set()
benchmarks: dict[str, Any] = {}
for key, info in all_scores.items():
models: list[dict] = []
for row in info["rows"]:
mid = row["model_id"]
if mid not in model_dates:
continue
provider = mid.split("/")[0] if "/" in mid else mid
short_name = mid.split("/")[-1]
all_providers.add(provider)
models.append({
"model_id": mid,
"short_name": short_name,
"provider": provider,
"score": round(row["score"], 2),
"date": model_dates[mid]["date"],
})
if models:
benchmarks[key] = {"name": info["name"], "models": models}
print(f"\nFetching logos for {len(all_providers)} providers...")
logos = fetch_all_logos(all_providers)
print(f" got {len(logos)} logos")
color_map: dict[str, str] = {}
for i, provider in enumerate(sorted(all_providers)):
color_map[provider] = PALETTE[i % len(PALETTE)]
tagged, relabeled = inject_aligned_race_branding(benchmarks, logos, color_map)
print(
f" injected {ALIGNED_LOGOS_KEY} logo + color; "
f"race_logo_key on {tagged} row(s); "
f"Aligned axis short_name on {relabeled} row(s)"
)
output = {
"benchmarks": benchmarks,
"logos": logos,
"colors": color_map,
"generated_at": datetime.now(timezone.utc).isoformat(),
}
data_json = json.dumps(output, indent=2)
print(f"\nGenerated {len(data_json) / 1024:.1f} KB")
for key, bm in benchmarks.items():
print(f" {bm['name']}: {len(bm['models'])} models")
print(f"\nUploading data.json to {SPACE_REPO}...")
api = HfApi()
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False, encoding="utf-8") as f:
f.write(data_json)
tmp_path = f.name
try:
api.upload_file(
path_or_fileobj=tmp_path,
path_in_repo="data.json",
repo_id=SPACE_REPO,
repo_type="space",
commit_message=f"Update data.json ({datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')})",
)
print("Done!")
finally:
Path(tmp_path).unlink(missing_ok=True)
_upload_index_html_fork(api)
if __name__ == "__main__":
main()
|