Spaces:

cstr
/

LLMProviders

Running

File size: 37,822 Bytes

'use strict';

/**
 * Fetch benchmark data from six sources and merge into data/benchmarks.json.
 *
 * Sources:
 *   1. AchilleasDrakou/LLMStats on GitHub (71 curated models, self-reported benchmarks)
 *   2. open-llm-leaderboard/contents on Hugging Face (4500+ open models, standardised evals)
 *   3. LiveBench (livebench.ai) — contamination-free, monthly, 70+ frontier models
 *   4. Chatbot Arena (lmarena.ai) — 316 models with real ELO ratings from human votes
 *   5. Aider (aider.chat) — code editing benchmark, 133 tasks per model
 *   6. Artificial Analysis (artificialanalysis.ai) — independent evaluations and speed benchmarks
 *
 * Unified field names (0-1 scale unless noted):
 *   mmlu, mmlu_pro, gpqa, human_eval, math, gsm8k, mmmu,
 *   hellaswag, ifeval, arc, drop, mbpp, mgsm, bbh  (from LLMStats)
 *   hf_math_lvl5, hf_musr, hf_avg, params_b        (HF-only)
 *   lb_name, lb_global, lb_reasoning, lb_coding,    (LiveBench, 0-1)
 *   lb_math, lb_language, lb_if, lb_data_analysis
 *   arena_elo, arena_rank, arena_votes               (Chatbot Arena; elo is raw ELO ~800-1500)
 *   aider_pass_rate                                  (Aider edit bench, 0-1)
 *   aa_id, aa_intelligence, aa_mmlu_pro, aa_gpqa,    (Artificial Analysis)
 *   aa_livecodebench, aa_tokens_per_s, aa_latency_s
 *
 * Where multiple sources have data for the same benchmark,
 * LLMStats takes priority (it stores self-reported model-card values).
 *
 * Usage:
 *   node scripts/fetch-benchmarks.js             # fetch all sources
 *   node scripts/fetch-benchmarks.js aa          # refresh Artificial Analysis only
 *   node scripts/fetch-benchmarks.js livebench   # refresh LiveBench only
 */

const fs   = require('fs');
const path = require('path');
const yaml = require('js-yaml');
const { getJson, getText } = require('./fetch-utils');
const { loadEnv } = require('./load-env');

loadEnv();

const OUT_FILE = path.join(__dirname, '..', 'data', 'benchmarks.json');

// ─── helpers ────────────────────────────────────────────────────────────────

const normName = (s) =>
  (s || '').toLowerCase().replace(/[-_.]/g, ' ').replace(/[^a-z0-9 ]/g, '').replace(/\s+/g, ' ').trim();

// ─── LLMStats ───────────────────────────────────────────────────────────────

const LLMSTATS_TREE = 'https://api.github.com/repos/AchilleasDrakou/LLMStats/git/trees/main?recursive=1';
const LLMSTATS_RAW  = 'https://raw.githubusercontent.com/AchilleasDrakou/LLMStats/main/';

const LLMSTATS_MAP = {
  mmlu:       ['MMLU', 'MMLU Chat', 'MMLU-Base', 'MMLU (CoT)', 'Multilingual MMLU'],
  mmlu_pro:   ['MMLU-Pro', 'MMLU-STEM', 'Multilingual MMLU-Pro'],
  gpqa:       ['GPQA'],
  human_eval: ['HumanEval', 'Humaneval', 'HumanEval+', 'HumanEval-Average', 'Instruct HumanEval', 'MBPP EvalPlus', 'EvalPlus', 'Evalplus'],
  math:       ['MATH', 'Math', 'MATH (CoT)', 'MATH-500', 'Functional_MATH', 'FunctionalMATH'],
  gsm8k:      ['GSM8K', 'GSM-8K', 'GSM8k', 'GSM8K Chat', 'GSM-8K (CoT)'],
  mmmu:       ['MMMU', 'MMMUval', 'MMMU-Pro'],
  hellaswag:  ['HellaSwag', 'HellaSWAG', 'Hellaswag'],
  ifeval:     ['IFEval', 'IF-Eval'],
  arc:        ['ARC Challenge', 'ARC-C', 'ARC-c', 'ARC-e', 'ARC-Challenge', 'AI2 Reasoning Challenge (ARC)'],
  drop:       ['DROP'],
  mbpp:       ['MBPP', 'MBPP+', 'MBPP++', 'MBPP pass@1', 'MBPP EvalPlus (base)'],
  mgsm:       ['MGSM', 'Multilingual MGSM', 'Multilingual MGSM (CoT)'],
  bbh:        ['BBH', 'BigBench Hard CoT', 'BIG-Bench-Hard', 'BigBench-Hard', 'BIG-Bench Hard', 'BigBench_Hard'],
};

function extractLLMStatsMetrics(qualitative_metrics) {
  const scores = {};
  for (const m of qualitative_metrics || []) {
    for (const [key, names] of Object.entries(LLMSTATS_MAP)) {
      if (names.some((n) => m.dataset_name === n) && scores[key] === undefined) {
        scores[key] = m.score;
      }
    }
  }
  return scores;
}

async function fetchLLMStats() {
  process.stdout.write('LLMStats: fetching file list... ');
  const tree = await getJson(LLMSTATS_TREE);
  const files = tree.tree.filter(
    (f) => f.type === 'blob' && f.path.startsWith('models/') && f.path.endsWith('/model.json')
  );
  console.log(`${files.length} models`);

  const results = [];
  const BATCH = 10;
  for (let i = 0; i < files.length; i += BATCH) {
    const batch = files.slice(i, i + BATCH);
    const rows = await Promise.all(batch.map(async (f) => {
      try {
        const data = await getJson(LLMSTATS_RAW + f.path);
        const slug = f.path.replace(/^models\//, '').replace(/\/model\.json$/, '');
        const metrics = extractLLMStatsMetrics(data.qualitative_metrics);
        const entry = { slug, name: data.name, ...metrics, sources: {} };
        Object.keys(metrics).forEach(k => entry.sources[k] = 'llmstats');
        return entry;
      } catch (e) {
        console.warn(`\n  ⚠ LLMStats ${f.path}: ${e.message}`);
        return null;
      }
    }));
    rows.forEach((r) => { if (r) results.push(r); });
    process.stdout.write(`  LLMStats: ${Math.min(i + BATCH, files.length)}/${files.length}\r`);
  }
  console.log(`  LLMStats: ${results.length} entries fetched            `);
  return results;
}

// ─── HF Leaderboard ─────────────────────────────────────────────────────────

const HF_ROWS_URL = 'https://datasets-server.huggingface.co/rows' +
  '?dataset=open-llm-leaderboard%2Fcontents&config=default&split=train';

async function fetchHFPage(offset, limit = 100) {
  const data = await getJson(`${HF_ROWS_URL}&offset=${offset}&limit=${limit}`);
  return { rows: data.rows.map((r) => r.row), total: data.num_rows_total };
}

async function fetchHFLeaderboard() {
  process.stdout.write('HF Leaderboard: probing total... ');
  const first = await fetchHFPage(0, 1);
  const total = first.total;
  console.log(`${total} rows`);

  const LIMIT = 100;
  const pages = Math.ceil(total / LIMIT);
  const allRows = [...first.rows];

  // Fetch remaining pages in batches of 5 concurrent requests
  const CONCURRENT = 5;
  for (let p = 1; p < pages; p += CONCURRENT) {
    const batch = [];
    for (let q = p; q < Math.min(p + CONCURRENT, pages); q++) {
      batch.push(fetchHFPage(q * LIMIT, LIMIT));
    }
    const results = await Promise.all(batch);
    results.forEach((r) => allRows.push(...r.rows));
    const done = Math.min((p + CONCURRENT) * LIMIT, total);
    process.stdout.write(`  HF: ${done}/${total}\r`);
  }
  console.log(`  HF: ${total}/${total} — filtering...            `);

  // The Average column name has a Unicode emoji
  const AVG_KEY = Object.keys(allRows[0]).find((k) => k.startsWith('Average'));

  const entries = allRows
    .filter((r) => r['Available on the hub'] && !r.Flagged)
    .map((r) => {
      const entry = {
        hf_id: r.fullname,
        name: r.fullname.split('/').pop(),
        sources: {},
      };
      if (r['#Params (B)'])     { entry.params_b      = r['#Params (B)']; entry.sources.params_b = 'hf'; }
      if (r['IFEval Raw'])      { entry.ifeval        = r['IFEval Raw']; entry.sources.ifeval = 'hf'; }
      if (r['BBH Raw'])         { entry.bbh           = r['BBH Raw']; entry.sources.bbh = 'hf'; }
      if (r['GPQA Raw'])        { entry.gpqa          = r['GPQA Raw']; entry.sources.gpqa = 'hf'; }
      if (r['MMLU-PRO Raw'])    { entry.mmlu_pro      = r['MMLU-PRO Raw']; entry.sources.mmlu_pro = 'hf'; }
      if (r['MATH Lvl 5 Raw'])  { entry.hf_math_lvl5  = r['MATH Lvl 5 Raw']; entry.sources.hf_math_lvl5 = 'hf'; }
      if (r['MUSR Raw'])        { entry.hf_musr       = r['MUSR Raw']; entry.sources.hf_musr = 'hf'; }
      if (AVG_KEY && r[AVG_KEY]) { entry.hf_avg       = r[AVG_KEY]; entry.sources.hf_avg = 'hf'; }
      return entry;
    });

  console.log(`  HF: ${entries.length} entries after filtering`);
  return entries;
}

// ─── LiveBench ───────────────────────────────────────────────────────────────

const LB_GITHUB_TREE = 'https://api.github.com/repos/LiveBench/livebench.github.io/git/trees/main?recursive=1';
const LB_BASE_URL    = 'https://livebench.ai';

const LB_SUFFIX_RE = new RegExp(
  '(-thinking-(?:auto-)?(?:\\d+k-)?(?:(?:high|medium|low)-effort)?|' +
  '-thinking(?:-(?:64k|32k|auto|minimal))?|' +
  '-(?:high|medium|low)-effort|' +
  '-base|-non-?reasoning|-(?:high|low|min)thinking|-nothinking)' +
  '(?:-(?:high|medium|low)-effort)?$'
);

function lbBaseName(name) {
  let prev;
  let cur = name;
  do { prev = cur; cur = cur.replace(LB_SUFFIX_RE, ''); } while (cur !== prev);
  return cur;
}

function parseLiveBenchCsv(csvText, taskToGroup) {
  const avg = (arr) => arr.reduce((a, b) => a + b, 0) / arr.length;
  const lines = csvText.split('\n').filter(Boolean);
  const headers = lines[0].split(',');
  const entries = [];
  for (const line of lines.slice(1)) {
    const vals = line.split(',');
    const modelName = vals[0];
    if (!modelName) continue;
    const taskScores = {};
    for (let i = 1; i < headers.length; i++) {
      const v = parseFloat(vals[i]);
      if (!isNaN(v)) taskScores[headers[i]] = v / 100;
    }
    const groupBuckets = {};
    for (const [task, group] of Object.entries(taskToGroup)) {
      if (taskScores[task] !== undefined) {
        groupBuckets[group] = groupBuckets[group] || [];
        groupBuckets[group].push(taskScores[task]);
      }
    }
    const allScores = Object.values(taskScores);
    const entry = {
      lb_name:          modelName,
      lb_global:        allScores.length ? avg(allScores) : undefined,
      lb_reasoning:     groupBuckets.lb_reasoning    ? avg(groupBuckets.lb_reasoning)    : undefined,
      lb_coding:        groupBuckets.lb_coding        ? avg(groupBuckets.lb_coding)        : undefined,
      lb_math:          groupBuckets.lb_math          ? avg(groupBuckets.lb_math)          : undefined,
      lb_language:      groupBuckets.lb_language      ? avg(groupBuckets.lb_language)      : undefined,
      lb_if:            groupBuckets.lb_if            ? avg(groupBuckets.lb_if)            : undefined,
      lb_data_analysis: groupBuckets.lb_data_analysis ? avg(groupBuckets.lb_data_analysis) : undefined,
      sources: {},
    };
    Object.keys(entry).forEach(k => {
      if (k.startsWith('lb_') && entry[k] !== undefined) entry.sources[k] = 'livebench';
    });
    entries.push(entry);
  }
  return entries;
}

async function fetchLiveBench() {
  process.stdout.write('LiveBench: finding all releases... ');
  const tree = await getJson(LB_GITHUB_TREE);
  const dates = tree.tree
    .filter((f) => f.path.startsWith('public/table_') && f.path.endsWith('.csv'))
    .map((f) => f.path.replace('public/table_', '').replace('.csv', ''))
    .sort();
  console.log(`${dates.length} releases (${dates[0]} → ${dates[dates.length - 1]})`);

  const cats = await getJson(`${LB_BASE_URL}/categories_${dates[dates.length - 1]}.json`);
  const taskToGroup = {};
  for (const [cat, tasks] of Object.entries(cats)) {
    const group =
      cat === 'Coding' || cat === 'Agentic Coding' ? 'lb_coding' :
      cat === 'Reasoning'     ? 'lb_reasoning' :
      cat === 'Mathematics'   ? 'lb_math' :
      cat === 'Language'      ? 'lb_language' :
      cat === 'IF'            ? 'lb_if' :
      cat === 'Data Analysis' ? 'lb_data_analysis' : null;
    if (group) for (const t of tasks) taskToGroup[t] = group;
  }

  const byName = new Map();
  for (const date of dates) {
    let csv;
    try { csv = await getText(`${LB_BASE_URL}/table_${date}.csv`); } 
    catch (e) { console.warn(`\n  ⚠ LiveBench ${date}: ${e.message}`); continue; }
    for (const entry of parseLiveBenchCsv(csv, taskToGroup)) byName.set(entry.lb_name, entry);
    process.stdout.write(`  LiveBench: ${date}\r`);
  }
  const entries = [...byName.values()];
  console.log(`  LiveBench: ${entries.length} unique models across all releases`);
  return entries;
}

function mergeLiveBench(entries, lbEntries) {
  const exactMap = new Map();
  const baseMap  = new Map();
  for (const lb of lbEntries) {
    exactMap.set(normName(lb.lb_name), lb);
    const base = normName(lbBaseName(lb.lb_name));
    if (base !== normName(lb.lb_name)) {
      const prev = baseMap.get(base);
      if (!prev || (lb.lb_global || 0) > (prev.lb_global || 0)) baseMap.set(base, lb);
    }
  }
  const usedLbNames = new Set();
  let matched = 0;
  for (const e of entries) {
    const candidates = [normName(e.name || ''), normName((e.slug || '').split('/').pop() || ''), normName((e.hf_id || '').split('/').pop() || '')].filter(Boolean);
    let lb = null;
    for (const c of candidates) { lb = exactMap.get(c) || baseMap.get(c); if (lb) break; }
    if (lb) { 
      Object.assign(e, lb); 
      e.sources = { ...(e.sources || {}), ...(lb.sources || {}) };
      usedLbNames.add(lb.lb_name); 
      matched++; 
    }
  }
  const usedBases = new Set([...usedLbNames].map((n) => normName(lbBaseName(n))));
  const newEntries = [];
  for (const lb of lbEntries) {
    if (usedLbNames.has(lb.lb_name)) continue;
    const base = normName(lbBaseName(lb.lb_name));
    if (usedBases.has(base)) continue;
    if (baseMap.get(base) === lb || exactMap.get(normName(lb.lb_name)) === lb) {
      newEntries.push({ name: lbBaseName(lb.lb_name), ...lb });
      usedBases.add(base);
    }
  }
  console.log(`  LiveBench: ${matched} matched, ${newEntries.length} new entries`);
  return [...entries, ...newEntries];
}

// ─── Chatbot Arena ───────────────────────────────────────────────────────────

async function fetchChatbotArena() {
  process.stdout.write('Chatbot Arena: fetching RSC leaderboard... ');
  const text = await getText('https://lmarena.ai/en/leaderboard/text', {
    headers: { 'User-Agent': 'Mozilla/5.0', 'RSC': '1', 'Accept': 'text/x-component' },
  });
  let entries = null;
  for (const line of text.split('\n')) {
    if (!line.includes('"entries":[') || !line.includes('"rating":')) continue;
    const start = line.indexOf('"entries":[') + '"entries":'.length;
    let depth = 0, end = -1;
    for (let i = start; i < line.length; i++) {
      if (line[i] === '[' || line[i] === '{') depth++;
      else if (line[i] === ']' || line[i] === '}') { depth--; if (depth === 0) { end = i + 1; break; } }
    }
    entries = JSON.parse(line.substring(start, end));
    break;
  }
  if (!entries) throw new Error('Could not find entries in RSC payload');
  console.log(`${entries.length} models`);
  return entries.map((e) => {
    const entry = {
      arena_name:  e.modelDisplayName,
      arena_org:   e.modelOrganization,
      arena_elo:   e.rating,
      arena_rank:  e.rank,
      arena_votes: e.votes,
      sources: {},
    };
    Object.keys(entry).forEach(k => {
      if (k.startsWith('arena_') && entry[k] !== undefined) entry.sources[k] = 'arena';
    });
    return entry;
  });
}

function mergeArena(entries, arenaEntries) {
  const arenaMap = new Map();
  for (const a of arenaEntries) arenaMap.set(normName(a.arena_name), a);
  let matched = 0;
  for (const e of entries) {
    const candidates = [normName(e.name || ''), normName(e.lb_name || ''), normName((e.slug || '').split('/').pop() || ''), normName((e.hf_id || '').split('/').pop() || '')];
    const a = candidates.map((c) => arenaMap.get(c)).find(Boolean);
    if (a) {
      e.arena_elo = a.arena_elo; e.arena_rank = a.arena_rank; e.arena_votes = a.arena_votes;
      e.sources = { ...(e.sources || {}), ...(a.sources || {}) };
      arenaMap.delete(normName(a.arena_name)); matched++;
    }
  }
  const newEntries = [];
  for (const a of arenaMap.values()) newEntries.push({ name: a.arena_name, ...a });
  console.log(`  Arena: ${matched} matched, ${newEntries.length} new entries`);
  return [...entries, ...newEntries];
}

// ─── Aider ───────────────────────────────────────────────────────────────────

const AIDER_RAW = 'https://raw.githubusercontent.com/Aider-AI/aider/main/aider/website/_data/edit_leaderboard.yml';

async function fetchAider() {
  process.stdout.write('Aider: fetching edit leaderboard... ');
  const text = await getText(AIDER_RAW);
  const rows = yaml.load(text);
  const best = new Map();
  for (const row of rows) {
    if (!row.model || row.pass_rate_1 === undefined) continue;
    const key = normName(row.model);
    const existing = best.get(key);
    if (!existing || row.pass_rate_1 > existing.pass_rate_1) best.set(key, row);
  }
  const entries = [];
  for (const row of best.values()) {
    const entry = { aider_model: row.model, aider_pass_rate: row.pass_rate_1 / 100, sources: {} };
    entry.sources.aider_pass_rate = 'aider';
    entries.push(entry);
  }
  console.log(`${entries.length} models (best run each)`);
  return entries;
}

function mergeAider(entries, aiderEntries) {
  const aiderMap = new Map();
  for (const a of aiderEntries) aiderMap.set(normName(a.aider_model), a);
  let matched = 0;
  for (const e of entries) {
    const candidates = [normName(e.name || ''), normName(e.lb_name || ''), normName((e.slug || '').split('/').pop() || ''), normName((e.hf_id || '').split('/').pop() || ''), normName(e.arena_name || '')];
    const a = candidates.map((c) => aiderMap.get(c)).find(Boolean);
    if (a) { 
      e.aider_pass_rate = a.aider_pass_rate; 
      e.sources = { ...(e.sources || {}), ...(a.sources || {}) };
      aiderMap.delete(normName(a.aider_model)); 
      matched++; 
    }
  }
  const newEntries = [];
  for (const a of aiderMap.values()) newEntries.push({ name: a.aider_model, ...a });
  console.log(`  Aider: ${matched} matched, ${newEntries.length} new entries`);
  return [...entries, ...newEntries];
}

// ─── Artificial Analysis ───────────────────────────────────────────────────

async function fetchArtificialAnalysis() {
  const apiKey = process.env.ARTIFICIAL_ANALYSIS_API_KEY;
  if (!apiKey) {
    console.log('Artificial Analysis: skipping (no API key found)');
    return [];
  }

  process.stdout.write('Artificial Analysis: fetching benchmarks... ');
  const res = await getJson('https://artificialanalysis.ai/api/v2/data/llms/models', {
    headers: { 'x-api-key': apiKey },
  });

  if (!res.data) throw new Error('Invalid response from Artificial Analysis API');
  console.log(`${res.data.length} models`);

  return res.data.map((m) => {
    const ev = m.evaluations || {};
    const entry = {
      aa_id: m.id,
      aa_name: m.name,
      aa_slug: m.slug,
      aa_intelligence: ev.artificial_analysis_intelligence_index, // 0-100
      aa_coding: ev.artificial_analysis_coding_index, // 0-100
      aa_math: ev.artificial_analysis_math_index, // 0-100
      aa_mmlu_pro: ev.mmlu_pro, // 0-1
      aa_gpqa: ev.gpqa, // 0-1
      aa_livecodebench: ev.livecodebench, // 0-1
      aa_hle: ev.hle,
      aa_scicode: ev.scicode,
      aa_math_500: ev.math_500,
      aa_aime: ev.aime,
      aa_tokens_per_s: m.median_output_tokens_per_second,
      aa_latency_s: m.median_time_to_first_token_seconds,
      sources: {},
    };
    Object.keys(entry).forEach(k => {
      if (k.startsWith('aa_') && entry[k] !== undefined) entry.sources[k] = 'aa';
    });
    return entry;
  });
}

function mergeArtificialAnalysis(entries, aaEntries) {
  const aaMap = new Map();
  for (const a of aaEntries) aaMap.set(normName(a.aa_name), a);

  let matched = 0;
  for (const e of entries) {
    const candidates = [
      normName(e.name || ''),
      normName(e.lb_name || ''),
      normName((e.slug || '').split('/').pop() || ''),
      normName((e.hf_id || '').split('/').pop() || ''),
      normName(e.arena_name || ''),
    ].filter(Boolean);

    const aa = candidates.map((c) => aaMap.get(c)).find(Boolean);
    if (aa) {
      Object.assign(e, aa);
      e.sources = { ...(e.sources || {}), ...(aa.sources || {}) };
      aaMap.delete(normName(aa.aa_name));
      matched++;
    }
  }

  const newEntries = [];
  for (const a of aaMap.values()) {
    newEntries.push({ name: a.aa_name, ...a });
  }

  console.log(`  AA: ${matched} matched, ${newEntries.length} new entries`);
  return [...entries, ...newEntries];
}

// ─── MTEB ──────────────────────────────────────────────────────────────────

const MTEB_PATHS_URL = 'https://raw.githubusercontent.com/embeddings-benchmark/results/main/paths.json';
const MTEB_RAW_BASE_URL = 'https://raw.githubusercontent.com/embeddings-benchmark/results/main/';

async function fetchMTEB() {
  const providersPath = path.join(__dirname, '..', 'data', 'providers.json');
  if (!fs.existsSync(providersPath)) return [];
  
  process.stdout.write('MTEB: fetching results index... ');
  const paths = await getJson(MTEB_PATHS_URL);
  const providers = JSON.parse(fs.readFileSync(providersPath, 'utf8')).providers;
  const hfIds = new Set();
  providers.forEach(p => p.models.forEach(m => { if (m.type === 'embedding' && m.hf_id) hfIds.add(m.hf_id); }));
  console.log(`${hfIds.size} embedders`);

  const results = [];
  for (const hfId of hfIds) {
    const key = hfId.replace(/\//g, '__');
    // Try original key, then find matching key in paths (case-insensitive)
    let resultPaths = paths[key];
    if (!resultPaths) {
      const match = Object.keys(paths).find(k => k.toLowerCase() === key.toLowerCase());
      if (match) resultPaths = paths[match];
    }
    if (!resultPaths) continue;

    const revisions = [...new Set(resultPaths.map(p => p.split('/')[2]))];
    // Aggregation: we'll take all unique tasks across all revisions, 
    // prioritizing the latest revision for each task.
    const taskPaths = new Map();
    revisions.forEach(rev => {
      const pathsInRev = resultPaths.filter(p => p.includes(`/${rev}/`));
      pathsInRev.forEach(p => {
        const taskName = p.split('/').pop().replace('.json', '');
        taskPaths.set(taskName, p);
      });
    });
    
    const latestPaths = [...taskPaths.values()];
    const fetchPaths = latestPaths.slice(0, 50); // Limit to 50 tasks to prevent hangs
    process.stdout.write(`  MTEB: ${hfId} (fetching ${fetchPaths.length}/${latestPaths.length} tasks)\r`);
    
    let total = 0, count = 0, retTotal = 0, retCount = 0;
    const BATCH = 20;
    for (let i = 0; i < fetchPaths.length; i += BATCH) {
      const batch = await Promise.all(fetchPaths.slice(i, i + BATCH).map(p => getJson(MTEB_RAW_BASE_URL + p).catch(() => null)));
      batch.forEach(res => {
        if (!res) return;
        const scores = res.scores || res;
        const data = scores.test || scores.dev || scores.validation;
        if (!data) return;
        const arr = Array.isArray(data) ? data : [data];
        
        // Find English or default subset
        let targetRes = arr.find(r => r.languages && r.languages.some(l => l.startsWith('eng') || l === 'en'));
        if (!targetRes && arr.length === 1) targetRes = arr[0];
        if (!targetRes) targetRes = arr.find(r => r.hf_subset === 'default');
        if (!targetRes && arr.length > 0) targetRes = arr[0];

        if (targetRes) {
          const s = targetRes.main_score || targetRes.ndcg_at_10 || targetRes.accuracy;
          if (typeof s === 'number' && s > 0) {
            let norm = s <= 1.0 ? s * 100 : s;
            if (norm > 100) norm = 100; // Cap at 100
            total += norm; count++;
            const task = res.mteb_dataset_name || res.task_name || '';
            if (task.includes('Retrieval') || task.includes('Search')) { retTotal += norm; retCount++; }
          }
        }
      });
    }
    if (count > 0) {
      results.push({
        hf_id: hfId,
        name: hfId.split('/').pop(),
        mteb_avg: Math.round(total / count * 100) / 100,
        mteb_retrieval: retCount > 0 ? Math.round(retTotal / retCount * 100) / 100 : undefined,
        sources: { mteb_avg: 'mteb', mteb_retrieval: retCount > 0 ? 'mteb' : undefined }
      });
    }
  }
  console.log(`\n  MTEB: ${results.length} models enriched            `);
  return results;
}

function mergeMTEB(entries, mtebEntries) {
  const map = new Map(mtebEntries.map(m => [m.hf_id.toLowerCase(), m]));
  
  // Manual overrides for famous models not yet in the results repo or needing fixed values
  const overrides = [
    { hf_id: 'BAAI/bge-multilingual-gemma2', mteb_avg: 70.3, mteb_retrieval: 67.5, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual' } },
    { hf_id: 'Qwen/Qwen3-Embedding-8B', mteb_avg: 71.2, mteb_retrieval: 72.1, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual' } },
    { hf_id: 'BAAI/bge-en-icl', mteb_avg: 64.9, mteb_retrieval: 58.2, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual' } },
    { hf_id: 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2', mteb_avg: 51.98, mteb_retrieval: 39.76, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual' } },
    { name: 'Mistral Embed', mteb_avg: 55.26, sources: { mteb_avg: 'manual' } },
    { name: 'Codestral Embed', mteb_avg: 84.7, mteb_retrieval: 81.0, lb_coding: 0.81, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual', lb_coding: 'manual' } },
  ];
  overrides.forEach(o => {
    const key = (o.hf_id || o.name).toLowerCase();
    map.set(key, o); // Force override
  });

  let matched = 0;
  for (const e of entries) {
    const m = (e.hf_id ? map.get(e.hf_id.toLowerCase()) : null) || (e.name ? map.get(e.name.toLowerCase()) : null);
    if (m) {
      if (m.mteb_avg) e.mteb_avg = m.mteb_avg;
      if (m.mteb_retrieval) e.mteb_retrieval = m.mteb_retrieval;
      if (m.lb_coding) e.lb_coding = m.lb_coding;
      e.sources = { ...(e.sources || {}), ...m.sources };
      const key = (m.hf_id || m.name).toLowerCase();
      map.delete(key); matched++;
    }
  }
  const newEntries = [...map.values()];
  console.log(`  MTEB: ${matched} matched, ${newEntries.length} new entries`);
  return [...entries, ...newEntries];
}

// ─── OCR Benchmarks ────────────────────────────────────────────────────────

function mergeOCR(entries) {
  const ocrData = [
    { name: 'datalab-to/chandra-ocr-2', score: 85.9 },
    { name: 'rednote-hilab/dots.mocr', score: 83.9 },
    { name: 'lightonai/LightOnOCR-2-1B', score: 83.2 },
    { name: 'datalab-to/chandra', score: 83.1 },
    { name: 'infly/Infinity-Parser-7B', score: 82.5 },
    { name: 'allenai/olmOCR-2-7B-1025-FP8', score: 82.4 },
    { name: 'PaddlePaddle/PaddleOCR-VL', score: 80.0 },
    { name: 'baidu/Qianfan-OCR', score: 79.8 },
    { name: 'rednote-hilab/dots.ocr', score: 79.1 },
    { name: 'deepseek-ai/DeepSeek-OCR-2', score: 76.3 },
    { name: 'lightonai/LightOnOCR-1B-1025', score: 76.1 },
    { name: 'deepseek-ai/DeepSeek-OCR', score: 75.7 },
    { name: 'opendatalab/MinerU2.5-2509-1.2B', score: 75.2 },
    { name: 'zai-org/GLM-OCR', score: 75.2 },
    { name: 'FireRedTeam/FireRed-OCR', score: 70.2 },
    { name: 'nanonets/Nanonets-OCR2-3B', score: 69.5 },
  ];

  const ocrMap = new Map();
  ocrData.forEach(d => {
    ocrMap.set(normName(d.name), d);
    const modelPart = d.name.split('/').pop();
    if (modelPart) ocrMap.set(normName(modelPart), d);
  });

  let matched = 0;
  const usedOcr = new Set();
  for (const e of entries) {
    const candidates = [
      normName(e.name || ''),
      normName((e.hf_id || '').split('/').pop() || ''),
      normName(e.hf_id || '')
    ].filter(Boolean);

    const ocr = candidates.map(c => ocrMap.get(c)).find(Boolean);
    if (ocr) {
      e.ocr_avg = ocr.score;
      e.sources = { ...(e.sources || {}), ocr_avg: 'manual' };
      matched++;
      usedOcr.add(ocr.name);
    }
  }

  const newEntries = [];
  ocrData.forEach(d => {
    if (!usedOcr.has(d.name)) {
      newEntries.push({
        hf_id: d.name,
        name: d.name.split('/').pop(),
        ocr_avg: d.score,
        sources: { ocr_avg: 'manual' }
      });
    }
  });
  
  console.log(`  OCR: ${matched} matched, ${newEntries.length} new entries`);
  return [...entries, ...newEntries];
}

// ─── Merge ───────────────────────────────────────────────────────────────────

function mergeEntries(llmstats, hfEntries) {
  const lsIdx = new Map();
  llmstats.forEach((e, i) => {
    lsIdx.set(normName(e.name), i);
    const slugModel = e.slug?.split('/').pop() || '';
    if (slugModel) lsIdx.set(normName(slugModel), i);
  });
  const merged = llmstats.map((e) => ({ ...e, sources: { ...(e.sources || {}) } }));
  const hfOnly = [];
  for (const hf of hfEntries) {
    const modelPart = normName(hf.name);
    const modelWords = modelPart.split(' ');
    const modelNoPrefix = modelWords.length > 1 ? modelWords.slice(1).join(' ') : modelPart;
    const idx = lsIdx.get(modelPart) ?? lsIdx.get(modelNoPrefix);
    if (idx !== undefined) {
      const target = merged[idx];
      if (!target.hf_id) target.hf_id = hf.hf_id;
      if (!target.params_b) target.params_b = hf.params_b;
      if (!target.ifeval) target.ifeval = hf.ifeval;
      if (!target.bbh) target.bbh = hf.bbh;
      if (!target.gpqa) target.gpqa = hf.gpqa;
      if (!target.mmlu_pro) target.mmlu_pro = hf.mmlu_pro;
      target.hf_math_lvl5 = hf.hf_math_lvl5;
      target.hf_musr = hf.hf_musr;
      target.hf_avg = hf.hf_avg;
      target.sources = { ...(target.sources || {}), ...(hf.sources || {}) };
    } else hfOnly.push(hf);
  }
  return [...merged, ...hfOnly];
}

// ─── Refresh ─────────────────────────────────────────────────────────────────

const SOURCE_FIELDS = {
  llmstats:  ['slug', 'mmlu', 'mmlu_pro', 'gpqa', 'human_eval', 'math', 'gsm8k', 'mmmu', 'hellaswag', 'ifeval', 'arc', 'drop', 'mbpp', 'mgsm', 'bbh'],
  hf:        ['hf_id', 'params_b', 'hf_math_lvl5', 'hf_musr', 'hf_avg'],
  livebench: ['lb_name', 'lb_global', 'lb_reasoning', 'lb_coding', 'lb_math', 'lb_language', 'lb_if', 'lb_data_analysis'],
  arena:     ['arena_name', 'arena_org', 'arena_elo', 'arena_rank', 'arena_votes'],
  aider:     ['aider_model', 'aider_pass_rate'],
  aa:        ['aa_id', 'aa_intelligence', 'aa_coding', 'aa_math', 'aa_mmlu_pro', 'aa_gpqa', 'aa_livecodebench', 'aa_hle', 'aa_scicode', 'aa_math_500', 'aa_aime', 'aa_tokens_per_s', 'aa_latency_s'],
  mteb:      ['mteb_avg', 'mteb_retrieval'],
  ocr:       ['ocr_avg'],
};

const SOURCE_ID_FIELD = {
  llmstats: 'slug', hf: 'hf_id', livebench: 'lb_name', arena: 'arena_elo', aider: 'aider_pass_rate', aa: 'aa_intelligence', mteb: 'mteb_avg', ocr: 'ocr_avg',
};

async function refreshSource(source) {
  if (!SOURCE_FIELDS[source]) {
    console.error(`Unknown source "${source}". Valid: ${Object.keys(SOURCE_FIELDS).join(', ')}`);
    process.exit(1);
  }
  console.log(`Refreshing benchmark source: ${source}\n`);
  const existing = JSON.parse(fs.readFileSync(OUT_FILE, 'utf8'));
  const otherIdFields = Object.values(SOURCE_ID_FIELD).filter(f => f !== SOURCE_ID_FIELD[source]);
  const stripped = existing.filter(e => otherIdFields.some(f => e[f] !== undefined)).map(e => {
    const s = { ...e }; for (const f of SOURCE_FIELDS[source]) delete s[f]; return s;
  });
  let result;
  if (source === 'llmstats') result = mergeLLMStatsInto(stripped, await fetchLLMStats());
  else if (source === 'hf') result = mergeHFInto(stripped, await fetchHFLeaderboard());
  else if (source === 'livebench') result = mergeLiveBench(stripped, await fetchLiveBench());
  else if (source === 'arena') result = mergeArena(stripped, await fetchChatbotArena());
  else if (source === 'aider') result = mergeAider(stripped, await fetchAider());
  else if (source === 'aa') result = mergeArtificialAnalysis(stripped, await fetchArtificialAnalysis());
  else if (source === 'mteb') result = mergeMTEB(stripped, await fetchMTEB());
  else if (source === 'ocr') result = mergeOCR(stripped);
  fs.writeFileSync(OUT_FILE, JSON.stringify(result, null, 2));
}

// ─── HF README Evaluation ──────────────────────────────────────────────────

async function fetchHFReadmeBenchmarks() {
  const providersPath = path.join(__dirname, '..', 'data', 'providers.json');
  if (!fs.existsSync(providersPath)) return [];
  
  const providers = JSON.parse(fs.readFileSync(providersPath, 'utf8')).providers;
  const hfIds = new Set();
  providers.forEach(p => p.models.forEach(m => { if (m.hf_id) hfIds.add(m.hf_id); }));
  
  process.stdout.write(`HF README: checking ${hfIds.size} models... `);
  const results = [];
  
  const BATCH = 10;
  const ids = Array.from(hfIds);
  for (let i = 0; i < ids.length; i += BATCH) {
    const batch = ids.slice(i, i + BATCH);
    const rows = await Promise.all(batch.map(async (hfId) => {
      try {
        const url = `https://huggingface.co/${hfId}/raw/main/README.md`;
        const text = await getText(url, { retries: 1 });
        if (!text.startsWith('---')) return null;
        
        const endYaml = text.indexOf('---', 3);
        if (endYaml === -1) return null;
        
        const yamlText = text.substring(3, endYaml);
        const meta = yaml.load(yamlText);
        if (!meta || !meta['model-index']) return null;
        
        let total = 0, count = 0, retTotal = 0, retCount = 0;
        const modelIndex = Array.isArray(meta['model-index']) ? meta['model-index'] : [meta['model-index']];
        modelIndex.forEach(mi => {
          (mi.results || []).forEach(res => {
            const isMTEB = (res.dataset?.type || '').toLowerCase().includes('mteb') || 
                          (res.dataset?.name || '').toLowerCase().includes('mteb') ||
                          (res.task?.type || '').toLowerCase().includes('retrieval');
            if (!isMTEB) return;
            
            const mainMetric = (res.metrics || []).find(m => m.type === 'main_score' || m.type === 'ndcg_at_10' || m.type === 'accuracy');
            if (mainMetric && typeof mainMetric.value === 'number') {
              const val = mainMetric.value;
              let norm = val <= 1.0 ? val * 100 : val;
              if (norm > 100) norm = 100; // Cap at 100
              total += norm; count++;
              
              const taskType = (res.task?.type || '').toLowerCase();
              if (taskType.includes('retrieval') || taskType.includes('search')) {
                retTotal += norm; retCount++;
              }
            }
          });
        });
        
        if (count > 0) {
          return {
            hf_id: hfId,
            name: hfId.split('/').pop(),
            mteb_avg: Math.round(total / count * 100) / 100,
            mteb_retrieval: retCount > 0 ? Math.round(retTotal / retCount * 100) / 100 : undefined,
            sources: { mteb_avg: 'hf-readme', mteb_retrieval: retCount > 0 ? 'hf-readme' : undefined }
          };
        }
      } catch (e) {
        return null;
      }
      return null;
    }));
    rows.forEach(r => { if (r) results.push(r); });
    process.stdout.write(`  HF README: ${Math.min(i + BATCH, ids.length)}/${ids.length}\r`);
  }
  
  console.log(`\n  HF README: ${results.length} models enriched from metadata`);
  return results;
}

// ─── Main ────────────────────────────────────────────────────────────────────

async function main() {
  const source = process.argv[2]?.toLowerCase();
  if (source) { await refreshSource(source); return; }

  const [llmstats, hfEntries, lbEntries, arenaEntries, aiderEntries, aaEntries, mtebEntries, readmeEntries] = await Promise.all([
    fetchLLMStats(),
    fetchHFLeaderboard(),
    fetchLiveBench(),
    fetchChatbotArena(),
    fetchAider(),
    fetchArtificialAnalysis(),
    fetchMTEB(),
    fetchHFReadmeBenchmarks(),
  ]);

  const merged  = mergeEntries(llmstats, hfEntries);
  const withLB  = mergeLiveBench(merged, lbEntries);
  const withAr  = mergeArena(withLB, arenaEntries);
  const withAi  = mergeAider(withAr, aiderEntries);
  const withAA  = mergeArtificialAnalysis(withAi, aaEntries);
  const withMTEB = mergeMTEB(withAA, mtebEntries);
  const withReadme = mergeMTEB(withMTEB, readmeEntries);
  const all     = mergeOCR(withReadme);

  console.log(`\nTotal entries: ${all.length}`);
  console.log(`  With LiveBench: ${all.filter(e => e.lb_name).length} | Arena: ${all.filter(e => e.arena_elo).length} | Aider: ${all.filter(e => e.aider_pass_rate !== undefined).length} | AA: ${all.filter(e => e.aa_intelligence !== undefined).length} | MTEB: ${all.filter(e => e.mteb_avg !== undefined).length} | OCR: ${all.filter(e => e.ocr_avg !== undefined).length}`);

  fs.writeFileSync(OUT_FILE, JSON.stringify(all, null, 2));
  console.log(`Saved to data/benchmarks.json (${(fs.statSync(OUT_FILE).size / 1024).toFixed(0)} KB)`);
}

main().catch((err) => { console.error('Fatal:', err); process.exit(1); });