Spaces:

shashankc28
/

deploymentGPUselector

Running

App Files Files Community

shashankc28 commited on Mar 20

Commit

ebd69f4

verified ·

1 Parent(s): 4641828

Update index.html

Browse files

Files changed (1) hide show

index.html +912 -18

index.html CHANGED Viewed

@@ -1,19 +1,913 @@
-<!doctype html>
-<html>
-	<head>
-		<meta charset="utf-8" />
-		<meta name="viewport" content="width=device-width" />
-		<title>My static Space</title>
-		<link rel="stylesheet" href="style.css" />
-	</head>
-	<body>
-		<div class="card">
-			<h1>Welcome to your static Space!</h1>
-			<p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
-			<p>
-				Also don't forget to check the
-				<a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
-			</p>
-		</div>
-	</body>
 </html>

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8" />
+  <meta name="viewport" content="width=device-width, initial-scale=1" />
+  <title>vLLM deployment advisor</title>
+  <link rel="preconnect" href="https://huggingface.co" />
+  <style>
+    :root {
+      --bg: #0f1419;
+      --surface: #1a2332;
+      --surface2: #243044;
+      --border: #334155;
+      --text: #e2e8f0;
+      --muted: #94a3b8;
+      --accent: #38bdf8;
+      --accent2: #a78bfa;
+      --good: #34d399;
+      --warn: #fbbf24;
+    }
+    * { box-sizing: border-box; }
+    body {
+      margin: 0;
+      font-family: "Segoe UI", system-ui, sans-serif;
+      background: var(--bg);
+      color: var(--text);
+      line-height: 1.5;
+      min-height: 100vh;
+    }
+    .wrap {
+      max-width: 1100px;
+      margin: 0 auto;
+      padding: 1.5rem 1.25rem 3rem;
+    }
+    h1 {
+      font-size: 1.35rem;
+      font-weight: 600;
+      margin: 0 0 0.25rem;
+      letter-spacing: -0.02em;
+    }
+    .sub {
+      color: var(--muted);
+      font-size: 0.9rem;
+      margin-bottom: 1.5rem;
+    }
+    label {
+      display: block;
+      font-size: 0.8rem;
+      color: var(--muted);
+      margin-bottom: 0.35rem;
+    }
+    input[type="text"], input[type="number"], select {
+      width: 100%;
+      padding: 0.6rem 0.75rem;
+      border: 1px solid var(--border);
+      border-radius: 8px;
+      background: var(--surface);
+      color: var(--text);
+      font-size: 0.95rem;
+    }
+    input:focus, select:focus {
+      outline: 2px solid var(--accent);
+      outline-offset: 1px;
+    }
+    .row {
+      display: grid;
+      gap: 1rem;
+      margin-bottom: 1rem;
+    }
+    @media (min-width: 640px) {
+      .row.cols-2 { grid-template-columns: 1fr 1fr; }
+      .row.cols-3 { grid-template-columns: repeat(3, 1fr); }
+    }
+    button.primary {
+      padding: 0.65rem 1.25rem;
+      background: linear-gradient(135deg, #0ea5e9, #6366f1);
+      color: #fff;
+      border: none;
+      border-radius: 8px;
+      font-weight: 600;
+      cursor: pointer;
+      font-size: 0.95rem;
+    }
+    button.primary:hover { filter: brightness(1.08); }
+    button.primary:disabled { opacity: 0.5; cursor: not-allowed; }
+    .card {
+      background: var(--surface);
+      border: 1px solid var(--border);
+      border-radius: 12px;
+      padding: 1.1rem 1.25rem;
+      margin-top: 1rem;
+    }
+    .card h2 {
+      font-size: 1rem;
+      margin: 0 0 0.75rem;
+      color: var(--accent);
+    }
+    .err { color: #f87171; font-size: 0.9rem; margin-top: 0.5rem; }
+    .ok { color: var(--good); font-size: 0.9rem; }
+    table {
+      width: 100%;
+      border-collapse: collapse;
+      font-size: 0.85rem;
+    }
+    th, td {
+      text-align: left;
+      padding: 0.45rem 0.5rem;
+      border-bottom: 1px solid var(--border);
+    }
+    th { color: var(--muted); font-weight: 500; }
+    .gpu-grid {
+      display: grid;
+      gap: 0.65rem;
+      grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
+    }
+    .gpu-card {
+      background: var(--surface2);
+      border: 1px solid var(--border);
+      border-radius: 10px;
+      padding: 0.75rem 0.9rem;
+      cursor: pointer;
+      transition: border-color 0.15s, box-shadow 0.15s;
+    }
+    .gpu-card:hover, .gpu-card.selected {
+      border-color: var(--accent);
+      box-shadow: 0 0 0 1px var(--accent);
+    }
+    .gpu-card .name { font-weight: 600; font-size: 0.9rem; }
+    .gpu-card .vram { color: var(--muted); font-size: 0.8rem; }
+    .gpu-detail {
+      margin-top: 1rem;
+      padding: 1rem;
+      background: var(--bg);
+      border-radius: 8px;
+      border: 1px solid var(--border);
+      font-size: 0.88rem;
+    }
+    .gpu-detail dl {
+      display: grid;
+      grid-template-columns: auto 1fr;
+      gap: 0.35rem 1rem;
+      margin: 0;
+    }
+    .gpu-detail dt { color: var(--muted); }
+    .gpu-detail dd { margin: 0; }
+    pre.cmd {
+      background: #0c1220;
+      border: 1px solid var(--border);
+      border-radius: 8px;
+      padding: 1rem;
+      overflow-x: auto;
+      font-size: 0.78rem;
+      line-height: 1.45;
+      white-space: pre-wrap;
+      word-break: break-all;
+    }
+    .badge {
+      display: inline-block;
+      padding: 0.15rem 0.45rem;
+      border-radius: 4px;
+      font-size: 0.75rem;
+      background: var(--surface2);
+      color: var(--muted);
+    }
+    .hint { font-size: 0.8rem; color: var(--muted); margin-top: 0.75rem; }
+    .spinner { display: inline-block; width: 1rem; height: 1rem; border: 2px solid var(--border); border-top-color: var(--accent); border-radius: 50%; animation: spin 0.7s linear infinite; vertical-align: middle; margin-right: 0.35rem; }
+    @keyframes spin { to { transform: rotate(360deg); } }
+    .model-row {
+      display: grid;
+      grid-template-columns: 1fr auto;
+      gap: 0.5rem;
+      align-items: end;
+      margin-bottom: 0.65rem;
+    }
+    .model-row .model-id-input { margin: 0; }
+    button.btn-ghost {
+      padding: 0.55rem 0.85rem;
+      background: var(--surface2);
+      color: var(--text);
+      border: 1px solid var(--border);
+      border-radius: 8px;
+      cursor: pointer;
+      font-size: 0.85rem;
+    }
+    button.btn-ghost:hover { border-color: var(--accent); }
+    button.btn-ghost.danger:hover { border-color: #f87171; color: #f87171; }
+    .gpu-card.preferred { border-color: var(--accent2); box-shadow: 0 0 0 1px var(--accent2); }
+    .gpu-card.commands-target { outline: 1px dashed var(--good); outline-offset: 2px; }
+    details.model-block { margin-bottom: 1rem; border: 1px solid var(--border); border-radius: 8px; padding: 0.5rem 0.75rem; background: var(--bg); }
+    details.model-block summary { cursor: pointer; font-weight: 600; color: var(--accent); }
+  </style>
+</head>
+<body>
+  <div class="wrap">
+    <h1>vLLM deployment advisor</h1>
+    <p class="sub">Pulls weight sizes from Hugging Face, estimates KV memory, and suggests tensor parallelism and <code style="color:var(--accent2)">vllm serve</code> commands. Add several models to estimate total GPUs on your preferred GPU type (separate vLLM instances). Estimates are heuristic — validate on your hardware.</p>
+    <div class="card" style="margin-top:0">
+      <label>Hugging Face models (one per serving endpoint)</label>
+      <p class="hint" style="margin-top:0">Each model is a separate <code>vllm serve</code> process. Planning assumes tensor-parallel groups do not share GPUs with another model unless you colocate manually.</p>
+      <div id="modelListContainer"></div>
+      <button type="button" class="btn-ghost" id="btnAddModel" style="margin-bottom:1rem">+ Add model</button>
+      <div class="row cols-2">
+        <div>
+          <label for="hfToken">HF token (optional, for gated/private)</label>
+          <input type="text" id="hfToken" placeholder="hf_..." autocomplete="off" />
+        </div>
+        <div>
+          <label for="preferredGpu">Preferred GPU (for TP &amp; totals)</label>
+          <select id="preferredGpu"></select>
+        </div>
+      </div>
+      <div class="row cols-3">
+        <div>
+          <label for="weightDtype">Weight memory (dtype)</label>
+          <select id="weightDtype">
+            <option value="bf16" selected>BF16 / FP16 (2 bytes/param)</option>
+            <option value="fp8">FP8 weights (~1 byte/param, if supported)</option>
+          </select>
+        </div>
+        <div>
+          <label for="kvDtype">KV cache dtype</label>
+          <select id="kvDtype">
+            <option value="auto">auto</option>
+            <option value="fp8" selected>fp8 (half KV vs fp16)</option>
+            <option value="fp16">fp16</option>
+          </select>
+        </div>
+        <div>
+          <label for="maxModelLen">Max model length (tokens)</label>
+          <input type="number" id="maxModelLen" value="8192" min="256" step="256" />
+        </div>
+      </div>
+      <div class="row cols-3">
+        <div>
+          <label for="gpuUtil">Target GPU memory utilization</label>
+          <input type="number" id="gpuUtil" value="0.90" min="0.5" max="0.98" step="0.01" />
+        </div>
+        <div>
+          <label for="batchHint">Concurrent sequences per model (KV hint)</label>
+          <input type="number" id="batchHint" value="8" min="1" max="512" step="1" />
+        </div>
+        <div style="display:flex;align-items:flex-end">
+          <button type="button" class="primary" id="btnFetch" style="width:100%">Fetch all &amp; compute</button>
+        </div>
+      </div>
+      <div id="fetchError" class="err" hidden></div>
+    </div>
+    <div id="results" hidden>
+      <div class="card">
+        <h2>Multi-model deployment (preferred GPU)</h2>
+        <div id="multiDeployment"></div>
+      </div>
+      <div class="card">
+        <h2>Models &amp; shards (from Hub)</h2>
+        <div id="modelSummary"></div>
+      </div>
+      <div class="card">
+        <h2>Memory breakdown</h2>
+        <div id="memBreakdown"></div>
+      </div>
+      <div class="card">
+        <h2>GPU catalog</h2>
+        <p class="sub" style="margin:0 0 0.75rem">Click a GPU for full specs. Your <strong>preferred</strong> choice is highlighted for multi-model totals above.</p>
+        <div id="gpuGrid" class="gpu-grid"></div>
+        <div id="gpuDetailPanel" hidden></div>
+      </div>
+      <div class="card">
+        <h2>vLLM commands</h2>
+        <p id="commandGpuHint" class="hint" style="margin-top:0"></p>
+        <pre class="cmd" id="vllmCmd"></pre>
+        <p class="hint">Use a different <code>--port</code> per model when running on the same host. Adjust <code>--tensor-parallel-size</code> if your cluster differs. See <a href="https://docs.vllm.ai" style="color:var(--accent)" target="_blank" rel="noopener">vLLM docs</a>.</p>
+      </div>
+    </div>
+  </div>
+  <script>
+    const HF_API = "https://huggingface.co/api";
+    /** Hugging Face repo ids are `org/name`; encoding the whole string turns `/` into `%2F` and breaks `/api/models/...` (400). Encode each path segment only. */
+    function hfRepoPath(repoId) {
+      return repoId
+        .trim()
+        .split("/")
+        .filter(Boolean)
+        .map(encodeURIComponent)
+        .join("/");
+    }
+    /** Accept pasted browser URLs, e.g. https://huggingface.co/Qwen/Qwen3-30B-A3B → Qwen/Qwen3-30B-A3B */
+    function normalizeHfModelInput(raw) {
+      const s = String(raw).trim();
+      if (!s) return s;
+      if (!/^https?:\/\//i.test(s)) return s;
+      try {
+        const u = new URL(s);
+        const h = u.hostname.replace(/^www\./i, "").toLowerCase();
+        if (h !== "huggingface.co" && h !== "hf.co") return s;
+        const parts = u.pathname.split("/").filter(Boolean);
+        if (parts[0] === "datasets" || parts[0] === "spaces") return s;
+        if (parts.length >= 2) {
+          return `${decodeURIComponent(parts[0])}/${decodeURIComponent(parts[1])}`;
+        }
+      } catch {
+        /* ignore */
+      }
+      return s;
+    }
+    const GPU_CATALOG = [
+      { id: "h100-sxm", name: "NVIDIA H100 SXM", vramGb: 80, memBandwidthGbps: 3350, tdpW: 700, fp16Tflops: 989, pcie: "PCIe 5.0 x16", notes: "Datacenter flagship; best for large TP." },
+      { id: "h100-pcie", name: "NVIDIA H100 PCIe", vramGb: 80, memBandwidthGbps: 2000, tdpW: 350, fp16Tflops: 756, pcie: "PCIe 5.0 x16", notes: "Slightly lower BW than SXM." },
+      { id: "h200", name: "NVIDIA H200", vramGb: 141, memBandwidthGbps: 4800, tdpW: 700, fp16Tflops: 989, pcie: "PCIe 5.0 x16", notes: "More HBM than H100." },
+      { id: "b200", name: "NVIDIA B200", vramGb: 192, memBandwidthGbps: 8000, tdpW: 1000, fp16Tflops: 2250, pcie: "NVLink / rack", notes: "Blackwell; approximate specs." },
+      { id: "a100-80", name: "NVIDIA A100 80GB", vramGb: 80, memBandwidthGbps: 2039, tdpW: 400, fp16Tflops: 312, pcie: "PCIe 4.0", notes: "Common in clouds." },
+      { id: "a100-40", name: "NVIDIA A100 40GB", vramGb: 40, memBandwidthGbps: 1555, tdpW: 400, fp16Tflops: 312, pcie: "PCIe 4.0", notes: "" },
+      { id: "l40s", name: "NVIDIA L40S", vramGb: 48, memBandwidthGbps: 864, tdpW: 350, fp16Tflops: 362, pcie: "PCIe 4.0 x16", notes: "Inference-oriented Ada." },
+      { id: "l40", name: "NVIDIA L40", vramGb: 48, memBandwidthGbps: 864, tdpW: 300, fp16Tflops: 181, pcie: "PCIe 4.0 x16", notes: "Legacy Ada datacenter; predecessor to L40S." },
+      { id: "a30", name: "NVIDIA A30", vramGb: 24, memBandwidthGbps: 933, tdpW: 165, fp16Tflops: 165, pcie: "PCIe 4.0 x16", notes: "Legacy Ampere; compact inference." },
+      { id: "a10", name: "NVIDIA A10", vramGb: 24, memBandwidthGbps: 600, tdpW: 150, fp16Tflops: 125, pcie: "PCIe 4.0 x16", notes: "Legacy Ampere single-slot cloud GPU." },
+      { id: "a10g", name: "NVIDIA A10G", vramGb: 24, memBandwidthGbps: 600, tdpW: 300, fp16Tflops: 125, pcie: "PCIe 4.0 x16", notes: "A10-class (e.g. AWS G5); ref. specs." },
+      { id: "l4", name: "NVIDIA L4", vramGb: 24, memBandwidthGbps: 300, tdpW: 72, fp16Tflops: 120, pcie: "PCIe 4.0 x16", notes: "Legacy Ada low-power inference." },
+      { id: "t4", name: "NVIDIA T4", vramGb: 16, memBandwidthGbps: 320, tdpW: 70, fp16Tflops: 65, pcie: "PCIe 3.0 x16", notes: "Legacy Turing inference." },
+      { id: "v100-32", name: "NVIDIA V100 32GB", vramGb: 32, memBandwidthGbps: 1134, tdpW: 300, fp16Tflops: 125, pcie: "PCIe 3.0 / SXM2", notes: "Legacy Volta; still common in older clusters." },
+      { id: "v100-16", name: "NVIDIA V100 16GB", vramGb: 16, memBandwidthGbps: 900, tdpW: 250, fp16Tflops: 125, pcie: "PCIe 3.0 / SXM2", notes: "Legacy Volta 16 GB SKU." },
+      { id: "p100-16", name: "NVIDIA P100 16GB", vramGb: 16, memBandwidthGbps: 732, tdpW: 250, fp16Tflops: 19, pcie: "PCIe 3.0", notes: "Legacy Pascal; very dated for LLMs." },
+      { id: "a6000", name: "NVIDIA RTX A6000", vramGb: 48, memBandwidthGbps: 768, tdpW: 300, fp16Tflops: 155, pcie: "PCIe 4.0 x16", notes: "Workstation." },
+      { id: "3090", name: "NVIDIA GeForce RTX 3090", vramGb: 24, memBandwidthGbps: 936, tdpW: 350, fp16Tflops: 160, pcie: "PCIe 4.0 x16", notes: "Legacy Ampere consumer; 24 GB." },
+      { id: "4090", name: "NVIDIA GeForce RTX 4090", vramGb: 24, memBandwidthGbps: 1008, tdpW: 450, fp16Tflops: 330, pcie: "PCIe 4.0 x16", notes: "High BW consumer card." },
+      { id: "4080", name: "NVIDIA GeForce RTX 4080", vramGb: 16, memBandwidthGbps: 717, tdpW: 320, fp16Tflops: 195, pcie: "PCIe 4.0 x16", notes: "" },
+      { id: "5090", name: "NVIDIA GeForce RTX 5090", vramGb: 32, memBandwidthGbps: 1792, tdpW: 575, fp16Tflops: 420, pcie: "PCIe 5.0 x16", notes: "Approximate consumer flagship." },
+      { id: "mi300x", name: "AMD MI300X", vramGb: 192, memBandwidthGbps: 5300, tdpW: 750, fp16Tflops: 1300, pcie: "OAM", notes: "Approximate; check ROCm/vLLM support." },
+    ];
+    function authHeaders() {
+      const t = document.getElementById("hfToken").value.trim();
+      return t ? { Authorization: `Bearer ${t}` } : {};
+    }
+    async function hfFetch(url) {
+      const r = await fetch(url, { headers: { ...authHeaders() } });
+      if (!r.ok) throw new Error(`${r.status} ${r.statusText} — ${url}`);
+      return r;
+    }
+    async function hfJson(url) {
+      const r = await hfFetch(url);
+      return r.json();
+    }
+    async function hfText(url) {
+      const r = await hfFetch(url);
+      return r.text();
+    }
+    /** Sum sizes of weight files from tree API */
+    function analyzeTreeFiles(tree) {
+      const ignore = ["training_args", "optimizer", "scheduler", "tf_model", "flax_model", "rust_model"];
+      const files = tree.filter((f) => {
+        if ((f.type !== "blob" && f.type !== "file") || typeof f.size !== "number") return false;
+        const p = f.path.toLowerCase();
+        if (ignore.some((k) => p.includes(k))) return false;
+        if (p.endsWith(".safetensors")) return true;
+        if (p.endsWith(".bin")) {
+          return (
+            p.endsWith("pytorch_model.bin") ||
+            /model-\d+-of-\d+\.bin$/.test(p) ||
+            p.includes("pytorch_model-")
+          );
+        }
+        return false;
+      });
+      const totalBytes = files.reduce((s, f) => s + f.size, 0);
+      const byShard = files.map((f) => ({ path: f.path, sizeBytes: f.size, sizeGb: f.size / 1e9 }));
+      byShard.sort((a, b) => b.sizeBytes - a.sizeBytes);
+      const maxShard = byShard.length ? byShard[0].sizeBytes : 0;
+      return { files: byShard, totalBytes, maxShardBytes: maxShard };
+    }
+    function parseConfigJson(text) {
+      try {
+        return JSON.parse(text);
+      } catch {
+        return null;
+      }
+    }
+    /** Rough param count from Llama-like config */
+    function estimateParamsFromConfig(cfg) {
+      if (!cfg) return null;
+      if (typeof cfg.num_parameters === "number") return cfg.num_parameters;
+      const h = cfg.hidden_size;
+      const L = cfg.num_hidden_layers;
+      const V = cfg.vocab_size;
+      const I = cfg.intermediate_size;
+      const nHead = cfg.num_attention_heads;
+      const nKV = cfg.num_key_value_heads ?? nHead;
+      if (!h || !L || !V || !I || !nHead) return null;
+      const headDim = h / nHead;
+      const embed = V * h;
+      const attnPerLayer = 2 * (h * h) + 2 * (nKV * headDim * h);
+      const mlpPerLayer = 3 * h * I;
+      const ln = 2 * h * L * 2;
+      const out = h * V;
+      return embed + L * (attnPerLayer + mlpPerLayer) + ln + out;
+    }
+    /**
+     * KV bytes per token per layer: K and V each num_kv_heads * head_dim.
+     * Per token: 2 (K+V) * num_kv_heads * head_dim * bytes
+     */
+    function kvBytesPerToken(cfg, kvBytesPerEl) {
+      if (!cfg) return 0;
+      const h = cfg.hidden_size;
+      const L = cfg.num_hidden_layers;
+      const nHead = cfg.num_attention_heads;
+      const nKV = cfg.num_key_value_heads ?? nHead;
+      if (!h || !L || !nHead) return 0;
+      const headDim = h / nHead;
+      return L * 2 * nKV * headDim * kvBytesPerEl;
+    }
+    function bytesPerParamWeight(dtype) {
+      return dtype === "fp8" ? 1 : 2;
+    }
+    function usableVramGb(vramGb, util) {
+      return vramGb * util;
+    }
+    /**
+     * With tensor parallelism, weights and standard attention KV are split across TP ranks:
+     * per-GPU ≈ (weightGb + kvTotalGb) / tp. Need tp ≥ ceil((weight + KV) / usable).
+     * Largest on-disk shard is shown separately (load-time peak can differ by loader).
+     */
+    function minTpForWeightsAndKv(totalWeightGb, kvTotalGb, usablePerGpuGb) {
+      if (usablePerGpuGb <= 0) return Infinity;
+      const combined = totalWeightGb + kvTotalGb;
+      return Math.max(1, Math.ceil(combined / usablePerGpuGb));
+    }
+    function minTpForLargestShard(maxShardGb, usablePerGpuGb) {
+      if (!maxShardGb || maxShardGb <= 0) return 1;
+      if (usablePerGpuGb <= 0) return Infinity;
+      return Math.max(1, Math.ceil(maxShardGb / usablePerGpuGb));
+    }
+    function renderGpuDetail(gpu) {
+      const el = document.getElementById("gpuDetailPanel");
+      el.hidden = false;
+      el.innerHTML = `
+        <div class="gpu-detail">
+          <strong style="color:var(--accent)">${gpu.name}</strong>
+          <dl style="margin-top:0.75rem">
+            <dt>VRAM</dt><dd>${gpu.vramGb} GB</dd>
+            <dt>Memory bandwidth (ref.)</dt><dd>~${gpu.memBandwidthGbps} GB/s</dd>
+            <dt>FP16 TFLOPS (ref.)</dt><dd>~${gpu.fp16Tflops}</dd>
+            <dt>TDP (ref.)</dt><dd>${gpu.tdpW} W</dd>
+            <dt>PCIe</dt><dd>${gpu.pcie}</dd>
+            <dt>Notes</dt><dd>${gpu.notes || "—"}</dd>
+          </dl>
+          <p class="hint" style="margin-bottom:0">Published specs vary by SKU and firmware; use vendor datasheets for procurement.</p>
+        </div>
+      `;
+    }
+    let selectedGpuId = null;
+    /** @type {{ models: object[] } | null} */
+    let lastFetchCtx = null;
+    let rowIdSeq = 0;
+    function populatePreferredGpuSelect() {
+      const sel = document.getElementById("preferredGpu");
+      if (!sel || sel.options.length) return;
+      GPU_CATALOG.forEach((g) => {
+        const o = document.createElement("option");
+        o.value = g.id;
+        o.textContent = `${g.name} (${g.vramGb} GB)`;
+        sel.appendChild(o);
+      });
+      sel.value = "h100-sxm";
+    }
+    function getModelIdsFromInputs() {
+      return Array.from(document.querySelectorAll(".model-id-input"))
+        .map((el) => normalizeHfModelInput(el.value.trim()))
+        .filter(Boolean);
+    }
+    function syncInputValuesFromNormalized() {
+      const inputs = document.querySelectorAll(".model-id-input");
+      inputs.forEach((el) => {
+        const n = normalizeHfModelInput(el.value.trim());
+        if (n && n !== el.value.trim()) el.value = n;
+      });
+    }
+    function addModelRow(initial = "") {
+      const container = document.getElementById("modelListContainer");
+      const id = `mr-${++rowIdSeq}`;
+      const wrap = document.createElement("div");
+      wrap.className = "model-row";
+      wrap.dataset.rowId = id;
+      wrap.innerHTML = `
+        <div>
+          <label class="model-row-label" style="font-size:0.8rem;color:var(--muted)">Model id or URL</label>
+          <input type="text" class="model-id-input" placeholder="org/model or https://huggingface.co/…" autocomplete="off" />
+        </div>
+        <button type="button" class="btn-ghost danger btn-remove-model" title="Remove">Remove</button>`;
+      wrap.querySelector(".model-id-input").value = initial;
+      container.appendChild(wrap);
+      wrap.querySelector(".btn-remove-model").addEventListener("click", () => {
+        if (document.querySelectorAll(".model-row").length <= 1) return;
+        wrap.remove();
+      });
+    }
+    async function fetchOneModel(modelId) {
+      const meta = await hfJson(`${HF_API}/models/${hfRepoPath(modelId)}`);
+      const ref = meta.sha || "main";
+      const treeUrl = `${HF_API}/models/${hfRepoPath(modelId)}/tree/${encodeURIComponent(ref)}?recursive=true`;
+      const tree = await hfJson(treeUrl);
+      const analysis = analyzeTreeFiles(Array.isArray(tree) ? tree : []);
+      let config = null;
+      try {
+        const cfgUrl = `https://huggingface.co/${hfRepoPath(modelId)}/resolve/${encodeURIComponent(ref)}/config.json`;
+        const cfgText = await hfText(cfgUrl);
+        config = parseConfigJson(cfgText);
+      } catch {
+        config = null;
+      }
+      let indexMeta = null;
+      try {
+        const idxCandidates = [
+          `https://huggingface.co/${hfRepoPath(modelId)}/resolve/${encodeURIComponent(ref)}/model.safetensors.index.json`,
+          `https://huggingface.co/${hfRepoPath(modelId)}/resolve/${encodeURIComponent(ref)}/pytorch_model.bin.index.json`,
+        ];
+        for (const u of idxCandidates) {
+          try {
+            const j = await hfJson(u);
+            if (j.metadata && j.metadata.total_size != null) {
+              indexMeta = j.metadata;
+              break;
+            }
+          } catch { /* try next */ }
+        }
+      } catch { /* optional */ }
+      const totalBytesFromIndex = indexMeta && indexMeta.total_size ? Number(indexMeta.total_size) : null;
+      const totalBytes = analysis.totalBytes > 0 ? analysis.totalBytes : totalBytesFromIndex;
+      const totalGbDisk = totalBytes != null ? totalBytes / 1e9 : null;
+      const maxShardGb = analysis.maxShardBytes > 0 ? analysis.maxShardBytes / 1e9 : (totalGbDisk || 0);
+      const estParams = estimateParamsFromConfig(config);
+      return { modelId, meta, analysis, config, totalGbDisk, maxShardGb, estParams };
+    }
+    function metricsForCtx(ctx) {
+      const weightDtype = document.getElementById("weightDtype").value;
+      const bPerParam = bytesPerParamWeight(weightDtype);
+      const weightGbFromParams = ctx.estParams != null ? (ctx.estParams * bPerParam) / 1e9 : null;
+      const weightGb = ctx.totalGbDisk != null ? ctx.totalGbDisk * (bPerParam / 2) : weightGbFromParams;
+      const kvSel = document.getElementById("kvDtype").value;
+      const kvBytesPerEl = kvSel === "fp8" ? 1 : 2;
+      const maxLen = Math.max(256, parseInt(document.getElementById("maxModelLen").value, 10) || 8192);
+      const batchHint = Math.max(1, parseInt(document.getElementById("batchHint").value, 10) || 1);
+      const kvPerToken = kvBytesPerToken(ctx.config, kvBytesPerEl);
+      const kvTotalGb = (kvPerToken * maxLen * batchHint) / 1e9;
+      return { weightGb, kvTotalGb, kvPerToken, weightDtype, kvSel, maxLen, batchHint };
+    }
+    function tpForModelOnGpu(ctx, weightGb, kvTotalGb, gpu, util) {
+      const usable = usableVramGb(gpu.vramGb, util);
+      if (weightGb == null) return null;
+      const tpMem = minTpForWeightsAndKv(weightGb, kvTotalGb, usable);
+      const tpShard = minTpForLargestShard(ctx.maxShardGb, usable);
+      return Math.max(tpMem, tpShard);
+    }
+    /** GPU used for generated vLLM commands: clicked card overrides Preferred dropdown. */
+    function gpuForCommands() {
+      const prefId = document.getElementById("preferredGpu").value;
+      if (selectedGpuId) {
+        const g = GPU_CATALOG.find((x) => x.id === selectedGpuId);
+        if (g) return g;
+      }
+      return GPU_CATALOG.find((x) => x.id === prefId) || GPU_CATALOG[0];
+    }
+    function renderVllmCommands(models) {
+      const hintEl = document.getElementById("commandGpuHint");
+      const cmdEl = document.getElementById("vllmCmd");
+      if (!models || !models.length || !cmdEl) return;
+      const util = Math.min(0.98, Math.max(0.5, parseFloat(document.getElementById("gpuUtil").value) || 0.9));
+      const cmdGpu = gpuForCommands();
+      const usableCmd = usableVramGb(cmdGpu.vramGb, util);
+      const kvFlag =
+        document.getElementById("kvDtype").value === "fp8"
+          ? "fp8_e5m2"
+          : document.getElementById("kvDtype").value === "fp16"
+            ? "fp16"
+            : "auto";
+      const dtypeFlag = document.getElementById("weightDtype").value === "fp8" ? "float8_e4m3fn" : "bfloat16";
+      const maxLen = Math.max(256, parseInt(document.getElementById("maxModelLen").value, 10) || 8192);
+      const blocks = [];
+      let totalCmd = 0;
+      models.forEach((ctx) => {
+        const m = metricsForCtx(ctx);
+        const tp = tpForModelOnGpu(ctx, m.weightGb, m.kvTotalGb, cmdGpu, util);
+        const tpUse = typeof tp === "number" && !Number.isNaN(tp) ? tp : 1;
+        totalCmd += tpUse;
+        blocks.push({ ctx, tpUse });
+      });
+      const lines = [
+        `# Total GPUs (separate vLLM servers, ${cmdGpu.name}): ${totalCmd}`,
+        `# ~${usableCmd.toFixed(1)} GB usable per GPU @ ${(util * 100).toFixed(0)}% of ${cmdGpu.vramGb} GB VRAM`,
+        `# Assign disjoint CUDA_VISIBLE_DEVICES per server on the same host.`,
+        "",
+      ];
+      blocks.forEach((b, i) => {
+        const port = 8000 + i;
+        lines.push(
+          `# --- ${b.ctx.modelId} ---`,
+          `vllm serve "${b.ctx.modelId}" \\`,
+          `  --dtype ${dtypeFlag} \\`,
+          `  --tensor-parallel-size ${b.tpUse} \\`,
+          `  --max-model-len ${maxLen} \\`,
+          `  --gpu-memory-utilization ${util} \\`,
+          `  --kv-cache-dtype ${kvFlag} \\`,
+          `  --port ${port}`,
+          ""
+        );
+      });
+      cmdEl.textContent = lines.join("\n").trimEnd();
+      if (hintEl) {
+        const src = selectedGpuId ? "GPU catalog (clicked card)" : "Preferred GPU dropdown";
+        hintEl.textContent = `Tensor parallelism in the commands below uses ${cmdGpu.name} (~${usableCmd.toFixed(1)} GB usable per GPU). Source: ${src}. Click a GPU card to override the dropdown; change the dropdown to clear the override.`;
+      }
+    }
+    function buildGpuGrid(state) {
+      const grid = document.getElementById("gpuGrid");
+      grid.innerHTML = "";
+      const { usablePerGpuByGpu, shardsFit, tp, util, preferredGpuId } = state;
+      const cmdGpuId = gpuForCommands().id;
+      GPU_CATALOG.forEach((gpu) => {
+        const usable = usablePerGpuByGpu[gpu.id];
+        const fit = shardsFit[gpu.id];
+        const isPref = preferredGpuId && gpu.id === preferredGpuId;
+        const isCmdTarget = gpu.id === cmdGpuId;
+        const card = document.createElement("div");
+        card.className =
+          "gpu-card" +
+          (selectedGpuId === gpu.id ? " selected" : "") +
+          (isPref ? " preferred" : "") +
+          (isCmdTarget ? " commands-target" : "");
+        card.innerHTML = `
+          <div class="name">${isPref ? '<span style="float:right;font-size:0.65rem;color:var(--accent2);text-transform:uppercase">preferred</span>' : ""}${gpu.name}</div>
+          <div class="vram">${gpu.vramGb} GB VRAM · ~${usable.toFixed(1)} GB usable @ ${(util * 100).toFixed(0)}%</div>
+          <div style="margin-top:0.4rem;font-size:0.78rem;color:var(--muted)">
+            Shards fit (largest shard across models) / GPU: <strong style="color:var(--text)">${fit}</strong>
+            ${tp[gpu.id] != null ? ` · max TP any model: <strong style="color:var(--good)">${tp[gpu.id]}</strong>` : ""}
+          </div>
+        `;
+        card.addEventListener("click", () => {
+          selectedGpuId = gpu.id;
+          document.querySelectorAll(".gpu-card").forEach((c) => c.classList.remove("selected"));
+          card.classList.add("selected");
+          renderGpuDetail(gpu);
+          if (lastFetchCtx && lastFetchCtx.models) computeAndRenderMulti(lastFetchCtx.models);
+        });
+        grid.appendChild(card);
+      });
+    }
+    /**
+     * @param {object[]} models — array of Hub fetch ctx
+     */
+    function computeAndRenderMulti(models) {
+      const util = Math.min(0.98, Math.max(0.5, parseFloat(document.getElementById("gpuUtil").value) || 0.9));
+      const preferredGpuId = document.getElementById("preferredGpu").value;
+      const prefGpu = GPU_CATALOG.find((g) => g.id === preferredGpuId) || GPU_CATALOG[0];
+      const perModel = models.map((ctx) => {
+        const m = metricsForCtx(ctx);
+        const tpPref = tpForModelOnGpu(ctx, m.weightGb, m.kvTotalGb, prefGpu, util);
+        const perGpuPref =
+          m.weightGb != null && tpPref != null ? (m.weightGb + m.kvTotalGb) / tpPref : null;
+        return { ctx, ...m, tpOnPreferred: tpPref, perGpuOnPreferred: perGpuPref };
+      });
+      const maxShardAll = Math.max(0, ...models.map((c) => c.maxShardGb || 0));
+      let summaryHtml = "";
+      let memHtml = "";
+      perModel.forEach((row, idx) => {
+        const { ctx, weightGb, kvTotalGb, kvPerToken, weightDtype, kvSel, maxLen, batchHint } = row;
+        const shardRows = ctx.analysis.files.slice(0, 12).map((f) =>
+          `<tr><td>${escapeHtml(f.path)}</td><td>${f.sizeGb.toFixed(2)}</td></tr>`
+        ).join("");
+        const moreShards =
+          ctx.analysis.files.length > 12
+            ? `<tr><td colspan="2">… ${ctx.analysis.files.length - 12} more</td></tr>`
+            : "";
+        summaryHtml += `
+          <details class="model-block" ${idx === 0 ? "open" : ""}>
+            <summary>${escapeHtml(ctx.modelId)}</summary>
+            <p style="margin:0.5rem 0;font-size:0.85rem;color:var(--muted)">${escapeHtml(ctx.meta.pipeline_tag || ctx.meta.library_name || "model")}</p>
+            <table>
+              <tr><th>Metric</th><th>Value</th></tr>
+              <tr><td>Weight files total</td><td>${ctx.totalGbDisk != null ? ctx.totalGbDisk.toFixed(2) + " GB" : "unknown"}</td></tr>
+              <tr><td>Largest shard</td><td>${ctx.maxShardGb > 0 ? ctx.maxShardGb.toFixed(2) + " GB" : "—"}</td></tr>
+              <tr><td>Est. weight (${weightDtype})</td><td>${weightGb != null ? weightGb.toFixed(2) + " GB" : "—"}</td></tr>
+            </table>
+            ${ctx.analysis.files.length ? `<table style="margin-top:0.5rem"><tr><th>File</th><th>GB</th></tr>${shardRows}${moreShards}</table>` : ""}
+          </details>`;
+        memHtml += `
+          <h3 style="font-size:0.9rem;margin:0.75rem 0 0.4rem;color:var(--accent)">${escapeHtml(ctx.modelId)}</h3>
+          <table>
+            <tr><th>Component</th><th>Estimate</th></tr>
+            <tr><td>Weights</td><td>${weightGb != null ? weightGb.toFixed(2) + " GB" : "—"}</td></tr>
+            <tr><td>KV (${kvSel}, ${maxLen} × ${batchHint} seqs)</td><td>${kvTotalGb.toFixed(3)} GB</td></tr>
+            <tr><td>KV / token</td><td>${(kvPerToken / 1024).toFixed(2)} KiB</td></tr>
+          </table>`;
+      });
+      document.getElementById("modelSummary").innerHTML = summaryHtml || "<p class='hint'>No models.</p>";
+      document.getElementById("memBreakdown").innerHTML =
+        memHtml + `<p class="hint">KV is a planning upper bound; vLLM paging changes real usage.</p>`;
+      const usablePerGpuByGpu = {};
+      const shardsFit = {};
+      const minTp = {};
+      for (const gpu of GPU_CATALOG) {
+        const usable = usableVramGb(gpu.vramGb, util);
+        usablePerGpuByGpu[gpu.id] = usable;
+        shardsFit[gpu.id] =
+          maxShardAll > 0 ? Math.floor(usable / maxShardAll) : 0;
+        let maxTp = 0;
+        for (const row of perModel) {
+          if (row.weightGb == null) continue;
+          const t = tpForModelOnGpu(row.ctx, row.weightGb, row.kvTotalGb, gpu, util);
+          if (t != null && t > maxTp) maxTp = t;
+        }
+        minTp[gpu.id] = maxTp || null;
+      }
+      buildGpuGrid({ util, usablePerGpuByGpu, shardsFit, tp: minTp, preferredGpuId });
+      const totalGpusSeparate = perModel.reduce(
+        (s, r) => s + (typeof r.tpOnPreferred === "number" && !Number.isNaN(r.tpOnPreferred) ? r.tpOnPreferred : 0),
+        0
+      );
+      const sumMemOneGpu = perModel.reduce((s, r) => s + (r.weightGb || 0) + r.kvTotalGb, 0);
+      const usablePref = usableVramGb(prefGpu.vramGb, util);
+      const eachTpOne = perModel.every((r) => r.tpOnPreferred === 1);
+      const fitsAllOnSingleGpu = sumMemOneGpu <= usablePref && eachTpOne;
+      let multiHtml = `
+        <p class="hint" style="margin-bottom:0.75rem">This table uses the <strong>Preferred GPU</strong> dropdown only. The <strong>vLLM commands</strong> section uses that same GPU until you click a GPU in the catalog — then commands switch to the clicked GPU (dashed outline). Changing the dropdown clears the click override.</p>
+        <p><strong>Preferred GPU:</strong> ${escapeHtml(prefGpu.name)} — ~${usablePref.toFixed(1)} GB usable @ ${(util * 100).toFixed(0)}%</p>
+        <table>
+          <tr><th>Model</th><th>Weights+KV (est.)</th><th>Min TP on preferred</th><th>GPUs (dedicated group)</th></tr>
+          ${perModel
+            .map((r) => {
+              const sum = r.weightGb != null ? r.weightGb + r.kvTotalGb : r.kvTotalGb;
+              const tp = r.tpOnPreferred ?? "—";
+              const gpus = r.tpOnPreferred ?? "—";
+              return `<tr>
+                <td>${escapeHtml(r.ctx.modelId)}</td>
+                <td>${sum.toFixed(2)} GB</td>
+                <td>${tp}</td>
+                <td>${gpus}</td>
+              </tr>`;
+            })
+            .join("")}
+          <tr style="font-weight:600;border-top:2px solid var(--border)">
+            <td>Total (separate instances)</td>
+            <td>—</td>
+            <td>—</td>
+            <td>${totalGpusSeparate || "—"} GPUs</td>
+          </tr>
+        </table>
+        <p class="hint" style="margin-top:0.75rem">
+          <strong>Separate instances:</strong> each model uses its own tensor-parallel group; total accelerator count ≈ <strong>${totalGpusSeparate}</strong> × ${escapeHtml(prefGpu.name)} (no GPU sharing between models).
+        </p>
+        <p class="hint">
+          <strong>Single GPU, multiple models:</strong> needs sum(weights+KV) ≤ usable VRAM on one GPU <em>and</em> each model’s min TP = 1 on that GPU.
+          Here sum ≈ <strong>${sumMemOneGpu.toFixed(2)} GB</strong> vs <strong>${usablePref.toFixed(2)} GB</strong> usable —
+          ${fitsAllOnSingleGpu ? '<span style="color:var(--good)">may fit in theory (still not recommended for large models — VRAM fragmentation &amp; two processes).</span>' : '<span style="color:#f87171">does not fit on one GPU of this type at current settings.</span>'}
+        </p>
+        <p class="hint"><strong>Max configuration on preferred GPU:</strong> at these dtype / max-model-len / batch settings, the table above is the minimum TP per model; you cannot lower TP without reducing context, batch, quantization, or choosing a larger GPU.</p>
+      `;
+      document.getElementById("multiDeployment").innerHTML = multiHtml;
+      renderVllmCommands(models);
+    }
+    function tryRecomputeFromCache() {
+      if (!lastFetchCtx || !lastFetchCtx.models || document.getElementById("results").hidden) return;
+      const ids = getModelIdsFromInputs();
+      const cached = lastFetchCtx.models.map((m) => m.modelId);
+      if (ids.length !== cached.length || ids.some((id, i) => id !== cached[i])) return;
+      computeAndRenderMulti(lastFetchCtx.models);
+    }
+    document.getElementById("btnAddModel").addEventListener("click", () => addModelRow());
+    document.getElementById("btnFetch").addEventListener("click", async () => {
+      syncInputValuesFromNormalized();
+      const ids = getModelIdsFromInputs();
+      const errEl = document.getElementById("fetchError");
+      const results = document.getElementById("results");
+      errEl.hidden = true;
+      results.hidden = true;
+      if (ids.length === 0) {
+        errEl.textContent = "Add at least one Hugging Face model id or URL.";
+        errEl.hidden = false;
+        return;
+      }
+      const btn = document.getElementById("btnFetch");
+      btn.disabled = true;
+      btn.innerHTML = '<span class="spinner"></span>Loading…';
+      try {
+        const models = [];
+        const errors = [];
+        for (let i = 0; i < ids.length; i++) {
+          try {
+            models.push(await fetchOneModel(ids[i]));
+          } catch (e) {
+            errors.push(`${ids[i]}: ${e.message || e}`);
+          }
+        }
+        if (errors.length && models.length === 0) {
+          errEl.textContent = errors.join("\n");
+          errEl.hidden = false;
+          lastFetchCtx = null;
+          return;
+        }
+        if (errors.length) {
+          errEl.textContent = "Some models failed:\n" + errors.join("\n");
+          errEl.hidden = false;
+        }
+        lastFetchCtx = { models };
+        document.getElementById("gpuDetailPanel").hidden = true;
+        selectedGpuId = null;
+        computeAndRenderMulti(models);
+        results.hidden = false;
+      } catch (e) {
+        errEl.textContent = e.message || String(e);
+        errEl.hidden = false;
+        lastFetchCtx = null;
+      } finally {
+        btn.disabled = false;
+        btn.textContent = "Fetch all & compute";
+      }
+    });
+    function escapeHtml(s) {
+      const d = document.createElement("div");
+      d.textContent = s;
+      return d.innerHTML;
+    }
+    function debounce(fn, ms) {
+      let t;
+      return (...args) => {
+        clearTimeout(t);
+        t = setTimeout(() => fn(...args), ms);
+      };
+    }
+    const debouncedRecompute = debounce(tryRecomputeFromCache, 350);
+    ["maxModelLen", "batchHint", "gpuUtil", "weightDtype", "kvDtype"].forEach((id) => {
+      const el = document.getElementById(id);
+      if (!el) return;
+      el.addEventListener("change", tryRecomputeFromCache);
+      if (el.type === "number") el.addEventListener("input", debouncedRecompute);
+    });
+    document.getElementById("preferredGpu").addEventListener("change", () => {
+      selectedGpuId = null;
+      tryRecomputeFromCache();
+    });
+    populatePreferredGpuSelect();
+    addModelRow();
+  </script>
+</body>
 </html>