shashankc28 commited on
Commit
ebd69f4
·
verified ·
1 Parent(s): 4641828

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +912 -18
index.html CHANGED
@@ -1,19 +1,913 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  </html>
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
6
+ <title>vLLM deployment advisor</title>
7
+ <link rel="preconnect" href="https://huggingface.co" />
8
+ <style>
9
+ :root {
10
+ --bg: #0f1419;
11
+ --surface: #1a2332;
12
+ --surface2: #243044;
13
+ --border: #334155;
14
+ --text: #e2e8f0;
15
+ --muted: #94a3b8;
16
+ --accent: #38bdf8;
17
+ --accent2: #a78bfa;
18
+ --good: #34d399;
19
+ --warn: #fbbf24;
20
+ }
21
+ * { box-sizing: border-box; }
22
+ body {
23
+ margin: 0;
24
+ font-family: "Segoe UI", system-ui, sans-serif;
25
+ background: var(--bg);
26
+ color: var(--text);
27
+ line-height: 1.5;
28
+ min-height: 100vh;
29
+ }
30
+ .wrap {
31
+ max-width: 1100px;
32
+ margin: 0 auto;
33
+ padding: 1.5rem 1.25rem 3rem;
34
+ }
35
+ h1 {
36
+ font-size: 1.35rem;
37
+ font-weight: 600;
38
+ margin: 0 0 0.25rem;
39
+ letter-spacing: -0.02em;
40
+ }
41
+ .sub {
42
+ color: var(--muted);
43
+ font-size: 0.9rem;
44
+ margin-bottom: 1.5rem;
45
+ }
46
+ label {
47
+ display: block;
48
+ font-size: 0.8rem;
49
+ color: var(--muted);
50
+ margin-bottom: 0.35rem;
51
+ }
52
+ input[type="text"], input[type="number"], select {
53
+ width: 100%;
54
+ padding: 0.6rem 0.75rem;
55
+ border: 1px solid var(--border);
56
+ border-radius: 8px;
57
+ background: var(--surface);
58
+ color: var(--text);
59
+ font-size: 0.95rem;
60
+ }
61
+ input:focus, select:focus {
62
+ outline: 2px solid var(--accent);
63
+ outline-offset: 1px;
64
+ }
65
+ .row {
66
+ display: grid;
67
+ gap: 1rem;
68
+ margin-bottom: 1rem;
69
+ }
70
+ @media (min-width: 640px) {
71
+ .row.cols-2 { grid-template-columns: 1fr 1fr; }
72
+ .row.cols-3 { grid-template-columns: repeat(3, 1fr); }
73
+ }
74
+ button.primary {
75
+ padding: 0.65rem 1.25rem;
76
+ background: linear-gradient(135deg, #0ea5e9, #6366f1);
77
+ color: #fff;
78
+ border: none;
79
+ border-radius: 8px;
80
+ font-weight: 600;
81
+ cursor: pointer;
82
+ font-size: 0.95rem;
83
+ }
84
+ button.primary:hover { filter: brightness(1.08); }
85
+ button.primary:disabled { opacity: 0.5; cursor: not-allowed; }
86
+ .card {
87
+ background: var(--surface);
88
+ border: 1px solid var(--border);
89
+ border-radius: 12px;
90
+ padding: 1.1rem 1.25rem;
91
+ margin-top: 1rem;
92
+ }
93
+ .card h2 {
94
+ font-size: 1rem;
95
+ margin: 0 0 0.75rem;
96
+ color: var(--accent);
97
+ }
98
+ .err { color: #f87171; font-size: 0.9rem; margin-top: 0.5rem; }
99
+ .ok { color: var(--good); font-size: 0.9rem; }
100
+ table {
101
+ width: 100%;
102
+ border-collapse: collapse;
103
+ font-size: 0.85rem;
104
+ }
105
+ th, td {
106
+ text-align: left;
107
+ padding: 0.45rem 0.5rem;
108
+ border-bottom: 1px solid var(--border);
109
+ }
110
+ th { color: var(--muted); font-weight: 500; }
111
+ .gpu-grid {
112
+ display: grid;
113
+ gap: 0.65rem;
114
+ grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
115
+ }
116
+ .gpu-card {
117
+ background: var(--surface2);
118
+ border: 1px solid var(--border);
119
+ border-radius: 10px;
120
+ padding: 0.75rem 0.9rem;
121
+ cursor: pointer;
122
+ transition: border-color 0.15s, box-shadow 0.15s;
123
+ }
124
+ .gpu-card:hover, .gpu-card.selected {
125
+ border-color: var(--accent);
126
+ box-shadow: 0 0 0 1px var(--accent);
127
+ }
128
+ .gpu-card .name { font-weight: 600; font-size: 0.9rem; }
129
+ .gpu-card .vram { color: var(--muted); font-size: 0.8rem; }
130
+ .gpu-detail {
131
+ margin-top: 1rem;
132
+ padding: 1rem;
133
+ background: var(--bg);
134
+ border-radius: 8px;
135
+ border: 1px solid var(--border);
136
+ font-size: 0.88rem;
137
+ }
138
+ .gpu-detail dl {
139
+ display: grid;
140
+ grid-template-columns: auto 1fr;
141
+ gap: 0.35rem 1rem;
142
+ margin: 0;
143
+ }
144
+ .gpu-detail dt { color: var(--muted); }
145
+ .gpu-detail dd { margin: 0; }
146
+ pre.cmd {
147
+ background: #0c1220;
148
+ border: 1px solid var(--border);
149
+ border-radius: 8px;
150
+ padding: 1rem;
151
+ overflow-x: auto;
152
+ font-size: 0.78rem;
153
+ line-height: 1.45;
154
+ white-space: pre-wrap;
155
+ word-break: break-all;
156
+ }
157
+ .badge {
158
+ display: inline-block;
159
+ padding: 0.15rem 0.45rem;
160
+ border-radius: 4px;
161
+ font-size: 0.75rem;
162
+ background: var(--surface2);
163
+ color: var(--muted);
164
+ }
165
+ .hint { font-size: 0.8rem; color: var(--muted); margin-top: 0.75rem; }
166
+ .spinner { display: inline-block; width: 1rem; height: 1rem; border: 2px solid var(--border); border-top-color: var(--accent); border-radius: 50%; animation: spin 0.7s linear infinite; vertical-align: middle; margin-right: 0.35rem; }
167
+ @keyframes spin { to { transform: rotate(360deg); } }
168
+ .model-row {
169
+ display: grid;
170
+ grid-template-columns: 1fr auto;
171
+ gap: 0.5rem;
172
+ align-items: end;
173
+ margin-bottom: 0.65rem;
174
+ }
175
+ .model-row .model-id-input { margin: 0; }
176
+ button.btn-ghost {
177
+ padding: 0.55rem 0.85rem;
178
+ background: var(--surface2);
179
+ color: var(--text);
180
+ border: 1px solid var(--border);
181
+ border-radius: 8px;
182
+ cursor: pointer;
183
+ font-size: 0.85rem;
184
+ }
185
+ button.btn-ghost:hover { border-color: var(--accent); }
186
+ button.btn-ghost.danger:hover { border-color: #f87171; color: #f87171; }
187
+ .gpu-card.preferred { border-color: var(--accent2); box-shadow: 0 0 0 1px var(--accent2); }
188
+ .gpu-card.commands-target { outline: 1px dashed var(--good); outline-offset: 2px; }
189
+ details.model-block { margin-bottom: 1rem; border: 1px solid var(--border); border-radius: 8px; padding: 0.5rem 0.75rem; background: var(--bg); }
190
+ details.model-block summary { cursor: pointer; font-weight: 600; color: var(--accent); }
191
+ </style>
192
+ </head>
193
+ <body>
194
+ <div class="wrap">
195
+ <h1>vLLM deployment advisor</h1>
196
+ <p class="sub">Pulls weight sizes from Hugging Face, estimates KV memory, and suggests tensor parallelism and <code style="color:var(--accent2)">vllm serve</code> commands. Add several models to estimate total GPUs on your preferred GPU type (separate vLLM instances). Estimates are heuristic — validate on your hardware.</p>
197
+
198
+ <div class="card" style="margin-top:0">
199
+ <label>Hugging Face models (one per serving endpoint)</label>
200
+ <p class="hint" style="margin-top:0">Each model is a separate <code>vllm serve</code> process. Planning assumes tensor-parallel groups do not share GPUs with another model unless you colocate manually.</p>
201
+ <div id="modelListContainer"></div>
202
+ <button type="button" class="btn-ghost" id="btnAddModel" style="margin-bottom:1rem">+ Add model</button>
203
+ <div class="row cols-2">
204
+ <div>
205
+ <label for="hfToken">HF token (optional, for gated/private)</label>
206
+ <input type="text" id="hfToken" placeholder="hf_..." autocomplete="off" />
207
+ </div>
208
+ <div>
209
+ <label for="preferredGpu">Preferred GPU (for TP &amp; totals)</label>
210
+ <select id="preferredGpu"></select>
211
+ </div>
212
+ </div>
213
+ <div class="row cols-3">
214
+ <div>
215
+ <label for="weightDtype">Weight memory (dtype)</label>
216
+ <select id="weightDtype">
217
+ <option value="bf16" selected>BF16 / FP16 (2 bytes/param)</option>
218
+ <option value="fp8">FP8 weights (~1 byte/param, if supported)</option>
219
+ </select>
220
+ </div>
221
+ <div>
222
+ <label for="kvDtype">KV cache dtype</label>
223
+ <select id="kvDtype">
224
+ <option value="auto">auto</option>
225
+ <option value="fp8" selected>fp8 (half KV vs fp16)</option>
226
+ <option value="fp16">fp16</option>
227
+ </select>
228
+ </div>
229
+ <div>
230
+ <label for="maxModelLen">Max model length (tokens)</label>
231
+ <input type="number" id="maxModelLen" value="8192" min="256" step="256" />
232
+ </div>
233
+ </div>
234
+ <div class="row cols-3">
235
+ <div>
236
+ <label for="gpuUtil">Target GPU memory utilization</label>
237
+ <input type="number" id="gpuUtil" value="0.90" min="0.5" max="0.98" step="0.01" />
238
+ </div>
239
+ <div>
240
+ <label for="batchHint">Concurrent sequences per model (KV hint)</label>
241
+ <input type="number" id="batchHint" value="8" min="1" max="512" step="1" />
242
+ </div>
243
+ <div style="display:flex;align-items:flex-end">
244
+ <button type="button" class="primary" id="btnFetch" style="width:100%">Fetch all &amp; compute</button>
245
+ </div>
246
+ </div>
247
+ <div id="fetchError" class="err" hidden></div>
248
+ </div>
249
+
250
+ <div id="results" hidden>
251
+ <div class="card">
252
+ <h2>Multi-model deployment (preferred GPU)</h2>
253
+ <div id="multiDeployment"></div>
254
+ </div>
255
+
256
+ <div class="card">
257
+ <h2>Models &amp; shards (from Hub)</h2>
258
+ <div id="modelSummary"></div>
259
+ </div>
260
+
261
+ <div class="card">
262
+ <h2>Memory breakdown</h2>
263
+ <div id="memBreakdown"></div>
264
+ </div>
265
+
266
+ <div class="card">
267
+ <h2>GPU catalog</h2>
268
+ <p class="sub" style="margin:0 0 0.75rem">Click a GPU for full specs. Your <strong>preferred</strong> choice is highlighted for multi-model totals above.</p>
269
+ <div id="gpuGrid" class="gpu-grid"></div>
270
+ <div id="gpuDetailPanel" hidden></div>
271
+ </div>
272
+
273
+ <div class="card">
274
+ <h2>vLLM commands</h2>
275
+ <p id="commandGpuHint" class="hint" style="margin-top:0"></p>
276
+ <pre class="cmd" id="vllmCmd"></pre>
277
+ <p class="hint">Use a different <code>--port</code> per model when running on the same host. Adjust <code>--tensor-parallel-size</code> if your cluster differs. See <a href="https://docs.vllm.ai" style="color:var(--accent)" target="_blank" rel="noopener">vLLM docs</a>.</p>
278
+ </div>
279
+ </div>
280
+ </div>
281
+
282
+ <script>
283
+ const HF_API = "https://huggingface.co/api";
284
+
285
+ /** Hugging Face repo ids are `org/name`; encoding the whole string turns `/` into `%2F` and breaks `/api/models/...` (400). Encode each path segment only. */
286
+ function hfRepoPath(repoId) {
287
+ return repoId
288
+ .trim()
289
+ .split("/")
290
+ .filter(Boolean)
291
+ .map(encodeURIComponent)
292
+ .join("/");
293
+ }
294
+
295
+ /** Accept pasted browser URLs, e.g. https://huggingface.co/Qwen/Qwen3-30B-A3B → Qwen/Qwen3-30B-A3B */
296
+ function normalizeHfModelInput(raw) {
297
+ const s = String(raw).trim();
298
+ if (!s) return s;
299
+ if (!/^https?:\/\//i.test(s)) return s;
300
+ try {
301
+ const u = new URL(s);
302
+ const h = u.hostname.replace(/^www\./i, "").toLowerCase();
303
+ if (h !== "huggingface.co" && h !== "hf.co") return s;
304
+ const parts = u.pathname.split("/").filter(Boolean);
305
+ if (parts[0] === "datasets" || parts[0] === "spaces") return s;
306
+ if (parts.length >= 2) {
307
+ return `${decodeURIComponent(parts[0])}/${decodeURIComponent(parts[1])}`;
308
+ }
309
+ } catch {
310
+ /* ignore */
311
+ }
312
+ return s;
313
+ }
314
+
315
+ const GPU_CATALOG = [
316
+ { id: "h100-sxm", name: "NVIDIA H100 SXM", vramGb: 80, memBandwidthGbps: 3350, tdpW: 700, fp16Tflops: 989, pcie: "PCIe 5.0 x16", notes: "Datacenter flagship; best for large TP." },
317
+ { id: "h100-pcie", name: "NVIDIA H100 PCIe", vramGb: 80, memBandwidthGbps: 2000, tdpW: 350, fp16Tflops: 756, pcie: "PCIe 5.0 x16", notes: "Slightly lower BW than SXM." },
318
+ { id: "h200", name: "NVIDIA H200", vramGb: 141, memBandwidthGbps: 4800, tdpW: 700, fp16Tflops: 989, pcie: "PCIe 5.0 x16", notes: "More HBM than H100." },
319
+ { id: "b200", name: "NVIDIA B200", vramGb: 192, memBandwidthGbps: 8000, tdpW: 1000, fp16Tflops: 2250, pcie: "NVLink / rack", notes: "Blackwell; approximate specs." },
320
+ { id: "a100-80", name: "NVIDIA A100 80GB", vramGb: 80, memBandwidthGbps: 2039, tdpW: 400, fp16Tflops: 312, pcie: "PCIe 4.0", notes: "Common in clouds." },
321
+ { id: "a100-40", name: "NVIDIA A100 40GB", vramGb: 40, memBandwidthGbps: 1555, tdpW: 400, fp16Tflops: 312, pcie: "PCIe 4.0", notes: "" },
322
+ { id: "l40s", name: "NVIDIA L40S", vramGb: 48, memBandwidthGbps: 864, tdpW: 350, fp16Tflops: 362, pcie: "PCIe 4.0 x16", notes: "Inference-oriented Ada." },
323
+ { id: "l40", name: "NVIDIA L40", vramGb: 48, memBandwidthGbps: 864, tdpW: 300, fp16Tflops: 181, pcie: "PCIe 4.0 x16", notes: "Legacy Ada datacenter; predecessor to L40S." },
324
+ { id: "a30", name: "NVIDIA A30", vramGb: 24, memBandwidthGbps: 933, tdpW: 165, fp16Tflops: 165, pcie: "PCIe 4.0 x16", notes: "Legacy Ampere; compact inference." },
325
+ { id: "a10", name: "NVIDIA A10", vramGb: 24, memBandwidthGbps: 600, tdpW: 150, fp16Tflops: 125, pcie: "PCIe 4.0 x16", notes: "Legacy Ampere single-slot cloud GPU." },
326
+ { id: "a10g", name: "NVIDIA A10G", vramGb: 24, memBandwidthGbps: 600, tdpW: 300, fp16Tflops: 125, pcie: "PCIe 4.0 x16", notes: "A10-class (e.g. AWS G5); ref. specs." },
327
+ { id: "l4", name: "NVIDIA L4", vramGb: 24, memBandwidthGbps: 300, tdpW: 72, fp16Tflops: 120, pcie: "PCIe 4.0 x16", notes: "Legacy Ada low-power inference." },
328
+ { id: "t4", name: "NVIDIA T4", vramGb: 16, memBandwidthGbps: 320, tdpW: 70, fp16Tflops: 65, pcie: "PCIe 3.0 x16", notes: "Legacy Turing inference." },
329
+ { id: "v100-32", name: "NVIDIA V100 32GB", vramGb: 32, memBandwidthGbps: 1134, tdpW: 300, fp16Tflops: 125, pcie: "PCIe 3.0 / SXM2", notes: "Legacy Volta; still common in older clusters." },
330
+ { id: "v100-16", name: "NVIDIA V100 16GB", vramGb: 16, memBandwidthGbps: 900, tdpW: 250, fp16Tflops: 125, pcie: "PCIe 3.0 / SXM2", notes: "Legacy Volta 16 GB SKU." },
331
+ { id: "p100-16", name: "NVIDIA P100 16GB", vramGb: 16, memBandwidthGbps: 732, tdpW: 250, fp16Tflops: 19, pcie: "PCIe 3.0", notes: "Legacy Pascal; very dated for LLMs." },
332
+ { id: "a6000", name: "NVIDIA RTX A6000", vramGb: 48, memBandwidthGbps: 768, tdpW: 300, fp16Tflops: 155, pcie: "PCIe 4.0 x16", notes: "Workstation." },
333
+ { id: "3090", name: "NVIDIA GeForce RTX 3090", vramGb: 24, memBandwidthGbps: 936, tdpW: 350, fp16Tflops: 160, pcie: "PCIe 4.0 x16", notes: "Legacy Ampere consumer; 24 GB." },
334
+ { id: "4090", name: "NVIDIA GeForce RTX 4090", vramGb: 24, memBandwidthGbps: 1008, tdpW: 450, fp16Tflops: 330, pcie: "PCIe 4.0 x16", notes: "High BW consumer card." },
335
+ { id: "4080", name: "NVIDIA GeForce RTX 4080", vramGb: 16, memBandwidthGbps: 717, tdpW: 320, fp16Tflops: 195, pcie: "PCIe 4.0 x16", notes: "" },
336
+ { id: "5090", name: "NVIDIA GeForce RTX 5090", vramGb: 32, memBandwidthGbps: 1792, tdpW: 575, fp16Tflops: 420, pcie: "PCIe 5.0 x16", notes: "Approximate consumer flagship." },
337
+ { id: "mi300x", name: "AMD MI300X", vramGb: 192, memBandwidthGbps: 5300, tdpW: 750, fp16Tflops: 1300, pcie: "OAM", notes: "Approximate; check ROCm/vLLM support." },
338
+ ];
339
+
340
+ function authHeaders() {
341
+ const t = document.getElementById("hfToken").value.trim();
342
+ return t ? { Authorization: `Bearer ${t}` } : {};
343
+ }
344
+
345
+ async function hfFetch(url) {
346
+ const r = await fetch(url, { headers: { ...authHeaders() } });
347
+ if (!r.ok) throw new Error(`${r.status} ${r.statusText} — ${url}`);
348
+ return r;
349
+ }
350
+
351
+ async function hfJson(url) {
352
+ const r = await hfFetch(url);
353
+ return r.json();
354
+ }
355
+
356
+ async function hfText(url) {
357
+ const r = await hfFetch(url);
358
+ return r.text();
359
+ }
360
+
361
+ /** Sum sizes of weight files from tree API */
362
+ function analyzeTreeFiles(tree) {
363
+ const ignore = ["training_args", "optimizer", "scheduler", "tf_model", "flax_model", "rust_model"];
364
+ const files = tree.filter((f) => {
365
+ if ((f.type !== "blob" && f.type !== "file") || typeof f.size !== "number") return false;
366
+ const p = f.path.toLowerCase();
367
+ if (ignore.some((k) => p.includes(k))) return false;
368
+ if (p.endsWith(".safetensors")) return true;
369
+ if (p.endsWith(".bin")) {
370
+ return (
371
+ p.endsWith("pytorch_model.bin") ||
372
+ /model-\d+-of-\d+\.bin$/.test(p) ||
373
+ p.includes("pytorch_model-")
374
+ );
375
+ }
376
+ return false;
377
+ });
378
+ const totalBytes = files.reduce((s, f) => s + f.size, 0);
379
+ const byShard = files.map((f) => ({ path: f.path, sizeBytes: f.size, sizeGb: f.size / 1e9 }));
380
+ byShard.sort((a, b) => b.sizeBytes - a.sizeBytes);
381
+ const maxShard = byShard.length ? byShard[0].sizeBytes : 0;
382
+ return { files: byShard, totalBytes, maxShardBytes: maxShard };
383
+ }
384
+
385
+ function parseConfigJson(text) {
386
+ try {
387
+ return JSON.parse(text);
388
+ } catch {
389
+ return null;
390
+ }
391
+ }
392
+
393
+ /** Rough param count from Llama-like config */
394
+ function estimateParamsFromConfig(cfg) {
395
+ if (!cfg) return null;
396
+ if (typeof cfg.num_parameters === "number") return cfg.num_parameters;
397
+ const h = cfg.hidden_size;
398
+ const L = cfg.num_hidden_layers;
399
+ const V = cfg.vocab_size;
400
+ const I = cfg.intermediate_size;
401
+ const nHead = cfg.num_attention_heads;
402
+ const nKV = cfg.num_key_value_heads ?? nHead;
403
+ if (!h || !L || !V || !I || !nHead) return null;
404
+ const headDim = h / nHead;
405
+ const embed = V * h;
406
+ const attnPerLayer = 2 * (h * h) + 2 * (nKV * headDim * h);
407
+ const mlpPerLayer = 3 * h * I;
408
+ const ln = 2 * h * L * 2;
409
+ const out = h * V;
410
+ return embed + L * (attnPerLayer + mlpPerLayer) + ln + out;
411
+ }
412
+
413
+ /**
414
+ * KV bytes per token per layer: K and V each num_kv_heads * head_dim.
415
+ * Per token: 2 (K+V) * num_kv_heads * head_dim * bytes
416
+ */
417
+ function kvBytesPerToken(cfg, kvBytesPerEl) {
418
+ if (!cfg) return 0;
419
+ const h = cfg.hidden_size;
420
+ const L = cfg.num_hidden_layers;
421
+ const nHead = cfg.num_attention_heads;
422
+ const nKV = cfg.num_key_value_heads ?? nHead;
423
+ if (!h || !L || !nHead) return 0;
424
+ const headDim = h / nHead;
425
+ return L * 2 * nKV * headDim * kvBytesPerEl;
426
+ }
427
+
428
+ function bytesPerParamWeight(dtype) {
429
+ return dtype === "fp8" ? 1 : 2;
430
+ }
431
+
432
+ function usableVramGb(vramGb, util) {
433
+ return vramGb * util;
434
+ }
435
+
436
+ /**
437
+ * With tensor parallelism, weights and standard attention KV are split across TP ranks:
438
+ * per-GPU ≈ (weightGb + kvTotalGb) / tp. Need tp ≥ ceil((weight + KV) / usable).
439
+ * Largest on-disk shard is shown separately (load-time peak can differ by loader).
440
+ */
441
+ function minTpForWeightsAndKv(totalWeightGb, kvTotalGb, usablePerGpuGb) {
442
+ if (usablePerGpuGb <= 0) return Infinity;
443
+ const combined = totalWeightGb + kvTotalGb;
444
+ return Math.max(1, Math.ceil(combined / usablePerGpuGb));
445
+ }
446
+
447
+ function minTpForLargestShard(maxShardGb, usablePerGpuGb) {
448
+ if (!maxShardGb || maxShardGb <= 0) return 1;
449
+ if (usablePerGpuGb <= 0) return Infinity;
450
+ return Math.max(1, Math.ceil(maxShardGb / usablePerGpuGb));
451
+ }
452
+
453
+ function renderGpuDetail(gpu) {
454
+ const el = document.getElementById("gpuDetailPanel");
455
+ el.hidden = false;
456
+ el.innerHTML = `
457
+ <div class="gpu-detail">
458
+ <strong style="color:var(--accent)">${gpu.name}</strong>
459
+ <dl style="margin-top:0.75rem">
460
+ <dt>VRAM</dt><dd>${gpu.vramGb} GB</dd>
461
+ <dt>Memory bandwidth (ref.)</dt><dd>~${gpu.memBandwidthGbps} GB/s</dd>
462
+ <dt>FP16 TFLOPS (ref.)</dt><dd>~${gpu.fp16Tflops}</dd>
463
+ <dt>TDP (ref.)</dt><dd>${gpu.tdpW} W</dd>
464
+ <dt>PCIe</dt><dd>${gpu.pcie}</dd>
465
+ <dt>Notes</dt><dd>${gpu.notes || "—"}</dd>
466
+ </dl>
467
+ <p class="hint" style="margin-bottom:0">Published specs vary by SKU and firmware; use vendor datasheets for procurement.</p>
468
+ </div>
469
+ `;
470
+ }
471
+
472
+ let selectedGpuId = null;
473
+ /** @type {{ models: object[] } | null} */
474
+ let lastFetchCtx = null;
475
+ let rowIdSeq = 0;
476
+
477
+ function populatePreferredGpuSelect() {
478
+ const sel = document.getElementById("preferredGpu");
479
+ if (!sel || sel.options.length) return;
480
+ GPU_CATALOG.forEach((g) => {
481
+ const o = document.createElement("option");
482
+ o.value = g.id;
483
+ o.textContent = `${g.name} (${g.vramGb} GB)`;
484
+ sel.appendChild(o);
485
+ });
486
+ sel.value = "h100-sxm";
487
+ }
488
+
489
+ function getModelIdsFromInputs() {
490
+ return Array.from(document.querySelectorAll(".model-id-input"))
491
+ .map((el) => normalizeHfModelInput(el.value.trim()))
492
+ .filter(Boolean);
493
+ }
494
+
495
+ function syncInputValuesFromNormalized() {
496
+ const inputs = document.querySelectorAll(".model-id-input");
497
+ inputs.forEach((el) => {
498
+ const n = normalizeHfModelInput(el.value.trim());
499
+ if (n && n !== el.value.trim()) el.value = n;
500
+ });
501
+ }
502
+
503
+ function addModelRow(initial = "") {
504
+ const container = document.getElementById("modelListContainer");
505
+ const id = `mr-${++rowIdSeq}`;
506
+ const wrap = document.createElement("div");
507
+ wrap.className = "model-row";
508
+ wrap.dataset.rowId = id;
509
+ wrap.innerHTML = `
510
+ <div>
511
+ <label class="model-row-label" style="font-size:0.8rem;color:var(--muted)">Model id or URL</label>
512
+ <input type="text" class="model-id-input" placeholder="org/model or https://huggingface.co/…" autocomplete="off" />
513
+ </div>
514
+ <button type="button" class="btn-ghost danger btn-remove-model" title="Remove">Remove</button>`;
515
+ wrap.querySelector(".model-id-input").value = initial;
516
+ container.appendChild(wrap);
517
+ wrap.querySelector(".btn-remove-model").addEventListener("click", () => {
518
+ if (document.querySelectorAll(".model-row").length <= 1) return;
519
+ wrap.remove();
520
+ });
521
+ }
522
+
523
+ async function fetchOneModel(modelId) {
524
+ const meta = await hfJson(`${HF_API}/models/${hfRepoPath(modelId)}`);
525
+ const ref = meta.sha || "main";
526
+ const treeUrl = `${HF_API}/models/${hfRepoPath(modelId)}/tree/${encodeURIComponent(ref)}?recursive=true`;
527
+ const tree = await hfJson(treeUrl);
528
+ const analysis = analyzeTreeFiles(Array.isArray(tree) ? tree : []);
529
+
530
+ let config = null;
531
+ try {
532
+ const cfgUrl = `https://huggingface.co/${hfRepoPath(modelId)}/resolve/${encodeURIComponent(ref)}/config.json`;
533
+ const cfgText = await hfText(cfgUrl);
534
+ config = parseConfigJson(cfgText);
535
+ } catch {
536
+ config = null;
537
+ }
538
+
539
+ let indexMeta = null;
540
+ try {
541
+ const idxCandidates = [
542
+ `https://huggingface.co/${hfRepoPath(modelId)}/resolve/${encodeURIComponent(ref)}/model.safetensors.index.json`,
543
+ `https://huggingface.co/${hfRepoPath(modelId)}/resolve/${encodeURIComponent(ref)}/pytorch_model.bin.index.json`,
544
+ ];
545
+ for (const u of idxCandidates) {
546
+ try {
547
+ const j = await hfJson(u);
548
+ if (j.metadata && j.metadata.total_size != null) {
549
+ indexMeta = j.metadata;
550
+ break;
551
+ }
552
+ } catch { /* try next */ }
553
+ }
554
+ } catch { /* optional */ }
555
+ const totalBytesFromIndex = indexMeta && indexMeta.total_size ? Number(indexMeta.total_size) : null;
556
+
557
+ const totalBytes = analysis.totalBytes > 0 ? analysis.totalBytes : totalBytesFromIndex;
558
+ const totalGbDisk = totalBytes != null ? totalBytes / 1e9 : null;
559
+ const maxShardGb = analysis.maxShardBytes > 0 ? analysis.maxShardBytes / 1e9 : (totalGbDisk || 0);
560
+ const estParams = estimateParamsFromConfig(config);
561
+
562
+ return { modelId, meta, analysis, config, totalGbDisk, maxShardGb, estParams };
563
+ }
564
+
565
+ function metricsForCtx(ctx) {
566
+ const weightDtype = document.getElementById("weightDtype").value;
567
+ const bPerParam = bytesPerParamWeight(weightDtype);
568
+ const weightGbFromParams = ctx.estParams != null ? (ctx.estParams * bPerParam) / 1e9 : null;
569
+ const weightGb = ctx.totalGbDisk != null ? ctx.totalGbDisk * (bPerParam / 2) : weightGbFromParams;
570
+
571
+ const kvSel = document.getElementById("kvDtype").value;
572
+ const kvBytesPerEl = kvSel === "fp8" ? 1 : 2;
573
+ const maxLen = Math.max(256, parseInt(document.getElementById("maxModelLen").value, 10) || 8192);
574
+ const batchHint = Math.max(1, parseInt(document.getElementById("batchHint").value, 10) || 1);
575
+
576
+ const kvPerToken = kvBytesPerToken(ctx.config, kvBytesPerEl);
577
+ const kvTotalGb = (kvPerToken * maxLen * batchHint) / 1e9;
578
+ return { weightGb, kvTotalGb, kvPerToken, weightDtype, kvSel, maxLen, batchHint };
579
+ }
580
+
581
+ function tpForModelOnGpu(ctx, weightGb, kvTotalGb, gpu, util) {
582
+ const usable = usableVramGb(gpu.vramGb, util);
583
+ if (weightGb == null) return null;
584
+ const tpMem = minTpForWeightsAndKv(weightGb, kvTotalGb, usable);
585
+ const tpShard = minTpForLargestShard(ctx.maxShardGb, usable);
586
+ return Math.max(tpMem, tpShard);
587
+ }
588
+
589
+ /** GPU used for generated vLLM commands: clicked card overrides Preferred dropdown. */
590
+ function gpuForCommands() {
591
+ const prefId = document.getElementById("preferredGpu").value;
592
+ if (selectedGpuId) {
593
+ const g = GPU_CATALOG.find((x) => x.id === selectedGpuId);
594
+ if (g) return g;
595
+ }
596
+ return GPU_CATALOG.find((x) => x.id === prefId) || GPU_CATALOG[0];
597
+ }
598
+
599
+ function renderVllmCommands(models) {
600
+ const hintEl = document.getElementById("commandGpuHint");
601
+ const cmdEl = document.getElementById("vllmCmd");
602
+ if (!models || !models.length || !cmdEl) return;
603
+
604
+ const util = Math.min(0.98, Math.max(0.5, parseFloat(document.getElementById("gpuUtil").value) || 0.9));
605
+ const cmdGpu = gpuForCommands();
606
+ const usableCmd = usableVramGb(cmdGpu.vramGb, util);
607
+
608
+ const kvFlag =
609
+ document.getElementById("kvDtype").value === "fp8"
610
+ ? "fp8_e5m2"
611
+ : document.getElementById("kvDtype").value === "fp16"
612
+ ? "fp16"
613
+ : "auto";
614
+ const dtypeFlag = document.getElementById("weightDtype").value === "fp8" ? "float8_e4m3fn" : "bfloat16";
615
+ const maxLen = Math.max(256, parseInt(document.getElementById("maxModelLen").value, 10) || 8192);
616
+
617
+ const blocks = [];
618
+ let totalCmd = 0;
619
+ models.forEach((ctx) => {
620
+ const m = metricsForCtx(ctx);
621
+ const tp = tpForModelOnGpu(ctx, m.weightGb, m.kvTotalGb, cmdGpu, util);
622
+ const tpUse = typeof tp === "number" && !Number.isNaN(tp) ? tp : 1;
623
+ totalCmd += tpUse;
624
+ blocks.push({ ctx, tpUse });
625
+ });
626
+
627
+ const lines = [
628
+ `# Total GPUs (separate vLLM servers, ${cmdGpu.name}): ${totalCmd}`,
629
+ `# ~${usableCmd.toFixed(1)} GB usable per GPU @ ${(util * 100).toFixed(0)}% of ${cmdGpu.vramGb} GB VRAM`,
630
+ `# Assign disjoint CUDA_VISIBLE_DEVICES per server on the same host.`,
631
+ "",
632
+ ];
633
+ blocks.forEach((b, i) => {
634
+ const port = 8000 + i;
635
+ lines.push(
636
+ `# --- ${b.ctx.modelId} ---`,
637
+ `vllm serve "${b.ctx.modelId}" \\`,
638
+ ` --dtype ${dtypeFlag} \\`,
639
+ ` --tensor-parallel-size ${b.tpUse} \\`,
640
+ ` --max-model-len ${maxLen} \\`,
641
+ ` --gpu-memory-utilization ${util} \\`,
642
+ ` --kv-cache-dtype ${kvFlag} \\`,
643
+ ` --port ${port}`,
644
+ ""
645
+ );
646
+ });
647
+ cmdEl.textContent = lines.join("\n").trimEnd();
648
+
649
+ if (hintEl) {
650
+ const src = selectedGpuId ? "GPU catalog (clicked card)" : "Preferred GPU dropdown";
651
+ hintEl.textContent = `Tensor parallelism in the commands below uses ${cmdGpu.name} (~${usableCmd.toFixed(1)} GB usable per GPU). Source: ${src}. Click a GPU card to override the dropdown; change the dropdown to clear the override.`;
652
+ }
653
+ }
654
+
655
+ function buildGpuGrid(state) {
656
+ const grid = document.getElementById("gpuGrid");
657
+ grid.innerHTML = "";
658
+ const { usablePerGpuByGpu, shardsFit, tp, util, preferredGpuId } = state;
659
+ const cmdGpuId = gpuForCommands().id;
660
+
661
+ GPU_CATALOG.forEach((gpu) => {
662
+ const usable = usablePerGpuByGpu[gpu.id];
663
+ const fit = shardsFit[gpu.id];
664
+ const isPref = preferredGpuId && gpu.id === preferredGpuId;
665
+ const isCmdTarget = gpu.id === cmdGpuId;
666
+ const card = document.createElement("div");
667
+ card.className =
668
+ "gpu-card" +
669
+ (selectedGpuId === gpu.id ? " selected" : "") +
670
+ (isPref ? " preferred" : "") +
671
+ (isCmdTarget ? " commands-target" : "");
672
+ card.innerHTML = `
673
+ <div class="name">${isPref ? '<span style="float:right;font-size:0.65rem;color:var(--accent2);text-transform:uppercase">preferred</span>' : ""}${gpu.name}</div>
674
+ <div class="vram">${gpu.vramGb} GB VRAM · ~${usable.toFixed(1)} GB usable @ ${(util * 100).toFixed(0)}%</div>
675
+ <div style="margin-top:0.4rem;font-size:0.78rem;color:var(--muted)">
676
+ Shards fit (largest shard across models) / GPU: <strong style="color:var(--text)">${fit}</strong>
677
+ ${tp[gpu.id] != null ? ` · max TP any model: <strong style="color:var(--good)">${tp[gpu.id]}</strong>` : ""}
678
+ </div>
679
+ `;
680
+ card.addEventListener("click", () => {
681
+ selectedGpuId = gpu.id;
682
+ document.querySelectorAll(".gpu-card").forEach((c) => c.classList.remove("selected"));
683
+ card.classList.add("selected");
684
+ renderGpuDetail(gpu);
685
+ if (lastFetchCtx && lastFetchCtx.models) computeAndRenderMulti(lastFetchCtx.models);
686
+ });
687
+ grid.appendChild(card);
688
+ });
689
+ }
690
+
691
+ /**
692
+ * @param {object[]} models — array of Hub fetch ctx
693
+ */
694
+ function computeAndRenderMulti(models) {
695
+ const util = Math.min(0.98, Math.max(0.5, parseFloat(document.getElementById("gpuUtil").value) || 0.9));
696
+ const preferredGpuId = document.getElementById("preferredGpu").value;
697
+ const prefGpu = GPU_CATALOG.find((g) => g.id === preferredGpuId) || GPU_CATALOG[0];
698
+
699
+ const perModel = models.map((ctx) => {
700
+ const m = metricsForCtx(ctx);
701
+ const tpPref = tpForModelOnGpu(ctx, m.weightGb, m.kvTotalGb, prefGpu, util);
702
+ const perGpuPref =
703
+ m.weightGb != null && tpPref != null ? (m.weightGb + m.kvTotalGb) / tpPref : null;
704
+ return { ctx, ...m, tpOnPreferred: tpPref, perGpuOnPreferred: perGpuPref };
705
+ });
706
+
707
+ const maxShardAll = Math.max(0, ...models.map((c) => c.maxShardGb || 0));
708
+
709
+ let summaryHtml = "";
710
+ let memHtml = "";
711
+ perModel.forEach((row, idx) => {
712
+ const { ctx, weightGb, kvTotalGb, kvPerToken, weightDtype, kvSel, maxLen, batchHint } = row;
713
+ const shardRows = ctx.analysis.files.slice(0, 12).map((f) =>
714
+ `<tr><td>${escapeHtml(f.path)}</td><td>${f.sizeGb.toFixed(2)}</td></tr>`
715
+ ).join("");
716
+ const moreShards =
717
+ ctx.analysis.files.length > 12
718
+ ? `<tr><td colspan="2">… ${ctx.analysis.files.length - 12} more</td></tr>`
719
+ : "";
720
+
721
+ summaryHtml += `
722
+ <details class="model-block" ${idx === 0 ? "open" : ""}>
723
+ <summary>${escapeHtml(ctx.modelId)}</summary>
724
+ <p style="margin:0.5rem 0;font-size:0.85rem;color:var(--muted)">${escapeHtml(ctx.meta.pipeline_tag || ctx.meta.library_name || "model")}</p>
725
+ <table>
726
+ <tr><th>Metric</th><th>Value</th></tr>
727
+ <tr><td>Weight files total</td><td>${ctx.totalGbDisk != null ? ctx.totalGbDisk.toFixed(2) + " GB" : "unknown"}</td></tr>
728
+ <tr><td>Largest shard</td><td>${ctx.maxShardGb > 0 ? ctx.maxShardGb.toFixed(2) + " GB" : "—"}</td></tr>
729
+ <tr><td>Est. weight (${weightDtype})</td><td>${weightGb != null ? weightGb.toFixed(2) + " GB" : "—"}</td></tr>
730
+ </table>
731
+ ${ctx.analysis.files.length ? `<table style="margin-top:0.5rem"><tr><th>File</th><th>GB</th></tr>${shardRows}${moreShards}</table>` : ""}
732
+ </details>`;
733
+
734
+ memHtml += `
735
+ <h3 style="font-size:0.9rem;margin:0.75rem 0 0.4rem;color:var(--accent)">${escapeHtml(ctx.modelId)}</h3>
736
+ <table>
737
+ <tr><th>Component</th><th>Estimate</th></tr>
738
+ <tr><td>Weights</td><td>${weightGb != null ? weightGb.toFixed(2) + " GB" : "—"}</td></tr>
739
+ <tr><td>KV (${kvSel}, ${maxLen} × ${batchHint} seqs)</td><td>${kvTotalGb.toFixed(3)} GB</td></tr>
740
+ <tr><td>KV / token</td><td>${(kvPerToken / 1024).toFixed(2)} KiB</td></tr>
741
+ </table>`;
742
+ });
743
+
744
+ document.getElementById("modelSummary").innerHTML = summaryHtml || "<p class='hint'>No models.</p>";
745
+ document.getElementById("memBreakdown").innerHTML =
746
+ memHtml + `<p class="hint">KV is a planning upper bound; vLLM paging changes real usage.</p>`;
747
+
748
+ const usablePerGpuByGpu = {};
749
+ const shardsFit = {};
750
+ const minTp = {};
751
+ for (const gpu of GPU_CATALOG) {
752
+ const usable = usableVramGb(gpu.vramGb, util);
753
+ usablePerGpuByGpu[gpu.id] = usable;
754
+ shardsFit[gpu.id] =
755
+ maxShardAll > 0 ? Math.floor(usable / maxShardAll) : 0;
756
+ let maxTp = 0;
757
+ for (const row of perModel) {
758
+ if (row.weightGb == null) continue;
759
+ const t = tpForModelOnGpu(row.ctx, row.weightGb, row.kvTotalGb, gpu, util);
760
+ if (t != null && t > maxTp) maxTp = t;
761
+ }
762
+ minTp[gpu.id] = maxTp || null;
763
+ }
764
+
765
+ buildGpuGrid({ util, usablePerGpuByGpu, shardsFit, tp: minTp, preferredGpuId });
766
+
767
+ const totalGpusSeparate = perModel.reduce(
768
+ (s, r) => s + (typeof r.tpOnPreferred === "number" && !Number.isNaN(r.tpOnPreferred) ? r.tpOnPreferred : 0),
769
+ 0
770
+ );
771
+ const sumMemOneGpu = perModel.reduce((s, r) => s + (r.weightGb || 0) + r.kvTotalGb, 0);
772
+ const usablePref = usableVramGb(prefGpu.vramGb, util);
773
+ const eachTpOne = perModel.every((r) => r.tpOnPreferred === 1);
774
+ const fitsAllOnSingleGpu = sumMemOneGpu <= usablePref && eachTpOne;
775
+
776
+ let multiHtml = `
777
+ <p class="hint" style="margin-bottom:0.75rem">This table uses the <strong>Preferred GPU</strong> dropdown only. The <strong>vLLM commands</strong> section uses that same GPU until you click a GPU in the catalog — then commands switch to the clicked GPU (dashed outline). Changing the dropdown clears the click override.</p>
778
+ <p><strong>Preferred GPU:</strong> ${escapeHtml(prefGpu.name)} — ~${usablePref.toFixed(1)} GB usable @ ${(util * 100).toFixed(0)}%</p>
779
+ <table>
780
+ <tr><th>Model</th><th>Weights+KV (est.)</th><th>Min TP on preferred</th><th>GPUs (dedicated group)</th></tr>
781
+ ${perModel
782
+ .map((r) => {
783
+ const sum = r.weightGb != null ? r.weightGb + r.kvTotalGb : r.kvTotalGb;
784
+ const tp = r.tpOnPreferred ?? "—";
785
+ const gpus = r.tpOnPreferred ?? "—";
786
+ return `<tr>
787
+ <td>${escapeHtml(r.ctx.modelId)}</td>
788
+ <td>${sum.toFixed(2)} GB</td>
789
+ <td>${tp}</td>
790
+ <td>${gpus}</td>
791
+ </tr>`;
792
+ })
793
+ .join("")}
794
+ <tr style="font-weight:600;border-top:2px solid var(--border)">
795
+ <td>Total (separate instances)</td>
796
+ <td>—</td>
797
+ <td>—</td>
798
+ <td>${totalGpusSeparate || "—"} GPUs</td>
799
+ </tr>
800
+ </table>
801
+ <p class="hint" style="margin-top:0.75rem">
802
+ <strong>Separate instances:</strong> each model uses its own tensor-parallel group; total accelerator count ≈ <strong>${totalGpusSeparate}</strong> × ${escapeHtml(prefGpu.name)} (no GPU sharing between models).
803
+ </p>
804
+ <p class="hint">
805
+ <strong>Single GPU, multiple models:</strong> needs sum(weights+KV) ≤ usable VRAM on one GPU <em>and</em> each model’s min TP = 1 on that GPU.
806
+ Here sum ≈ <strong>${sumMemOneGpu.toFixed(2)} GB</strong> vs <strong>${usablePref.toFixed(2)} GB</strong> usable —
807
+ ${fitsAllOnSingleGpu ? '<span style="color:var(--good)">may fit in theory (still not recommended for large models — VRAM fragmentation &amp; two processes).</span>' : '<span style="color:#f87171">does not fit on one GPU of this type at current settings.</span>'}
808
+ </p>
809
+ <p class="hint"><strong>Max configuration on preferred GPU:</strong> at these dtype / max-model-len / batch settings, the table above is the minimum TP per model; you cannot lower TP without reducing context, batch, quantization, or choosing a larger GPU.</p>
810
+ `;
811
+ document.getElementById("multiDeployment").innerHTML = multiHtml;
812
+
813
+ renderVllmCommands(models);
814
+ }
815
+
816
+ function tryRecomputeFromCache() {
817
+ if (!lastFetchCtx || !lastFetchCtx.models || document.getElementById("results").hidden) return;
818
+ const ids = getModelIdsFromInputs();
819
+ const cached = lastFetchCtx.models.map((m) => m.modelId);
820
+ if (ids.length !== cached.length || ids.some((id, i) => id !== cached[i])) return;
821
+ computeAndRenderMulti(lastFetchCtx.models);
822
+ }
823
+
824
+ document.getElementById("btnAddModel").addEventListener("click", () => addModelRow());
825
+
826
+ document.getElementById("btnFetch").addEventListener("click", async () => {
827
+ syncInputValuesFromNormalized();
828
+ const ids = getModelIdsFromInputs();
829
+ const errEl = document.getElementById("fetchError");
830
+ const results = document.getElementById("results");
831
+ errEl.hidden = true;
832
+ results.hidden = true;
833
+
834
+ if (ids.length === 0) {
835
+ errEl.textContent = "Add at least one Hugging Face model id or URL.";
836
+ errEl.hidden = false;
837
+ return;
838
+ }
839
+
840
+ const btn = document.getElementById("btnFetch");
841
+ btn.disabled = true;
842
+ btn.innerHTML = '<span class="spinner"></span>Loading…';
843
+
844
+ try {
845
+ const models = [];
846
+ const errors = [];
847
+ for (let i = 0; i < ids.length; i++) {
848
+ try {
849
+ models.push(await fetchOneModel(ids[i]));
850
+ } catch (e) {
851
+ errors.push(`${ids[i]}: ${e.message || e}`);
852
+ }
853
+ }
854
+ if (errors.length && models.length === 0) {
855
+ errEl.textContent = errors.join("\n");
856
+ errEl.hidden = false;
857
+ lastFetchCtx = null;
858
+ return;
859
+ }
860
+ if (errors.length) {
861
+ errEl.textContent = "Some models failed:\n" + errors.join("\n");
862
+ errEl.hidden = false;
863
+ }
864
+
865
+ lastFetchCtx = { models };
866
+ document.getElementById("gpuDetailPanel").hidden = true;
867
+ selectedGpuId = null;
868
+ computeAndRenderMulti(models);
869
+
870
+ results.hidden = false;
871
+ } catch (e) {
872
+ errEl.textContent = e.message || String(e);
873
+ errEl.hidden = false;
874
+ lastFetchCtx = null;
875
+ } finally {
876
+ btn.disabled = false;
877
+ btn.textContent = "Fetch all & compute";
878
+ }
879
+ });
880
+
881
+ function escapeHtml(s) {
882
+ const d = document.createElement("div");
883
+ d.textContent = s;
884
+ return d.innerHTML;
885
+ }
886
+
887
+ function debounce(fn, ms) {
888
+ let t;
889
+ return (...args) => {
890
+ clearTimeout(t);
891
+ t = setTimeout(() => fn(...args), ms);
892
+ };
893
+ }
894
+
895
+ const debouncedRecompute = debounce(tryRecomputeFromCache, 350);
896
+
897
+ ["maxModelLen", "batchHint", "gpuUtil", "weightDtype", "kvDtype"].forEach((id) => {
898
+ const el = document.getElementById(id);
899
+ if (!el) return;
900
+ el.addEventListener("change", tryRecomputeFromCache);
901
+ if (el.type === "number") el.addEventListener("input", debouncedRecompute);
902
+ });
903
+
904
+ document.getElementById("preferredGpu").addEventListener("change", () => {
905
+ selectedGpuId = null;
906
+ tryRecomputeFromCache();
907
+ });
908
+
909
+ populatePreferredGpuSelect();
910
+ addModelRow();
911
+ </script>
912
+ </body>
913
  </html>