CanIRunThisLLM / app.py
artificialguybr's picture
Fix skeleton to match result layout structure, balance column widths
394db7a
import re
import math
import requests
import gradio as gr
from functools import lru_cache
import json
import time
# ──────────────────────────────────────────────────────────────
# 0. LLMCHECK BENCHMARKS (Apple Silicon performance data)
# ──────────────────────────────────────────────────────────────
LLMCHECK_URL = "https://llmcheck.net/data/benchmarks.json"
LLMCHECK_CACHE_TTL = 3600 # 1 hour cache
# Chip performance scaling factors (derived from llmcheck 8B Q4_K_M benchmarks)
CHIP_SCALE_FACTORS = {
"M1": 1.00,
"M2": 1.34,
"M3": 1.72,
"M3 Pro": 1.43,
"M3 Max": 2.50, # extrapolated
"M4": 1.72,
"M4 Pro": 2.18,
"M4 Max": 2.87,
"M4 Ultra": 3.50, # extrapolated from bandwidth ratio
"M5": 2.00, # extrapolated
"M5 Pro": 2.19,
"M5 Max": 2.63,
"M5 Ultra": 3.80, # extrapolated
}
_llmcheck_cache = {"data": None, "timestamp": 0}
def fetch_llmcheck_benchmarks():
"""Fetch llmcheck benchmark data with caching."""
now = time.time()
if _llmcheck_cache["data"] and (now - _llmcheck_cache["timestamp"]) < LLMCHECK_CACHE_TTL:
return _llmcheck_cache["data"]
try:
r = requests.get(LLMCHECK_URL, timeout=15)
if r.status_code == 200:
data = r.json()
_llmcheck_cache["data"] = data
_llmcheck_cache["timestamp"] = now
return data
except Exception as e:
print(f"[WARN] llmcheck fetch failed: {e}")
return None
def parse_apple_silicon_chip(gpu_name: str) -> str:
"""Extract chip name from GPU dropdown label."""
# Examples: "Apple M5 Max (128 GB)" -> "M5 Max"
if not gpu_name.startswith("Apple"):
return None
match = re.match(r"Apple (M\d(?:\s+\w+)?).*", gpu_name)
if match:
return match.group(1)
return None
def estimate_apple_silicon_tps(params_b: float, chip: str, quant: str = "Q4_K_M") -> dict:
"""
Estimate inference speed (tok/s) for Apple Silicon.
Returns dict with tps, ttft, source, and confidence.
"""
result = {"tps": None, "ttft": None, "source": None, "confidence": "unknown", "model_used": None}
llmcheck = fetch_llmcheck_benchmarks()
if not llmcheck:
return result
benchmarks = llmcheck.get("benchmarks", [])
# Normalize quant key
quant_map = {
"Q4_K_M (best)": "Q4_K_M",
"Q4_K_S": "Q4_K_M",
"Q4_0 / NF4": "Q4_K_M",
"Q5_K_M": "Q4_K_M", # approximate
"Q6_K": "Q4_K_M", # approximate
"INT8 / Q8_0": "Q8_0",
"Q8_0": "Q8_0",
}
llmcheck_quant = quant_map.get(quant, "Q4_K_M")
# 1. Direct match: find exact model params on exact chip
for b in benchmarks:
b_params = float(re.sub(r'[^\d.]', '', b["params"]))
b_chip = b["chip"]
b_quant = b["quant"]
if abs(b_params - params_b) < 0.5 and b_chip == chip and b_quant == llmcheck_quant:
result["tps"] = b["tps"]
result["ttft"] = b["ttft"]
result["source"] = "llmcheck measured"
result["confidence"] = "high"
result["model_used"] = b["model"]
return result
# 2. Same chip, similar params: scale by params ratio
same_chip = [b for b in benchmarks if b["chip"] == chip and b["quant"] == llmcheck_quant]
if same_chip:
# Find closest params
closest = min(same_chip, key=lambda b: abs(float(re.sub(r'[^\d.]', '', b["params"])) - params_b))
base_params = float(re.sub(r'[^\d.]', '', closest["params"]))
scale = base_params / params_b if params_b > 0 else 1
result["tps"] = round(closest["tps"] * scale)
result["ttft"] = round(closest["ttft"] * scale, 1) if closest.get("ttft") else None
result["source"] = "llmcheck estimated (similar model on same chip)"
result["confidence"] = "medium"
result["model_used"] = closest["model"]
return result
# 3. Cross-chip: use M5 Max as reference and apply scaling
m5max_refs = [b for b in benchmarks if b["chip"] == "M5 Max" and b["quant"] == llmcheck_quant]
if m5max_refs:
closest = min(m5max_refs, key=lambda b: abs(float(re.sub(r'[^\d.]', '', b["params"])) - params_b))
base_params = float(re.sub(r'[^\d.]', '', closest["params"]))
params_scale = base_params / params_b if params_b > 0 else 1
chip_scale = CHIP_SCALE_FACTORS.get(chip, 1.0)
result["tps"] = round(closest["tps"] * params_scale * chip_scale / CHIP_SCALE_FACTORS["M5 Max"])
result["ttft"] = round(closest["ttft"] * params_scale / chip_scale, 1) if closest.get("ttft") else None
result["source"] = "llmcheck estimated (scaled from M5 Max)"
result["confidence"] = "low"
result["model_used"] = closest["model"]
return result
return result
# ──────────────────────────────────────────────────────────────
# 1. GPU DATABASE (dbgpu → TechPowerUp, auto-updated + Apple Silicon from llmcheck)
# ──────────────────────────────────────────────────────────────
APPLE_SILICON = {
# M5 series (newest, based on llmcheck benchmarks)
"Apple M5 Max (128 GB)": {"vram_gb": 128, "bw_gbs": 546, "tier": "Apple Silicon"},
"Apple M5 Max (64 GB)": {"vram_gb": 64, "bw_gbs": 546, "tier": "Apple Silicon"},
"Apple M5 Pro (48 GB)": {"vram_gb": 48, "bw_gbs": 273, "tier": "Apple Silicon"},
"Apple M5 Pro (24 GB)": {"vram_gb": 24, "bw_gbs": 273, "tier": "Apple Silicon"},
"Apple M5 (16 GB)": {"vram_gb": 16, "bw_gbs": 120, "tier": "Apple Silicon"},
# M4 series
"Apple M4 Ultra (192 GB)": {"vram_gb": 192, "bw_gbs": 819, "tier": "Apple Silicon"},
"Apple M4 Ultra (128 GB)": {"vram_gb": 128, "bw_gbs": 819, "tier": "Apple Silicon"},
"Apple M4 Max (128 GB)": {"vram_gb": 128, "bw_gbs": 546, "tier": "Apple Silicon"},
"Apple M4 Max (96 GB)": {"vram_gb": 96, "bw_gbs": 546, "tier": "Apple Silicon"},
"Apple M4 Max (64 GB)": {"vram_gb": 64, "bw_gbs": 546, "tier": "Apple Silicon"},
"Apple M4 Max (48 GB)": {"vram_gb": 48, "bw_gbs": 546, "tier": "Apple Silicon"},
"Apple M4 Pro (48 GB)": {"vram_gb": 48, "bw_gbs": 273, "tier": "Apple Silicon"},
"Apple M4 Pro (24 GB)": {"vram_gb": 24, "bw_gbs": 273, "tier": "Apple Silicon"},
"Apple M4 (16 GB)": {"vram_gb": 16, "bw_gbs": 120, "tier": "Apple Silicon"},
# M3 series
"Apple M3 Ultra (192 GB)": {"vram_gb": 192, "bw_gbs": 819, "tier": "Apple Silicon"},
"Apple M3 Max (128 GB)": {"vram_gb": 128, "bw_gbs": 400, "tier": "Apple Silicon"},
"Apple M3 Max (96 GB)": {"vram_gb": 96, "bw_gbs": 400, "tier": "Apple Silicon"},
"Apple M3 Max (64 GB)": {"vram_gb": 64, "bw_gbs": 400, "tier": "Apple Silicon"},
"Apple M3 Max (36 GB)": {"vram_gb": 36, "bw_gbs": 400, "tier": "Apple Silicon"},
"Apple M3 Pro (36 GB)": {"vram_gb": 36, "bw_gbs": 150, "tier": "Apple Silicon"},
"Apple M3 Pro (18 GB)": {"vram_gb": 18, "bw_gbs": 150, "tier": "Apple Silicon"},
"Apple M3 (16 GB)": {"vram_gb": 16, "bw_gbs": 100, "tier": "Apple Silicon"},
"Apple M3 (8 GB)": {"vram_gb": 8, "bw_gbs": 100, "tier": "Apple Silicon"},
# M2 series
"Apple M2 Ultra (192 GB)": {"vram_gb": 192, "bw_gbs": 800, "tier": "Apple Silicon"},
"Apple M2 Ultra (128 GB)": {"vram_gb": 128, "bw_gbs": 800, "tier": "Apple Silicon"},
"Apple M2 Max (96 GB)": {"vram_gb": 96, "bw_gbs": 400, "tier": "Apple Silicon"},
"Apple M2 Max (64 GB)": {"vram_gb": 64, "bw_gbs": 400, "tier": "Apple Silicon"},
"Apple M2 Pro (32 GB)": {"vram_gb": 32, "bw_gbs": 200, "tier": "Apple Silicon"},
"Apple M2 Pro (16 GB)": {"vram_gb": 16, "bw_gbs": 200, "tier": "Apple Silicon"},
"Apple M2 (16 GB)": {"vram_gb": 16, "bw_gbs": 100, "tier": "Apple Silicon"},
"Apple M2 (8 GB)": {"vram_gb": 8, "bw_gbs": 100, "tier": "Apple Silicon"},
# M1 series
"Apple M1 Ultra (128 GB)": {"vram_gb": 128, "bw_gbs": 800, "tier": "Apple Silicon"},
"Apple M1 Max (64 GB)": {"vram_gb": 64, "bw_gbs": 400, "tier": "Apple Silicon"},
"Apple M1 Pro (32 GB)": {"vram_gb": 32, "bw_gbs": 200, "tier": "Apple Silicon"},
"Apple M1 Pro (16 GB)": {"vram_gb": 16, "bw_gbs": 200, "tier": "Apple Silicon"},
"Apple M1 (16 GB)": {"vram_gb": 16, "bw_gbs": 100, "tier": "Apple Silicon"},
"Apple M1 (8 GB)": {"vram_gb": 8, "bw_gbs": 100, "tier": "Apple Silicon"},
}
TIER_KEYWORDS = {
"Data Center": ["H200", "H100", "H800", "B200", "B100", "B300", "A100", "A800",
"A40", "L40", "L20", "V100", "P100", "MI3", "MI2", "MI1",
"MI325", "MI350", "MI355", "RTX PRO 6000", "RTX PRO 5000",
"Instinct", "GB10", "Jetson T5000", "Jetson T4000"],
"Workstation": ["RTX 6000", "RTX 5000", "RTX 4000", "RTX 3000", "RTX A6000",
"RTX A5000", "RTX A4000", "Quadro", "W7900", "W7800", "W6800",
"Pro W", "PRO V"],
"Laptop": ["Laptop", "Mobile", "Max-Q", "MXM", "Ti Laptop"],
}
@lru_cache(maxsize=1)
def build_gpu_database():
gpu_db = {}
try:
from dbgpu import GPUDatabase
db = GPUDatabase.default()
for spec in db.specs:
try:
vram = spec.memory_size_gb
bw = spec.memory_bandwidth_gb_s or 0
mfr = spec.manufacturer or ""
name = spec.name or ""
rd = spec.release_date
if not vram or vram < 4:
continue
if mfr not in ("NVIDIA", "AMD", "Intel"):
continue
if rd and rd.year < 2017:
continue
name_l = name.lower()
if any(k.lower() in name_l for k in ["Laptop", "Mobile", "Max-Q", "MXM"]):
if vram < 16:
continue
tier = "Consumer"
for t, kws in TIER_KEYWORDS.items():
if any(kw.lower() in name_l for kw in kws):
tier = t
break
v_str = int(vram) if vram == int(vram) else vram
label = f"{mfr} {name} ({v_str} GB)"
gpu_db[label] = {"vram_gb": vram, "bw_gbs": bw, "tier": tier}
except Exception:
continue
except Exception as e:
print(f"[WARN] dbgpu failed: {e}")
gpu_db.update(APPLE_SILICON)
return gpu_db
def get_gpu_choices():
db = build_gpu_database()
tiers = {"Data Center": [], "Workstation": [], "Consumer": [], "Apple Silicon": [], "Other": []}
for name, info in db.items():
tiers.get(info["tier"], tiers["Other"]).append((name, info["vram_gb"]))
choices = []
for tier in ["Data Center", "Workstation", "Consumer", "Apple Silicon"]:
for name, _ in sorted(tiers[tier], key=lambda x: -x[1]):
choices.append(name)
return choices
# ──────────────────────────────────────────────────────────────
# 2. QUANTIZATION TABLE
# ──────────────────────────────────────────────────────────────
QUANT_BPW = {
"FP32 (32-bit)": {"bpw": 4.000, "color": "#ef4444", "desc": "Full precision. Training baseline. Rarely used for inference."},
"BF16 / FP16": {"bpw": 2.000, "color": "#f97316", "desc": "Standard half-precision. Most HF checkpoints. Training standard."},
"FP8 (H100/B200)": {"bpw": 1.000, "color": "#eab308", "desc": "Native on Hopper/Blackwell. Near-FP16 quality with 2x savings."},
"INT8 / Q8_0": {"bpw": 1.000, "color": "#eab308", "desc": "8-bit. 50% smaller vs FP16, negligible quality loss."},
"Q6_K": {"bpw": 0.781, "color": "#84cc16", "desc": "6-bit GGUF. Near-original quality. Good for quality-sensitive tasks."},
"Q5_K_M": {"bpw": 0.688, "color": "#22c55e", "desc": "5-bit GGUF. Better quality than Q4 with minimal extra VRAM."},
"Q4_K_M (best)": {"bpw": 0.567, "color": "#10b981", "desc": "MOST POPULAR. Best balance size vs quality. Recommended starting point."},
"Q4_K_S": {"bpw": 0.534, "color": "#14b8a6", "desc": "4-bit smaller variant. Slightly lower quality than Q4_K_M."},
"Q4_0 / NF4": {"bpw": 0.500, "color": "#06b6d4", "desc": "Basic 4-bit. NF4 variant used for QLoRA fine-tuning."},
"IQ4_XS": {"bpw": 0.478, "color": "#3b82f6", "desc": "Importance-matrix 4-bit. Better quality than Q4_K_S at same size."},
"Q3_K_M": {"bpw": 0.375, "color": "#8b5cf6", "desc": "3-bit GGUF. Noticeable quality drop. Only when severely VRAM-limited."},
"Q2_K": {"bpw": 0.250, "color": "#a855f7", "desc": "2-bit. Maximum compression, significant quality loss."},
"1.58-bit (BitNet)": {"bpw": 0.188, "color": "#ec4899", "desc": "Experimental ternary. Requires BitNet-native trained models."},
}
# ──────────────────────────────────────────────────────────────
# 3. MODEL METADATA FETCHER
# ──────────────────────────────────────────────────────────────
KNOWN_MODELS = {
"meta-llama/llama-3.1-8b": (8.03e9, 131072, "LLaMA", "BF16 / FP16"),
"meta-llama/llama-3.1-70b": (70.6e9, 131072, "LLaMA", "BF16 / FP16"),
"meta-llama/llama-3.1-405b": (405e9, 131072, "LLaMA", "BF16 / FP16"),
"meta-llama/llama-3.2-3b": (3.21e9, 131072, "LLaMA", "BF16 / FP16"),
"meta-llama/llama-3.2-1b": (1.24e9, 131072, "LLaMA", "BF16 / FP16"),
"meta-llama/llama-4-scout": (109e9, 10000000, "LLaMA-4 MoE", "BF16 / FP16"),
"meta-llama/llama-4-maverick": (400e9, 1000000, "LLaMA-4 MoE", "BF16 / FP16"),
"microsoft/phi-4": (14.7e9, 16384, "Phi", "BF16 / FP16"),
"microsoft/phi-3.5-mini": (3.82e9, 128000, "Phi", "BF16 / FP16"),
"microsoft/phi-3-mini": (3.82e9, 4096, "Phi", "BF16 / FP16"),
"microsoft/phi-2": (2.78e9, 2048, "Phi", "BF16 / FP16"),
"mistralai/mistral-7b": (7.24e9, 32768, "Mistral", "BF16 / FP16"),
"mistralai/mistral-nemo": (12.2e9, 128000, "Mistral", "BF16 / FP16"),
"mistralai/mixtral-8x7b": (46.7e9, 32768, "Mixtral MoE", "BF16 / FP16"),
"mistralai/mixtral-8x22b": (141e9, 65536, "Mixtral MoE", "BF16 / FP16"),
"qwen/qwen2.5-7b": (7.62e9, 131072, "Qwen", "BF16 / FP16"),
"qwen/qwen2.5-14b": (14.8e9, 131072, "Qwen", "BF16 / FP16"),
"qwen/qwen2.5-32b": (32.5e9, 131072, "Qwen", "BF16 / FP16"),
"qwen/qwen2.5-72b": (72.7e9, 131072, "Qwen", "BF16 / FP16"),
"qwen/qwen3-0.6b": (0.6e9, 32768, "Qwen", "BF16 / FP16"),
"qwen/qwen3-1.7b": (1.7e9, 32768, "Qwen", "BF16 / FP16"),
"qwen/qwen3-4b": (4.0e9, 32768, "Qwen", "BF16 / FP16"),
"qwen/qwen3-8b": (8.19e9, 131072, "Qwen", "BF16 / FP16"),
"qwen/qwen3-14b": (14.8e9, 131072, "Qwen", "BF16 / FP16"),
"qwen/qwen3-32b": (32.8e9, 131072, "Qwen", "BF16 / FP16"),
"qwen/qwen3-72b": (72.7e9, 131072, "Qwen", "BF16 / FP16"),
"qwen/qwen3-235b-a22b": (235e9, 131072, "Qwen MoE", "BF16 / FP16"),
"deepseek-ai/deepseek-v3": (671e9, 163840, "DeepSeek MoE", "BF16 / FP16"),
"deepseek-ai/deepseek-r1": (671e9, 163840, "DeepSeek MoE", "BF16 / FP16"),
"deepseek-ai/deepseek-v2": (236e9, 131072, "DeepSeek MoE", "BF16 / FP16"),
"google/gemma-2-2b": (2.61e9, 8192, "Gemma", "BF16 / FP16"),
"google/gemma-2-9b": (9.24e9, 8192, "Gemma", "BF16 / FP16"),
"google/gemma-2-27b": (27.2e9, 8192, "Gemma", "BF16 / FP16"),
"google/gemma-3-27b": (27e9, 131072, "Gemma", "BF16 / FP16"),
"openai-community/gpt2": (124e6, 1024, "GPT-2", "FP32 (32-bit)"),
"tiiuae/falcon-7b": (7.0e9, 2048, "Falcon", "BF16 / FP16"),
"tiiuae/falcon-40b": (40.0e9, 2048, "Falcon", "BF16 / FP16"),
}
def fetch_model_info(model_slug: str, hf_token: str = "") -> dict:
result = {"params": None, "params_b": None, "max_context": 4096,
"arch": "Unknown", "dtype": "BF16 / FP16", "source": "",
"error": None, "is_moe": False}
model_slug = model_slug.strip().strip("/")
if not model_slug or "/" not in model_slug:
result["error"] = "Enter a valid HuggingFace slug — e.g. `meta-llama/Llama-3.1-8B-Instruct`"
return result
headers = {"Authorization": f"Bearer {hf_token}"} if hf_token else {}
# (a) HF API
try:
r = requests.get(f"https://huggingface.co/api/models/{model_slug}",
headers=headers, timeout=12)
if r.status_code == 200:
data = r.json()
st = data.get("safetensors", {})
if st and st.get("total", 0) > 0:
result["params"] = int(st["total"])
result["source"] = "safetensors metadata"
tags = [t.lower() for t in (data.get("tags") or [])]
for t in tags:
if "llama" in t: result["arch"] = "LLaMA"; break
if "mistral" in t: result["arch"] = "Mistral"; break
if "mixtral" in t: result["arch"] = "Mixtral MoE"; break
if "qwen" in t: result["arch"] = "Qwen"; break
if "gemma" in t: result["arch"] = "Gemma"; break
if "phi" in t: result["arch"] = "Phi"; break
if "falcon" in t: result["arch"] = "Falcon"; break
if "gpt" in t: result["arch"] = "GPT"; break
if any("moe" in t or "mixture" in t for t in tags):
result["is_moe"] = True
except Exception:
pass
# (b) config.json
if not result["params"]:
try:
r = requests.get(
f"https://huggingface.co/{model_slug}/resolve/main/config.json",
headers=headers, timeout=12)
if r.status_code == 200:
cfg = r.json()
result["arch"] = cfg.get("model_type", result["arch"]).replace("_", " ").title()
ctx = (cfg.get("max_position_embeddings") or cfg.get("max_sequence_length")
or cfg.get("n_positions") or cfg.get("seq_length"))
if ctx:
result["max_context"] = int(ctx)
if "float32" in str(cfg.get("torch_dtype", "")):
result["dtype"] = "FP32 (32-bit)"
if cfg.get("num_experts") or cfg.get("num_local_experts"):
result["is_moe"] = True
h = cfg.get("hidden_size") or cfg.get("d_model") or cfg.get("n_embd")
L = cfg.get("num_hidden_layers") or cfg.get("n_layer")
ffn = cfg.get("intermediate_size")
vocab = cfg.get("vocab_size")
if h and L and vocab:
p = L * (4*h*h + (2*h*ffn if ffn else 8*h*h)) + vocab*h
if p > 1_000_000:
result["params"] = int(p)
result["source"] = "config.json arch inference"
except Exception:
pass
# (c) safetensors index
if not result["params"]:
try:
r = requests.get(
f"https://huggingface.co/{model_slug}/resolve/main/model.safetensors.index.json",
headers=headers, timeout=12)
if r.status_code == 200:
idx = r.json()
sz = idx.get("metadata", {}).get("total_size", 0)
if sz > 0:
result["params"] = sz // 2
result["source"] = "safetensors index (BF16 assumed)"
except Exception:
pass
# (d) Name heuristic
if not result["params"]:
for pat in [r'[\-\_\/](\d+(?:\.\d+)?)[Bb][\-\_\s\.]',
r'[\-\_\/](\d+(?:\.\d+)?)[Bb]$',
r'^(\d+(?:\.\d+)?)[Bb][\-\_]']:
m = re.search(pat, model_slug)
if m:
b = float(m.group(1))
if 0.05 <= b <= 10000:
result["params"] = int(b * 1e9)
result["source"] = f"name heuristic ({b}B)"
break
# (e) Known model table
key = model_slug.lower()
for known, (p, ctx, arch, dtype) in KNOWN_MODELS.items():
if key == known or key.startswith(known + "-") or key.startswith(known + "_"):
if not result["params"]:
result["params"] = int(p)
result["source"] = "known model table"
if result["max_context"] == 4096:
result["max_context"] = ctx
if result["arch"] == "Unknown":
result["arch"] = arch
if "MoE" in arch:
result["is_moe"] = True
break
if result["params"]:
result["params_b"] = result["params"] / 1e9
else:
result["error"] = ("Could not determine parameter count. "
"Try a HF token for gated models, or use the manual override.")
return result
# ──────────────────────────────────────────────────────────────
# 4. VRAM CALCULATION ENGINE
# ──────────────────────────────────────────────────────────────
def calc_inference(params, quant_key, context_len, batch_size):
bpw = QUANT_BPW[quant_key]["bpw"]
weights_gb = params * bpw / 1e9
est_layers = max(16, int(28 * (params / 7e9) ** 0.45))
est_kv_heads = 8
est_head_dim = 128
kv_bytes = 2 * est_kv_heads * est_head_dim * 2 * est_layers * context_len * batch_size
kv_gb = kv_bytes / 1e9
acts_gb = weights_gb * 0.05
overhead_gb = max(0.5, weights_gb * 0.05)
return {"total": weights_gb + kv_gb + acts_gb + overhead_gb,
"weights": weights_gb, "kv": kv_gb, "acts": acts_gb, "overhead": overhead_gb}
def calc_full_ft(params, context_len, batch_size):
weights_gb = params * 2 / 1e9
grads_gb = params * 4 / 1e9
optimizer_gb = params * 8 / 1e9
seq_scale = max(1.0, context_len / 2048)
acts_gb = weights_gb * 1.5 * seq_scale * max(1.0, batch_size)
overhead_gb = max(1.0, weights_gb * 0.1)
return {"total": weights_gb + grads_gb + optimizer_gb + acts_gb + overhead_gb,
"weights": weights_gb, "grads": grads_gb,
"optimizer": optimizer_gb, "acts": acts_gb, "overhead": overhead_gb}
def calc_lora(params, quant_key, context_len, batch_size, lora_rank):
bpw = QUANT_BPW[quant_key]["bpw"]
weights_gb = params * bpw / 1e9
trainable_ratio = (2 * lora_rank) / 4096 * 0.30
tp = int(params * trainable_ratio)
adapter_gb = tp * 2 / 1e9
grads_gb = tp * 4 / 1e9
optimizer_gb = tp * 8 / 1e9
acts_gb = weights_gb * 0.8 * max(1.0, context_len / 2048) * max(1.0, batch_size)
overhead_gb = max(0.5, weights_gb * 0.05)
return {"total": weights_gb + adapter_gb + grads_gb + optimizer_gb + acts_gb + overhead_gb,
"weights": weights_gb, "adapter": adapter_gb,
"grads": grads_gb, "optimizer": optimizer_gb, "acts": acts_gb, "overhead": overhead_gb,
"tp": tp, "tpct": trainable_ratio * 100}
def calc_qlora(params, context_len, batch_size, lora_rank):
weights_gb = params * 0.5 / 1e9
trainable_ratio = (2 * lora_rank) / 4096 * 0.30
tp = int(params * trainable_ratio)
adapter_gb = tp * 2 / 1e9
grads_gb = tp * 4 / 1e9
optimizer_gb = tp * 8 / 1e9
dequant_gb = weights_gb * 0.05
acts_gb = weights_gb * 0.5 * max(1.0, context_len / 2048) * max(1.0, batch_size)
overhead_gb = max(0.5, weights_gb * 0.08)
return {"total": weights_gb + adapter_gb + grads_gb + optimizer_gb + dequant_gb + acts_gb + overhead_gb,
"weights": weights_gb, "adapter": adapter_gb, "grads": grads_gb,
"optimizer": optimizer_gb, "dequant": dequant_gb, "acts": acts_gb, "overhead": overhead_gb,
"tp": tp, "tpct": trainable_ratio * 100}
def gpu_compat(required_gb, gpu_name, n_gpus):
db = build_gpu_database()
gpu = db.get(gpu_name)
if not gpu:
return "❓", "GPU not in database", 999
available = gpu["vram_gb"] * n_gpus
pct = required_gb * 1.05 / available * 100
gpus_needed = math.ceil(required_gb * 1.05 / gpu["vram_gb"])
if pct <= 75:
return "✅", f"{required_gb:.1f} GB needed / {available:.0f} GB available — fits comfortably", pct
elif pct <= 100:
return "⚠️", f"{required_gb:.1f} GB needed / {available:.0f} GB available — tight fit", pct
else:
return "❌", f"{required_gb:.1f} GB needed / {available:.0f} GB available — need ≥{gpus_needed}× GPUs", pct
# ──────────────────────────────────────────────────────────────
# 5. HTML RENDERING HELPERS
# ──────────────────────────────────────────────────────────────
def bar_html(val, total_val, color="#10b981"):
pct = min(100, val / max(total_val, 0.001) * 100)
w = int(pct * 240 / 100)
return (f'<div style="display:flex;align-items:center;gap:10px;margin:2px 0">'
f'<div style="width:240px;background:#1e293b;border-radius:3px;height:8px;flex-shrink:0">'
f'<div style="width:{w}px;background:{color};height:8px;border-radius:3px"></div></div>'
f'<code style="color:#94a3b8;font-size:12px;white-space:nowrap">{val:.2f} GB</code></div>')
def compat_badge(emoji, msg, pct):
col = "#10b981" if emoji == "✅" else "#f59e0b" if emoji == "⚠️" else "#ef4444"
return (f'<div style="background:{col}18;border:1px solid {col}55;border-radius:8px;'
f'padding:10px 14px;margin-top:12px;color:{col};font-size:13px;font-family:monospace">'
f'{emoji} {msg}</div>')
def result_card(title, rows, total, gpu_name, n_gpus, accent="#a78bfa"):
emoji, msg, pct = gpu_compat(total, gpu_name, n_gpus)
rows_html = "".join(
f'<tr><td style="padding:5px 12px 5px 0;color:#64748b;font-size:12px;'
f'white-space:nowrap;vertical-align:middle">{lbl}</td>'
f'<td style="padding:5px 0;vertical-align:middle">{bar_html(val, total, col)}</td></tr>'
for lbl, val, col in rows
)
return (
f'<div style="background:#0f172a;border:1px solid #1e293b;border-radius:14px;'
f'padding:22px 24px;margin:0 0 12px">'
f'<div style="font-size:14px;font-weight:700;color:{accent};'
f'font-family:monospace;margin-bottom:10px">{title}</div>'
f'<div style="font-size:28px;font-weight:800;color:#f8fafc;'
f'font-family:monospace;margin-bottom:16px">'
f'{total:.2f} <span style="font-size:13px;color:#334155;font-weight:400">GB total</span></div>'
f'<table style="width:100%;border-collapse:collapse">{rows_html}</table>'
f'<div style="border-top:1px solid #1e293b;margin-top:14px;padding-top:12px;'
f'font-size:11px;color:#334155;font-family:monospace">'
f'GPU: <span style="color:#475569">{gpu_name} × {n_gpus}</span>'
f'{compat_badge(emoji, msg, pct)}</div></div>'
)
# ──────────────────────────────────────────────────────────────
# 6. MAIN CALCULATE
# ──────────────────────────────────────────────────────────────
def calculate(model_slug, hf_token, quant_key, context_len, batch_size,
lora_rank, gpu_name, n_gpus, manual_params_b, use_manual):
if use_manual and manual_params_b > 0:
params = int(manual_params_b * 1e9)
info = {"params": params, "params_b": manual_params_b, "max_context": 4096,
"arch": "Manual", "dtype": quant_key, "source": "manual override",
"error": None, "is_moe": False}
else:
info = fetch_model_info(model_slug.strip(), hf_token.strip())
if info["error"] and not use_manual:
return (f'<div style="background:#7f1d1d22;border:1px solid #ef4444;border-radius:10px;'
f'padding:14px;color:#fca5a5;font-family:monospace;font-size:13px">'
f'⚠️ {info["error"]}</div>'), "", "", "", "", ""
params = info["params"] or 0
if params == 0:
return '<div style="color:#ef4444;font-family:monospace">❌ No parameter count available.</div>', "", "", "", "", ""
params_b = params / 1e9
infer = calc_inference(params, quant_key, context_len, batch_size)
full = calc_full_ft(params, context_len, batch_size)
lora = calc_lora(params, quant_key, context_len, batch_size, lora_rank)
ql = calc_qlora(params, context_len, batch_size, lora_rank)
moe_badge = (' <span style="background:#7c3aed33;color:#a78bfa;font-size:10px;'
'border-radius:4px;padding:2px 6px;margin-left:6px">MoE</span>'
if info.get("is_moe") else "")
# Model info panel
bpw = QUANT_BPW[quant_key]["bpw"]
model_html = (
f'<div style="background:#0f172a;border:1px solid #1e293b;border-radius:14px;'
f'padding:22px 24px;margin-bottom:12px;font-family:monospace">'
f'<div style="font-size:10px;letter-spacing:3px;text-transform:uppercase;'
f'color:#334155;margin-bottom:6px">Model</div>'
f'<div style="font-size:18px;font-weight:800;color:#a78bfa;margin-bottom:16px;word-break:break-all">'
f'{model_slug.strip() if not use_manual else "Manual Entry"}{moe_badge}</div>'
f'<div style="display:grid;grid-template-columns:repeat(4,1fr);gap:10px">'
f'<div style="background:#111827;border-radius:8px;padding:12px">'
f'<div style="color:#334155;font-size:10px;text-transform:uppercase;letter-spacing:1px;margin-bottom:4px">Parameters</div>'
f'<div style="color:#f8fafc;font-size:20px;font-weight:800">{params_b:.2f}B</div>'
f'<div style="color:#334155;font-size:11px">{params:,}</div></div>'
f'<div style="background:#111827;border-radius:8px;padding:12px">'
f'<div style="color:#334155;font-size:10px;text-transform:uppercase;letter-spacing:1px;margin-bottom:4px">Architecture</div>'
f'<div style="color:#f8fafc;font-size:14px;font-weight:700">{info.get("arch","?")}</div>'
f'<div style="color:#334155;font-size:11px">{info.get("dtype","?")}</div></div>'
f'<div style="background:#111827;border-radius:8px;padding:12px">'
f'<div style="color:#334155;font-size:10px;text-transform:uppercase;letter-spacing:1px;margin-bottom:4px">Max Context</div>'
f'<div style="color:#f8fafc;font-size:14px;font-weight:700">{info.get("max_context",4096):,}</div>'
f'<div style="color:#334155;font-size:11px">tokens</div></div>'
f'<div style="background:#111827;border-radius:8px;padding:12px">'
f'<div style="color:#334155;font-size:10px;text-transform:uppercase;letter-spacing:1px;margin-bottom:4px">Quantization</div>'
f'<div style="color:#f8fafc;font-size:12px;font-weight:700">{quant_key}</div>'
f'<div style="color:#334155;font-size:11px">{bpw} B/param</div></div>'
f'</div>'
f'<div style="margin-top:10px;font-size:11px;color:#1e293b">source: {info.get("source","?")}</div>'
f'</div>'
)
# Check if Apple Silicon GPU selected - add performance estimate
apple_chip = parse_apple_silicon_chip(gpu_name)
perf_html = ""
if apple_chip and infer["total"] <= build_gpu_database().get(gpu_name, {}).get("vram_gb", 0) * n_gpus:
perf = estimate_apple_silicon_tps(params_b, apple_chip, quant_key)
if perf["tps"]:
confidence_colors = {"high": "#10b981", "medium": "#f59e0b", "low": "#64748b"}
conf_color = confidence_colors.get(perf["confidence"], "#64748b")
conf_label = {"high": "Measured", "medium": "Estimated", "low": "Rough estimate"}
conf_text = conf_label.get(perf["confidence"], "Unknown")
perf_html = (
f'<div style="background:#0f172a;border:1px solid #1e293b;border-radius:14px;'
f'padding:16px 20px;margin-top:12px;font-family:monospace">'
f'<div style="font-size:10px;letter-spacing:2px;text-transform:uppercase;color:#334155;margin-bottom:8px">'
f'Apple Silicon Inference Speed</div>'
f'<div style="display:flex;align-items:center;gap:16px">'
f'<div style="background:#111827;border-radius:8px;padding:12px 16px">'
f'<div style="color:#334155;font-size:10px;text-transform:uppercase;margin-bottom:2px">Speed</div>'
f'<div style="color:#f8fafc;font-size:24px;font-weight:800">~{perf["tps"]} <span style="font-size:14px;color:#475569">tok/s</span></div></div>'
f'<div style="background:#111827;border-radius:8px;padding:12px 16px">'
f'<div style="color:#334155;font-size:10px;text-transform:uppercase;margin-bottom:2px">TTFT</div>'
f'<div style="color:#f8fafc;font-size:24px;font-weight:800">{perf["ttft"] or "?"} <span style="font-size:14px;color:#475569">s</span></div></div>'
f'<div style="background:{conf_color}18;border:1px solid {conf_color}55;border-radius:8px;padding:8px 12px">'
f'<div style="color:{conf_color};font-size:11px;font-weight:600">{conf_text}</div>'
f'<div style="color:#334155;font-size:9px">based on {perf["model_used"] or "?"}</div></div></div>'
f'<div style="margin-top:10px;font-size:10px;color:#475569">'
f'Source: <a href="https://llmcheck.net" style="color:#64748b" target="_blank">llmcheck.net</a> '
f'(CC BY 4.0) · {perf["source"] or ""}</div></div>'
)
infer_html = result_card(
"🚀 Inference",
[("Model Weights", infer["weights"], "#a78bfa"),
(f"KV Cache {context_len:,} ctx × batch {batch_size}", infer["kv"], "#60a5fa"),
("Activations (Flash Attn)", infer["acts"], "#34d399"),
("Framework Overhead", infer["overhead"], "#475569")],
infer["total"], gpu_name, n_gpus, "#a78bfa"
) + perf_html
ft_html = result_card(
"🎓 Full Fine-Tune <span style='font-weight:400;font-size:12px;color:#334155'>(BF16 weights + FP32 Adam)</span>",
[("Weights BF16 2 B/param", full["weights"], "#a78bfa"),
("Gradients FP32 4 B/param", full["grads"], "#f97316"),
("AdamW States 8 B/param", full["optimizer"], "#ef4444"),
(f"Activations (grad ckpt) ctx={context_len:,}", full["acts"], "#60a5fa"),
("Overhead", full["overhead"], "#475569")],
full["total"], gpu_name, n_gpus, "#f97316"
)
lora_html = result_card(
f"🔧 LoRA <span style='font-weight:400;font-size:12px;color:#334155'>rank={lora_rank} {lora['tpct']:.2f}% trainable</span>",
[(f"Base Weights {quant_key}", lora["weights"], "#a78bfa"),
(f"LoRA Adapters BF16 {lora['tp']:,} params", lora["adapter"], "#34d399"),
("Adapter Grads FP32", lora["grads"], "#f97316"),
("Adapter AdamW FP32", lora["optimizer"], "#ef4444"),
("Activations", lora["acts"], "#60a5fa"),
("Overhead", lora["overhead"], "#475569")],
lora["total"], gpu_name, n_gpus, "#34d399"
)
ql_html = result_card(
f"⚡ QLoRA <span style='font-weight:400;font-size:12px;color:#334155'>NF4 base + BF16 adapters rank={lora_rank}</span>",
[("Base NF4 0.5 B/param", ql["weights"], "#a78bfa"),
(f"LoRA Adapters BF16 {ql['tp']:,} params", ql["adapter"], "#34d399"),
("Adapter Grads FP32", ql["grads"], "#f97316"),
("Paged AdamW FP32", ql["optimizer"], "#ef4444"),
("Dequant Buffers temp FP16", ql["dequant"], "#fb923c"),
("Activations", ql["acts"], "#60a5fa"),
("Overhead", ql["overhead"], "#475569")],
ql["total"], gpu_name, n_gpus, "#facc15"
)
return model_html, infer_html, ft_html, lora_html, ql_html, ""
def auto_fetch(model_slug, hf_token):
info = fetch_model_info(model_slug.strip(), hf_token.strip())
if info["error"]:
return gr.update(), gr.update(), f"⚠️ {info['error']}", gr.update()
ctx_opts = [512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 1000000]
nearest = min(ctx_opts, key=lambda x: abs(x - info.get("max_context", 4096)))
status = (f"✅ **{info['params_b']:.2f}B params** · {info['arch']} · "
f"ctx {info.get('max_context',4096):,} · source: {info['source']}")
return gr.update(value=info["params_b"]), gr.update(value=nearest), status, gr.update(value=True)
# ──────────────────────────────────────────────────────────────
# 7. UI - Clean, intuitive workflow
# ──────────────────────────────────────────────────────────────
CSS = """
/* Warm, soft palette - easy on eyes */
@import url('https://fonts.googleapis.com/css2?family=DM+Sans:wght@400;500;600;700&family=DM+Mono:wght@400;500&display=swap');
:root {
--bg-primary: #f5f3f0;
--bg-secondary: #ebe8e4;
--bg-card: #fdfcfb;
--bg-warm: #f9f7f5;
--text-primary: #3d3d3d;
--text-secondary: #5c5c5c;
--text-muted: #8a8a8a;
--accent: #d97706;
--accent-light: #fef7ed;
--accent-dark: #b45309;
--success: #16a34a;
--success-bg: #f0fdf4;
--success-border: #86efac;
--warning: #ca8a04;
--warning-bg: #fefce8;
--warning-border: #fde047;
--error: #dc2626;
--error-bg: #fef2f2;
--error-border: #fca5a5;
--border: #e5e2de;
--border-strong: #d1ccc6;
}
body, .gradio-container {
font-family: 'DM Sans', sans-serif !important;
background: var(--bg-primary) !important;
color: var(--text-primary) !important;
}
.gradio-container {
max-width: 1400px !important;
margin: 0 auto !important;
padding: 16px 32px !important;
}
/* Clean card styling */
.gr-box, .gr-panel {
background: var(--bg-card) !important;
border: 1px solid var(--border) !important;
border-radius: 12px !important;
}
/* Better labels */
label, .gr-label {
color: var(--text-secondary) !important;
font-size: 13px !important;
font-weight: 500 !important;
letter-spacing: 0 !important;
text-transform: none !important;
}
/* Inputs */
input, textarea, select {
background: var(--bg-card) !important;
color: var(--text-primary) !important;
border: 1px solid var(--border-strong) !important;
font-family: 'IBM Plex Sans', sans-serif !important;
font-size: 14px !important;
padding: 12px !important;
}
input:focus, select:focus {
border-color: var(--accent) !important;
outline: none !important;
box-shadow: 0 0 0 3px var(--accent-light) !important;
}
/* Buttons */
button.primary {
background: var(--accent) !important;
color: white !important;
border: none !important;
font-family: 'DM Sans', sans-serif !important;
font-weight: 700 !important;
font-size: 16px !important;
padding: 16px 32px !important;
border-radius: 8px !important;
box-shadow: 0 2px 8px rgba(217, 119, 6, 0.25) !important;
}
button.primary:hover {
background: var(--accent-dark) !important;
transform: translateY(-1px) !important;
}
button.secondary {
background: var(--bg-card) !important;
color: var(--text-primary) !important;
border: 2px solid var(--border-strong) !important;
font-family: 'DM Sans', sans-serif !important;
}
/* Radio/Mode selector - very visible selected state */
.gr-radio {
background: var(--bg-card) !important;
border: 2px solid var(--border-strong) !important;
border-radius: 12px !important;
padding: 16px !important;
}
.gr-radio label {
font-size: 15px !important;
font-weight: 600 !important;
padding: 12px 20px !important;
border-radius: 8px !important;
margin: 4px !important;
background: var(--bg-warm) !important;
border: 2px solid transparent !important;
transition: all 0.2s !important;
}
.gr-radio label.selected {
background: var(--accent) !important;
color: white !important;
border: 2px solid var(--accent-dark) !important;
box-shadow: 0 2px 8px rgba(217, 119, 6, 0.4) !important;
}
.gr-radio input[type="radio"] {
display: none !important;
}
/* Tabs */
.tab-nav {
background: var(--bg-secondary) !important;
border-radius: 10px !important;
padding: 6px !important;
margin-bottom: 16px !important;
border: 2px solid var(--border) !important;
}
.tab-nav button {
color: var(--text-secondary) !important;
font-family: 'DM Sans', sans-serif !important;
font-size: 15px !important;
font-weight: 600 !important;
padding: 12px 24px !important;
border-radius: 6px !important;
}
.tab-nav button.selected {
color: white !important;
background: var(--accent) !important;
box-shadow: 0 2px 6px rgba(217, 119, 6, 0.3) !important;
}
/* Sliders */
input[type=range] {
accent-color: var(--accent) !important;
}
/* Hide unnecessary elements */
footer, .built-with, #component-0 > .svelte-1gf513q { display: none !important; }
/* Accordion styling */
.accordion {
background: var(--bg-card) !important;
border: 1px solid var(--border) !important;
}
details summary {
color: var(--text-secondary) !important;
font-weight: 500 !important;
}
/* Number input styling */
input[type=number] {
font-family: 'IBM Plex Mono', monospace !important;
}
/* Dropdown styling */
.gr-dropdown {
font-size: 14px !important;
}
/* Markdown styling */
.gr-markdown {
font-family: 'IBM Plex Sans', sans-serif !important;
color: var(--text-primary) !important;
}
.gr-markdown p {
font-size: 14px !important;
line-height: 1.6 !important;
}
.gr-markdown code {
font-family: 'IBM Plex Mono', monospace !important;
background: var(--bg-secondary) !important;
padding: 2px 6px !important;
border-radius: 4px !important;
}
/* Hide Gradio's default header styling */
.contain .top-container { display: none !important; }
"""
GPU_CHOICES = get_gpu_choices()
GPU_COUNT = len(GPU_CHOICES)
DEFAULT_GPU = next((g for g in GPU_CHOICES if "RTX 4090" in g), GPU_CHOICES[0])
def build_result_html(result_type, fits, required_gb, available_gb, details, gpu_name, n_gpus, formulas=""):
"""Build result HTML with YES/NO, breakdown, and formulas."""
status = "YES" if fits == "✅" else ("MAYBE" if fits == "⚠️" else "NO")
status_color = "var(--success)" if fits == "✅" else ("var(--warning)" if fits == "⚠️" else "var(--error)")
status_bg = "var(--success-bg)" if fits == "✅" else ("var(--warning-bg)" if fits == "⚠️" else "var(--error-bg)")
status_border = "var(--success-border)" if fits == "✅" else ("var(--warning-border)" if fits == "⚠️" else "var(--error-border)")
gpus_needed = math.ceil(required_gb * 1.05 / available_gb) if fits == "❌" else n_gpus
return f'''
<div style="border:2px solid var(--border);border-radius:12px;overflow:hidden;background:var(--bg-card)">
<!-- YES/NO Header -->
<div style="background:{status_bg};padding:24px;border-bottom:2px solid {status_border}">
<div style="display:flex;align-items:center;gap:24px">
<div style="font-size:48px;font-weight:700;color:{status_color};font-family:'DM Sans',sans-serif">{status}</div>
<div style="flex:1">
<div style="font-size:18px;color:var(--text-primary);font-weight:600">{result_type} on {gpu_name} × {gpus_needed if fits == "❌" else n_gpus}</div>
<div style="font-family:'DM Mono',monospace;font-size:22px;color:var(--text-secondary);margin-top:8px">{required_gb:.1f} GB required / {available_gb:.0f} GB available</div>
</div>
</div>
</div>
<!-- VRAM Breakdown -->
<div style="padding:24px">
<div style="font-size:14px;font-weight:600;color:var(--text-primary);margin-bottom:16px">VRAM Breakdown</div>
{details}
</div>
<!-- Formula -->
<div style="padding:0 24px 24px 24px">
<div style="background:var(--bg-secondary);border-radius:8px;padding:16px;border:1px solid var(--border)">
<div style="font-size:12px;font-weight:600;color:var(--text-muted);margin-bottom:8px">How it's calculated</div>
<div style="font-family:'DM Mono',monospace;font-size:13px;color:var(--text-secondary);line-height:1.6">
{formulas}
</div>
</div>
</div>
</div>
'''
def calculate_clean(mode, model_slug, gpu_name, n_gpus, quant_key, context_len, manual_params_b, use_manual):
"""Clean calculation function with simplified output."""
# Mode names are already correct from dropdown
# Convert simplified quant names
quant_map = {"Q4_K_M": "Q4_K_M (best)", "Q5_K_M": "Q5_K_M", "INT8": "INT8 / Q8_0", "NF4": "Q4_0 / NF4", "FP16": "BF16 / FP16"}
quant_key = quant_map.get(quant_key, quant_key)
# Get model info
if use_manual and manual_params_b > 0:
params = int(manual_params_b * 1e9)
params_b = manual_params_b
model_name = "Manual Entry"
else:
model_slug = model_slug.strip().strip("/")
if not model_slug or "/" not in model_slug:
return '<div style="background:var(--error-bg);border:2px solid var(--error-border);border-radius:12px;padding:20px;color:var(--error);font-size:15px">Enter a valid HuggingFace slug like meta-llama/Llama-3.1-8B-Instruct</div>'
info = fetch_model_info(model_slug, "")
if info["error"]:
return f'<div style="background:var(--error-bg);border:2px solid var(--error-border);border-radius:12px;padding:20px;color:var(--error);font-size:15px">{info["error"]}</div>'
params = info["params"]
if not params:
return '<div style="background:var(--error-bg);border:2px solid var(--error-border);border-radius:12px;padding:20px;color:var(--error);font-size:15px">Could not determine model size. Use manual override.</div>'
params_b = info["params_b"]
model_name = model_slug
# Get GPU info - handle both int and float n_gpus
n_gpus = int(n_gpus) if n_gpus else 1
db = build_gpu_database()
gpu = db.get(gpu_name)
if not gpu:
return '<div style="background:var(--error-bg);border:2px solid var(--error-border);border-radius:12px;padding:20px;color:var(--error);font-size:15px">GPU not found in database</div>'
available_gb = gpu["vram_gb"] * n_gpus
# Calculate based on mode
batch_size = 1 # Simplified for clean UI
context_len = int(context_len) if context_len else 4096
formulas = "" # Default
if mode == "Run (Inference)":
calc = calc_inference(params, quant_key, context_len, batch_size)
result_type = "Inference"
breakdown = f"""
<div style="display:grid;grid-template-columns:1fr 1fr;gap:12px">
<div style="background:var(--bg-secondary);padding:16px;border-radius:8px;border-left:3px solid var(--accent)">
<div style="font-size:13px;color:var(--text-muted);margin-bottom:4px">Model weights</div>
<div style="font-size:20px;font-weight:700;color:var(--text-primary)">{calc["weights"]:.2f} GB</div>
<div style="font-size:12px;color:var(--text-secondary);margin-top:4px">{quant_key} @ {QUANT_BPW[quant_key]["bpw"]} B/param</div>
</div>
<div style="background:var(--bg-secondary);padding:16px;border-radius:8px;border-left:3px solid var(--accent)">
<div style="font-size:13px;color:var(--text-muted);margin-bottom:4px">KV Cache</div>
<div style="font-size:20px;font-weight:700;color:var(--text-primary)">{calc["kv"]:.2f} GB</div>
<div style="font-size:12px;color:var(--text-secondary);margin-top:4px">context={context_len:,} tokens</div>
</div>
<div style="background:var(--bg-secondary);padding:16px;border-radius:8px;border-left:3px solid var(--success)">
<div style="font-size:13px;color:var(--text-muted);margin-bottom:4px">Activations</div>
<div style="font-size:20px;font-weight:700;color:var(--text-primary)">{calc["acts"]:.2f} GB</div>
<div style="font-size:12px;color:var(--text-secondary);margin-top:4px">Flash Attention ~5%</div>
</div>
<div style="background:var(--bg-secondary);padding:16px;border-radius:8px;border-left:3px solid var(--text-muted)">
<div style="font-size:13px;color:var(--text-muted);margin-bottom:4px">Overhead</div>
<div style="font-size:20px;font-weight:700;color:var(--text-primary)">{calc["overhead"]:.2f} GB</div>
<div style="font-size:12px;color:var(--text-secondary);margin-top:4px">Framework buffers ~5%</div>
</div>
</div>
"""
formulas = f"""
Total = weights + KV_cache + activations + overhead<br><br>
weights = params × bpw / 1e9<br><br>
KV_cache = 2 × kv_heads × head_dim × 2B × layers × context × batch<br><br>
activations ≈ weights × 5% (Flash Attention)<br><br>
overhead ≈ weights × 5% (runtime buffers)
"""
elif mode == "Train (Fine-tune)":
calc = calc_full_ft(params, context_len, batch_size)
result_type = "Full Fine-tune"
breakdown = f"""
<div style="display:grid;grid-template-columns:1fr 1fr;gap:16px;background:var(--bg-secondary);padding:16px;border-radius:8px">
<div>
<div style="font-weight:600;margin-bottom:8px;color:var(--text-primary)">VRAM Components</div>
<div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px">
<span style="color:var(--accent)">●</span> Weights BF16: <strong>{calc["weights"]:.2f} GB</strong><br>
<span style="color:var(--text-muted);font-size:12px;margin-left:16px">2 bytes/param</span>
</div>
<div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px">
<span style="color:var(--warning)">●</span> Gradients FP32: <strong>{calc["grads"]:.2f} GB</strong><br>
<span style="color:var(--text-muted);font-size:12px;margin-left:16px">4 bytes/param</span>
</div>
<div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px">
<span style="color:var(--error)">●</span> Optimizer AdamW: <strong>{calc["optimizer"]:.2f} GB</strong><br>
<span style="color:var(--text-muted);font-size:12px;margin-left:16px">8 bytes/param (momentum + variance)</span>
</div>
<div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px">
<span style="color:var(--accent)">●</span> Activations: <strong>{calc["acts"]:.2f} GB</strong><br>
<span style="color:var(--text-muted);font-size:12px;margin-left:16px">gradient checkpointing enabled</span>
</div>
</div>
<div>
<div style="font-weight:600;margin-bottom:8px;color:var(--text-primary)">Formula</div>
<div style="padding:12px;background:var(--bg-card);border-radius:6px;font-size:13px;color:var(--text-secondary)">
Total = weights(2B) + grads(4B) + optimizer(8B) + activations + overhead<br><br>
≈ 14-16× model size in bytes<br><br>
activations ≈ weights × 1.5 × context_scale × batch<br><br>
context_scale = max(1, context/2048)
</div>
</div>
</div>
"""
else: # LoRA/QLoRA
lora_rank = 16 # Default for clean UI
if quant_key == "Q4_0 / NF4":
calc = calc_qlora(params, context_len, batch_size, lora_rank)
result_type = "QLoRA"
breakdown = f"""
<div style="display:grid;grid-template-columns:1fr 1fr;gap:16px;background:var(--bg-secondary);padding:16px;border-radius:8px">
<div>
<div style="font-weight:600;margin-bottom:8px;color:var(--text-primary)">VRAM Components</div>
<div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px">
<span style="color:var(--accent)">●</span> Base weights NF4: <strong>{calc["weights"]:.2f} GB</strong><br>
<span style="color:var(--text-muted);font-size:12px;margin-left:16px">0.5 bytes/param (4-bit quantized)</span>
</div>
<div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px">
<span style="color:var(--success)">●</span> LoRA adapters: <strong>{calc["adapter"]:.2f} GB</strong><br>
<span style="color:var(--text-muted);font-size:12px;margin-left:16px">rank={lora_rank}, ~{calc["tpct"]:.1f}% trainable</span>
</div>
<div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px">
<span style="color:var(--warning)">●</span> Adapter gradients: <strong>{calc["grads"]:.2f} GB</strong>
</div>
<div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px">
<span style="color:var(--error)">●</span> Paged optimizer: <strong>{calc["optimizer"]:.2f} GB</strong>
</div>
</div>
<div>
<div style="font-weight:600;margin-bottom:8px;color:var(--text-primary)">Formula</div>
<div style="padding:12px;background:var(--bg-card);border-radius:6px;font-size:13px;color:var(--text-secondary)">
Base frozen (NF4) + trainable LoRA adapters<br><br>
trainable_params ≈ base_params × (2×rank/4096) × 0.3<br><br>
adapters = trainable × 2B<br><br>
grads = trainable × 4B<br><br>
optimizer = trainable × 8B (paged AdamW)
</div>
</div>
</div>
"""
else:
calc = calc_lora(params, quant_key, context_len, batch_size, lora_rank)
result_type = "LoRA"
breakdown = f"""
<div style="display:grid;grid-template-columns:1fr 1fr;gap:16px;background:var(--bg-secondary);padding:16px;border-radius:8px">
<div>
<div style="font-weight:600;margin-bottom:8px;color:var(--text-primary)">VRAM Components</div>
<div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px">
<span style="color:var(--accent)">●</span> Base weights: <strong>{calc["weights"]:.2f} GB</strong><br>
<span style="color:var(--text-muted);font-size:12px;margin-left:16px">{quant_key} frozen</span>
</div>
<div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px">
<span style="color:var(--success)">●</span> LoRA adapters: <strong>{calc["adapter"]:.2f} GB</strong><br>
<span style="color:var(--text-muted);font-size:12px;margin-left:16px">rank={lora_rank}, ~{calc["tpct"]:.1f}% trainable</span>
</div>
<div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px">
<span style="color:var(--warning)">●</span> Adapter gradients: <strong>{calc["grads"]:.2f} GB</strong>
</div>
<div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px">
<span style="color:var(--error)">●</span> AdamW optimizer: <strong>{calc["optimizer"]:.2f} GB</strong>
</div>
</div>
<div>
<div style="font-weight:600;margin-bottom:8px;color:var(--text-primary)">Formula</div>
<div style="padding:12px;background:var(--bg-card);border-radius:6px;font-size:13px;color:var(--text-secondary)">
Frozen base (any quant) + trainable LoRA<br><br>
trainable_params ≈ base_params × (2×rank/4096) × 0.3<br><br>
Only adapter weights need gradients + optimizer<br><br>
Much less VRAM than full fine-tune!
</div>
</div>
</div>
"""
required_gb = calc["total"]
fits, msg, _ = gpu_compat(required_gb, gpu_name, n_gpus)
# Build result
result_html = build_result_html(result_type, fits, required_gb, available_gb, breakdown, gpu_name, n_gpus, formulas)
# Model info summary - compact horizontal
model_summary = f'''
<div style="background:var(--bg-secondary);border-radius:10px;padding:16px 20px;margin-bottom:20px;display:flex;align-items:center;gap:24px">
<div style="flex:1">
<div style="font-size:12px;color:var(--text-muted);margin-bottom:4px">Model</div>
<div style="font-size:16px;font-weight:600;color:var(--text-primary)">{model_name}</div>
</div>
<div style="flex:1">
<div style="font-size:12px;color:var(--text-muted);margin-bottom:4px">Parameters</div>
<div style="font-family:'IBM Plex Mono',monospace;font-size:16px;color:var(--text-primary)">{params_b:.2f}B</div>
</div>
<div style="flex:1">
<div style="font-size:12px;color:var(--text-muted);margin-bottom:4px">Quantization</div>
<div style="font-size:14px;color:var(--text-secondary)">{quant_key}</div>
</div>
<div style="flex:1">
<div style="font-size:12px;color:var(--text-muted);margin-bottom:4px">Context</div>
<div style="font-family:'IBM Plex Mono',monospace;font-size:16px;color:var(--text-primary)">{context_len:,}</div>
</div>
</div>
'''
# Apple Silicon performance (if applicable)
apple_chip = parse_apple_silicon_chip(gpu_name)
perf_html = ""
if apple_chip and required_gb <= available_gb:
perf = estimate_apple_silicon_tps(params_b, apple_chip, quant_key)
if perf["tps"]:
conf_label = {"high": "measured", "medium": "estimated", "low": "rough estimate"}
conf_text = conf_label.get(perf["confidence"], "unknown")
perf_html = f'''
<div style="background:var(--bg-card);border:1px solid var(--border);border-radius:8px;padding:12px;margin-top:16px">
<div style="font-size:12px;color:var(--text-muted);margin-bottom:8px">Apple Silicon speed ({conf_text})</div>
<div style="font-family:'IBM Plex Mono',monospace;font-size:18px;color:var(--text-primary)">
~{perf["tps"]} tok/s · TTFT: {perf["ttft"] or "?"}s
</div>
<div style="font-size:11px;color:var(--text-muted);margin-top:4px">
via <a href="https://llmcheck.net" style="color:var(--accent)" target="_blank">llmcheck.net</a>
</div>
</div>
'''
return model_summary + result_html + perf_html
with gr.Blocks(title="Can I Run This LLM?", css=CSS, theme=gr.themes.Default()) as demo:
# Minimal header
gr.HTML("""
<div style="padding:12px 0 16px 0;border-bottom:2px solid var(--border);margin-bottom:16px">
<div style="font-size:24px;font-weight:700;color:var(--text-primary);font-family:'DM Sans',sans-serif">
Can I Run This LLM?
</div>
<div style="font-size:13px;color:var(--text-muted);margin-top:4px">
Check if your GPU can run any HuggingFace model
</div>
</div>
""")
# TWO COLUMN LAYOUT - inputs left, results skeleton right
with gr.Row():
# LEFT: Inputs
with gr.Column(scale=1, min_width=320):
# Mode selector - Dropdown is more visible
mode = gr.Dropdown(
label="Mode",
choices=["Run (Inference)", "Train (Full Fine-tune)", "LoRA/QLoRA"],
value="Run (Inference)",
interactive=True
)
# Model input
model_slug = gr.Textbox(
label="Model",
placeholder="Qwen/Qwen3-VL-2B-Instruct",
value="Qwen/Qwen3-VL-2B-Instruct",
lines=1
)
with gr.Row():
gpu_name = gr.Dropdown(
label="GPU",
choices=GPU_CHOICES,
value=DEFAULT_GPU
)
n_gpus = gr.Number(
label="# GPUs",
value=1,
minimum=1,
maximum=8
)
with gr.Row():
quant_key = gr.Dropdown(
label="Quant",
choices=["Q4_K_M", "Q5_K_M", "INT8", "NF4", "FP16"],
value="Q4_K_M"
)
context_len = gr.Number(
label="Context",
value=4096,
minimum=512,
maximum=65536
)
# Mode-specific advanced options (visible, not collapsed)
gr.HTML('<div style="font-size:12px;color:var(--text-muted);margin-top:12px;margin-bottom:6px;font-weight:500">Advanced options</div>')
with gr.Row():
hf_token = gr.Textbox(
label="HF Token",
placeholder="hf_... (gated models)",
type="password",
visible=True
)
use_manual = gr.Checkbox(
label="Manual params",
value=False
)
manual_params_b = gr.Number(
label="Params (B)",
value=7.0,
visible=True
)
# LoRA-specific options (show conditionally)
lora_rank = gr.Slider(
label="LoRA Rank",
minimum=4, maximum=256, value=16, step=4,
visible=False,
info="Higher rank = more trainable params"
)
batch_size = gr.Slider(
label="Batch Size",
minimum=1, maximum=64, value=1, step=1,
visible=False,
info="Higher batch = more VRAM for activations"
)
# Check button
calc_btn = gr.Button("CHECK", variant="primary", size="lg")
# RIGHT: Results skeleton (always visible, fills in)
with gr.Column(scale=1.3, min_width=500):
result_html = gr.HTML("""
<div style="border:2px solid var(--border);border-radius:12px;overflow:hidden;background:var(--bg-card)">
<!-- Model info bar skeleton -->
<div style="background:var(--bg-secondary);border-radius:10px;padding:16px 20px;margin:20px;display:flex;align-items:center;gap:24px">
<div style="flex:1"><div style="font-size:12px;color:var(--text-muted);margin-bottom:4px">Model</div><div style="height:20px;background:var(--bg-card);border-radius:4px;width:80%"></div></div>
<div style="flex:1"><div style="font-size:12px;color:var(--text-muted);margin-bottom:4px">Parameters</div><div style="height:20px;background:var(--bg-card);border-radius:4px;width:60%"></div></div>
<div style="flex:1"><div style="font-size:12px;color:var(--text-muted);margin-bottom:4px">Quantization</div><div style="height:20px;background:var(--bg-card);border-radius:4px;width:70%"></div></div>
<div style="flex:1"><div style="font-size:12px;color:var(--text-muted);margin-bottom:4px">Context</div><div style="height:20px;background:var(--bg-card);border-radius:4px;width:50%"></div></div>
</div>
<!-- YES/NO header skeleton -->
<div style="background:var(--bg-secondary);padding:24px;border-bottom:2px solid var(--border)">
<div style="display:flex;align-items:center;gap:24px">
<div style="width:80px;height:48px;background:var(--bg-card);border-radius:8px;display:flex;align-items:center;justify-content:center;color:var(--text-muted)">?</div>
<div style="flex:1"><div style="height:18px;background:var(--bg-card);border-radius:4px;width:60%;margin-bottom:8px"></div><div style="height:22px;background:var(--bg-card);border-radius:4px;width:45%"></div></div>
</div>
</div>
<!-- VRAM breakdown skeleton (4-card grid) -->
<div style="padding:24px">
<div style="font-size:14px;font-weight:600;color:var(--text-muted);margin-bottom:16px">VRAM Breakdown</div>
<div style="display:grid;grid-template-columns:1fr 1fr;gap:12px">
<div style="background:var(--bg-secondary);padding:16px;border-radius:8px;border-left:3px solid var(--border)">
<div style="height:13px;background:var(--bg-card);border-radius:4px;width:40%;margin-bottom:4px"></div>
<div style="height:20px;background:var(--bg-card);border-radius:4px;width:30%;margin-top:4px"></div>
</div>
<div style="background:var(--bg-secondary);padding:16px;border-radius:8px;border-left:3px solid var(--border)">
<div style="height:13px;background:var(--bg-card);border-radius:4px;width:40%;margin-bottom:4px"></div>
<div style="height:20px;background:var(--bg-card);border-radius:4px;width:30%;margin-top:4px"></div>
</div>
<div style="background:var(--bg-secondary);padding:16px;border-radius:8px;border-left:3px solid var(--border)">
<div style="height:13px;background:var(--bg-card);border-radius:4px;width:40%;margin-bottom:4px"></div>
<div style="height:20px;background:var(--bg-card);border-radius:4px;width:30%;margin-top:4px"></div>
</div>
<div style="background:var(--bg-secondary);padding:16px;border-radius:8px;border-left:3px solid var(--border)">
<div style="height:13px;background:var(--bg-card);border-radius:4px;width:40%;margin-bottom:4px"></div>
<div style="height:20px;background:var(--bg-card);border-radius:4px;width:30%;margin-top:4px"></div>
</div>
</div>
</div>
<!-- Formula skeleton -->
<div style="padding:0 24px 24px 24px">
<div style="background:var(--bg-secondary);border-radius:8px;padding:16px;border:1px solid var(--border)">
<div style="font-size:12px;font-weight:600;color:var(--text-muted);margin-bottom:8px">How it's calculated</div>
<div style="height:60px;background:var(--bg-card);border-radius:4px"></div>
</div>
</div>
<!-- Apple Silicon skeleton -->
<div style="padding:0 24px 24px 24px">
<div style="background:var(--bg-secondary);border-radius:8px;padding:12px;border:1px solid var(--border)">
<div style="height:12px;background:var(--bg-card);border-radius:4px;width:30%;margin-bottom:8px"></div>
<div style="height:18px;background:var(--bg-card);border-radius:4px;width:25%"></div>
</div>
</div>
</div>
""")
# Footer
gr.HTML("""
<div style="margin-top:8px;padding-top:8px;border-top:1px solid var(--border);text-align:center;font-size:11px;color:var(--text-muted)">
GPU data: <a href="https://www.techpowerup.com/gpu-specs/" style="color:var(--accent)" target="_blank">TechPowerUp</a>/dbgpu ·
Apple Silicon: <a href="https://llmcheck.net" style="color:var(--accent)" target="_blank">llmcheck.net</a> (CC BY 4.0)
</div>
""")
# Dynamic visibility for mode-specific options
def toggle_advanced_options(mode):
"""Show/hide options based on mode."""
if mode == "Run (Inference)":
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
elif mode == "Train (Full Fine-tune)":
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
else: # LoRA/QLoRA
return gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
mode.change(
toggle_advanced_options,
inputs=[mode],
outputs=[lora_rank, batch_size, manual_params_b]
)
# Events
calc_btn.click(
calculate_clean,
inputs=[mode, model_slug, gpu_name, n_gpus, quant_key, context_len, manual_params_b, use_manual],
outputs=result_html
)
if __name__ == "__main__":
demo.launch(share=False)