Spaces:
Running
Running
| import re | |
| import math | |
| import requests | |
| import gradio as gr | |
| from functools import lru_cache | |
| import json | |
| import time | |
| # ────────────────────────────────────────────────────────────── | |
| # 0. LLMCHECK BENCHMARKS (Apple Silicon performance data) | |
| # ────────────────────────────────────────────────────────────── | |
| LLMCHECK_URL = "https://llmcheck.net/data/benchmarks.json" | |
| LLMCHECK_CACHE_TTL = 3600 # 1 hour cache | |
| # Chip performance scaling factors (derived from llmcheck 8B Q4_K_M benchmarks) | |
| CHIP_SCALE_FACTORS = { | |
| "M1": 1.00, | |
| "M2": 1.34, | |
| "M3": 1.72, | |
| "M3 Pro": 1.43, | |
| "M3 Max": 2.50, # extrapolated | |
| "M4": 1.72, | |
| "M4 Pro": 2.18, | |
| "M4 Max": 2.87, | |
| "M4 Ultra": 3.50, # extrapolated from bandwidth ratio | |
| "M5": 2.00, # extrapolated | |
| "M5 Pro": 2.19, | |
| "M5 Max": 2.63, | |
| "M5 Ultra": 3.80, # extrapolated | |
| } | |
| _llmcheck_cache = {"data": None, "timestamp": 0} | |
| def fetch_llmcheck_benchmarks(): | |
| """Fetch llmcheck benchmark data with caching.""" | |
| now = time.time() | |
| if _llmcheck_cache["data"] and (now - _llmcheck_cache["timestamp"]) < LLMCHECK_CACHE_TTL: | |
| return _llmcheck_cache["data"] | |
| try: | |
| r = requests.get(LLMCHECK_URL, timeout=15) | |
| if r.status_code == 200: | |
| data = r.json() | |
| _llmcheck_cache["data"] = data | |
| _llmcheck_cache["timestamp"] = now | |
| return data | |
| except Exception as e: | |
| print(f"[WARN] llmcheck fetch failed: {e}") | |
| return None | |
| def parse_apple_silicon_chip(gpu_name: str) -> str: | |
| """Extract chip name from GPU dropdown label.""" | |
| # Examples: "Apple M5 Max (128 GB)" -> "M5 Max" | |
| if not gpu_name.startswith("Apple"): | |
| return None | |
| match = re.match(r"Apple (M\d(?:\s+\w+)?).*", gpu_name) | |
| if match: | |
| return match.group(1) | |
| return None | |
| def estimate_apple_silicon_tps(params_b: float, chip: str, quant: str = "Q4_K_M") -> dict: | |
| """ | |
| Estimate inference speed (tok/s) for Apple Silicon. | |
| Returns dict with tps, ttft, source, and confidence. | |
| """ | |
| result = {"tps": None, "ttft": None, "source": None, "confidence": "unknown", "model_used": None} | |
| llmcheck = fetch_llmcheck_benchmarks() | |
| if not llmcheck: | |
| return result | |
| benchmarks = llmcheck.get("benchmarks", []) | |
| # Normalize quant key | |
| quant_map = { | |
| "Q4_K_M (best)": "Q4_K_M", | |
| "Q4_K_S": "Q4_K_M", | |
| "Q4_0 / NF4": "Q4_K_M", | |
| "Q5_K_M": "Q4_K_M", # approximate | |
| "Q6_K": "Q4_K_M", # approximate | |
| "INT8 / Q8_0": "Q8_0", | |
| "Q8_0": "Q8_0", | |
| } | |
| llmcheck_quant = quant_map.get(quant, "Q4_K_M") | |
| # 1. Direct match: find exact model params on exact chip | |
| for b in benchmarks: | |
| b_params = float(re.sub(r'[^\d.]', '', b["params"])) | |
| b_chip = b["chip"] | |
| b_quant = b["quant"] | |
| if abs(b_params - params_b) < 0.5 and b_chip == chip and b_quant == llmcheck_quant: | |
| result["tps"] = b["tps"] | |
| result["ttft"] = b["ttft"] | |
| result["source"] = "llmcheck measured" | |
| result["confidence"] = "high" | |
| result["model_used"] = b["model"] | |
| return result | |
| # 2. Same chip, similar params: scale by params ratio | |
| same_chip = [b for b in benchmarks if b["chip"] == chip and b["quant"] == llmcheck_quant] | |
| if same_chip: | |
| # Find closest params | |
| closest = min(same_chip, key=lambda b: abs(float(re.sub(r'[^\d.]', '', b["params"])) - params_b)) | |
| base_params = float(re.sub(r'[^\d.]', '', closest["params"])) | |
| scale = base_params / params_b if params_b > 0 else 1 | |
| result["tps"] = round(closest["tps"] * scale) | |
| result["ttft"] = round(closest["ttft"] * scale, 1) if closest.get("ttft") else None | |
| result["source"] = "llmcheck estimated (similar model on same chip)" | |
| result["confidence"] = "medium" | |
| result["model_used"] = closest["model"] | |
| return result | |
| # 3. Cross-chip: use M5 Max as reference and apply scaling | |
| m5max_refs = [b for b in benchmarks if b["chip"] == "M5 Max" and b["quant"] == llmcheck_quant] | |
| if m5max_refs: | |
| closest = min(m5max_refs, key=lambda b: abs(float(re.sub(r'[^\d.]', '', b["params"])) - params_b)) | |
| base_params = float(re.sub(r'[^\d.]', '', closest["params"])) | |
| params_scale = base_params / params_b if params_b > 0 else 1 | |
| chip_scale = CHIP_SCALE_FACTORS.get(chip, 1.0) | |
| result["tps"] = round(closest["tps"] * params_scale * chip_scale / CHIP_SCALE_FACTORS["M5 Max"]) | |
| result["ttft"] = round(closest["ttft"] * params_scale / chip_scale, 1) if closest.get("ttft") else None | |
| result["source"] = "llmcheck estimated (scaled from M5 Max)" | |
| result["confidence"] = "low" | |
| result["model_used"] = closest["model"] | |
| return result | |
| return result | |
| # ────────────────────────────────────────────────────────────── | |
| # 1. GPU DATABASE (dbgpu → TechPowerUp, auto-updated + Apple Silicon from llmcheck) | |
| # ────────────────────────────────────────────────────────────── | |
| APPLE_SILICON = { | |
| # M5 series (newest, based on llmcheck benchmarks) | |
| "Apple M5 Max (128 GB)": {"vram_gb": 128, "bw_gbs": 546, "tier": "Apple Silicon"}, | |
| "Apple M5 Max (64 GB)": {"vram_gb": 64, "bw_gbs": 546, "tier": "Apple Silicon"}, | |
| "Apple M5 Pro (48 GB)": {"vram_gb": 48, "bw_gbs": 273, "tier": "Apple Silicon"}, | |
| "Apple M5 Pro (24 GB)": {"vram_gb": 24, "bw_gbs": 273, "tier": "Apple Silicon"}, | |
| "Apple M5 (16 GB)": {"vram_gb": 16, "bw_gbs": 120, "tier": "Apple Silicon"}, | |
| # M4 series | |
| "Apple M4 Ultra (192 GB)": {"vram_gb": 192, "bw_gbs": 819, "tier": "Apple Silicon"}, | |
| "Apple M4 Ultra (128 GB)": {"vram_gb": 128, "bw_gbs": 819, "tier": "Apple Silicon"}, | |
| "Apple M4 Max (128 GB)": {"vram_gb": 128, "bw_gbs": 546, "tier": "Apple Silicon"}, | |
| "Apple M4 Max (96 GB)": {"vram_gb": 96, "bw_gbs": 546, "tier": "Apple Silicon"}, | |
| "Apple M4 Max (64 GB)": {"vram_gb": 64, "bw_gbs": 546, "tier": "Apple Silicon"}, | |
| "Apple M4 Max (48 GB)": {"vram_gb": 48, "bw_gbs": 546, "tier": "Apple Silicon"}, | |
| "Apple M4 Pro (48 GB)": {"vram_gb": 48, "bw_gbs": 273, "tier": "Apple Silicon"}, | |
| "Apple M4 Pro (24 GB)": {"vram_gb": 24, "bw_gbs": 273, "tier": "Apple Silicon"}, | |
| "Apple M4 (16 GB)": {"vram_gb": 16, "bw_gbs": 120, "tier": "Apple Silicon"}, | |
| # M3 series | |
| "Apple M3 Ultra (192 GB)": {"vram_gb": 192, "bw_gbs": 819, "tier": "Apple Silicon"}, | |
| "Apple M3 Max (128 GB)": {"vram_gb": 128, "bw_gbs": 400, "tier": "Apple Silicon"}, | |
| "Apple M3 Max (96 GB)": {"vram_gb": 96, "bw_gbs": 400, "tier": "Apple Silicon"}, | |
| "Apple M3 Max (64 GB)": {"vram_gb": 64, "bw_gbs": 400, "tier": "Apple Silicon"}, | |
| "Apple M3 Max (36 GB)": {"vram_gb": 36, "bw_gbs": 400, "tier": "Apple Silicon"}, | |
| "Apple M3 Pro (36 GB)": {"vram_gb": 36, "bw_gbs": 150, "tier": "Apple Silicon"}, | |
| "Apple M3 Pro (18 GB)": {"vram_gb": 18, "bw_gbs": 150, "tier": "Apple Silicon"}, | |
| "Apple M3 (16 GB)": {"vram_gb": 16, "bw_gbs": 100, "tier": "Apple Silicon"}, | |
| "Apple M3 (8 GB)": {"vram_gb": 8, "bw_gbs": 100, "tier": "Apple Silicon"}, | |
| # M2 series | |
| "Apple M2 Ultra (192 GB)": {"vram_gb": 192, "bw_gbs": 800, "tier": "Apple Silicon"}, | |
| "Apple M2 Ultra (128 GB)": {"vram_gb": 128, "bw_gbs": 800, "tier": "Apple Silicon"}, | |
| "Apple M2 Max (96 GB)": {"vram_gb": 96, "bw_gbs": 400, "tier": "Apple Silicon"}, | |
| "Apple M2 Max (64 GB)": {"vram_gb": 64, "bw_gbs": 400, "tier": "Apple Silicon"}, | |
| "Apple M2 Pro (32 GB)": {"vram_gb": 32, "bw_gbs": 200, "tier": "Apple Silicon"}, | |
| "Apple M2 Pro (16 GB)": {"vram_gb": 16, "bw_gbs": 200, "tier": "Apple Silicon"}, | |
| "Apple M2 (16 GB)": {"vram_gb": 16, "bw_gbs": 100, "tier": "Apple Silicon"}, | |
| "Apple M2 (8 GB)": {"vram_gb": 8, "bw_gbs": 100, "tier": "Apple Silicon"}, | |
| # M1 series | |
| "Apple M1 Ultra (128 GB)": {"vram_gb": 128, "bw_gbs": 800, "tier": "Apple Silicon"}, | |
| "Apple M1 Max (64 GB)": {"vram_gb": 64, "bw_gbs": 400, "tier": "Apple Silicon"}, | |
| "Apple M1 Pro (32 GB)": {"vram_gb": 32, "bw_gbs": 200, "tier": "Apple Silicon"}, | |
| "Apple M1 Pro (16 GB)": {"vram_gb": 16, "bw_gbs": 200, "tier": "Apple Silicon"}, | |
| "Apple M1 (16 GB)": {"vram_gb": 16, "bw_gbs": 100, "tier": "Apple Silicon"}, | |
| "Apple M1 (8 GB)": {"vram_gb": 8, "bw_gbs": 100, "tier": "Apple Silicon"}, | |
| } | |
| TIER_KEYWORDS = { | |
| "Data Center": ["H200", "H100", "H800", "B200", "B100", "B300", "A100", "A800", | |
| "A40", "L40", "L20", "V100", "P100", "MI3", "MI2", "MI1", | |
| "MI325", "MI350", "MI355", "RTX PRO 6000", "RTX PRO 5000", | |
| "Instinct", "GB10", "Jetson T5000", "Jetson T4000"], | |
| "Workstation": ["RTX 6000", "RTX 5000", "RTX 4000", "RTX 3000", "RTX A6000", | |
| "RTX A5000", "RTX A4000", "Quadro", "W7900", "W7800", "W6800", | |
| "Pro W", "PRO V"], | |
| "Laptop": ["Laptop", "Mobile", "Max-Q", "MXM", "Ti Laptop"], | |
| } | |
| def build_gpu_database(): | |
| gpu_db = {} | |
| try: | |
| from dbgpu import GPUDatabase | |
| db = GPUDatabase.default() | |
| for spec in db.specs: | |
| try: | |
| vram = spec.memory_size_gb | |
| bw = spec.memory_bandwidth_gb_s or 0 | |
| mfr = spec.manufacturer or "" | |
| name = spec.name or "" | |
| rd = spec.release_date | |
| if not vram or vram < 4: | |
| continue | |
| if mfr not in ("NVIDIA", "AMD", "Intel"): | |
| continue | |
| if rd and rd.year < 2017: | |
| continue | |
| name_l = name.lower() | |
| if any(k.lower() in name_l for k in ["Laptop", "Mobile", "Max-Q", "MXM"]): | |
| if vram < 16: | |
| continue | |
| tier = "Consumer" | |
| for t, kws in TIER_KEYWORDS.items(): | |
| if any(kw.lower() in name_l for kw in kws): | |
| tier = t | |
| break | |
| v_str = int(vram) if vram == int(vram) else vram | |
| label = f"{mfr} {name} ({v_str} GB)" | |
| gpu_db[label] = {"vram_gb": vram, "bw_gbs": bw, "tier": tier} | |
| except Exception: | |
| continue | |
| except Exception as e: | |
| print(f"[WARN] dbgpu failed: {e}") | |
| gpu_db.update(APPLE_SILICON) | |
| return gpu_db | |
| def get_gpu_choices(): | |
| db = build_gpu_database() | |
| tiers = {"Data Center": [], "Workstation": [], "Consumer": [], "Apple Silicon": [], "Other": []} | |
| for name, info in db.items(): | |
| tiers.get(info["tier"], tiers["Other"]).append((name, info["vram_gb"])) | |
| choices = [] | |
| for tier in ["Data Center", "Workstation", "Consumer", "Apple Silicon"]: | |
| for name, _ in sorted(tiers[tier], key=lambda x: -x[1]): | |
| choices.append(name) | |
| return choices | |
| # ────────────────────────────────────────────────────────────── | |
| # 2. QUANTIZATION TABLE | |
| # ────────────────────────────────────────────────────────────── | |
| QUANT_BPW = { | |
| "FP32 (32-bit)": {"bpw": 4.000, "color": "#ef4444", "desc": "Full precision. Training baseline. Rarely used for inference."}, | |
| "BF16 / FP16": {"bpw": 2.000, "color": "#f97316", "desc": "Standard half-precision. Most HF checkpoints. Training standard."}, | |
| "FP8 (H100/B200)": {"bpw": 1.000, "color": "#eab308", "desc": "Native on Hopper/Blackwell. Near-FP16 quality with 2x savings."}, | |
| "INT8 / Q8_0": {"bpw": 1.000, "color": "#eab308", "desc": "8-bit. 50% smaller vs FP16, negligible quality loss."}, | |
| "Q6_K": {"bpw": 0.781, "color": "#84cc16", "desc": "6-bit GGUF. Near-original quality. Good for quality-sensitive tasks."}, | |
| "Q5_K_M": {"bpw": 0.688, "color": "#22c55e", "desc": "5-bit GGUF. Better quality than Q4 with minimal extra VRAM."}, | |
| "Q4_K_M (best)": {"bpw": 0.567, "color": "#10b981", "desc": "MOST POPULAR. Best balance size vs quality. Recommended starting point."}, | |
| "Q4_K_S": {"bpw": 0.534, "color": "#14b8a6", "desc": "4-bit smaller variant. Slightly lower quality than Q4_K_M."}, | |
| "Q4_0 / NF4": {"bpw": 0.500, "color": "#06b6d4", "desc": "Basic 4-bit. NF4 variant used for QLoRA fine-tuning."}, | |
| "IQ4_XS": {"bpw": 0.478, "color": "#3b82f6", "desc": "Importance-matrix 4-bit. Better quality than Q4_K_S at same size."}, | |
| "Q3_K_M": {"bpw": 0.375, "color": "#8b5cf6", "desc": "3-bit GGUF. Noticeable quality drop. Only when severely VRAM-limited."}, | |
| "Q2_K": {"bpw": 0.250, "color": "#a855f7", "desc": "2-bit. Maximum compression, significant quality loss."}, | |
| "1.58-bit (BitNet)": {"bpw": 0.188, "color": "#ec4899", "desc": "Experimental ternary. Requires BitNet-native trained models."}, | |
| } | |
| # ────────────────────────────────────────────────────────────── | |
| # 3. MODEL METADATA FETCHER | |
| # ────────────────────────────────────────────────────────────── | |
| KNOWN_MODELS = { | |
| "meta-llama/llama-3.1-8b": (8.03e9, 131072, "LLaMA", "BF16 / FP16"), | |
| "meta-llama/llama-3.1-70b": (70.6e9, 131072, "LLaMA", "BF16 / FP16"), | |
| "meta-llama/llama-3.1-405b": (405e9, 131072, "LLaMA", "BF16 / FP16"), | |
| "meta-llama/llama-3.2-3b": (3.21e9, 131072, "LLaMA", "BF16 / FP16"), | |
| "meta-llama/llama-3.2-1b": (1.24e9, 131072, "LLaMA", "BF16 / FP16"), | |
| "meta-llama/llama-4-scout": (109e9, 10000000, "LLaMA-4 MoE", "BF16 / FP16"), | |
| "meta-llama/llama-4-maverick": (400e9, 1000000, "LLaMA-4 MoE", "BF16 / FP16"), | |
| "microsoft/phi-4": (14.7e9, 16384, "Phi", "BF16 / FP16"), | |
| "microsoft/phi-3.5-mini": (3.82e9, 128000, "Phi", "BF16 / FP16"), | |
| "microsoft/phi-3-mini": (3.82e9, 4096, "Phi", "BF16 / FP16"), | |
| "microsoft/phi-2": (2.78e9, 2048, "Phi", "BF16 / FP16"), | |
| "mistralai/mistral-7b": (7.24e9, 32768, "Mistral", "BF16 / FP16"), | |
| "mistralai/mistral-nemo": (12.2e9, 128000, "Mistral", "BF16 / FP16"), | |
| "mistralai/mixtral-8x7b": (46.7e9, 32768, "Mixtral MoE", "BF16 / FP16"), | |
| "mistralai/mixtral-8x22b": (141e9, 65536, "Mixtral MoE", "BF16 / FP16"), | |
| "qwen/qwen2.5-7b": (7.62e9, 131072, "Qwen", "BF16 / FP16"), | |
| "qwen/qwen2.5-14b": (14.8e9, 131072, "Qwen", "BF16 / FP16"), | |
| "qwen/qwen2.5-32b": (32.5e9, 131072, "Qwen", "BF16 / FP16"), | |
| "qwen/qwen2.5-72b": (72.7e9, 131072, "Qwen", "BF16 / FP16"), | |
| "qwen/qwen3-0.6b": (0.6e9, 32768, "Qwen", "BF16 / FP16"), | |
| "qwen/qwen3-1.7b": (1.7e9, 32768, "Qwen", "BF16 / FP16"), | |
| "qwen/qwen3-4b": (4.0e9, 32768, "Qwen", "BF16 / FP16"), | |
| "qwen/qwen3-8b": (8.19e9, 131072, "Qwen", "BF16 / FP16"), | |
| "qwen/qwen3-14b": (14.8e9, 131072, "Qwen", "BF16 / FP16"), | |
| "qwen/qwen3-32b": (32.8e9, 131072, "Qwen", "BF16 / FP16"), | |
| "qwen/qwen3-72b": (72.7e9, 131072, "Qwen", "BF16 / FP16"), | |
| "qwen/qwen3-235b-a22b": (235e9, 131072, "Qwen MoE", "BF16 / FP16"), | |
| "deepseek-ai/deepseek-v3": (671e9, 163840, "DeepSeek MoE", "BF16 / FP16"), | |
| "deepseek-ai/deepseek-r1": (671e9, 163840, "DeepSeek MoE", "BF16 / FP16"), | |
| "deepseek-ai/deepseek-v2": (236e9, 131072, "DeepSeek MoE", "BF16 / FP16"), | |
| "google/gemma-2-2b": (2.61e9, 8192, "Gemma", "BF16 / FP16"), | |
| "google/gemma-2-9b": (9.24e9, 8192, "Gemma", "BF16 / FP16"), | |
| "google/gemma-2-27b": (27.2e9, 8192, "Gemma", "BF16 / FP16"), | |
| "google/gemma-3-27b": (27e9, 131072, "Gemma", "BF16 / FP16"), | |
| "openai-community/gpt2": (124e6, 1024, "GPT-2", "FP32 (32-bit)"), | |
| "tiiuae/falcon-7b": (7.0e9, 2048, "Falcon", "BF16 / FP16"), | |
| "tiiuae/falcon-40b": (40.0e9, 2048, "Falcon", "BF16 / FP16"), | |
| } | |
| def fetch_model_info(model_slug: str, hf_token: str = "") -> dict: | |
| result = {"params": None, "params_b": None, "max_context": 4096, | |
| "arch": "Unknown", "dtype": "BF16 / FP16", "source": "", | |
| "error": None, "is_moe": False} | |
| model_slug = model_slug.strip().strip("/") | |
| if not model_slug or "/" not in model_slug: | |
| result["error"] = "Enter a valid HuggingFace slug — e.g. `meta-llama/Llama-3.1-8B-Instruct`" | |
| return result | |
| headers = {"Authorization": f"Bearer {hf_token}"} if hf_token else {} | |
| # (a) HF API | |
| try: | |
| r = requests.get(f"https://huggingface.co/api/models/{model_slug}", | |
| headers=headers, timeout=12) | |
| if r.status_code == 200: | |
| data = r.json() | |
| st = data.get("safetensors", {}) | |
| if st and st.get("total", 0) > 0: | |
| result["params"] = int(st["total"]) | |
| result["source"] = "safetensors metadata" | |
| tags = [t.lower() for t in (data.get("tags") or [])] | |
| for t in tags: | |
| if "llama" in t: result["arch"] = "LLaMA"; break | |
| if "mistral" in t: result["arch"] = "Mistral"; break | |
| if "mixtral" in t: result["arch"] = "Mixtral MoE"; break | |
| if "qwen" in t: result["arch"] = "Qwen"; break | |
| if "gemma" in t: result["arch"] = "Gemma"; break | |
| if "phi" in t: result["arch"] = "Phi"; break | |
| if "falcon" in t: result["arch"] = "Falcon"; break | |
| if "gpt" in t: result["arch"] = "GPT"; break | |
| if any("moe" in t or "mixture" in t for t in tags): | |
| result["is_moe"] = True | |
| except Exception: | |
| pass | |
| # (b) config.json | |
| if not result["params"]: | |
| try: | |
| r = requests.get( | |
| f"https://huggingface.co/{model_slug}/resolve/main/config.json", | |
| headers=headers, timeout=12) | |
| if r.status_code == 200: | |
| cfg = r.json() | |
| result["arch"] = cfg.get("model_type", result["arch"]).replace("_", " ").title() | |
| ctx = (cfg.get("max_position_embeddings") or cfg.get("max_sequence_length") | |
| or cfg.get("n_positions") or cfg.get("seq_length")) | |
| if ctx: | |
| result["max_context"] = int(ctx) | |
| if "float32" in str(cfg.get("torch_dtype", "")): | |
| result["dtype"] = "FP32 (32-bit)" | |
| if cfg.get("num_experts") or cfg.get("num_local_experts"): | |
| result["is_moe"] = True | |
| h = cfg.get("hidden_size") or cfg.get("d_model") or cfg.get("n_embd") | |
| L = cfg.get("num_hidden_layers") or cfg.get("n_layer") | |
| ffn = cfg.get("intermediate_size") | |
| vocab = cfg.get("vocab_size") | |
| if h and L and vocab: | |
| p = L * (4*h*h + (2*h*ffn if ffn else 8*h*h)) + vocab*h | |
| if p > 1_000_000: | |
| result["params"] = int(p) | |
| result["source"] = "config.json arch inference" | |
| except Exception: | |
| pass | |
| # (c) safetensors index | |
| if not result["params"]: | |
| try: | |
| r = requests.get( | |
| f"https://huggingface.co/{model_slug}/resolve/main/model.safetensors.index.json", | |
| headers=headers, timeout=12) | |
| if r.status_code == 200: | |
| idx = r.json() | |
| sz = idx.get("metadata", {}).get("total_size", 0) | |
| if sz > 0: | |
| result["params"] = sz // 2 | |
| result["source"] = "safetensors index (BF16 assumed)" | |
| except Exception: | |
| pass | |
| # (d) Name heuristic | |
| if not result["params"]: | |
| for pat in [r'[\-\_\/](\d+(?:\.\d+)?)[Bb][\-\_\s\.]', | |
| r'[\-\_\/](\d+(?:\.\d+)?)[Bb]$', | |
| r'^(\d+(?:\.\d+)?)[Bb][\-\_]']: | |
| m = re.search(pat, model_slug) | |
| if m: | |
| b = float(m.group(1)) | |
| if 0.05 <= b <= 10000: | |
| result["params"] = int(b * 1e9) | |
| result["source"] = f"name heuristic ({b}B)" | |
| break | |
| # (e) Known model table | |
| key = model_slug.lower() | |
| for known, (p, ctx, arch, dtype) in KNOWN_MODELS.items(): | |
| if key == known or key.startswith(known + "-") or key.startswith(known + "_"): | |
| if not result["params"]: | |
| result["params"] = int(p) | |
| result["source"] = "known model table" | |
| if result["max_context"] == 4096: | |
| result["max_context"] = ctx | |
| if result["arch"] == "Unknown": | |
| result["arch"] = arch | |
| if "MoE" in arch: | |
| result["is_moe"] = True | |
| break | |
| if result["params"]: | |
| result["params_b"] = result["params"] / 1e9 | |
| else: | |
| result["error"] = ("Could not determine parameter count. " | |
| "Try a HF token for gated models, or use the manual override.") | |
| return result | |
| # ────────────────────────────────────────────────────────────── | |
| # 4. VRAM CALCULATION ENGINE | |
| # ────────────────────────────────────────────────────────────── | |
| def calc_inference(params, quant_key, context_len, batch_size): | |
| bpw = QUANT_BPW[quant_key]["bpw"] | |
| weights_gb = params * bpw / 1e9 | |
| est_layers = max(16, int(28 * (params / 7e9) ** 0.45)) | |
| est_kv_heads = 8 | |
| est_head_dim = 128 | |
| kv_bytes = 2 * est_kv_heads * est_head_dim * 2 * est_layers * context_len * batch_size | |
| kv_gb = kv_bytes / 1e9 | |
| acts_gb = weights_gb * 0.05 | |
| overhead_gb = max(0.5, weights_gb * 0.05) | |
| return {"total": weights_gb + kv_gb + acts_gb + overhead_gb, | |
| "weights": weights_gb, "kv": kv_gb, "acts": acts_gb, "overhead": overhead_gb} | |
| def calc_full_ft(params, context_len, batch_size): | |
| weights_gb = params * 2 / 1e9 | |
| grads_gb = params * 4 / 1e9 | |
| optimizer_gb = params * 8 / 1e9 | |
| seq_scale = max(1.0, context_len / 2048) | |
| acts_gb = weights_gb * 1.5 * seq_scale * max(1.0, batch_size) | |
| overhead_gb = max(1.0, weights_gb * 0.1) | |
| return {"total": weights_gb + grads_gb + optimizer_gb + acts_gb + overhead_gb, | |
| "weights": weights_gb, "grads": grads_gb, | |
| "optimizer": optimizer_gb, "acts": acts_gb, "overhead": overhead_gb} | |
| def calc_lora(params, quant_key, context_len, batch_size, lora_rank): | |
| bpw = QUANT_BPW[quant_key]["bpw"] | |
| weights_gb = params * bpw / 1e9 | |
| trainable_ratio = (2 * lora_rank) / 4096 * 0.30 | |
| tp = int(params * trainable_ratio) | |
| adapter_gb = tp * 2 / 1e9 | |
| grads_gb = tp * 4 / 1e9 | |
| optimizer_gb = tp * 8 / 1e9 | |
| acts_gb = weights_gb * 0.8 * max(1.0, context_len / 2048) * max(1.0, batch_size) | |
| overhead_gb = max(0.5, weights_gb * 0.05) | |
| return {"total": weights_gb + adapter_gb + grads_gb + optimizer_gb + acts_gb + overhead_gb, | |
| "weights": weights_gb, "adapter": adapter_gb, | |
| "grads": grads_gb, "optimizer": optimizer_gb, "acts": acts_gb, "overhead": overhead_gb, | |
| "tp": tp, "tpct": trainable_ratio * 100} | |
| def calc_qlora(params, context_len, batch_size, lora_rank): | |
| weights_gb = params * 0.5 / 1e9 | |
| trainable_ratio = (2 * lora_rank) / 4096 * 0.30 | |
| tp = int(params * trainable_ratio) | |
| adapter_gb = tp * 2 / 1e9 | |
| grads_gb = tp * 4 / 1e9 | |
| optimizer_gb = tp * 8 / 1e9 | |
| dequant_gb = weights_gb * 0.05 | |
| acts_gb = weights_gb * 0.5 * max(1.0, context_len / 2048) * max(1.0, batch_size) | |
| overhead_gb = max(0.5, weights_gb * 0.08) | |
| return {"total": weights_gb + adapter_gb + grads_gb + optimizer_gb + dequant_gb + acts_gb + overhead_gb, | |
| "weights": weights_gb, "adapter": adapter_gb, "grads": grads_gb, | |
| "optimizer": optimizer_gb, "dequant": dequant_gb, "acts": acts_gb, "overhead": overhead_gb, | |
| "tp": tp, "tpct": trainable_ratio * 100} | |
| def gpu_compat(required_gb, gpu_name, n_gpus): | |
| db = build_gpu_database() | |
| gpu = db.get(gpu_name) | |
| if not gpu: | |
| return "❓", "GPU not in database", 999 | |
| available = gpu["vram_gb"] * n_gpus | |
| pct = required_gb * 1.05 / available * 100 | |
| gpus_needed = math.ceil(required_gb * 1.05 / gpu["vram_gb"]) | |
| if pct <= 75: | |
| return "✅", f"{required_gb:.1f} GB needed / {available:.0f} GB available — fits comfortably", pct | |
| elif pct <= 100: | |
| return "⚠️", f"{required_gb:.1f} GB needed / {available:.0f} GB available — tight fit", pct | |
| else: | |
| return "❌", f"{required_gb:.1f} GB needed / {available:.0f} GB available — need ≥{gpus_needed}× GPUs", pct | |
| # ────────────────────────────────────────────────────────────── | |
| # 5. HTML RENDERING HELPERS | |
| # ────────────────────────────────────────────────────────────── | |
| def bar_html(val, total_val, color="#10b981"): | |
| pct = min(100, val / max(total_val, 0.001) * 100) | |
| w = int(pct * 240 / 100) | |
| return (f'<div style="display:flex;align-items:center;gap:10px;margin:2px 0">' | |
| f'<div style="width:240px;background:#1e293b;border-radius:3px;height:8px;flex-shrink:0">' | |
| f'<div style="width:{w}px;background:{color};height:8px;border-radius:3px"></div></div>' | |
| f'<code style="color:#94a3b8;font-size:12px;white-space:nowrap">{val:.2f} GB</code></div>') | |
| def compat_badge(emoji, msg, pct): | |
| col = "#10b981" if emoji == "✅" else "#f59e0b" if emoji == "⚠️" else "#ef4444" | |
| return (f'<div style="background:{col}18;border:1px solid {col}55;border-radius:8px;' | |
| f'padding:10px 14px;margin-top:12px;color:{col};font-size:13px;font-family:monospace">' | |
| f'{emoji} {msg}</div>') | |
| def result_card(title, rows, total, gpu_name, n_gpus, accent="#a78bfa"): | |
| emoji, msg, pct = gpu_compat(total, gpu_name, n_gpus) | |
| rows_html = "".join( | |
| f'<tr><td style="padding:5px 12px 5px 0;color:#64748b;font-size:12px;' | |
| f'white-space:nowrap;vertical-align:middle">{lbl}</td>' | |
| f'<td style="padding:5px 0;vertical-align:middle">{bar_html(val, total, col)}</td></tr>' | |
| for lbl, val, col in rows | |
| ) | |
| return ( | |
| f'<div style="background:#0f172a;border:1px solid #1e293b;border-radius:14px;' | |
| f'padding:22px 24px;margin:0 0 12px">' | |
| f'<div style="font-size:14px;font-weight:700;color:{accent};' | |
| f'font-family:monospace;margin-bottom:10px">{title}</div>' | |
| f'<div style="font-size:28px;font-weight:800;color:#f8fafc;' | |
| f'font-family:monospace;margin-bottom:16px">' | |
| f'{total:.2f} <span style="font-size:13px;color:#334155;font-weight:400">GB total</span></div>' | |
| f'<table style="width:100%;border-collapse:collapse">{rows_html}</table>' | |
| f'<div style="border-top:1px solid #1e293b;margin-top:14px;padding-top:12px;' | |
| f'font-size:11px;color:#334155;font-family:monospace">' | |
| f'GPU: <span style="color:#475569">{gpu_name} × {n_gpus}</span>' | |
| f'{compat_badge(emoji, msg, pct)}</div></div>' | |
| ) | |
| # ────────────────────────────────────────────────────────────── | |
| # 6. MAIN CALCULATE | |
| # ────────────────────────────────────────────────────────────── | |
| def calculate(model_slug, hf_token, quant_key, context_len, batch_size, | |
| lora_rank, gpu_name, n_gpus, manual_params_b, use_manual): | |
| if use_manual and manual_params_b > 0: | |
| params = int(manual_params_b * 1e9) | |
| info = {"params": params, "params_b": manual_params_b, "max_context": 4096, | |
| "arch": "Manual", "dtype": quant_key, "source": "manual override", | |
| "error": None, "is_moe": False} | |
| else: | |
| info = fetch_model_info(model_slug.strip(), hf_token.strip()) | |
| if info["error"] and not use_manual: | |
| return (f'<div style="background:#7f1d1d22;border:1px solid #ef4444;border-radius:10px;' | |
| f'padding:14px;color:#fca5a5;font-family:monospace;font-size:13px">' | |
| f'⚠️ {info["error"]}</div>'), "", "", "", "", "" | |
| params = info["params"] or 0 | |
| if params == 0: | |
| return '<div style="color:#ef4444;font-family:monospace">❌ No parameter count available.</div>', "", "", "", "", "" | |
| params_b = params / 1e9 | |
| infer = calc_inference(params, quant_key, context_len, batch_size) | |
| full = calc_full_ft(params, context_len, batch_size) | |
| lora = calc_lora(params, quant_key, context_len, batch_size, lora_rank) | |
| ql = calc_qlora(params, context_len, batch_size, lora_rank) | |
| moe_badge = (' <span style="background:#7c3aed33;color:#a78bfa;font-size:10px;' | |
| 'border-radius:4px;padding:2px 6px;margin-left:6px">MoE</span>' | |
| if info.get("is_moe") else "") | |
| # Model info panel | |
| bpw = QUANT_BPW[quant_key]["bpw"] | |
| model_html = ( | |
| f'<div style="background:#0f172a;border:1px solid #1e293b;border-radius:14px;' | |
| f'padding:22px 24px;margin-bottom:12px;font-family:monospace">' | |
| f'<div style="font-size:10px;letter-spacing:3px;text-transform:uppercase;' | |
| f'color:#334155;margin-bottom:6px">Model</div>' | |
| f'<div style="font-size:18px;font-weight:800;color:#a78bfa;margin-bottom:16px;word-break:break-all">' | |
| f'{model_slug.strip() if not use_manual else "Manual Entry"}{moe_badge}</div>' | |
| f'<div style="display:grid;grid-template-columns:repeat(4,1fr);gap:10px">' | |
| f'<div style="background:#111827;border-radius:8px;padding:12px">' | |
| f'<div style="color:#334155;font-size:10px;text-transform:uppercase;letter-spacing:1px;margin-bottom:4px">Parameters</div>' | |
| f'<div style="color:#f8fafc;font-size:20px;font-weight:800">{params_b:.2f}B</div>' | |
| f'<div style="color:#334155;font-size:11px">{params:,}</div></div>' | |
| f'<div style="background:#111827;border-radius:8px;padding:12px">' | |
| f'<div style="color:#334155;font-size:10px;text-transform:uppercase;letter-spacing:1px;margin-bottom:4px">Architecture</div>' | |
| f'<div style="color:#f8fafc;font-size:14px;font-weight:700">{info.get("arch","?")}</div>' | |
| f'<div style="color:#334155;font-size:11px">{info.get("dtype","?")}</div></div>' | |
| f'<div style="background:#111827;border-radius:8px;padding:12px">' | |
| f'<div style="color:#334155;font-size:10px;text-transform:uppercase;letter-spacing:1px;margin-bottom:4px">Max Context</div>' | |
| f'<div style="color:#f8fafc;font-size:14px;font-weight:700">{info.get("max_context",4096):,}</div>' | |
| f'<div style="color:#334155;font-size:11px">tokens</div></div>' | |
| f'<div style="background:#111827;border-radius:8px;padding:12px">' | |
| f'<div style="color:#334155;font-size:10px;text-transform:uppercase;letter-spacing:1px;margin-bottom:4px">Quantization</div>' | |
| f'<div style="color:#f8fafc;font-size:12px;font-weight:700">{quant_key}</div>' | |
| f'<div style="color:#334155;font-size:11px">{bpw} B/param</div></div>' | |
| f'</div>' | |
| f'<div style="margin-top:10px;font-size:11px;color:#1e293b">source: {info.get("source","?")}</div>' | |
| f'</div>' | |
| ) | |
| # Check if Apple Silicon GPU selected - add performance estimate | |
| apple_chip = parse_apple_silicon_chip(gpu_name) | |
| perf_html = "" | |
| if apple_chip and infer["total"] <= build_gpu_database().get(gpu_name, {}).get("vram_gb", 0) * n_gpus: | |
| perf = estimate_apple_silicon_tps(params_b, apple_chip, quant_key) | |
| if perf["tps"]: | |
| confidence_colors = {"high": "#10b981", "medium": "#f59e0b", "low": "#64748b"} | |
| conf_color = confidence_colors.get(perf["confidence"], "#64748b") | |
| conf_label = {"high": "Measured", "medium": "Estimated", "low": "Rough estimate"} | |
| conf_text = conf_label.get(perf["confidence"], "Unknown") | |
| perf_html = ( | |
| f'<div style="background:#0f172a;border:1px solid #1e293b;border-radius:14px;' | |
| f'padding:16px 20px;margin-top:12px;font-family:monospace">' | |
| f'<div style="font-size:10px;letter-spacing:2px;text-transform:uppercase;color:#334155;margin-bottom:8px">' | |
| f'Apple Silicon Inference Speed</div>' | |
| f'<div style="display:flex;align-items:center;gap:16px">' | |
| f'<div style="background:#111827;border-radius:8px;padding:12px 16px">' | |
| f'<div style="color:#334155;font-size:10px;text-transform:uppercase;margin-bottom:2px">Speed</div>' | |
| f'<div style="color:#f8fafc;font-size:24px;font-weight:800">~{perf["tps"]} <span style="font-size:14px;color:#475569">tok/s</span></div></div>' | |
| f'<div style="background:#111827;border-radius:8px;padding:12px 16px">' | |
| f'<div style="color:#334155;font-size:10px;text-transform:uppercase;margin-bottom:2px">TTFT</div>' | |
| f'<div style="color:#f8fafc;font-size:24px;font-weight:800">{perf["ttft"] or "?"} <span style="font-size:14px;color:#475569">s</span></div></div>' | |
| f'<div style="background:{conf_color}18;border:1px solid {conf_color}55;border-radius:8px;padding:8px 12px">' | |
| f'<div style="color:{conf_color};font-size:11px;font-weight:600">{conf_text}</div>' | |
| f'<div style="color:#334155;font-size:9px">based on {perf["model_used"] or "?"}</div></div></div>' | |
| f'<div style="margin-top:10px;font-size:10px;color:#475569">' | |
| f'Source: <a href="https://llmcheck.net" style="color:#64748b" target="_blank">llmcheck.net</a> ' | |
| f'(CC BY 4.0) · {perf["source"] or ""}</div></div>' | |
| ) | |
| infer_html = result_card( | |
| "🚀 Inference", | |
| [("Model Weights", infer["weights"], "#a78bfa"), | |
| (f"KV Cache {context_len:,} ctx × batch {batch_size}", infer["kv"], "#60a5fa"), | |
| ("Activations (Flash Attn)", infer["acts"], "#34d399"), | |
| ("Framework Overhead", infer["overhead"], "#475569")], | |
| infer["total"], gpu_name, n_gpus, "#a78bfa" | |
| ) + perf_html | |
| ft_html = result_card( | |
| "🎓 Full Fine-Tune <span style='font-weight:400;font-size:12px;color:#334155'>(BF16 weights + FP32 Adam)</span>", | |
| [("Weights BF16 2 B/param", full["weights"], "#a78bfa"), | |
| ("Gradients FP32 4 B/param", full["grads"], "#f97316"), | |
| ("AdamW States 8 B/param", full["optimizer"], "#ef4444"), | |
| (f"Activations (grad ckpt) ctx={context_len:,}", full["acts"], "#60a5fa"), | |
| ("Overhead", full["overhead"], "#475569")], | |
| full["total"], gpu_name, n_gpus, "#f97316" | |
| ) | |
| lora_html = result_card( | |
| f"🔧 LoRA <span style='font-weight:400;font-size:12px;color:#334155'>rank={lora_rank} {lora['tpct']:.2f}% trainable</span>", | |
| [(f"Base Weights {quant_key}", lora["weights"], "#a78bfa"), | |
| (f"LoRA Adapters BF16 {lora['tp']:,} params", lora["adapter"], "#34d399"), | |
| ("Adapter Grads FP32", lora["grads"], "#f97316"), | |
| ("Adapter AdamW FP32", lora["optimizer"], "#ef4444"), | |
| ("Activations", lora["acts"], "#60a5fa"), | |
| ("Overhead", lora["overhead"], "#475569")], | |
| lora["total"], gpu_name, n_gpus, "#34d399" | |
| ) | |
| ql_html = result_card( | |
| f"⚡ QLoRA <span style='font-weight:400;font-size:12px;color:#334155'>NF4 base + BF16 adapters rank={lora_rank}</span>", | |
| [("Base NF4 0.5 B/param", ql["weights"], "#a78bfa"), | |
| (f"LoRA Adapters BF16 {ql['tp']:,} params", ql["adapter"], "#34d399"), | |
| ("Adapter Grads FP32", ql["grads"], "#f97316"), | |
| ("Paged AdamW FP32", ql["optimizer"], "#ef4444"), | |
| ("Dequant Buffers temp FP16", ql["dequant"], "#fb923c"), | |
| ("Activations", ql["acts"], "#60a5fa"), | |
| ("Overhead", ql["overhead"], "#475569")], | |
| ql["total"], gpu_name, n_gpus, "#facc15" | |
| ) | |
| return model_html, infer_html, ft_html, lora_html, ql_html, "" | |
| def auto_fetch(model_slug, hf_token): | |
| info = fetch_model_info(model_slug.strip(), hf_token.strip()) | |
| if info["error"]: | |
| return gr.update(), gr.update(), f"⚠️ {info['error']}", gr.update() | |
| ctx_opts = [512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 1000000] | |
| nearest = min(ctx_opts, key=lambda x: abs(x - info.get("max_context", 4096))) | |
| status = (f"✅ **{info['params_b']:.2f}B params** · {info['arch']} · " | |
| f"ctx {info.get('max_context',4096):,} · source: {info['source']}") | |
| return gr.update(value=info["params_b"]), gr.update(value=nearest), status, gr.update(value=True) | |
| # ────────────────────────────────────────────────────────────── | |
| # 7. UI - Clean, intuitive workflow | |
| # ────────────────────────────────────────────────────────────── | |
| CSS = """ | |
| /* Warm, soft palette - easy on eyes */ | |
| @import url('https://fonts.googleapis.com/css2?family=DM+Sans:wght@400;500;600;700&family=DM+Mono:wght@400;500&display=swap'); | |
| :root { | |
| --bg-primary: #f5f3f0; | |
| --bg-secondary: #ebe8e4; | |
| --bg-card: #fdfcfb; | |
| --bg-warm: #f9f7f5; | |
| --text-primary: #3d3d3d; | |
| --text-secondary: #5c5c5c; | |
| --text-muted: #8a8a8a; | |
| --accent: #d97706; | |
| --accent-light: #fef7ed; | |
| --accent-dark: #b45309; | |
| --success: #16a34a; | |
| --success-bg: #f0fdf4; | |
| --success-border: #86efac; | |
| --warning: #ca8a04; | |
| --warning-bg: #fefce8; | |
| --warning-border: #fde047; | |
| --error: #dc2626; | |
| --error-bg: #fef2f2; | |
| --error-border: #fca5a5; | |
| --border: #e5e2de; | |
| --border-strong: #d1ccc6; | |
| } | |
| body, .gradio-container { | |
| font-family: 'DM Sans', sans-serif !important; | |
| background: var(--bg-primary) !important; | |
| color: var(--text-primary) !important; | |
| } | |
| .gradio-container { | |
| max-width: 1400px !important; | |
| margin: 0 auto !important; | |
| padding: 16px 32px !important; | |
| } | |
| /* Clean card styling */ | |
| .gr-box, .gr-panel { | |
| background: var(--bg-card) !important; | |
| border: 1px solid var(--border) !important; | |
| border-radius: 12px !important; | |
| } | |
| /* Better labels */ | |
| label, .gr-label { | |
| color: var(--text-secondary) !important; | |
| font-size: 13px !important; | |
| font-weight: 500 !important; | |
| letter-spacing: 0 !important; | |
| text-transform: none !important; | |
| } | |
| /* Inputs */ | |
| input, textarea, select { | |
| background: var(--bg-card) !important; | |
| color: var(--text-primary) !important; | |
| border: 1px solid var(--border-strong) !important; | |
| font-family: 'IBM Plex Sans', sans-serif !important; | |
| font-size: 14px !important; | |
| padding: 12px !important; | |
| } | |
| input:focus, select:focus { | |
| border-color: var(--accent) !important; | |
| outline: none !important; | |
| box-shadow: 0 0 0 3px var(--accent-light) !important; | |
| } | |
| /* Buttons */ | |
| button.primary { | |
| background: var(--accent) !important; | |
| color: white !important; | |
| border: none !important; | |
| font-family: 'DM Sans', sans-serif !important; | |
| font-weight: 700 !important; | |
| font-size: 16px !important; | |
| padding: 16px 32px !important; | |
| border-radius: 8px !important; | |
| box-shadow: 0 2px 8px rgba(217, 119, 6, 0.25) !important; | |
| } | |
| button.primary:hover { | |
| background: var(--accent-dark) !important; | |
| transform: translateY(-1px) !important; | |
| } | |
| button.secondary { | |
| background: var(--bg-card) !important; | |
| color: var(--text-primary) !important; | |
| border: 2px solid var(--border-strong) !important; | |
| font-family: 'DM Sans', sans-serif !important; | |
| } | |
| /* Radio/Mode selector - very visible selected state */ | |
| .gr-radio { | |
| background: var(--bg-card) !important; | |
| border: 2px solid var(--border-strong) !important; | |
| border-radius: 12px !important; | |
| padding: 16px !important; | |
| } | |
| .gr-radio label { | |
| font-size: 15px !important; | |
| font-weight: 600 !important; | |
| padding: 12px 20px !important; | |
| border-radius: 8px !important; | |
| margin: 4px !important; | |
| background: var(--bg-warm) !important; | |
| border: 2px solid transparent !important; | |
| transition: all 0.2s !important; | |
| } | |
| .gr-radio label.selected { | |
| background: var(--accent) !important; | |
| color: white !important; | |
| border: 2px solid var(--accent-dark) !important; | |
| box-shadow: 0 2px 8px rgba(217, 119, 6, 0.4) !important; | |
| } | |
| .gr-radio input[type="radio"] { | |
| display: none !important; | |
| } | |
| /* Tabs */ | |
| .tab-nav { | |
| background: var(--bg-secondary) !important; | |
| border-radius: 10px !important; | |
| padding: 6px !important; | |
| margin-bottom: 16px !important; | |
| border: 2px solid var(--border) !important; | |
| } | |
| .tab-nav button { | |
| color: var(--text-secondary) !important; | |
| font-family: 'DM Sans', sans-serif !important; | |
| font-size: 15px !important; | |
| font-weight: 600 !important; | |
| padding: 12px 24px !important; | |
| border-radius: 6px !important; | |
| } | |
| .tab-nav button.selected { | |
| color: white !important; | |
| background: var(--accent) !important; | |
| box-shadow: 0 2px 6px rgba(217, 119, 6, 0.3) !important; | |
| } | |
| /* Sliders */ | |
| input[type=range] { | |
| accent-color: var(--accent) !important; | |
| } | |
| /* Hide unnecessary elements */ | |
| footer, .built-with, #component-0 > .svelte-1gf513q { display: none !important; } | |
| /* Accordion styling */ | |
| .accordion { | |
| background: var(--bg-card) !important; | |
| border: 1px solid var(--border) !important; | |
| } | |
| details summary { | |
| color: var(--text-secondary) !important; | |
| font-weight: 500 !important; | |
| } | |
| /* Number input styling */ | |
| input[type=number] { | |
| font-family: 'IBM Plex Mono', monospace !important; | |
| } | |
| /* Dropdown styling */ | |
| .gr-dropdown { | |
| font-size: 14px !important; | |
| } | |
| /* Markdown styling */ | |
| .gr-markdown { | |
| font-family: 'IBM Plex Sans', sans-serif !important; | |
| color: var(--text-primary) !important; | |
| } | |
| .gr-markdown p { | |
| font-size: 14px !important; | |
| line-height: 1.6 !important; | |
| } | |
| .gr-markdown code { | |
| font-family: 'IBM Plex Mono', monospace !important; | |
| background: var(--bg-secondary) !important; | |
| padding: 2px 6px !important; | |
| border-radius: 4px !important; | |
| } | |
| /* Hide Gradio's default header styling */ | |
| .contain .top-container { display: none !important; } | |
| """ | |
| GPU_CHOICES = get_gpu_choices() | |
| GPU_COUNT = len(GPU_CHOICES) | |
| DEFAULT_GPU = next((g for g in GPU_CHOICES if "RTX 4090" in g), GPU_CHOICES[0]) | |
| def build_result_html(result_type, fits, required_gb, available_gb, details, gpu_name, n_gpus, formulas=""): | |
| """Build result HTML with YES/NO, breakdown, and formulas.""" | |
| status = "YES" if fits == "✅" else ("MAYBE" if fits == "⚠️" else "NO") | |
| status_color = "var(--success)" if fits == "✅" else ("var(--warning)" if fits == "⚠️" else "var(--error)") | |
| status_bg = "var(--success-bg)" if fits == "✅" else ("var(--warning-bg)" if fits == "⚠️" else "var(--error-bg)") | |
| status_border = "var(--success-border)" if fits == "✅" else ("var(--warning-border)" if fits == "⚠️" else "var(--error-border)") | |
| gpus_needed = math.ceil(required_gb * 1.05 / available_gb) if fits == "❌" else n_gpus | |
| return f''' | |
| <div style="border:2px solid var(--border);border-radius:12px;overflow:hidden;background:var(--bg-card)"> | |
| <!-- YES/NO Header --> | |
| <div style="background:{status_bg};padding:24px;border-bottom:2px solid {status_border}"> | |
| <div style="display:flex;align-items:center;gap:24px"> | |
| <div style="font-size:48px;font-weight:700;color:{status_color};font-family:'DM Sans',sans-serif">{status}</div> | |
| <div style="flex:1"> | |
| <div style="font-size:18px;color:var(--text-primary);font-weight:600">{result_type} on {gpu_name} × {gpus_needed if fits == "❌" else n_gpus}</div> | |
| <div style="font-family:'DM Mono',monospace;font-size:22px;color:var(--text-secondary);margin-top:8px">{required_gb:.1f} GB required / {available_gb:.0f} GB available</div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- VRAM Breakdown --> | |
| <div style="padding:24px"> | |
| <div style="font-size:14px;font-weight:600;color:var(--text-primary);margin-bottom:16px">VRAM Breakdown</div> | |
| {details} | |
| </div> | |
| <!-- Formula --> | |
| <div style="padding:0 24px 24px 24px"> | |
| <div style="background:var(--bg-secondary);border-radius:8px;padding:16px;border:1px solid var(--border)"> | |
| <div style="font-size:12px;font-weight:600;color:var(--text-muted);margin-bottom:8px">How it's calculated</div> | |
| <div style="font-family:'DM Mono',monospace;font-size:13px;color:var(--text-secondary);line-height:1.6"> | |
| {formulas} | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| ''' | |
| def calculate_clean(mode, model_slug, gpu_name, n_gpus, quant_key, context_len, manual_params_b, use_manual): | |
| """Clean calculation function with simplified output.""" | |
| # Mode names are already correct from dropdown | |
| # Convert simplified quant names | |
| quant_map = {"Q4_K_M": "Q4_K_M (best)", "Q5_K_M": "Q5_K_M", "INT8": "INT8 / Q8_0", "NF4": "Q4_0 / NF4", "FP16": "BF16 / FP16"} | |
| quant_key = quant_map.get(quant_key, quant_key) | |
| # Get model info | |
| if use_manual and manual_params_b > 0: | |
| params = int(manual_params_b * 1e9) | |
| params_b = manual_params_b | |
| model_name = "Manual Entry" | |
| else: | |
| model_slug = model_slug.strip().strip("/") | |
| if not model_slug or "/" not in model_slug: | |
| return '<div style="background:var(--error-bg);border:2px solid var(--error-border);border-radius:12px;padding:20px;color:var(--error);font-size:15px">Enter a valid HuggingFace slug like meta-llama/Llama-3.1-8B-Instruct</div>' | |
| info = fetch_model_info(model_slug, "") | |
| if info["error"]: | |
| return f'<div style="background:var(--error-bg);border:2px solid var(--error-border);border-radius:12px;padding:20px;color:var(--error);font-size:15px">{info["error"]}</div>' | |
| params = info["params"] | |
| if not params: | |
| return '<div style="background:var(--error-bg);border:2px solid var(--error-border);border-radius:12px;padding:20px;color:var(--error);font-size:15px">Could not determine model size. Use manual override.</div>' | |
| params_b = info["params_b"] | |
| model_name = model_slug | |
| # Get GPU info - handle both int and float n_gpus | |
| n_gpus = int(n_gpus) if n_gpus else 1 | |
| db = build_gpu_database() | |
| gpu = db.get(gpu_name) | |
| if not gpu: | |
| return '<div style="background:var(--error-bg);border:2px solid var(--error-border);border-radius:12px;padding:20px;color:var(--error);font-size:15px">GPU not found in database</div>' | |
| available_gb = gpu["vram_gb"] * n_gpus | |
| # Calculate based on mode | |
| batch_size = 1 # Simplified for clean UI | |
| context_len = int(context_len) if context_len else 4096 | |
| formulas = "" # Default | |
| if mode == "Run (Inference)": | |
| calc = calc_inference(params, quant_key, context_len, batch_size) | |
| result_type = "Inference" | |
| breakdown = f""" | |
| <div style="display:grid;grid-template-columns:1fr 1fr;gap:12px"> | |
| <div style="background:var(--bg-secondary);padding:16px;border-radius:8px;border-left:3px solid var(--accent)"> | |
| <div style="font-size:13px;color:var(--text-muted);margin-bottom:4px">Model weights</div> | |
| <div style="font-size:20px;font-weight:700;color:var(--text-primary)">{calc["weights"]:.2f} GB</div> | |
| <div style="font-size:12px;color:var(--text-secondary);margin-top:4px">{quant_key} @ {QUANT_BPW[quant_key]["bpw"]} B/param</div> | |
| </div> | |
| <div style="background:var(--bg-secondary);padding:16px;border-radius:8px;border-left:3px solid var(--accent)"> | |
| <div style="font-size:13px;color:var(--text-muted);margin-bottom:4px">KV Cache</div> | |
| <div style="font-size:20px;font-weight:700;color:var(--text-primary)">{calc["kv"]:.2f} GB</div> | |
| <div style="font-size:12px;color:var(--text-secondary);margin-top:4px">context={context_len:,} tokens</div> | |
| </div> | |
| <div style="background:var(--bg-secondary);padding:16px;border-radius:8px;border-left:3px solid var(--success)"> | |
| <div style="font-size:13px;color:var(--text-muted);margin-bottom:4px">Activations</div> | |
| <div style="font-size:20px;font-weight:700;color:var(--text-primary)">{calc["acts"]:.2f} GB</div> | |
| <div style="font-size:12px;color:var(--text-secondary);margin-top:4px">Flash Attention ~5%</div> | |
| </div> | |
| <div style="background:var(--bg-secondary);padding:16px;border-radius:8px;border-left:3px solid var(--text-muted)"> | |
| <div style="font-size:13px;color:var(--text-muted);margin-bottom:4px">Overhead</div> | |
| <div style="font-size:20px;font-weight:700;color:var(--text-primary)">{calc["overhead"]:.2f} GB</div> | |
| <div style="font-size:12px;color:var(--text-secondary);margin-top:4px">Framework buffers ~5%</div> | |
| </div> | |
| </div> | |
| """ | |
| formulas = f""" | |
| Total = weights + KV_cache + activations + overhead<br><br> | |
| weights = params × bpw / 1e9<br><br> | |
| KV_cache = 2 × kv_heads × head_dim × 2B × layers × context × batch<br><br> | |
| activations ≈ weights × 5% (Flash Attention)<br><br> | |
| overhead ≈ weights × 5% (runtime buffers) | |
| """ | |
| elif mode == "Train (Fine-tune)": | |
| calc = calc_full_ft(params, context_len, batch_size) | |
| result_type = "Full Fine-tune" | |
| breakdown = f""" | |
| <div style="display:grid;grid-template-columns:1fr 1fr;gap:16px;background:var(--bg-secondary);padding:16px;border-radius:8px"> | |
| <div> | |
| <div style="font-weight:600;margin-bottom:8px;color:var(--text-primary)">VRAM Components</div> | |
| <div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px"> | |
| <span style="color:var(--accent)">●</span> Weights BF16: <strong>{calc["weights"]:.2f} GB</strong><br> | |
| <span style="color:var(--text-muted);font-size:12px;margin-left:16px">2 bytes/param</span> | |
| </div> | |
| <div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px"> | |
| <span style="color:var(--warning)">●</span> Gradients FP32: <strong>{calc["grads"]:.2f} GB</strong><br> | |
| <span style="color:var(--text-muted);font-size:12px;margin-left:16px">4 bytes/param</span> | |
| </div> | |
| <div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px"> | |
| <span style="color:var(--error)">●</span> Optimizer AdamW: <strong>{calc["optimizer"]:.2f} GB</strong><br> | |
| <span style="color:var(--text-muted);font-size:12px;margin-left:16px">8 bytes/param (momentum + variance)</span> | |
| </div> | |
| <div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px"> | |
| <span style="color:var(--accent)">●</span> Activations: <strong>{calc["acts"]:.2f} GB</strong><br> | |
| <span style="color:var(--text-muted);font-size:12px;margin-left:16px">gradient checkpointing enabled</span> | |
| </div> | |
| </div> | |
| <div> | |
| <div style="font-weight:600;margin-bottom:8px;color:var(--text-primary)">Formula</div> | |
| <div style="padding:12px;background:var(--bg-card);border-radius:6px;font-size:13px;color:var(--text-secondary)"> | |
| Total = weights(2B) + grads(4B) + optimizer(8B) + activations + overhead<br><br> | |
| ≈ 14-16× model size in bytes<br><br> | |
| activations ≈ weights × 1.5 × context_scale × batch<br><br> | |
| context_scale = max(1, context/2048) | |
| </div> | |
| </div> | |
| </div> | |
| """ | |
| else: # LoRA/QLoRA | |
| lora_rank = 16 # Default for clean UI | |
| if quant_key == "Q4_0 / NF4": | |
| calc = calc_qlora(params, context_len, batch_size, lora_rank) | |
| result_type = "QLoRA" | |
| breakdown = f""" | |
| <div style="display:grid;grid-template-columns:1fr 1fr;gap:16px;background:var(--bg-secondary);padding:16px;border-radius:8px"> | |
| <div> | |
| <div style="font-weight:600;margin-bottom:8px;color:var(--text-primary)">VRAM Components</div> | |
| <div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px"> | |
| <span style="color:var(--accent)">●</span> Base weights NF4: <strong>{calc["weights"]:.2f} GB</strong><br> | |
| <span style="color:var(--text-muted);font-size:12px;margin-left:16px">0.5 bytes/param (4-bit quantized)</span> | |
| </div> | |
| <div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px"> | |
| <span style="color:var(--success)">●</span> LoRA adapters: <strong>{calc["adapter"]:.2f} GB</strong><br> | |
| <span style="color:var(--text-muted);font-size:12px;margin-left:16px">rank={lora_rank}, ~{calc["tpct"]:.1f}% trainable</span> | |
| </div> | |
| <div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px"> | |
| <span style="color:var(--warning)">●</span> Adapter gradients: <strong>{calc["grads"]:.2f} GB</strong> | |
| </div> | |
| <div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px"> | |
| <span style="color:var(--error)">●</span> Paged optimizer: <strong>{calc["optimizer"]:.2f} GB</strong> | |
| </div> | |
| </div> | |
| <div> | |
| <div style="font-weight:600;margin-bottom:8px;color:var(--text-primary)">Formula</div> | |
| <div style="padding:12px;background:var(--bg-card);border-radius:6px;font-size:13px;color:var(--text-secondary)"> | |
| Base frozen (NF4) + trainable LoRA adapters<br><br> | |
| trainable_params ≈ base_params × (2×rank/4096) × 0.3<br><br> | |
| adapters = trainable × 2B<br><br> | |
| grads = trainable × 4B<br><br> | |
| optimizer = trainable × 8B (paged AdamW) | |
| </div> | |
| </div> | |
| </div> | |
| """ | |
| else: | |
| calc = calc_lora(params, quant_key, context_len, batch_size, lora_rank) | |
| result_type = "LoRA" | |
| breakdown = f""" | |
| <div style="display:grid;grid-template-columns:1fr 1fr;gap:16px;background:var(--bg-secondary);padding:16px;border-radius:8px"> | |
| <div> | |
| <div style="font-weight:600;margin-bottom:8px;color:var(--text-primary)">VRAM Components</div> | |
| <div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px"> | |
| <span style="color:var(--accent)">●</span> Base weights: <strong>{calc["weights"]:.2f} GB</strong><br> | |
| <span style="color:var(--text-muted);font-size:12px;margin-left:16px">{quant_key} frozen</span> | |
| </div> | |
| <div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px"> | |
| <span style="color:var(--success)">●</span> LoRA adapters: <strong>{calc["adapter"]:.2f} GB</strong><br> | |
| <span style="color:var(--text-muted);font-size:12px;margin-left:16px">rank={lora_rank}, ~{calc["tpct"]:.1f}% trainable</span> | |
| </div> | |
| <div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px"> | |
| <span style="color:var(--warning)">●</span> Adapter gradients: <strong>{calc["grads"]:.2f} GB</strong> | |
| </div> | |
| <div style="padding:8px 12px;background:var(--bg-card);border-radius:6px;margin-bottom:6px"> | |
| <span style="color:var(--error)">●</span> AdamW optimizer: <strong>{calc["optimizer"]:.2f} GB</strong> | |
| </div> | |
| </div> | |
| <div> | |
| <div style="font-weight:600;margin-bottom:8px;color:var(--text-primary)">Formula</div> | |
| <div style="padding:12px;background:var(--bg-card);border-radius:6px;font-size:13px;color:var(--text-secondary)"> | |
| Frozen base (any quant) + trainable LoRA<br><br> | |
| trainable_params ≈ base_params × (2×rank/4096) × 0.3<br><br> | |
| Only adapter weights need gradients + optimizer<br><br> | |
| Much less VRAM than full fine-tune! | |
| </div> | |
| </div> | |
| </div> | |
| """ | |
| required_gb = calc["total"] | |
| fits, msg, _ = gpu_compat(required_gb, gpu_name, n_gpus) | |
| # Build result | |
| result_html = build_result_html(result_type, fits, required_gb, available_gb, breakdown, gpu_name, n_gpus, formulas) | |
| # Model info summary - compact horizontal | |
| model_summary = f''' | |
| <div style="background:var(--bg-secondary);border-radius:10px;padding:16px 20px;margin-bottom:20px;display:flex;align-items:center;gap:24px"> | |
| <div style="flex:1"> | |
| <div style="font-size:12px;color:var(--text-muted);margin-bottom:4px">Model</div> | |
| <div style="font-size:16px;font-weight:600;color:var(--text-primary)">{model_name}</div> | |
| </div> | |
| <div style="flex:1"> | |
| <div style="font-size:12px;color:var(--text-muted);margin-bottom:4px">Parameters</div> | |
| <div style="font-family:'IBM Plex Mono',monospace;font-size:16px;color:var(--text-primary)">{params_b:.2f}B</div> | |
| </div> | |
| <div style="flex:1"> | |
| <div style="font-size:12px;color:var(--text-muted);margin-bottom:4px">Quantization</div> | |
| <div style="font-size:14px;color:var(--text-secondary)">{quant_key}</div> | |
| </div> | |
| <div style="flex:1"> | |
| <div style="font-size:12px;color:var(--text-muted);margin-bottom:4px">Context</div> | |
| <div style="font-family:'IBM Plex Mono',monospace;font-size:16px;color:var(--text-primary)">{context_len:,}</div> | |
| </div> | |
| </div> | |
| ''' | |
| # Apple Silicon performance (if applicable) | |
| apple_chip = parse_apple_silicon_chip(gpu_name) | |
| perf_html = "" | |
| if apple_chip and required_gb <= available_gb: | |
| perf = estimate_apple_silicon_tps(params_b, apple_chip, quant_key) | |
| if perf["tps"]: | |
| conf_label = {"high": "measured", "medium": "estimated", "low": "rough estimate"} | |
| conf_text = conf_label.get(perf["confidence"], "unknown") | |
| perf_html = f''' | |
| <div style="background:var(--bg-card);border:1px solid var(--border);border-radius:8px;padding:12px;margin-top:16px"> | |
| <div style="font-size:12px;color:var(--text-muted);margin-bottom:8px">Apple Silicon speed ({conf_text})</div> | |
| <div style="font-family:'IBM Plex Mono',monospace;font-size:18px;color:var(--text-primary)"> | |
| ~{perf["tps"]} tok/s · TTFT: {perf["ttft"] or "?"}s | |
| </div> | |
| <div style="font-size:11px;color:var(--text-muted);margin-top:4px"> | |
| via <a href="https://llmcheck.net" style="color:var(--accent)" target="_blank">llmcheck.net</a> | |
| </div> | |
| </div> | |
| ''' | |
| return model_summary + result_html + perf_html | |
| with gr.Blocks(title="Can I Run This LLM?", css=CSS, theme=gr.themes.Default()) as demo: | |
| # Minimal header | |
| gr.HTML(""" | |
| <div style="padding:12px 0 16px 0;border-bottom:2px solid var(--border);margin-bottom:16px"> | |
| <div style="font-size:24px;font-weight:700;color:var(--text-primary);font-family:'DM Sans',sans-serif"> | |
| Can I Run This LLM? | |
| </div> | |
| <div style="font-size:13px;color:var(--text-muted);margin-top:4px"> | |
| Check if your GPU can run any HuggingFace model | |
| </div> | |
| </div> | |
| """) | |
| # TWO COLUMN LAYOUT - inputs left, results skeleton right | |
| with gr.Row(): | |
| # LEFT: Inputs | |
| with gr.Column(scale=1, min_width=320): | |
| # Mode selector - Dropdown is more visible | |
| mode = gr.Dropdown( | |
| label="Mode", | |
| choices=["Run (Inference)", "Train (Full Fine-tune)", "LoRA/QLoRA"], | |
| value="Run (Inference)", | |
| interactive=True | |
| ) | |
| # Model input | |
| model_slug = gr.Textbox( | |
| label="Model", | |
| placeholder="Qwen/Qwen3-VL-2B-Instruct", | |
| value="Qwen/Qwen3-VL-2B-Instruct", | |
| lines=1 | |
| ) | |
| with gr.Row(): | |
| gpu_name = gr.Dropdown( | |
| label="GPU", | |
| choices=GPU_CHOICES, | |
| value=DEFAULT_GPU | |
| ) | |
| n_gpus = gr.Number( | |
| label="# GPUs", | |
| value=1, | |
| minimum=1, | |
| maximum=8 | |
| ) | |
| with gr.Row(): | |
| quant_key = gr.Dropdown( | |
| label="Quant", | |
| choices=["Q4_K_M", "Q5_K_M", "INT8", "NF4", "FP16"], | |
| value="Q4_K_M" | |
| ) | |
| context_len = gr.Number( | |
| label="Context", | |
| value=4096, | |
| minimum=512, | |
| maximum=65536 | |
| ) | |
| # Mode-specific advanced options (visible, not collapsed) | |
| gr.HTML('<div style="font-size:12px;color:var(--text-muted);margin-top:12px;margin-bottom:6px;font-weight:500">Advanced options</div>') | |
| with gr.Row(): | |
| hf_token = gr.Textbox( | |
| label="HF Token", | |
| placeholder="hf_... (gated models)", | |
| type="password", | |
| visible=True | |
| ) | |
| use_manual = gr.Checkbox( | |
| label="Manual params", | |
| value=False | |
| ) | |
| manual_params_b = gr.Number( | |
| label="Params (B)", | |
| value=7.0, | |
| visible=True | |
| ) | |
| # LoRA-specific options (show conditionally) | |
| lora_rank = gr.Slider( | |
| label="LoRA Rank", | |
| minimum=4, maximum=256, value=16, step=4, | |
| visible=False, | |
| info="Higher rank = more trainable params" | |
| ) | |
| batch_size = gr.Slider( | |
| label="Batch Size", | |
| minimum=1, maximum=64, value=1, step=1, | |
| visible=False, | |
| info="Higher batch = more VRAM for activations" | |
| ) | |
| # Check button | |
| calc_btn = gr.Button("CHECK", variant="primary", size="lg") | |
| # RIGHT: Results skeleton (always visible, fills in) | |
| with gr.Column(scale=1.3, min_width=500): | |
| result_html = gr.HTML(""" | |
| <div style="border:2px solid var(--border);border-radius:12px;overflow:hidden;background:var(--bg-card)"> | |
| <!-- Model info bar skeleton --> | |
| <div style="background:var(--bg-secondary);border-radius:10px;padding:16px 20px;margin:20px;display:flex;align-items:center;gap:24px"> | |
| <div style="flex:1"><div style="font-size:12px;color:var(--text-muted);margin-bottom:4px">Model</div><div style="height:20px;background:var(--bg-card);border-radius:4px;width:80%"></div></div> | |
| <div style="flex:1"><div style="font-size:12px;color:var(--text-muted);margin-bottom:4px">Parameters</div><div style="height:20px;background:var(--bg-card);border-radius:4px;width:60%"></div></div> | |
| <div style="flex:1"><div style="font-size:12px;color:var(--text-muted);margin-bottom:4px">Quantization</div><div style="height:20px;background:var(--bg-card);border-radius:4px;width:70%"></div></div> | |
| <div style="flex:1"><div style="font-size:12px;color:var(--text-muted);margin-bottom:4px">Context</div><div style="height:20px;background:var(--bg-card);border-radius:4px;width:50%"></div></div> | |
| </div> | |
| <!-- YES/NO header skeleton --> | |
| <div style="background:var(--bg-secondary);padding:24px;border-bottom:2px solid var(--border)"> | |
| <div style="display:flex;align-items:center;gap:24px"> | |
| <div style="width:80px;height:48px;background:var(--bg-card);border-radius:8px;display:flex;align-items:center;justify-content:center;color:var(--text-muted)">?</div> | |
| <div style="flex:1"><div style="height:18px;background:var(--bg-card);border-radius:4px;width:60%;margin-bottom:8px"></div><div style="height:22px;background:var(--bg-card);border-radius:4px;width:45%"></div></div> | |
| </div> | |
| </div> | |
| <!-- VRAM breakdown skeleton (4-card grid) --> | |
| <div style="padding:24px"> | |
| <div style="font-size:14px;font-weight:600;color:var(--text-muted);margin-bottom:16px">VRAM Breakdown</div> | |
| <div style="display:grid;grid-template-columns:1fr 1fr;gap:12px"> | |
| <div style="background:var(--bg-secondary);padding:16px;border-radius:8px;border-left:3px solid var(--border)"> | |
| <div style="height:13px;background:var(--bg-card);border-radius:4px;width:40%;margin-bottom:4px"></div> | |
| <div style="height:20px;background:var(--bg-card);border-radius:4px;width:30%;margin-top:4px"></div> | |
| </div> | |
| <div style="background:var(--bg-secondary);padding:16px;border-radius:8px;border-left:3px solid var(--border)"> | |
| <div style="height:13px;background:var(--bg-card);border-radius:4px;width:40%;margin-bottom:4px"></div> | |
| <div style="height:20px;background:var(--bg-card);border-radius:4px;width:30%;margin-top:4px"></div> | |
| </div> | |
| <div style="background:var(--bg-secondary);padding:16px;border-radius:8px;border-left:3px solid var(--border)"> | |
| <div style="height:13px;background:var(--bg-card);border-radius:4px;width:40%;margin-bottom:4px"></div> | |
| <div style="height:20px;background:var(--bg-card);border-radius:4px;width:30%;margin-top:4px"></div> | |
| </div> | |
| <div style="background:var(--bg-secondary);padding:16px;border-radius:8px;border-left:3px solid var(--border)"> | |
| <div style="height:13px;background:var(--bg-card);border-radius:4px;width:40%;margin-bottom:4px"></div> | |
| <div style="height:20px;background:var(--bg-card);border-radius:4px;width:30%;margin-top:4px"></div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Formula skeleton --> | |
| <div style="padding:0 24px 24px 24px"> | |
| <div style="background:var(--bg-secondary);border-radius:8px;padding:16px;border:1px solid var(--border)"> | |
| <div style="font-size:12px;font-weight:600;color:var(--text-muted);margin-bottom:8px">How it's calculated</div> | |
| <div style="height:60px;background:var(--bg-card);border-radius:4px"></div> | |
| </div> | |
| </div> | |
| <!-- Apple Silicon skeleton --> | |
| <div style="padding:0 24px 24px 24px"> | |
| <div style="background:var(--bg-secondary);border-radius:8px;padding:12px;border:1px solid var(--border)"> | |
| <div style="height:12px;background:var(--bg-card);border-radius:4px;width:30%;margin-bottom:8px"></div> | |
| <div style="height:18px;background:var(--bg-card);border-radius:4px;width:25%"></div> | |
| </div> | |
| </div> | |
| </div> | |
| """) | |
| # Footer | |
| gr.HTML(""" | |
| <div style="margin-top:8px;padding-top:8px;border-top:1px solid var(--border);text-align:center;font-size:11px;color:var(--text-muted)"> | |
| GPU data: <a href="https://www.techpowerup.com/gpu-specs/" style="color:var(--accent)" target="_blank">TechPowerUp</a>/dbgpu · | |
| Apple Silicon: <a href="https://llmcheck.net" style="color:var(--accent)" target="_blank">llmcheck.net</a> (CC BY 4.0) | |
| </div> | |
| """) | |
| # Dynamic visibility for mode-specific options | |
| def toggle_advanced_options(mode): | |
| """Show/hide options based on mode.""" | |
| if mode == "Run (Inference)": | |
| return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) | |
| elif mode == "Train (Full Fine-tune)": | |
| return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False) | |
| else: # LoRA/QLoRA | |
| return gr.update(visible=True), gr.update(visible=True), gr.update(visible=True) | |
| mode.change( | |
| toggle_advanced_options, | |
| inputs=[mode], | |
| outputs=[lora_rank, batch_size, manual_params_b] | |
| ) | |
| # Events | |
| calc_btn.click( | |
| calculate_clean, | |
| inputs=[mode, model_slug, gpu_name, n_gpus, quant_key, context_len, manual_params_b, use_manual], | |
| outputs=result_html | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(share=False) |