Spaces:

pearsonkyle
/

SDXL-Model-Merger

Sleeping

Kyle Pearson

cleaned up code

570384a about 1 month ago

12.8 kB

	"""Download utilities for SDXL Model Merger with Gradio progress integration."""

	import re
	import requests
	from pathlib import Path
	from tqdm import tqdm as TqdmBase

	from .config import download_cancelled


	def extract_model_id(url: str) -> str \| None:
	"""Extract CivitAI model ID from URL."""
	match = re.search(r'/models/(\d+)', url)
	return match.group(1) if match else None


	def is_huggingface_url(url: str) -> bool:
	"""Check if URL is a HuggingFace model download URL."""
	return "huggingface.co" in url.lower()


	def get_safe_filename_from_url(
	url: str,
	default_name: str = "model.safetensors",
	suffix: str = "",
	type_prefix: str \| None = None
	) -> str:
	"""
	Generate a safe filename with model ID from URL.

	For CivitAI URLs like https://civitai.com/api/download/models/12345?type=...

	Naming patterns:
	- Checkpoint (type_prefix='model'): 12345_model.safetensors or 12345_model_anime_style.safetensors
	- VAE (suffix='_vae'): 12345_vae.safetensors (no name extraction to avoid double suffix)
	- LoRA (suffix='_lora'): 12345_lora.safetensors (no name extraction to avoid double suffix)

	For HuggingFace URLs without model IDs, attempts to extract name from path or uses suffix-based naming.

	Args:
	url: The download URL
	default_name: Fallback filename if extraction fails
	suffix: Optional suffix to append before .safetensors (e.g., '_vae', '_lora')
	type_prefix: Optional prefix after model_id (e.g., 'model' -> 12345_model.safetensors)
	"""
	model_id = extract_model_id(url)

	# If no CivitAI model ID, try to generate a name from HuggingFace path
	if not model_id and "huggingface.co" in url:
	# Try to extract name from URL path (e.g., sdxl-vae-fp16-fix -> fp16_fix)
	try:
	parts = url.split("huggingface.co/")[1] if "huggingface.co/" in url else ""
	if parts:
	# Get the repo name (second part after org/)
	path_parts = [p for p in parts.split("/") if p]
	if len(path_parts) >= 2:
	repo_name = path_parts[1]
	# Clean up and create a simple identifier
	clean_repo = re.sub(r'[^a-zA-Z0-9]', '_', repo_name)[:30].strip('_')
	if clean_repo:
	model_id = f"hf_{clean_repo}"
	except Exception:
	pass

	if not model_id:
	return default_name

	# Special handling for VAE/LoRA with HuggingFace URLs to avoid double suffix
	is_special_type = suffix in ("_vae", "_lora")

	# Strip common suffixes from model_id when adding corresponding suffix
	# (e.g., "sdxl_vae_fp16_fix" + "_vae" -> "sdxl_fp16_fix" + "_vae")
	if is_special_type:
	strip_suffix = suffix.lstrip('_') # "vae" or "lora"
	model_id_lower = model_id.lower()
	# Check if model_id contains the type (with underscore boundaries)
	if f"_{strip_suffix}_" in model_id_lower or model_id_lower.endswith(f"_{strip_suffix}"):
	# Remove the suffix from model_id
	if model_id_lower.endswith(f"_{strip_suffix}"):
	model_id = model_id[:-len(strip_suffix)-1]
	else:
	# Find and remove _suffix_ pattern
	pattern = f"_{strip_suffix}_"
	idx = model_id_lower.find(pattern)
	if idx >= 0:
	model_id = model_id[:idx] + model_id[idx+len(pattern):]

	# Build the name portion: either clean name from URL or fallback
	name_part = ""

	# For VAE/LoRA types, skip Content-Disposition parsing to avoid double naming
	# (e.g., sdxl_vae_vae instead of just vae)
	if not is_special_type:
	try:
	response = requests.head(url, timeout=10, allow_redirects=True)
	cd = response.headers.get('Content-Disposition', '')
	match = re.search(r'filename="([^"]+)"', cd)
	if match:
	filename = match.group(1)
	# Extract base name without extension
	base_name = Path(filename).stem
	# Clean up the name (remove special chars)
	clean_name = re.sub(r'[^\w\s-]', '', base_name)[:50]
	clean_name = re.sub(r'[-\s]+', '_', clean_name.strip('-_'))
	if clean_name:
	name_part = clean_name
	except Exception:
	pass

	# Build filename with model_id, optional type_prefix, optional name_part, and suffix
	parts = [model_id]
	if type_prefix:
	parts.append(type_prefix)
	if name_part:
	parts.append(name_part)

	# Handle suffix - for VAE/LoRA we only add the suffix, not double naming
	if suffix:
	if is_special_type:
	# For _vae and _lora: just use model_id + suffix directly
	return f"{model_id}{suffix}.safetensors"
	else:
	# For other types (checkpoint), append suffix after name_part
	parts.append(suffix.lstrip('_'))

	return '_'.join(p for p in parts if p).replace('__', '_') + '.safetensors'


	class TqdmGradio(TqdmBase):
	"""tqdm subclass that sends progress updates to Gradio's gr.Progress()"""

	def __init__(self, args, gradio_prog=None, *kwargs):
	super().__init__(args, *kwargs)
	self.gradio_prog = gradio_prog
	self.last_pct = 0

	def update(self, n=1):
	from .config import download_cancelled
	if download_cancelled:
	raise KeyboardInterrupt("Download cancelled by user")
	super().update(n)
	if self.gradio_prog and self.total:
	pct = int(100 * self.n / self.total)
	# Only update UI every ~5% to avoid spamming
	if pct != self.last_pct and pct % 5 == 0:
	self.last_pct = pct
	self.gradio_prog(pct / 100)


	def get_cached_file_size(url: str, suffix: str = "", type_prefix: str \| None = None) -> tuple[Path \| None, int \| None]:
	"""
	Check if file exists in cache and matches expected size.

	Uses the same filename generation logic as download operations to find
	cached files by URL.

	Args:
	url: The download URL to check for cached file
	suffix: Optional suffix (e.g., '_vae', '_lora') for special file types
	type_prefix: Optional prefix after model_id (e.g., 'model')

	Returns:
	Tuple of (cached_file_path, file_size) if valid cache exists,
	or (None, None) if no valid cache found.
	"""
	from .config import CACHE_DIR

	# Generate the expected filename for this URL
	default_name = "vae.safetensors" if suffix == "_vae" else (
	"lora.safetensors" if suffix == "_lora" else "model.safetensors"
	)

	cached_filename = get_safe_filename_from_url(
	url,
	default_name=default_name,
	suffix=suffix,
	type_prefix=type_prefix
	)

	cached_path = CACHE_DIR / cached_filename

	if cached_path.exists() and cached_path.is_file():
	try:
	file_size = cached_path.stat().st_size
	# Only return valid cache if file has content
	if file_size > 0:
	return cached_path, file_size
	except OSError:
	pass

	return None, None


	def download_file_with_progress(url: str, output_path: Path, progress_bar=None) -> Path:
	"""
	Download a file with Gradio-synced progress bar + cancel support.

	Checks for existing cached files before downloading. If a valid cache
	exists (file exists with matching expected size), skips re-download.

	Supports both HTTP(S) and HuggingFace Hub URLs.

	Args:
	url: File URL to download (http/https/file)
	output_path: Destination path for downloaded file
	progress_bar: Optional gr.Progress() object for UI updates

	Returns:
	Path to the downloaded (or cached) file

	Raises:
	KeyboardInterrupt: If download is cancelled
	requests.RequestException: If download fails
	"""
	from .config import download_cancelled

	# Handle local file:// URLs
	if url.startswith("file://"):
	local_path = Path(url[7:]) # Remove "file://" prefix
	if local_path.exists():
	import shutil
	output_path.parent.mkdir(parents=True, exist_ok=True)

	print(f" 📁 Copying from cache: {local_path.name} → {output_path.name}")

	# Copy the file to cache location
	shutil.copy2(str(local_path), str(output_path))

	# Update progress bar for cached files
	if progress_bar:
	progress_bar(1.0)
	return output_path
	else:
	raise FileNotFoundError(f"Local file not found: {local_path}")

	print(f" 📥 Downloading to cache: {output_path.name}")

	# Early cache check: if file exists and size matches URL's content-length, skip re-download
	expected_size = None
	try:
	head = requests.head(url, timeout=10)
	expected_size = int(head.headers.get('content-length', 0))
	except Exception:
	pass # Skip header fetch on errors

	if output_path.exists() and expected_size is not None:
	try:
	cached_size = output_path.stat().st_size
	if cached_size == expected_size:
	print(f" ✅ Cache hit: {output_path.name} ({cached_size / (1024**2):.1f} MB)")
	# Cache hit - file exists with correct size
	if progress_bar:
	progress_bar(1.0)
	return output_path # Skip re-download!
	except OSError:
	pass # File access error, proceed with download

	output_path.parent.mkdir(parents=True, exist_ok=True)

	session = requests.Session()
	response = session.get(url, stream=True, timeout=30)
	response.raise_for_status()

	total_size = expected_size or int(response.headers.get('content-length', 0))
	block_size = 8192

	# Use TqdmGradio to sync progress with Gradio
	tqdm_kwargs = {
	'unit': 'B',
	'unit_scale': True,
	'desc': f"Downloading {output_path.name}",
	'gradio_prog': progress_bar,
	'disable': False,
	'bar_format': '{l_bar}{bar}\| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]',
	}

	with open(output_path, "wb") as f:
	try:
	for data in TqdmGradio(
	response.iter_content(block_size),
	total=total_size // block_size if total_size else 0,
	**tqdm_kwargs,
	):
	if download_cancelled:
	raise KeyboardInterrupt("Download cancelled by user")
	f.write(data)
	except KeyboardInterrupt:
	# Clean partial file on cancel
	output_path.unlink(missing_ok=True)
	raise

	# Verify the downloaded file is complete
	try:
	actual_size = output_path.stat().st_size

	# For safetensors files, check header is valid
	if output_path.suffix == ".safetensors":
	import struct
	with open(output_path, "rb") as f:
	header_size_bytes = f.read(8)
	if len(header_size_bytes) < 8:
	raise OSError(f"Safetensors file too small: {output_path.name}")

	header_size = struct.unpack("<Q", header_size_bytes)[0]
	header = f.read(header_size)
	if len(header) < header_size:
	raise OSError(f"Incomplete safetensors header in {output_path.name}")

	import json
	json.loads(header.decode("utf-8"))

	# Verify size matches expected (if known)
	if expected_size is not None and actual_size != expected_size:
	print(f" ⚠️ Size mismatch: expected {expected_size}, got {actual_size}")

	except Exception as e:
	output_path.unlink(missing_ok=True)
	raise OSError(f"Invalid downloaded file {output_path.name}: {str(e)}")

	return output_path


	def clear_cache(cache_dir: Path = None, keep_extensions: list[str] = None):
	"""
	Remove old cache files.

	Args:
	cache_dir: Cache directory path (defaults to config.CACHE_DIR)
	keep_extensions: File extensions to preserve (default: ['.safetensors'])
	"""
	if cache_dir is None:
	from .config import CACHE_DIR
	cache_dir = CACHE_DIR

	if keep_extensions is None:
	keep_extensions = ['.safetensors']

	# Remove temp files
	for file in cache_dir.glob("*.tmp"):
	file.unlink()

	# Optional: age-based cleanup (7 days)
	# import time
	# cutoff = time.time() - 86400 * 7
	# for f in cache_dir.iterdir():
	# if f.is_file() and f.stat().st_mtime < cutoff:
	# f.unlink()