import sys
from pathlib import Path
import uuid
# Add packages to Python path
current_dir = Path(__file__).parent
sys.path.insert(0, str(current_dir / "packages" / "ltx-pipelines" / "src"))
sys.path.insert(0, str(current_dir / "packages" / "ltx-core" / "src"))
import spaces
import flash_attn_interface
import time
import gradio as gr
import numpy as np
import random
import torch
from typing import Optional
from pathlib import Path
from huggingface_hub import hf_hub_download, snapshot_download
from ltx_pipelines.distilled import DistilledPipeline
from ltx_core.model.video_vae import TilingConfig
from ltx_core.loader.primitives import LoraPathStrengthAndSDOps
from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
from ltx_pipelines.utils.constants import (
DEFAULT_SEED,
DEFAULT_1_STAGE_HEIGHT,
DEFAULT_1_STAGE_WIDTH ,
DEFAULT_NUM_FRAMES,
DEFAULT_FRAME_RATE,
DEFAULT_LORA_STRENGTH,
)
MAX_SEED = np.iinfo(np.int32).max
# Import from public LTX-2 package
# Install with: pip install git+https://github.com/Lightricks/LTX-2.git
from ltx_pipelines.utils import ModelLedger
from ltx_pipelines.utils.helpers import generate_enhanced_prompt
# HuggingFace Hub defaults
DEFAULT_REPO_ID = "Lightricks/LTX-2"
DEFAULT_GEMMA_REPO_ID = "unsloth/gemma-3-12b-it-qat-bnb-4bit"
DEFAULT_CHECKPOINT_FILENAME = "ltx-2-19b-dev.safetensors"
def get_hub_or_local_checkpoint(repo_id: str, filename: str):
"""Download from HuggingFace Hub."""
print(f"Downloading {filename} from {repo_id}...")
ckpt_path = hf_hub_download(repo_id=repo_id, filename=filename)
print(f"Downloaded to {ckpt_path}")
return ckpt_path
def download_gemma_model(repo_id: str):
"""Download the full Gemma model directory."""
print(f"Downloading Gemma model from {repo_id}...")
local_dir = snapshot_download(repo_id=repo_id)
print(f"Gemma model downloaded to {local_dir}")
return local_dir
# Initialize model ledger and text encoder at startup (load once, keep in memory)
print("=" * 80)
print("Loading Gemma Text Encoder...")
print("=" * 80)
checkpoint_path = get_hub_or_local_checkpoint(DEFAULT_REPO_ID, DEFAULT_CHECKPOINT_FILENAME)
gemma_local_path = download_gemma_model(DEFAULT_GEMMA_REPO_ID)
device = "cuda"
print(f"Initializing text encoder with:")
print(f" checkpoint_path={checkpoint_path}")
print(f" gemma_root={gemma_local_path}")
print(f" device={device}")
model_ledger = ModelLedger(
dtype=torch.bfloat16,
device=device,
checkpoint_path=checkpoint_path,
gemma_root_path=DEFAULT_GEMMA_REPO_ID,
local_files_only=False
)
# Load text encoder once and keep it in memory
text_encoder = model_ledger.text_encoder()
print("=" * 80)
print("Text encoder loaded and ready!")
print("=" * 80)
def encode_text_simple(text_encoder, prompt: str):
"""Simple text encoding without using pipeline_utils."""
v_context, a_context, _ = text_encoder(prompt)
return v_context, a_context
@spaces.GPU()
def encode_prompt(
prompt: str,
enhance_prompt: bool = True,
input_image = None,
seed: int = 42,
negative_prompt: str = ""
):
"""
Encode a text prompt using Gemma text encoder.
Args:
prompt: Text prompt to encode
enhance_prompt: Whether to use AI to enhance the prompt
input_image: Optional image for image-to-video enhancement
seed: Random seed for prompt enhancement
negative_prompt: Optional negative prompt for CFG (two-stage pipeline)
Returns:
tuple: (file_path, enhanced_prompt_text, status_message)
"""
start_time = time.time()
try:
# Enhance prompt if requested
final_prompt = prompt
if enhance_prompt:
if input_image is not None:
# Save image temporarily
temp_dir = Path("temp_images")
temp_dir.mkdir(exist_ok=True)
temp_image_path = temp_dir / f"temp_{int(time.time())}.jpg"
if hasattr(input_image, 'save'):
input_image.save(temp_image_path)
else:
temp_image_path = input_image
final_prompt = generate_enhanced_prompt(
text_encoder=text_encoder,
prompt=prompt,
image_path=str(temp_image_path),
seed=seed
)
else:
final_prompt = generate_enhanced_prompt(
text_encoder=text_encoder,
prompt=prompt,
image_path=None,
seed=seed
)
# Encode the positive prompt using the pre-loaded text encoder
video_context, audio_context = encode_text_simple(text_encoder, final_prompt)
# Encode negative prompt if provided
video_context_negative = None
audio_context_negative = None
if negative_prompt:
video_context_negative, audio_context_negative = encode_text_simple(text_encoder, negative_prompt)
run_id = uuid.uuid4().hex
output_dir = Path("embeddings")
output_dir.mkdir(exist_ok=True)
output_path = output_dir / f"embedding_{run_id}.pt"
# Save embeddings (with negative contexts if provided)
embedding_data = {
'video_context': video_context.cpu(),
'audio_context': audio_context.cpu(),
'prompt': final_prompt,
'original_prompt': prompt if enhance_prompt else final_prompt,
}
# Add negative contexts if they were encoded
if video_context_negative is not None:
embedding_data['video_context_negative'] = video_context_negative.cpu()
embedding_data['audio_context_negative'] = audio_context_negative.cpu()
embedding_data['negative_prompt'] = negative_prompt
torch.save(embedding_data, output_path)
# Get memory stats
elapsed_time = time.time() - start_time
if torch.cuda.is_available():
allocated = torch.cuda.memory_allocated() / 1024**3
peak = torch.cuda.max_memory_allocated() / 1024**3
status = f"✓ Encoded in {elapsed_time:.2f}s | VRAM: {allocated:.2f}GB allocated, {peak:.2f}GB peak"
else:
status = f"✓ Encoded in {elapsed_time:.2f}s (CPU mode)"
return str(output_path), final_prompt, status
except Exception as e:
import traceback
error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
print(error_msg)
return None, prompt, error_msg
# Default prompt from docstring example
DEFAULT_PROMPT = "An astronaut hatches from a fragile egg on the surface of the Moon, the shell cracking and peeling apart in gentle low-gravity motion. Fine lunar dust lifts and drifts outward with each movement, floating in slow arcs before settling back onto the ground. The astronaut pushes free in a deliberate, weightless motion, small fragments of the egg tumbling and spinning through the air. In the background, the deep darkness of space subtly shifts as stars glide with the camera's movement, emphasizing vast depth and scale. The camera performs a smooth, cinematic slow push-in, with natural parallax between the foreground dust, the astronaut, and the distant starfield. Ultra-realistic detail, physically accurate low-gravity motion, cinematic lighting, and a breath-taking, movie-like shot."
# HuggingFace Hub defaults
DEFAULT_REPO_ID = "Lightricks/LTX-2"
DEFAULT_CHECKPOINT_FILENAME = "ltx-2-19b-dev.safetensors"
DEFAULT_DISTILLED_LORA_FILENAME = "ltx-2-19b-distilled-lora-384.safetensors"
DEFAULT_SPATIAL_UPSAMPLER_FILENAME = "ltx-2-spatial-upscaler-x2-1.0.safetensors"
def get_hub_or_local_checkpoint(repo_id: Optional[str] = None, filename: Optional[str] = None):
"""Download from HuggingFace Hub or use local checkpoint."""
if repo_id is None and filename is None:
raise ValueError("Please supply at least one of `repo_id` or `filename`")
if repo_id is not None:
if filename is None:
raise ValueError("If repo_id is specified, filename must also be specified.")
print(f"Downloading {filename} from {repo_id}...")
ckpt_path = hf_hub_download(repo_id=repo_id, filename=filename)
print(f"Downloaded to {ckpt_path}")
else:
ckpt_path = filename
return ckpt_path
# Initialize pipeline at startup
print("=" * 80)
print("Loading LTX-2 Distilled pipeline...")
print("=" * 80)
checkpoint_path = get_hub_or_local_checkpoint(DEFAULT_REPO_ID, DEFAULT_CHECKPOINT_FILENAME)
distilled_lora_path = get_hub_or_local_checkpoint(DEFAULT_REPO_ID, DEFAULT_DISTILLED_LORA_FILENAME)
spatial_upsampler_path = get_hub_or_local_checkpoint(DEFAULT_REPO_ID, DEFAULT_SPATIAL_UPSAMPLER_FILENAME)
print(f"Initializing pipeline with:")
print(f" checkpoint_path={checkpoint_path}")
print(f" distilled_lora_path={distilled_lora_path}")
print(f" spatial_upsampler_path={spatial_upsampler_path}")
# Load distilled LoRA as a regular LoRA
loras = [
LoraPathStrengthAndSDOps(
path=distilled_lora_path,
strength=DEFAULT_LORA_STRENGTH,
sd_ops=LTXV_LORA_COMFY_RENAMING_MAP,
)
]
# Initialize pipeline WITHOUT text encoder (gemma_root=None)
# Text encoding will be done by external space
pipeline = DistilledPipeline(
device=torch.device("cuda"),
checkpoint_path=checkpoint_path,
spatial_upsampler_path=spatial_upsampler_path,
gemma_root=None, # No text encoder in this space
loras=loras,
fp8transformer=False,
local_files_only=False,
)
pipeline._video_encoder = pipeline.model_ledger.video_encoder()
pipeline._transformer = pipeline.model_ledger.transformer()
# pipeline.device = torch.device("cuda")
# pipeline.model_ledger.device = torch.device("cuda")
print("=" * 80)
print("Pipeline fully loaded and ready!")
print("=" * 80)
def get_duration(
input_image,
prompt,
duration,
enhance_prompt,
seed,
randomize_seed,
height,
width,
progress
):
if duration <= 5:
return 80
else:
return 120
class RadioAnimated(gr.HTML):
"""
Animated segmented radio (like iOS pill selector).
Outputs: selected option string, e.g. "768x512"
"""
def __init__(self, choices, value=None, **kwargs):
if not choices or len(choices) < 2:
raise ValueError("RadioAnimated requires at least 2 choices.")
if value is None:
value = choices[0]
# Build labels/inputs HTML
inputs_html = "\n".join(
f"""
"""
for i, c in enumerate(choices)
)
html_template = f"""
{inputs_html}
"""
js_on_load = r"""
(() => {
const wrap = element.querySelector('#ra-wrap');
const inner = element.querySelector('#ra-inner');
const highlight = element.querySelector('#ra-highlight');
const inputs = Array.from(element.querySelectorAll('.ra-input'));
const labels = Array.from(element.querySelectorAll('.ra-label'));
if (!inputs.length) return;
const choices = inputs.map(i => i.value);
function setHighlightByIndex(idx) {
const n = choices.length;
const pct = 100 / n;
highlight.style.width = `calc(${pct}% - 6px)`;
highlight.style.transform = `translateX(${idx * 100}%)`;
}
function setCheckedByValue(val, shouldTrigger=false) {
const idx = Math.max(0, choices.indexOf(val));
inputs.forEach((inp, i) => { inp.checked = (i === idx); });
setHighlightByIndex(idx);
// Update props + fire change if requested
props.value = choices[idx];
if (shouldTrigger) trigger('change', props.value);
}
// Init from props.value
setCheckedByValue(props.value ?? choices[0], false);
// Click handlers
inputs.forEach((inp) => {
inp.addEventListener('change', () => {
setCheckedByValue(inp.value, true);
});
});
// Watch for python-side value updates
let last = props.value;
setInterval(() => {
if (props.value !== last) {
last = props.value;
setCheckedByValue(props.value ?? choices[0], false);
}
}, 100);
})();
"""
super().__init__(
value=value,
html_template=html_template,
js_on_load=js_on_load,
**kwargs
)
@spaces.GPU(duration=get_duration)
def generate_video(
input_image,
prompt: str,
duration: float,
enhance_prompt: bool = True,
seed: int = 42,
randomize_seed: bool = True,
height: int = DEFAULT_1_STAGE_HEIGHT,
width: int = DEFAULT_1_STAGE_WIDTH,
progress=gr.Progress(track_tqdm=True),
):
"""
Generate a short cinematic video from a text prompt and optional input image using the LTX-2 distilled pipeline.
Args:
input_image: Optional input image for image-to-video. If provided, it is injected at frame 0 to guide motion.
prompt: Text description of the scene, motion, and cinematic style to generate.
duration: Desired video length in seconds. Converted to frames using a fixed 24 FPS rate.
enhance_prompt: Whether to enhance the prompt using the prompt enhancer before encoding.
seed: Base random seed for reproducibility (ignored if randomize_seed is True).
randomize_seed: If True, a random seed is generated for each run.
height: Output video height in pixels.
width: Output video width in pixels.
progress: Gradio progress tracker.
Returns:
A tuple of:
- output_path: Path to the generated MP4 video file.
- seed: The seed used for generation.
Notes:
- Uses a fixed frame rate of 24 FPS.
- Prompt embeddings are generated externally to avoid reloading the text encoder.
- GPU cache is cleared after generation to reduce VRAM pressure.
- If an input image is provided, it is temporarily saved to disk for processing.
"""
try:
# Randomize seed if checkbox is enabled
current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
# Calculate num_frames from duration (using fixed 24 fps)
frame_rate = 24.0
num_frames = int(duration * frame_rate) + 1 # +1 to ensure we meet the duration
run_id = uuid.uuid4().hex
output_dir = Path("outputs")
output_dir.mkdir(exist_ok=True)
output_path = output_dir / f"video_{run_id}.mp4"
temp_image_path = output_dir / f"temp_input_{run_id}.jpg"
# Handle image input
images = []
if input_image is not None:
if hasattr(input_image, 'save'):
input_image.save(temp_image_path)
else:
# If it's a file path already
temp_image_path = Path(input_image)
# Format: (image_path, frame_idx, strength)
images = [(str(temp_image_path), 0, 1.0)]
# Prepare image for upload if it exists
image_input = None
result = encode_prompt(
prompt=prompt,
enhance_prompt=enhance_prompt,
input_image=input_image,
seed=current_seed,
negative_prompt="",
)
embedding_path = result[0] # Path to .pt file
print(f"Embeddings received from: {embedding_path}")
# Load embeddings
embeddings = torch.load(embedding_path)
video_context = embeddings['video_context']
audio_context = embeddings['audio_context']
print("✓ Embeddings loaded successfully")
# Run inference - progress automatically tracks tqdm from pipeline
pipeline(
prompt=prompt,
output_path=str(output_path),
seed=current_seed,
height=height,
width=width,
num_frames=num_frames,
frame_rate=frame_rate,
images=images,
tiling_config=TilingConfig.default(),
video_context=video_context,
audio_context=audio_context,
)
torch.cuda.empty_cache()
print("successful generation")
return str(output_path), current_seed
except Exception as e:
import traceback
error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
print(error_msg)
return None
def apply_resolution(resolution: str):
w, h = resolution.split("x")
return int(w), int(h)
css = """
#col-container {
margin: 0 auto;
max-width: 1600px;
}
#modal-container {
width: 100vw; /* Take full viewport width */
height: 100vh; /* Take full viewport height (optional) */
display: flex;
justify-content: center; /* Center content horizontally */
align-items: center; /* Center content vertically if desired */
}
#modal-content {
width: 100%;
max-width: 700px; /* Limit content width */
margin: 0 auto;
border-radius: 8px;
padding: 1.5rem;
}
#step-column {
padding: 10px;
border-radius: 8px;
box-shadow: var(--card-shadow);
margin: 10px;
}
#col-showcase {
margin: 0 auto;
max-width: 1100px;
}
.button-gradient {
background: linear-gradient(45deg, rgb(255, 65, 108), rgb(255, 75, 43), rgb(255, 155, 0), rgb(255, 65, 108)) 0% 0% / 400% 400%;
border: none;
padding: 14px 28px;
font-size: 16px;
font-weight: bold;
color: white;
border-radius: 10px;
cursor: pointer;
transition: 0.3s ease-in-out;
animation: 2s linear 0s infinite normal none running gradientAnimation;
box-shadow: rgba(255, 65, 108, 0.6) 0px 4px 10px;
}
.toggle-container {
display: inline-flex;
background-color: #ffd6ff; /* light pink background */
border-radius: 9999px;
padding: 4px;
position: relative;
width: fit-content;
font-family: sans-serif;
}
.toggle-container input[type="radio"] {
display: none;
}
.toggle-container label {
position: relative;
z-index: 2;
flex: 1;
text-align: center;
font-weight: 700;
color: #4b2ab5; /* dark purple text for unselected */
padding: 6px 22px;
border-radius: 9999px;
cursor: pointer;
transition: color 0.25s ease;
}
/* Moving highlight */
.toggle-highlight {
position: absolute;
top: 4px;
left: 4px;
width: calc(50% - 4px);
height: calc(100% - 8px);
background-color: #4b2ab5; /* dark purple background */
border-radius: 9999px;
transition: transform 0.25s ease;
z-index: 1;
}
/* When "True" is checked */
#true:checked ~ label[for="true"] {
color: #ffd6ff; /* light pink text */
}
/* When "False" is checked */
#false:checked ~ label[for="false"] {
color: #ffd6ff; /* light pink text */
}
/* Move highlight to right side when False is checked */
#false:checked ~ .toggle-highlight {
transform: translateX(100%);
}
"""
css += """
/* ---- radioanimated ---- */
.ra-wrap{
width: fit-content;
}
.ra-inner{
position: relative;
display: inline-flex;
align-items: center;
gap: 0;
padding: 6px;
background: #0b0b0b;
border-radius: 9999px;
overflow: hidden;
user-select: none;
}
.ra-input{
display: none;
}
.ra-label{
position: relative;
z-index: 2;
padding: 10px 18px;
font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Arial;
font-size: 14px;
font-weight: 600;
color: rgba(255,255,255,0.7);
cursor: pointer;
transition: color 180ms ease;
white-space: nowrap;
}
.ra-highlight{
position: absolute;
z-index: 1;
top: 6px;
left: 6px;
height: calc(100% - 12px);
border-radius: 9999px;
background: #8bff97; /* green knob */
transition: transform 200ms ease, width 200ms ease;
}
/* selected label becomes darker like your screenshot */
.ra-input:checked + .ra-label{
color: rgba(0,0,0,0.75);
}
"""
with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
gr.HTML(
"""
LTX-2 Distilled DiT-based audio-video foundation model
Using FA3 and Gemma 3 12B 4bit Quantisation for Faster Inference
HF Space by:
"""
)
with gr.Column(elem_id="col-container"):
with gr.Row():
with gr.Column(elem_id="step-column"):
input_image = gr.Image(
label="Input Image (Optional)",
type="pil",
height=512)
prompt = gr.Textbox(
label="Prompt",
value="Make this image come alive with cinematic motion, smooth animation",
lines=3,
max_lines=3,
placeholder="Describe the motion and animation you want..."
)
duration = gr.Slider(
label="Duration (seconds)",
minimum=1.0,
maximum=10.0,
value=3.0,
step=0.1
)
enhance_prompt = gr.Checkbox(
label="Enhance Prompt",
value=True,
visible=False
)
with gr.Accordion("Advanced Settings", open=False, visible=False):
seed = gr.Slider(
label="Seed",
minimum=0,
maximum=MAX_SEED,
value=DEFAULT_SEED,
step=1
)
randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
with gr.Column(elem_id="step-column"):
output_video = gr.Video(label="Generated Video", autoplay=True, height=512)
radioanimated = RadioAnimated(
choices=["768x512", "512x512", "512x768"],
value=f"{DEFAULT_1_STAGE_WIDTH}x{DEFAULT_1_STAGE_HEIGHT}",
elem_id="radioanimated"
)
width = gr.Number(label="Width", value=DEFAULT_1_STAGE_WIDTH, precision=0, visible=False)
height = gr.Number(label="Height", value=DEFAULT_1_STAGE_HEIGHT, precision=0, visible=False)
generate_btn = gr.Button("🤩 Generate Video", variant="primary", elem_classes="button-gradient")
radioanimated.change(
fn=apply_resolution,
inputs=radioanimated,
outputs=[width, height],
)
generate_btn.click(
fn=generate_video,
inputs=[
input_image,
prompt,
duration,
enhance_prompt,
seed,
randomize_seed,
height,
width,
],
outputs=[output_video,seed]
)
# Add example
gr.Examples(
examples=[
[
"supergirl.png",
"A fuzzy puppet superhero character resembling a female puppet with blonde hair and a blue superhero suit stands inside an icy cave made of frozen walls and icicles, she looks panicked and frantic, rapidly turning her head left and right and scanning the cave while waving her arms and shouting angrily and desperately, mouthing the words “where the hell is my dog,” her movements exaggerated and puppet-like with high energy and urgency, suddenly a second puppet dog bursts into frame from the side, jumping up excitedly and tackling her affectionately while licking her face repeatedly, she freezes in surprise and then breaks into relief and laughter as the dog continues licking her, the scene feels chaotic, comedic, and emotional with expressive puppet reactions, cinematic lighting, smooth camera motion, shallow depth of field, and high-quality puppet-style animation",
5.0,
],
[
"wednesday.png",
"A cinematic close-up of Wednesday Addams frozen mid-dance on a dark, blue-lit ballroom floor as students move indistinctly behind her, their footsteps and muffled music reduced to a distant, underwater thrum; the audio foregrounds her steady breathing and the faint rustle of fabric as she slowly raises one arm, never breaking eye contact with the camera, then after a deliberately long silence she speaks in a flat, dry, perfectly controlled voice, “I don’t dance… I vibe code,” each word crisp and unemotional, followed by an abrupt cutoff of her voice as the background sound swells slightly, reinforcing the deadpan humor, with precise lip sync, minimal facial movement, stark gothic lighting, and cinematic realism.",
5.0,
],
[
"astronaut.jpg",
"An astronaut hatches from a fragile egg on the surface of the Moon, the shell cracking and peeling apart in gentle low-gravity motion. Fine lunar dust lifts and drifts outward with each movement, floating in slow arcs before settling back onto the ground. The astronaut pushes free in a deliberate, weightless motion, small fragments of the egg tumbling and spinning through the air. In the background, the deep darkness of space subtly shifts as stars glide with the camera's movement, emphasizing vast depth and scale. The camera performs a smooth, cinematic slow push-in, with natural parallax between the foreground dust, the astronaut, and the distant starfield. Ultra-realistic detail, physically accurate low-gravity motion, cinematic lighting, and a breath-taking, movie-like shot.",
10.0,
]
],
fn=generate_video,
inputs=[input_image, prompt, duration],
outputs = [output_video, seed],
label="Example",
cache_examples=True,
)
if __name__ == "__main__":
demo.launch(ssr_mode=False, mcp_server=True, css=css)