File size: 3,528 Bytes

0970deb

#!/usr/bin/env python3
"""
VBVR-Wan2.2 Image-to-Video Inference Example

Generate a video from a reference image using the VBVR-Wan2.2 model.
Usage:
    python inference.py --model_path /path/to/VBVR-Wan2.2
"""

import os
import torch
from PIL import Image
from diffusers import WanImageToVideoPipeline, AutoencoderKLWan
from diffusers.utils import export_to_video

# ─────────────── Configuration (only change model_path) ───────────────
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--model_path", type=str, default="VBVR-Wan2.2")
args = parser.parse_args()
model_path = args.model_path

# ──────────────────────────────────────────────────────────────────────

# Paths derived from model_path
image_path = os.path.join(model_path, "assets", "first_frame.png")
output_path = "output.mp4"

# Prompt
prompt = (
    "The scene contains two types of shapes, each type has three shapes of "
    "different sizes arranged randomly. Keep all shapes unchanged in appearance "
    "(type, size, and color). Only rearrange their positions: first group the "
    "shapes by type, then within each group, sort the shapes from smallest to "
    "largest (left to right), and arrange all shapes in a single horizontal "
    "line from left to right."
)
negative_prompt = (
    "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，"
    "整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，"
    "画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，"
    "静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
)

# Generation settings
num_frames = 96
num_inference_steps = 50
guidance_scale = 5.0
seed = 1

# ──────────────────────── Load Pipeline ────────────────────────

print(f"Loading model from: {model_path}")

vae = AutoencoderKLWan.from_pretrained(
    model_path, subfolder="vae", torch_dtype=torch.float32
)

pipe = WanImageToVideoPipeline.from_pretrained(
    model_path,
    vae=vae,
    torch_dtype=torch.bfloat16,
)
pipe.enable_model_cpu_offload()

print(f"Pipeline loaded. boundary_ratio = {pipe.config.boundary_ratio}")

# ──────────────────────── Load Image ────────────────────────

print(f"Loading image: {image_path}")
image = Image.open(image_path).convert("RGB")
width, height = image.size
print(f"Image size: {width}x{height}")

# ──────────────────────── Generate Video ────────────────────────

print(f"Generating video: {num_frames} frames @ {width}x{height}, {num_inference_steps} steps")
generator = torch.Generator(device="cpu").manual_seed(seed)

output = pipe(
    image=image,
    prompt=prompt,
    negative_prompt=negative_prompt,
    height=height,
    width=width,
    num_frames=num_frames,
    num_inference_steps=num_inference_steps,
    guidance_scale=guidance_scale,
    generator=generator,
)

export_to_video(output.frames[0], output_path, fps=16)
print(f"Video saved to: {output_path}")