Transformers
Diffusers
Safetensors
File size: 3,528 Bytes
0970deb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python3
"""
VBVR-Wan2.2 Image-to-Video Inference Example

Generate a video from a reference image using the VBVR-Wan2.2 model.
Usage:
    python inference.py --model_path /path/to/VBVR-Wan2.2
"""

import os
import torch
from PIL import Image
from diffusers import WanImageToVideoPipeline, AutoencoderKLWan
from diffusers.utils import export_to_video

# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Configuration (only change model_path) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--model_path", type=str, default="VBVR-Wan2.2")
args = parser.parse_args()
model_path = args.model_path

# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€

# Paths derived from model_path
image_path = os.path.join(model_path, "assets", "first_frame.png")
output_path = "output.mp4"

# Prompt
prompt = (
    "The scene contains two types of shapes, each type has three shapes of "
    "different sizes arranged randomly. Keep all shapes unchanged in appearance "
    "(type, size, and color). Only rearrange their positions: first group the "
    "shapes by type, then within each group, sort the shapes from smallest to "
    "largest (left to right), and arrange all shapes in a single horizontal "
    "line from left to right."
)
negative_prompt = (
    "่‰ฒ่ฐƒ่‰ณไธฝ๏ผŒ่ฟ‡ๆ›๏ผŒ้™ๆ€๏ผŒ็ป†่Š‚ๆจก็ณŠไธๆธ…๏ผŒๅญ—ๅน•๏ผŒ้ฃŽๆ ผ๏ผŒไฝœๅ“๏ผŒ็”ปไฝœ๏ผŒ็”ป้ข๏ผŒ้™ๆญข๏ผŒ"
    "ๆ•ดไฝ“ๅ‘็ฐ๏ผŒๆœ€ๅทฎ่ดจ้‡๏ผŒไฝŽ่ดจ้‡๏ผŒJPEGๅŽ‹็ผฉๆฎ‹็•™๏ผŒไธ‘้™‹็š„๏ผŒๆฎ‹็ผบ็š„๏ผŒๅคšไฝ™็š„ๆ‰‹ๆŒ‡๏ผŒ"
    "็”ปๅพ—ไธๅฅฝ็š„ๆ‰‹้ƒจ๏ผŒ็”ปๅพ—ไธๅฅฝ็š„่„ธ้ƒจ๏ผŒ็•ธๅฝข็š„๏ผŒๆฏๅฎน็š„๏ผŒๅฝขๆ€็•ธๅฝข็š„่‚ขไฝ“๏ผŒๆ‰‹ๆŒ‡่žๅˆ๏ผŒ"
    "้™ๆญขไธๅŠจ็š„็”ป้ข๏ผŒๆ‚ไนฑ็š„่ƒŒๆ™ฏ๏ผŒไธ‰ๆก่…ฟ๏ผŒ่ƒŒๆ™ฏไบบๅพˆๅคš๏ผŒๅ€’็€่ตฐ"
)

# Generation settings
num_frames = 96
num_inference_steps = 50
guidance_scale = 5.0
seed = 1

# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Load Pipeline โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€

print(f"Loading model from: {model_path}")

vae = AutoencoderKLWan.from_pretrained(
    model_path, subfolder="vae", torch_dtype=torch.float32
)

pipe = WanImageToVideoPipeline.from_pretrained(
    model_path,
    vae=vae,
    torch_dtype=torch.bfloat16,
)
pipe.enable_model_cpu_offload()

print(f"Pipeline loaded. boundary_ratio = {pipe.config.boundary_ratio}")

# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Load Image โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€

print(f"Loading image: {image_path}")
image = Image.open(image_path).convert("RGB")
width, height = image.size
print(f"Image size: {width}x{height}")

# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Generate Video โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€

print(f"Generating video: {num_frames} frames @ {width}x{height}, {num_inference_steps} steps")
generator = torch.Generator(device="cpu").manual_seed(seed)

output = pipe(
    image=image,
    prompt=prompt,
    negative_prompt=negative_prompt,
    height=height,
    width=width,
    num_frames=num_frames,
    num_inference_steps=num_inference_steps,
    guidance_scale=guidance_scale,
    generator=generator,
)

export_to_video(output.frames[0], output_path, fps=16)
print(f"Video saved to: {output_path}")