| import os, torch, numpy |
| from diffnext.pipelines import URSAPipeline |
| from diffnext.utils import export_to_video |
| os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" |
|
|
|
|
|
|
| model_id, height, width = "BAAI/URSA-1.7B-FSQ320", 320, 512 |
| model_args = {"torch_dtype": torch.bfloat16, "trust_remote_code": True} |
| pipe = URSAPipeline.from_pretrained(model_id, **model_args) |
| pipe = pipe.to(torch.device("cuda")) |
|
|
| text_prompt = "tom and jerry" |
| negative_prompt = "worst quality, low quality, inconsistent motion, static, still, blurry, jittery, distorted, ugly" |
|
|
| import time |
|
|
| t1 = time.time() |
|
|
| |
| prompt = text_prompt |
| num_frames, num_inference_steps = 1, 25 |
| image = pipe(**locals()).frames[0] |
| image.save("tom/ursa.jpg") |
|
|
| t2 = time.time() |
|
|
| |
| prompt = f"motion=9.0, {text_prompt}" |
| num_frames, num_inference_steps = 49, 50 |
| video = pipe(**locals()).frames[0] |
| export_to_video(video, "tom/ursa_1+48f.mp4", fps=12) |
|
|
| t3 = time.time() |
|
|
| |
| image, video = None, None |
| prompt = f"motion=9.0, {text_prompt}" |
| num_frames, num_inference_steps = 49, 50 |
| video = pipe(**locals()).frames[0] |
| export_to_video(video, "tom/ursa_49f.mp4", fps=12) |
|
|
| t4 = time.time() |
|
|
| |
| prompt = f"motion=5.0, {text_prompt}" |
| num_frames, num_inference_steps = 49, 50 |
| num_cond_frames, cond_noise_scale = 13, 0.1 |
| for i in range(12): |
| video, start_video = video[-num_cond_frames:], video |
| video = pipe(**locals()).frames[0] |
| video = numpy.concatenate([start_video, video[num_cond_frames:]]) |
| export_to_video(video, "tom/ursa_{}f.mp4".format(video.shape[0]), fps=12) |
| |
| t5 = time.time() |
|
|
| print(f"Text-to-Image time: {t2-t1:.2f} seconds") |
| print(f"Image-to-Video time: {t3-t2:.2f} seconds") |
| print(f"Text-to-Video time: {t4-t3:.2f} seconds") |
| print(f"Video-to-Video time: {t5-t4:.2f} seconds") |
| |
| |
| |
| |
| |
|
|
|
|
| |
| |
|
|
| |