Spaces:

Leema-Krishna
/

ObjectInsertion

Runtime error

App Files Files Community

Leema Krishna Murali commited on 14 days ago

Commit

f3d0a26

1 Parent(s): a2f1ff3

Initial commit

Browse files

Files changed (15) hide show

app.py +370 -0
frame_editor.py +117 -0
pipeline.py +214 -0
pipeline_adapter.py +216 -0
preview.py +103 -0
requirements.txt +22 -0
stage1_approx.py +197 -0
stage2_vace.py +148 -0
utils/__init__.py +0 -0
utils/__pycache__/__init__.cpython-312.pyc +0 -0
utils/__pycache__/box_utils.cpython-312.pyc +0 -0
utils/__pycache__/video_utils.cpython-312.pyc +0 -0
utils/box_utils.py +65 -0
utils/video_utils.py +42 -0
visualizer.py +139 -0

app.py ADDED Viewed

	@@ -0,0 +1,370 @@

+import gradio as gr
+import numpy as np
+from PIL import Image
+from visualizer import draw_box_on_frame, create_comparison_strip
+from preview import preview_trajectory
+from pipeline_adapter import (
+    extract_first_frame,
+    load_all_frames,
+    run_pipeline_motion_edit,
+    run_pipeline_insertion        # ← need to add this
+)
+def build_interface():
+    # Load Qwen-Image-Edit once at startup (not per-click — model is ~20GB)
+    _qwen_edit_pipe = None
+    try:
+        from frame_editor import load_qwen_image_edit
+        _qwen_edit_pipe = load_qwen_image_edit(use_lightning=True, device="cuda")
+        print("Qwen-Image-Edit ready.")
+    except Exception as e:
+        print(f"Qwen-Image-Edit not available: {e}")
+    with gr.Blocks(title="TRACE Prototype", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# TRACE Prototype — Object Motion Editing")
+        with gr.Tabs():
+            # ── Tab 1: Motion Edit (existing) ─────────────────────────
+            # with gr.Tab("Motion Path Edit"):
+            #     gr.Markdown(
+            #         "Move an **existing object** in the video "
+            #         "to a new trajectory."
+            #     )
+            #     with gr.Row():
+            #         with gr.Column():
+            #             video_input_edit = gr.Video(label="Input Video")
+            #             video_info_edit  = gr.Markdown("")
+            #         with gr.Column():
+            #             first_frame_edit = gr.Image(
+            #                 label="First Frame + Trajectory Preview",
+            #                 interactive=False
+            #             )
+            #     gr.Markdown("**Start Box** — draw around the object")
+            #     with gr.Row():
+            #         sx1 = gr.Number(label="x1", value=100, precision=0)
+            #         sy1 = gr.Number(label="y1", value=100, precision=0)
+            #         sx2 = gr.Number(label="x2", value=200, precision=0)
+            #         sy2 = gr.Number(label="y2", value=200, precision=0)
+            #     gr.Markdown("**End Box** — where you want it to go")
+            #     with gr.Row():
+            #         ex1 = gr.Number(label="x1", value=500, precision=0)
+            #         ey1 = gr.Number(label="y1", value=200, precision=0)
+            #         ex2 = gr.Number(label="x2", value=600, precision=0)
+            #         ey2 = gr.Number(label="y2", value=300, precision=0)
+            #     prompt_edit = gr.Textbox(
+            #         label="Scene Description",
+            #         placeholder="a dog running in a park..."
+            #     )
+            #     with gr.Row():
+            #         stage1_method = gr.Radio(
+            #             choices=["linear", "cotracker"],
+            #             value="linear",
+            #             label="Stage 1 Method"
+            #         )
+            #         use_vace_edit = gr.Checkbox(
+            #             label="Use VACE",
+            #             value=False
+            #         )
+            #     run_edit_btn = gr.Button("Run Motion Edit", variant="primary")
+            #     with gr.Row():
+            #         output_video_edit = gr.Video(label="Output Video")
+            #         metrics_edit      = gr.Markdown("")
+            #     comparison_edit = gr.Image(label="Frame Comparison", interactive=False)
+            # ── Tab 2: Object Insertion (NEW — uses Qwen) ─────────────
+            with gr.Tab("Object Insertion"):
+                gr.Markdown(
+                    "Insert a **new object** into the video using "
+                    "Qwen to edit the first frame, then propagate."
+                )
+                with gr.Row():
+                    with gr.Column():
+                        video_input_ins = gr.Video(label="Input Video")
+                        video_info_ins  = gr.Markdown("")
+                    with gr.Column():
+                        first_frame_ins = gr.Image(
+                            label="First Frame Preview",
+                            interactive=False
+                        )
+                gr.Markdown("**Insertion Box** — where to place the new object")
+                with gr.Row():
+                    ix1 = gr.Number(label="x1", value=40, precision=0)
+                    iy1 = gr.Number(label="y1", value=40, precision=0)
+                    ix2 = gr.Number(label="x2", value=300, precision=0)
+                    iy2 = gr.Number(label="y2", value=300, precision=0)
+                gr.Markdown("**End Box** — where the object should arrive")
+                with gr.Row():
+                    iex1 = gr.Number(label="x1", value=500, precision=0)
+                    iey1 = gr.Number(label="y1", value=200, precision=0)
+                    iex2 = gr.Number(label="x2", value=600, precision=0)
+                    iey2 = gr.Number(label="y2", value=300, precision=0)
+                # ── The Qwen-specific inputs ───────────────────────────
+                gr.Markdown("**Object Description** — what Qwen will insert")
+                with gr.Row():
+                    with gr.Column():
+                        object_description = gr.Textbox(
+                            label="Object to Insert (Qwen prompt)",
+                            placeholder="a red helium balloon with a white string",
+                            info="Qwen uses this to paint the object into frame 1"
+                        )
+                        scene_prompt = gr.Textbox(
+                            label="Full Scene Prompt (for video synthesis)",
+                            placeholder="a peaceful park scene with a red balloon"
+                        )
+                    with gr.Column():
+                        gr.Markdown("Using **Qwen-Image-Edit-2511** for object insertion")
+                        # use_vace_ins = gr.Checkbox(
+                        #     label="Use VACE",
+                        #     value=False
+                        # )
+                # ── Qwen output preview before running video ───────────
+                gr.Markdown("**Step 1 Preview** — see Qwen's edit before running video")
+                preview_qwen_btn = gr.Button(
+                    "Preview First Frame Edit",
+                    variant="secondary"
+                )
+                edited_frame_preview = gr.Image(
+                    label="Qwen-Edited First Frame",
+                    interactive=False
+                )
+                qwen_status = gr.Markdown("")
+                # gr.Markdown("---")
+                # run_ins_btn = gr.Button(
+                #     "Run Full Insertion Pipeline",
+                #     variant="primary"
+                # )
+                # with gr.Row():
+                #     output_video_ins = gr.Video(label="Output Video")
+                #     metrics_ins      = gr.Markdown("")
+                # comparison_ins = gr.Image(
+                #     label="Frame Comparison",
+                #     interactive=False
+                # )
+        # ── Wire Up Tab 1 ─────────────────────────────────────────────
+        #_state = {"frames": None, "first_frame": None}
+        # def on_video_upload_edit(video_path):
+        #     if video_path is None:
+        #         return None, "Upload a video."
+        #     first_frame = extract_first_frame(video_path)
+        #     _state["first_frame"] = first_frame
+        #     return Image.fromarray(first_frame), "Video loaded."
+        # def on_boxes_changed_edit(sx1, sy1, sx2, sy2, ex1, ey1, ex2, ey2):
+        #     if _state["first_frame"] is None:
+        #         return None
+        #     from preview import preview_trajectory
+        #     preview = preview_trajectory(
+        #         _state["first_frame"],
+        #         [sx1, sy1, sx2, sy2],
+        #         [ex1, ey1, ex2, ey2]
+        #     )
+        #     return Image.fromarray(preview)
+        # video_input_edit.change(
+        #     fn=on_video_upload_edit,
+        #     inputs=[video_input_edit],
+        #     outputs=[first_frame_edit, video_info_edit]
+        # )
+        # for inp in [sx1, sy1, sx2, sy2, ex1, ey1, ex2, ey2]:
+        #     inp.change(
+        #         fn=on_boxes_changed_edit,
+        #         inputs=[sx1, sy1, sx2, sy2, ex1, ey1, ex2, ey2],
+        #         outputs=[first_frame_edit]
+        #     )
+        # def on_run_edit(video_path, sx1, sy1, sx2, sy2, ex1, ey1, ex2, ey2,
+        #                 prompt, stage1_method, use_vace, progress=gr.Progress()):
+        #     if video_path is None:
+        #         raise gr.Error("Please upload a video first.")
+        #     if sx2 <= sx1 or sy2 <= sy1:
+        #         raise gr.Error("Start box is invalid: x2 must be > x1, y2 must be > y1")
+        #     if ex2 <= ex1 or ey2 <= ey1:
+        #         raise gr.Error("End box is invalid: x2 must be > x1, y2 must be > y1")
+        #     def prog(frac, msg):
+        #         progress(frac, desc=msg)
+        #     output_path, result_frames, pred_boxes, metrics = \
+        #         run_pipeline_motion_edit(
+        #             video_path=video_path,
+        #             start_box=[sx1, sy1, sx2, sy2],
+        #             end_box=[ex1, ey1, ex2, ey2],
+        #             prompt=prompt,
+        #             stage1_method=stage1_method,
+        #             use_vace=use_vace,
+        #             progress_callback=prog
+        #         )
+        #     if _state["frames"] is None:
+        #         _state["frames"] = load_all_frames(video_path)
+        #     comparison = create_comparison_strip(
+        #         _state["frames"],
+        #         result_frames,
+        #         pred_boxes,
+        #         sample_ts=[0, 20, 40, 60, 80]
+        #     )
+        #     return output_path, Image.fromarray(comparison), metrics
+        # run_edit_btn.click(
+        #     fn=on_run_edit,
+        #     inputs=[
+        #         video_input_edit,
+        #         sx1, sy1, sx2, sy2,
+        #         ex1, ey1, ex2, ey2,
+        #         prompt_edit, stage1_method, use_vace_edit
+        #     ],
+        #     outputs=[output_video_edit, comparison_edit, metrics_edit]
+        # )
+        # ── Wire Up Tab 2 (Qwen insertion) ────────────────────────────
+        _ins_state = {"first_frame": None, "edited_frame": None}
+        def on_video_upload_ins(video_path):
+            if video_path is None:
+                return None, "Upload a video."
+            first_frame = extract_first_frame(video_path)
+            _ins_state["first_frame"] = first_frame
+            return Image.fromarray(first_frame), "Video loaded."
+        def on_preview_qwen(
+            video_path,
+            ix1, iy1, ix2, iy2,
+            object_description,
+            progress=gr.Progress()
+        ):
+            if _ins_state["first_frame"] is None:
+                raise gr.Error("Upload a video first.")
+            if not object_description.strip():
+                raise gr.Error("Enter an object description.")
+            if _qwen_edit_pipe is None:
+                raise gr.Error("Qwen-Image-Edit failed to load at startup. Check logs.")
+            insertion_box = [ix1, iy1, ix2, iy2]
+            progress(0.3, "Editing first frame with Qwen-Image-Edit...")
+            from frame_editor import insert_object_qwen_edit
+            edited = insert_object_qwen_edit(
+                first_frame=_ins_state["first_frame"],
+                box=insertion_box,
+                object_description=object_description,
+                pipe=_qwen_edit_pipe,
+            )
+            _ins_state["edited_frame"] = edited
+            preview = draw_box_on_frame(
+                edited,
+                insertion_box,
+                color=(255, 220, 0),
+                label="inserted here"
+            )
+            progress(1.0, "Done!")
+            return (
+                Image.fromarray(preview),
+                "First frame edited."
+            )
+        def on_run_insertion(
+            video_path,
+            ix1, iy1, ix2, iy2,
+            iex1, iey1, iex2, iey2,
+            scene_prompt,
+            use_vace_ins,
+            progress=gr.Progress()
+        ):
+            if _ins_state["edited_frame"] is None:
+                raise gr.Error(
+                    "Run 'Preview First Frame Edit' first — "
+                    "the edited frame is needed as appearance reference."
+                )
+            output_path, result_frames, pred_boxes, metrics = \
+                run_pipeline_insertion(
+                    video_path=video_path,
+                    edited_first_frame=_ins_state["edited_frame"],
+                    start_box=[ix1, iy1, ix2, iy2],
+                    end_box=[iex1, iey1, iex2, iey2],
+                    prompt=scene_prompt,
+                    use_vace=use_vace_ins,
+                    progress_callback=lambda f, m: progress(f, desc=m)
+                )
+            frames = load_all_frames(video_path)
+            comparison = create_comparison_strip(
+                frames, result_frames, pred_boxes
+            )
+            return (
+                output_path,
+                Image.fromarray(comparison),
+                metrics
+            )
+        video_input_ins.change(
+            fn=on_video_upload_ins,
+            inputs=[video_input_ins],
+            outputs=[first_frame_ins, video_info_ins]
+        )
+        preview_qwen_btn.click(
+            fn=on_preview_qwen,
+            inputs=[
+                video_input_ins,
+                ix1, iy1, ix2, iy2,
+                object_description,
+            ],
+            outputs=[edited_frame_preview, qwen_status]
+        )
+        # run_ins_btn.click(
+        #     fn=on_run_insertion,
+        #     inputs=[
+        #         video_input_ins,
+        #         ix1, iy1, ix2, iy2,
+        #         iex1, iey1, iex2, iey2,
+        #         scene_prompt,
+        #         use_vace_ins
+        #     ],
+        #     outputs=[output_video_ins, comparison_ins, metrics_ins]
+        # )
+    return demo
+if __name__ == "__main__":
+    demo = build_interface()
+    demo.launch(share=True)

frame_editor.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# frame_editor.py
+import numpy as np
+from PIL import Image
+import torch
+import cv2
+def load_qwen_image_edit(use_lightning=True, device="cuda"):
+    from diffusers import QwenImageEditPlusPipeline, FlowMatchEulerDiscreteScheduler
+    scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+        "Qwen/Qwen-Image-Edit-2511", subfolder="scheduler"
+    )
+    pipe = QwenImageEditPlusPipeline.from_pretrained(
+        "Qwen/Qwen-Image-Edit-2511",
+        scheduler=scheduler,
+        torch_dtype=torch.bfloat16,
+    ).to(device)
+    if use_lightning:
+        pipe.load_lora_weights(
+            "lightx2v/Qwen-Image-Edit-2511-Lightning",
+            weight_name="Qwen-Image-Edit-2511-Lightning-4steps-V1.0-bf16.safetensors"
+        )
+        pipe.fuse_lora()
+    return pipe
+def insert_object_qwen_edit(
+    first_frame,           # np.ndarray [H, W, 3] uint8 RGB
+    box,                   # [x1, y1, x2, y2]
+    object_description,    # e.g. "a red sports car"
+    pipe,
+    context_pad=60,        # pixels of context around box — helps Qwen understand scene
+    num_inference_steps=4,
+    guidance_scale=1.0,
+    seed=42,
+):
+    """
+    Inserts object into ONLY the bounding box region.
+    Background outside the box is pixel-identical to original.
+    Strategy:
+      1. Crop (box + padding) from original → gives Qwen scene context
+      2. Edit the crop with Qwen-Image-Edit
+      3. Extract only the box pixels from the edited crop
+      4. Paste back onto original frame
+    """
+    H, W = first_frame.shape[:2]
+    x1, y1, x2, y2 = [int(v) for v in box]
+    # --- Step 1: Crop with context padding ---
+    cx1 = max(0, x1 - context_pad)
+    cy1 = max(0, y1 - context_pad)
+    cx2 = min(W, x2 + context_pad)
+    cy2 = min(H, y2 + context_pad)
+    crop = first_frame[cy1:cy2, cx1:cx2].copy()   # [cH, cW, 3]
+    cH, cW = crop.shape[:2]
+    # Box coordinates relative to crop
+    lx1 = x1 - cx1
+    ly1 = y1 - cy1
+    lx2 = x2 - cx1
+    ly2 = y2 - cy1
+    # --- Step 2: Build focused edit instruction ---
+    prompt = (
+        f"Insert {object_description} in the region ({lx1},{ly1}) to ({lx2},{ly2}). "
+        f"Keep everything outside that region exactly the same. "
+        f"Match the scene lighting, shadows, and perspective."
+    )
+    generator = torch.Generator().manual_seed(seed)
+    edited = pipe(
+        image=[Image.fromarray(crop)],
+        prompt=prompt,
+        num_inference_steps=num_inference_steps,
+        true_cfg_scale=guidance_scale,
+        negative_prompt=" ",
+        generator=generator,
+    ).images[0]
+    edited_np = np.array(edited)  # [cH', cW', 3]
+    # Resize back if pipeline changed resolution
+    if edited_np.shape[:2] != (cH, cW):
+        edited_np = cv2.resize(edited_np, (cW, cH), interpolation=cv2.INTER_LINEAR)
+    # --- Step 3: Hard composite — only paste the box region back ---
+    result = first_frame.copy()
+    result[y1:y2, x1:x2] = edited_np[ly1:ly2, lx1:lx2]
+    return result  # [H, W, 3] uint8 RGB — background unchanged
+def segment_existing_object(
+    first_frame: np.ndarray,
+    box: list,
+    sam2_predictor
+) -> np.ndarray:
+    """
+    Use SAM2 to get a precise mask of an existing object.
+    Returns: [H, W] binary float32 mask
+    """
+    sam2_predictor.set_image(first_frame)
+    input_box = np.array([box])
+    masks, scores, _ = sam2_predictor.predict(
+        box=input_box,
+        multimask_output=False
+    )
+    return masks[np.argmax(scores)].astype(np.float32)

pipeline.py ADDED Viewed

	@@ -0,0 +1,214 @@

+# pipeline.py
+import numpy as np
+import torch
+from utils.video_utils import load_video, save_video
+from utils.box_utils import boxes_to_mask_sequence
+from stage1_approx import stage1_linear, stage1_cotracker
+from stage2_vace import VACEWrapper, SimpleCompositeStage2
+class TRACEPrototype:
+    def __init__(self, use_vace: bool = False, use_cotracker: bool = False):
+        # ── Stage 2: Video Synthesis ──────────────────────────────────
+        if use_vace:
+            self.stage2 = VACEWrapper()
+        else:
+            self.stage2 = SimpleCompositeStage2()
+        # ── CoTracker for Stage 1 ─────────────────────────────────────
+        self.cotracker = None
+        if use_cotracker:
+            try:
+                self.cotracker = torch.hub.load(
+                    "facebookresearch/co-tracker",
+                    "cotracker3_online"
+                ).cuda()
+                print("CoTracker loaded.")
+            except Exception as e:
+                print(f"CoTracker failed to load: {e}")
+                print("Falling back to linear interpolation.")
+        # ── SAM2 for object segmentation ─────────────────────────────
+        self.sam2 = None
+        try:
+            from sam2.build_sam import build_sam2
+            from sam2.sam2_image_predictor import SAM2ImagePredictor
+            self.sam2 = SAM2ImagePredictor(
+                build_sam2("sam2_hiera_large.pt")
+            )
+            print("SAM2 loaded.")
+        except Exception as e:
+            print(f"SAM2 not available: {e}")
+            print("Will use box masks directly instead of segmentation.")
+                # ── Qwen-Image-Edit for object insertion ──────────────────────
+        self.qwen_edit_pipe = None
+        try:
+            from frame_editor import load_qwen_image_edit
+            self.qwen_edit_pipe = load_qwen_image_edit(
+                use_lightning=True, device="cuda"
+            )
+            print("Qwen-Image-Edit loaded.")
+        except Exception as e:
+            print(f"Qwen-Image-Edit not available: {e}")
+    def run_motion_edit(
+        self,
+        video_path: str,
+        keyboxes: dict,       # {frame_idx: [x1, y1, x2, y2]}
+        text_prompt: str,
+        output_path: str = None,
+        frames: np.ndarray = None  # pass directly to avoid reloading
+    ) -> np.ndarray:
+        """
+        Edit the trajectory of an existing object in the video.
+        keyboxes must include:
+          - frame 0: current object location (start)
+          - at least one other frame: target location (end)
+        """
+        # Load video if frames not passed directly
+        if frames is None:
+            frames = load_video(video_path)
+        T, H, W, _ = frames.shape
+        # ── Stage 1: Compute target trajectory ───────────────────────
+        if self.cotracker is not None:
+            pred_boxes = stage1_cotracker(
+                frames, keyboxes, self.cotracker
+            )
+        else:
+            pred_boxes = stage1_linear(keyboxes, T)
+        # ── Build masks ───────────────────────────────────────────────
+        # Synthesis mask: where to PLACE the object (new trajectory)
+        synthesis_masks = boxes_to_mask_sequence(pred_boxes, H, W)
+        # Inpainting mask: where to ERASE the object (original position)
+        # Use SAM2 for precise mask if available, else use box directly
+        orig_box = keyboxes[0]
+        if self.sam2 is not None:
+            from frame_editor import segment_existing_object
+            seg_mask = segment_existing_object(
+                frames[0], orig_box, self.sam2
+            )
+            # Propagate original mask roughly using linear boxes
+            orig_keyboxes = {0: orig_box}
+            orig_boxes = stage1_linear(orig_keyboxes, T)
+            inpaint_masks = boxes_to_mask_sequence(orig_boxes, H, W)
+            # Refine frame 0 with SAM2 mask
+            inpaint_masks[0] = seg_mask
+        else:
+            # Fallback: use box directly as inpaint mask
+            orig_keyboxes = {0: orig_box}
+            orig_boxes    = stage1_linear(orig_keyboxes, T)
+            inpaint_masks = boxes_to_mask_sequence(orig_boxes, H, W)
+        # ── Stage 2: Synthesize video ─────────────────────────────────
+        if isinstance(self.stage2, VACEWrapper):
+            result = self.stage2.synthesize(
+                original_frames=frames,
+                synthesis_masks=synthesis_masks,
+                inpaint_masks=inpaint_masks,
+                first_frame_ref=frames[0],
+                text_prompt=text_prompt
+            )
+        else:
+            # SimpleCompositeStage2: needs object crop
+            x1, y1, x2, y2 = [int(v) for v in orig_box]
+            obj_crop = frames[0, y1:y2, x1:x2]
+            if self.sam2 is not None:
+                obj_mask = seg_mask[y1:y2, x1:x2]
+            else:
+                obj_mask = np.ones(
+                    (y2 - y1, x2 - x1), dtype=np.float32
+                )
+            result = self.stage2.synthesize(
+                original_frames=frames,
+                synthesis_masks=synthesis_masks,
+                inpaint_masks=inpaint_masks,
+                object_crop=obj_crop,
+                object_mask=obj_mask
+            )
+        # ── Save if path provided ─────────────────────────────────────
+        if output_path is not None:
+            save_video(result, output_path)
+            print(f"Saved to {output_path}")
+        return result
+    def run_object_insertion(
+        self,
+        video_path: str,
+        object_description: str,
+        keyboxes: dict,       # {frame_idx: [x1, y1, x2, y2]}
+        text_prompt: str,
+        output_path: str = None,
+        frames: np.ndarray = None,
+    ) -> np.ndarray:
+        """
+        Insert a new object into the video and animate it along a trajectory.
+        Qwen-Image-Edit paints the object into frame 0 only.
+        Stage 2 propagates it through all frames.
+        """
+        if frames is None:
+            frames = load_video(video_path)
+        T, H, W, _ = frames.shape
+        # Stage 1: trajectory
+        pred_boxes = stage1_linear(keyboxes, T)
+        # Edit first frame with Qwen-Image-Edit
+        if self.qwen_edit_pipe is not None:
+            from frame_editor import insert_object_qwen_edit
+            edited_first_frame = insert_object_qwen_edit(
+                first_frame=frames[0],
+                box=pred_boxes[0],
+                object_description=object_description,
+                pipe=self.qwen_edit_pipe,
+            )
+        else:
+            print("Qwen-Image-Edit not available, using original first frame.")
+            edited_first_frame = frames[0]
+        # Synthesis masks: where to place object along trajectory
+        synthesis_masks = boxes_to_mask_sequence(pred_boxes, H, W)
+        # No inpaint masks needed — nothing to erase for insertion
+        inpaint_masks = np.zeros((T, H, W), dtype=np.uint8)
+        # Stage 2
+        if isinstance(self.stage2, VACEWrapper):
+            result = self.stage2.synthesize(
+                original_frames=frames,
+                synthesis_masks=synthesis_masks,
+                inpaint_masks=inpaint_masks,
+                first_frame_ref=edited_first_frame,
+                text_prompt=text_prompt,
+            )
+        else:
+            x1, y1, x2, y2 = [int(v) for v in pred_boxes[0]]
+            obj_crop = edited_first_frame[y1:y2, x1:x2]
+            obj_mask = np.ones((y2 - y1, x2 - x1), dtype=np.float32)
+            result = self.stage2.synthesize(
+                original_frames=frames,
+                synthesis_masks=synthesis_masks,
+                inpaint_masks=inpaint_masks,
+                object_crop=obj_crop,
+                object_mask=obj_mask,
+            )
+        if output_path is not None:
+            save_video(result, output_path)
+            print(f"Saved to {output_path}")
+        return result

pipeline_adapter.py ADDED Viewed

	@@ -0,0 +1,216 @@

+# pipeline_adapter.py
+import numpy as np
+import tempfile
+from utils.video_utils import load_video, save_video
+import numpy as np
+from skimage.metrics import peak_signal_noise_ratio, structural_similarity
+def compute_psnr(original, result):
+    """Mean PSNR across all frames."""
+    scores = []
+    for f1, f2 in zip(original, result):
+        scores.append(peak_signal_noise_ratio(f1, f2, data_range=255))
+    return float(np.mean(scores))
+def compute_ssim_video(original, result):
+    """Mean SSIM across all frames."""
+    scores = []
+    for f1, f2 in zip(original, result):
+        scores.append(structural_similarity(f1, f2, channel_axis=-1, data_range=255))
+    return float(np.mean(scores))
+def compute_lpips_video(original, result, device="cuda"):
+    """Mean LPIPS across all frames (lower = better)."""
+    import torch
+    import lpips
+    loss_fn = lpips.LPIPS(net="alex").to(device)
+    scores = []
+    for f1, f2 in zip(original, result):
+        # Convert [H, W, 3] uint8 → [1, 3, H, W] float in [-1, 1]
+        t1 = torch.from_numpy(f1).permute(2, 0, 1).unsqueeze(0).float() / 127.5 - 1.0
+        t2 = torch.from_numpy(f2).permute(2, 0, 1).unsqueeze(0).float() / 127.5 - 1.0
+        t1, t2 = t1.to(device), t2.to(device)
+        with torch.no_grad():
+            score = loss_fn(t1, t2)
+        scores.append(score.item())
+    return float(np.mean(scores))
+def extract_first_frame(video_path: str) -> np.ndarray:
+    frames = load_video(video_path, max_frames=1)
+    return frames[0]
+def load_all_frames(video_path: str) -> np.ndarray:
+    return load_video(video_path, max_frames=81)
+def run_pipeline_motion_edit(
+    video_path: str,
+    start_box: list,
+    end_box: list,
+    prompt: str,
+    stage1_method: str = "linear",
+    use_vace: bool = False,
+    progress_callback=None
+) -> tuple:
+    from pipeline import TRACEPrototype
+    from stage1_approx import stage1_linear, stage1_cotracker
+    # from evaluation.metrics import (
+    #     compute_psnr, compute_ssim_video, compute_lpips_video
+    # )
+    if progress_callback:
+        progress_callback(0.1, "Loading video...")
+    frames = load_all_frames(video_path)
+    T, H, W, _ = frames.shape
+    keyboxes = {0: start_box, T - 1: end_box}
+    proto = TRACEPrototype(
+        use_vace=use_vace,
+        use_cotracker=(stage1_method == "cotracker")
+    )
+    if progress_callback:
+        progress_callback(0.3, "Computing trajectory...")
+    if stage1_method == "cotracker" and proto.cotracker is not None:
+        pred_boxes = stage1_cotracker(frames, keyboxes, proto.cotracker)
+    else:
+        pred_boxes = stage1_linear(keyboxes, T)
+    if progress_callback:
+        progress_callback(0.5, "Running video synthesis...")
+    result = proto.run_motion_edit(
+        video_path=video_path,
+        keyboxes=keyboxes,
+        text_prompt=prompt,
+        output_path=None
+    )
+    tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
+    save_video(result, tmp.name)
+    if progress_callback:
+        progress_callback(0.9, "Computing metrics...")
+    psnr  = compute_psnr(result, frames)
+    ssim  = compute_ssim_video(result, frames)
+    lpips = compute_lpips_video(result, frames)
+    metrics_text = (
+        f"**Video Quality**\n"
+        f"- PSNR:  {psnr:.2f} dB  (TRACE paper: 20.48)\n"
+        f"- SSIM:  {ssim:.3f}    (TRACE paper: 0.71)\n"
+        f"- LPIPS: {lpips:.3f}   (TRACE paper: 0.19)\n\n"
+        f"**Settings**\n"
+        f"- Stage 1: `{stage1_method}`\n"
+        f"- Frames: {T} | Resolution: {W}x{H}\n"
+    )
+    if progress_callback:
+        progress_callback(1.0, "Done!")
+    return tmp.name, result, pred_boxes, metrics_text
+def run_pipeline_insertion(
+    video_path: str,
+    edited_first_frame: np.ndarray,  # Qwen/FLUX output — already edited
+    start_box: list,
+    end_box: list,
+    prompt: str,
+    use_vace: bool = False,
+    progress_callback=None
+) -> tuple:
+    """
+    Run insertion pipeline using a pre-edited first frame.
+    The first frame has already been modified by Qwen or FLUX-Fill
+    before this function is called — this function handles
+    the trajectory + video synthesis steps only.
+    """
+    from pipeline import TRACEPrototype
+    from stage1_approx import stage1_linear
+    from stage2_vace import VACEWrapper, SimpleCompositeStage2
+    from utils.box_utils import boxes_to_mask_sequence
+    #from evaluation.metrics import compute_psnr, compute_ssim_video
+    if progress_callback:
+        progress_callback(0.1, "Loading video...")
+    frames = load_all_frames(video_path)
+    T, H, W, _ = frames.shape
+    keyboxes = {0: start_box, T - 1: end_box}
+    if progress_callback:
+        progress_callback(0.3, "Computing trajectory...")
+    # Stage 1: interpolate trajectory
+    # (cotracker optional — linear fine for insertion prototype)
+    pred_boxes = stage1_linear(keyboxes, T)
+    # Build masks
+    synthesis_masks = boxes_to_mask_sequence(pred_boxes, H, W)
+    # No inpainting mask — object wasn't in original video
+    inpaint_masks = np.zeros_like(synthesis_masks)
+    if progress_callback:
+        progress_callback(0.5, "Running video synthesis...")
+    if use_vace:
+        stage2 = VACEWrapper()
+        result = stage2.synthesize(
+            original_frames=frames,
+            synthesis_masks=synthesis_masks,
+            inpaint_masks=inpaint_masks,
+            first_frame_ref=edited_first_frame,  # ← Qwen-edited frame
+            text_prompt=prompt
+        )
+    else:
+        # Debug mode: simple alpha compositing
+        stage2 = SimpleCompositeStage2()
+        x1, y1, x2, y2 = [int(v) for v in start_box]
+        obj_crop = edited_first_frame[y1:y2, x1:x2]
+        # Build object mask from non-black pixels in crop
+        obj_mask = (obj_crop.sum(axis=2) > 10).astype(np.float32)
+        result = stage2.synthesize(
+            original_frames=frames,
+            synthesis_masks=synthesis_masks,
+            inpaint_masks=inpaint_masks,
+            object_crop=obj_crop,
+            object_mask=obj_mask
+        )
+    if progress_callback:
+        progress_callback(0.9, "Saving output...")
+    tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
+    save_video(result, tmp.name)
+    psnr = compute_psnr(result, frames)
+    ssim = compute_ssim_video(result, frames)
+    metrics_text = (
+        f"**Insertion Result**\n"
+        f"- PSNR:  {psnr:.2f} dB\n"
+        f"- SSIM:  {ssim:.3f}\n\n"
+        f"**Settings**\n"
+        f"- First frame editor: Qwen/FLUX (run separately)\n"
+        f"- VACE synthesis: {'on' if use_vace else 'off (debug mode)'}\n"
+        f"- Frames: {T} | Resolution: {W}x{H}\n"
+    )
+    if progress_callback:
+        progress_callback(1.0, "Done!")
+    return tmp.name, result, pred_boxes, metrics_text

preview.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# demo/preview.py
+import numpy as np
+from visualizer import draw_box_on_frame, draw_trajectory_on_frame
+from utils.box_utils import interpolate_boxes
+def preview_trajectory(
+    first_frame: np.ndarray,   # [H, W, 3]
+    start_box: list,           # [x1, y1, x2, y2]
+    end_box:   list,           # [x1, y1, x2, y2]
+    num_frames: int = 81
+) -> np.ndarray:
+    """
+    Shows the planned trajectory on the first frame BEFORE running.
+    User sees this immediately after drawing boxes — fast feedback.
+    """
+    keyboxes = {0: start_box, num_frames - 1: end_box}
+    boxes = interpolate_boxes(keyboxes, num_frames)
+    frame = first_frame.copy()
+    # Draw full trajectory path (center points)
+    centers = np.stack([
+        (boxes[:, 0] + boxes[:, 2]) / 2,
+        (boxes[:, 1] + boxes[:, 3]) / 2
+    ], axis=1).astype(int)
+    for i in range(1, len(centers)):
+        alpha = i / len(centers)
+        color = (
+            int(255 * (1 - alpha)),
+            int(200 * alpha),
+            255
+        )
+        import cv2
+        cv2.line(frame,
+                 tuple(centers[i-1]),
+                 tuple(centers[i]),
+                 color, 2)
+    # Draw start box (solid yellow)
+    frame = draw_box_on_frame(
+        frame, start_box,
+        color=(255, 220, 0),
+        label="START",
+        dashed=False
+    )
+    # Draw end box (dashed yellow)
+    frame = draw_box_on_frame(
+        frame, end_box,
+        color=(255, 220, 0),
+        label="END",
+        dashed=True
+    )
+    # Draw a few intermediate boxes (faded)
+    for i in [20, 40, 60]:
+        if i < len(boxes):
+            frame = draw_box_on_frame(
+                frame, boxes[i],
+                color=(200, 200, 200),
+                label=f"t={i}",
+                dashed=True,
+                thickness=1
+            )
+    return frame
+def preview_trajectory_strip(
+    frames: np.ndarray,     # [T, H, W, 3]
+    start_box: list,
+    end_box:   list,
+) -> np.ndarray:
+    """
+    Shows predicted box overlaid on 5 sampled frames.
+    Gives sense of how box moves through the video.
+    """
+    T = len(frames)
+    keyboxes = {0: start_box, T - 1: end_box}
+    boxes = interpolate_boxes(keyboxes, T)
+    sample_ts = [0, T//4, T//2, 3*T//4, T-1]
+    previews = []
+    for t in sample_ts:
+        frame = frames[t].copy()
+        frame = draw_box_on_frame(
+            frame, boxes[t],
+            color=(0, 255, 255),
+            label=f"t={t}",
+            dashed=(t > 0)
+        )
+        # Add small frame counter
+        import cv2
+        H, W = frame.shape[:2]
+        progress = f"{t}/{T-1}"
+        cv2.putText(frame, progress, (W-80, H-10),
+                    cv2.FONT_HERSHEY_SIMPLEX, 0.5,
+                    (200, 200, 200), 1)
+        previews.append(frame)
+    return np.concatenate(previews, axis=1)  # horizontal strip

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+# requirements.txt
+torch>=2.1.0
+torchvision
+transformers>=4.40.0
+git+https://github.com/huggingface/diffusers.git
+torchao==0.11.0
+peft
+sentencepiece
+opencv-python
+numpy
+scipy
+Pillow
+imageio[ffmpeg]
+einops
+transformers
+accelerate
+# Install separately (need git clone):
+# CoTracker3: github.com/facebookresearch/co-tracker
+# SAM2:       github.com/facebookresearch/segment-anything-2
+# VACE:       github.com/ali-vilab/VACE
+# DA-v3:      github.com/DepthAnything/Depth-Anything-V3

stage1_approx.py ADDED Viewed

	@@ -0,0 +1,197 @@

+# stage1_approx.py
+import numpy as np
+import torch
+from utils.box_utils import interpolate_boxes
+# ── Option A: Pure Linear Interpolation ─────────────────────────────
+# Best for: static camera or very slow camera movement
+# Worst for: fast pans, zoom, handheld footage
+def stage1_linear(
+    keyboxes: dict,
+    num_frames: int
+) -> np.ndarray:
+    """
+    Simplest possible Stage 1 substitute.
+    keyboxes: {frame_idx: [x1, y1, x2, y2]}
+    Returns:  [T, 4] box sequence
+    """
+    return interpolate_boxes(keyboxes, num_frames, method="linear")
+# ── Option B: DA-v3 Depth Warping ───────────────────────────────────
+# Better for: moderate camera motion
+# From Table 7: IoU=0.79, mAP=0.73 (vs TRACE 0.80, 0.91)
+# Requires: DepthAnything-v3 + MegaSAM or RAFT optical flow
+def stage1_depth_warp(
+    frames: np.ndarray,   # [T, H, W, 3]
+    keyboxes: dict,
+    depth_model,
+    flow_model=None
+) -> np.ndarray:
+    """
+    Project first-frame boxes to subsequent frames using depth + flow.
+    """
+    T, H, W, _ = frames.shape
+    first_frame = frames[0]
+    # Get depth for all frames
+    depths = []
+    for frame in frames:
+        d = depth_model.infer(frame)  # [H, W] depth map
+        depths.append(d)
+    depths = np.stack(depths)  # [T, H, W]
+    # Get first-frame depth at box center
+    result_boxes = np.zeros((T, 4))
+    for frame_idx, box in keyboxes.items():
+        result_boxes[frame_idx] = box
+    # For each unspecified frame, warp from nearest keybox
+    keyframe_ids = sorted(keyboxes.keys())
+    for t in range(T):
+        if t in keyboxes:
+            continue
+        # Find nearest keyframe
+        nearest_key = min(keyframe_ids, key=lambda k: abs(k - t))
+        ref_box = keyboxes[nearest_key]
+        ref_depth = depths[nearest_key]
+        tgt_depth = depths[t]
+        # Get depth at box center in reference frame
+        cx_ref = (ref_box[0] + ref_box[2]) / 2
+        cy_ref = (ref_box[1] + ref_box[3]) / 2
+        cx_ref_i, cy_ref_i = int(cx_ref), int(cy_ref)
+        d_ref = ref_depth[cy_ref_i, cx_ref_i]
+        # Use optical flow if available for center displacement
+        if flow_model is not None:
+            flow = flow_model.compute(
+                frames[nearest_key], frames[t]
+            )  # [H, W, 2]
+            dx = flow[cy_ref_i, cx_ref_i, 0]
+            dy = flow[cy_ref_i, cx_ref_i, 1]
+        else:
+            dx, dy = 0, 0
+        # Warp center
+        cx_tgt = cx_ref + dx
+        cy_tgt = cy_ref + dy
+        # Scale box size by depth ratio
+        d_tgt = tgt_depth[int(cy_tgt), int(cx_tgt)]
+        scale = d_ref / (d_tgt + 1e-6)
+        bw = (ref_box[2] - ref_box[0]) * scale
+        bh = (ref_box[3] - ref_box[1]) * scale
+        result_boxes[t] = [
+            cx_tgt - bw/2, cy_tgt - bh/2,
+            cx_tgt + bw/2, cy_tgt + bh/2
+        ]
+    # Fill any remaining gaps with interpolation
+    specified = {i: result_boxes[i] for i in keyframe_ids}
+    return interpolate_boxes(specified, T, method="linear")
+# ── Option C: CoTracker-Assisted Warping ────────────────────────────
+# Best for: fast camera, most accurate without training
+# Uses background point tracks to estimate camera motion
+def stage1_cotracker(
+    frames: np.ndarray,   # [T, H, W, 3]
+    keyboxes: dict,
+    cotracker_model
+) -> np.ndarray:
+    """
+    Use CoTracker point tracks to estimate camera motion,
+    then warp keyboxes accordingly.
+    """
+    import torch
+    T, H, W, _ = frames.shape
+    # Build grid of background query points (avoid object region)
+    first_box = list(keyboxes.values())[0]
+    # Sample 100 background points (outside object box)
+    bg_points = _sample_background_points(
+        H, W, first_box, n_points=100
+    )  # [100, 2] (x, y)
+    # Track them across all frames
+    video_tensor = torch.from_numpy(frames).float()
+    video_tensor = video_tensor.permute(0, 3, 1, 2).unsqueeze(0)
+    # [1, T, 3, H, W]
+    queries = torch.zeros(1, len(bg_points), 3)
+    queries[0, :, 0] = 0  # query at frame 0
+    queries[0, :, 1] = torch.from_numpy(bg_points[:, 0])  # x
+    queries[0, :, 2] = torch.from_numpy(bg_points[:, 1])  # y
+    with torch.no_grad():
+        tracks, visibility = cotracker_model(
+            video_tensor, queries=queries
+        )
+    # tracks: [1, T, N_points, 2]
+    tracks = tracks[0].numpy()  # [T, N, 2]
+    # Estimate per-frame homography from background tracks
+    result_boxes = np.zeros((T, 4))
+    ref_points = tracks[0]  # [N, 2] at frame 0
+    for t in range(T):
+        if t in keyboxes:
+            result_boxes[t] = keyboxes[t]
+            continue
+        # Find nearest keyframe
+        nearest_key = min(keyboxes.keys(), key=lambda k: abs(k-t))
+        ref_box = keyboxes[nearest_key]
+        # Estimate transformation from nearest keyframe to frame t
+        src_pts = tracks[nearest_key]  # [N, 2]
+        dst_pts = tracks[t]           # [N, 2]
+        import cv2
+        H_mat, mask = cv2.findHomography(
+            src_pts, dst_pts, cv2.RANSAC, 5.0
+        )
+        if H_mat is None:
+            result_boxes[t] = ref_box
+            continue
+        # Warp box corners through homography
+        corners = np.array([
+            [ref_box[0], ref_box[1]],
+            [ref_box[2], ref_box[1]],
+            [ref_box[2], ref_box[3]],
+            [ref_box[0], ref_box[3]]
+        ], dtype=np.float32).reshape(-1, 1, 2)
+        warped = cv2.perspectiveTransform(corners, H_mat)
+        warped = warped.reshape(-1, 2)
+        result_boxes[t] = [
+            warped[:, 0].min(), warped[:, 1].min(),
+            warped[:, 0].max(), warped[:, 1].max()
+        ]
+    return result_boxes
+def _sample_background_points(H, W, object_box, n_points=100):
+    """Sample points outside the object bounding box"""
+    x1, y1, x2, y2 = object_box
+    points = []
+    attempts = 0
+    while len(points) < n_points and attempts < n_points * 10:
+        x = np.random.randint(0, W)
+        y = np.random.randint(0, H)
+        if not (x1 <= x <= x2 and y1 <= y <= y2):
+            points.append([x, y])
+        attempts += 1
+    return np.array(points, dtype=np.float32)

stage2_vace.py ADDED Viewed

	@@ -0,0 +1,148 @@

+# stage2_vace.py
+import numpy as np
+import torch
+from PIL import Image
+class VACEWrapper:
+    def __init__(self, device="cuda"):
+        from diffusers import WanImageToVideoPipeline
+        from diffusers.utils import export_to_video
+        import torch
+        self.device = device
+        self.pipe = WanImageToVideoPipeline.from_pretrained(
+            "Wan-AI/Wan2.1-VACE-1.3B-diffusers",
+            torch_dtype=torch.bfloat16,
+        ).to(device)
+        self.pipe.enable_model_cpu_offload()
+    def synthesize(
+    self,
+    original_frames,
+    synthesis_masks,
+    inpaint_masks,
+    first_frame_ref,
+    text_prompt="",
+    ):
+      import numpy as np
+      import cv2
+      import torch
+      from PIL import Image
+      T, orig_H, orig_W, _ = original_frames.shape
+      # Round to nearest multiple of 16 (VACE requirement)
+      H = (orig_H // 16) * 16
+      W = (orig_W // 16) * 16
+      if H != orig_H or W != orig_W:
+          original_frames = np.stack([cv2.resize(f, (W, H)) for f in original_frames])
+          first_frame_ref = cv2.resize(first_frame_ref, (W, H))
+          synthesis_masks = np.stack([
+              cv2.resize(m, (W, H), interpolation=cv2.INTER_NEAREST) for m in synthesis_masks
+          ])
+          inpaint_masks = np.stack([
+              cv2.resize(m, (W, H), interpolation=cv2.INTER_NEAREST) for m in inpaint_masks
+          ])
+      video_pil = [Image.fromarray(f) for f in original_frames]
+      combined  = np.clip(
+          synthesis_masks.astype(np.uint16) + inpaint_masks.astype(np.uint16), 0, 255
+      ).astype(np.uint8)
+      mask_pil  = [Image.fromarray(m) for m in combined]
+      ref_pil   = Image.fromarray(first_frame_ref)
+      output = self.pipe(
+          video=video_pil,
+          mask=mask_pil,
+          prompt=text_prompt,
+          negative_prompt="static, blurry, low quality",
+          reference_images=[ref_pil],
+          num_frames=T,
+          height=H,
+          width=W,
+          guidance_scale=5.0,
+          num_inference_steps=25,
+      ).frames[0]
+      result = np.stack([np.array(f) for f in output], axis=0)
+      # Restore original resolution
+      if orig_H != H or orig_W != W:
+          result = np.stack([cv2.resize(f, (orig_W, orig_H)) for f in result])
+      return result
+class SimpleCompositeStage2:
+    """
+    Fallback Stage 2: simple alpha compositing.
+    No diffusion model needed.
+    Works for: clean background, simple objects.
+    Quality: low but fast for debugging the pipeline.
+    """
+    def synthesize(
+        self,
+        original_frames: np.ndarray,   # [T, H, W, 3]
+        synthesis_masks: np.ndarray,   # [T, H, W]
+        inpaint_masks: np.ndarray,     # [T, H, W]
+        object_crop: np.ndarray,       # [H_obj, W_obj, 3]
+        object_mask: np.ndarray,       # [H_obj, W_obj] binary
+    ) -> np.ndarray:
+        """
+        Composite object into new positions using simple alpha blending.
+        Useful for validating box trajectory before diffusion.
+        """
+        import cv2
+        T, H, W, _ = original_frames.shape
+        result = original_frames.copy()
+        for t in range(T):
+            # Find box from synthesis mask
+            mask_t = synthesis_masks[t]
+            ys, xs = np.where(mask_t > 0.5)
+            if len(ys) == 0:
+                continue
+            y1, y2 = ys.min(), ys.max()
+            x1, x2 = xs.min(), xs.max()
+            bh, bw = y2 - y1, x2 - x1
+            if bh <= 0 or bw <= 0:
+                continue
+            # Resize object to target box size
+            obj_resized = cv2.resize(
+                object_crop, (bw, bh),
+                interpolation=cv2.INTER_LINEAR
+            )
+            mask_resized = cv2.resize(
+                object_mask.astype(np.float32), (bw, bh),
+                interpolation=cv2.INTER_LINEAR
+            )
+            mask_3ch = mask_resized[:, :, None]
+            # Erase original position (simple fill with nearby bg)
+            erase_mask = inpaint_masks[t]
+            if erase_mask.sum() > 0:
+                result[t] = _inpaint_simple(result[t], erase_mask)
+            # Composite object at new position
+            roi = result[t, y1:y2, x1:x2]
+            result[t, y1:y2, x1:x2] = (
+                obj_resized * mask_3ch + roi * (1 - mask_3ch)
+            ).astype(np.uint8)
+        return result
+def _inpaint_simple(frame: np.ndarray, mask: np.ndarray) -> np.ndarray:
+    """Simple telea inpainting for object removal"""
+    import cv2
+    mask_uint8 = (mask * 255).astype(np.uint8)
+    return cv2.inpaint(frame, mask_uint8, 3, cv2.INPAINT_TELEA)

utils/__init__.py ADDED Viewed

File without changes

utils/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (183 Bytes). View file

utils/__pycache__/box_utils.cpython-312.pyc ADDED Viewed

Binary file (3.2 kB). View file

utils/__pycache__/video_utils.cpython-312.pyc ADDED Viewed

Binary file (2.6 kB). View file

utils/box_utils.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# utils/box_utils.py
+import numpy as np
+from scipy.interpolate import interp1d
+def interpolate_boxes(
+    keyboxes: dict,    # {frame_idx: [x1, y1, x2, y2]}
+    num_frames: int,
+    method: str = "linear"  # "linear" or "cubic"
+) -> np.ndarray:
+    """
+    Interpolate sparse keyboxes to dense per-frame boxes.
+    Returns: [T, 4] float32
+    """
+    frame_ids = sorted(keyboxes.keys())
+    boxes = np.array([keyboxes[i] for i in frame_ids], dtype=np.float32)
+    # Interpolate each coordinate separately
+    result = np.zeros((num_frames, 4), dtype=np.float32)
+    t_query = np.arange(num_frames)
+    for coord in range(4):
+        f = interp1d(
+            frame_ids,
+            boxes[:, coord],
+            kind=method,
+            fill_value="extrapolate"
+        )
+        result[:, coord] = f(t_query)
+    return result.clip(0, None)  # boxes can't be negative
+def box_to_mask(
+    box: np.ndarray,  # [x1, y1, x2, y2]
+    H: int,
+    W: int
+) -> np.ndarray:
+    """
+    Convert bounding box to binary mask [H, W]
+    """
+    mask = np.zeros((H, W), dtype=np.float32)
+    x1, y1, x2, y2 = box.astype(int)
+    x1, x2 = np.clip([x1, x2], 0, W)
+    y1, y2 = np.clip([y1, y2], 0, H)
+    mask[y1:y2, x1:x2] = 1.0
+    return mask
+def boxes_to_mask_sequence(
+    boxes: np.ndarray,  # [T, 4]
+    H: int,
+    W: int
+) -> np.ndarray:
+    """
+    Returns: [T, H, W] binary masks
+    """
+    T = len(boxes)
+    masks = np.zeros((T, H, W), dtype=np.float32)
+    for t, box in enumerate(boxes):
+        masks[t] = box_to_mask(box, H, W)
+    return masks
+def expand_box(box: np.ndarray, padding: int = 10) -> np.ndarray:
+    """Expand box by padding pixels on each side"""
+    x1, y1, x2, y2 = box
+    return np.array([x1 - padding, y1 - padding,
+                     x2 + padding, y2 + padding])

utils/video_utils.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# utils/video_utils.py
+import cv2
+import numpy as np
+import imageio
+import torch
+def load_video(path: str, max_frames: int = 81) -> np.ndarray:
+    """
+    Returns: [T, H, W, 3] uint8 RGB array
+    """
+    cap = cv2.VideoCapture(path)
+    frames = []
+    while len(frames) < max_frames:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+    cap.release()
+    return np.stack(frames)
+def save_video(frames: np.ndarray, path: str, fps: int = 24):
+    """
+    frames: [T, H, W, 3] uint8 RGB
+    """
+    writer = imageio.get_writer(path, fps=fps)
+    for frame in frames:
+        writer.append_data(frame)
+    writer.close()
+def frames_to_tensor(frames: np.ndarray) -> torch.Tensor:
+    """
+    [T, H, W, 3] uint8 → [T, 3, H, W] float32 in [-1, 1]
+    """
+    t = torch.from_numpy(frames).float() / 127.5 - 1.0
+    return t.permute(0, 3, 1, 2)
+def tensor_to_frames(t: torch.Tensor) -> np.ndarray:
+    """
+    [T, 3, H, W] float32 in [-1, 1] → [T, H, W, 3] uint8
+    """
+    t = ((t + 1.0) * 127.5).clamp(0, 255)
+    return t.permute(0, 2, 3, 1).byte().numpy()

visualizer.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# demo/visualizer.py
+import numpy as np
+import cv2
+from typing import Optional
+def draw_box_on_frame(
+    frame: np.ndarray,   # [H, W, 3] uint8 RGB
+    box: list,           # [x1, y1, x2, y2]
+    color: tuple = (255, 255, 0),
+    label: str = "",
+    thickness: int = 2,
+    dashed: bool = False
+) -> np.ndarray:
+    """Draw a single bounding box on a frame"""
+    frame = frame.copy()
+    x1, y1, x2, y2 = [int(v) for v in box]
+    if dashed:
+        # Draw dashed rectangle manually
+        dash_len = 10
+        gap_len  = 5
+        pts = [
+            ((x1, y1), (x2, y1)),  # top
+            ((x2, y1), (x2, y2)),  # right
+            ((x2, y2), (x1, y2)),  # bottom
+            ((x1, y2), (x1, y1)),  # left
+        ]
+        for (px1, py1), (px2, py2) in pts:
+            dx = px2 - px1
+            dy = py2 - py1
+            dist = max(abs(dx), abs(dy))
+            if dist == 0:
+                continue
+            for i in range(0, dist, dash_len + gap_len):
+                s = i / dist
+                e = min(i + dash_len, dist) / dist
+                sx = int(px1 + s * dx)
+                sy = int(py1 + s * dy)
+                ex = int(px1 + e * dx)
+                ey = int(py1 + e * dy)
+                cv2.line(frame, (sx, sy), (ex, ey), color, thickness)
+    else:
+        cv2.rectangle(frame, (x1, y1), (x2, y2), color, thickness)
+    if label:
+        cv2.putText(
+            frame, label,
+            (x1, max(y1 - 8, 12)),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.6, color, 2
+        )
+    return frame
+def draw_trajectory_on_frame(
+    frame: np.ndarray,
+    boxes: np.ndarray,    # [T, 4] — full trajectory
+    current_t: int,
+    color: tuple = (255, 200, 0)
+) -> np.ndarray:
+    """
+    Draw the motion path (center points) up to current frame.
+    Gives a visual "trail" showing where the object came from.
+    """
+    frame = frame.copy()
+    centers = np.stack([
+        (boxes[:, 0] + boxes[:, 2]) / 2,
+        (boxes[:, 1] + boxes[:, 3]) / 2
+    ], axis=1).astype(int)
+    # Draw path line
+    for i in range(1, current_t + 1):
+        alpha = i / (current_t + 1)  # fade older points
+        c = tuple(int(v * alpha) for v in color)
+        cv2.line(
+            frame,
+            tuple(centers[i-1]),
+            tuple(centers[i]),
+            c, 2
+        )
+    # Draw current center dot
+    cv2.circle(frame, tuple(centers[current_t]), 5, color, -1)
+    return frame
+def create_comparison_strip(
+    original: np.ndarray,   # [T, H, W, 3]
+    result:   np.ndarray,   # [T, H, W, 3]
+    pred_boxes: np.ndarray, # [T, 4]
+    sample_ts: list = None  # which frames to show
+) -> np.ndarray:
+    """
+    Creates a horizontal strip for visual comparison.
+    Shows: Original | Result | Diff for N sampled frames.
+    """
+    T = len(original)
+    if sample_ts is None:
+        sample_ts = [0, T//4, T//2, 3*T//4, T-1]
+    rows = []
+    for t in sample_ts:
+        orig_t = original[t].copy()
+        res_t  = result[t].copy()
+        # Draw box on result
+        res_t = draw_box_on_frame(
+            res_t, pred_boxes[t],
+            color=(0, 255, 0),
+            label=f"t={t}"
+        )
+        # Amplified diff
+        diff_t = np.abs(
+            orig_t.astype(np.int32) - result[t].astype(np.int32)
+        )
+        diff_t = (diff_t * 4).clip(0, 255).astype(np.uint8)
+        # Add labels
+        def add_label(img, text):
+            img = img.copy()
+            cv2.putText(img, text, (10, 25),
+                        cv2.FONT_HERSHEY_SIMPLEX, 0.7,
+                        (255, 255, 255), 2)
+            cv2.putText(img, text, (10, 25),
+                        cv2.FONT_HERSHEY_SIMPLEX, 0.7,
+                        (0, 0, 0), 1)
+            return img
+        orig_t = add_label(orig_t, "Original")
+        res_t  = add_label(res_t,  "Result")
+        diff_t = add_label(diff_t, "Diff x4")
+        row = np.concatenate([orig_t, res_t, diff_t], axis=1)
+        rows.append(row)
+    return np.concatenate(rows, axis=0)