TYTTYTTYT
/

zf_qwen3_vl_processor

Transformers

Model card Files Files and versions

xet

Community

TYTTYTTYT commited on Mar 6

Commit

9e7bf26

verified ·

1 Parent(s): da73575

update the processor to the working version for videomme

Browse files

Files changed (1) hide show

chunk_utils.py +25 -7

chunk_utils.py CHANGED Viewed

@@ -74,7 +74,7 @@ def _visual_token_cums(
         else:
             if not in_video:
-                cums.append(ChunkCum(cum=cum, image_grid_thw=None, video_grid_thw=None))
             else:
                 cum += 1
             token_idx += 1
@@ -115,6 +115,24 @@ class Chunk:
     video_grid_thws: list[tuple[int, int, int]]
 def chunk_tokens(
     max_chunk_size: int,
     input_ids: torch.Tensor | np.ndarray,
@@ -147,22 +165,22 @@ def chunk_tokens(
         current_video_grid_thws: list[tuple[int, int, int]] = []
         for cum in sequence_cums:
-            if cum.image_grid_thw is not None:
-                current_image_grid_thws.append(cum.image_grid_thw)
-            if cum.video_grid_thw is not None:
-                current_video_grid_thws.append(cum.video_grid_thw)
             if current_chunk_size + cum.cum > max_chunk_size:
                 chunks.append(Chunk(
                     start=current_chunk_start,
                     end=current_chunk_start + current_chunk_size,
                     image_grid_thws=current_image_grid_thws,
-                    video_grid_thws=current_video_grid_thws
                 ))
                 current_chunk_start += current_chunk_size
                 current_chunk_size = 0
                 current_image_grid_thws = []
                 current_video_grid_thws = []
             current_chunk_size += cum.cum
         if current_chunk_size > 0:
@@ -170,7 +188,7 @@ def chunk_tokens(
                 start=current_chunk_start,
                 end=current_chunk_start + current_chunk_size,
                 image_grid_thws=current_image_grid_thws,
-                video_grid_thws=current_video_grid_thws,
             ))
         chunked_cums.append(chunks)

         else:
             if not in_video:
+                cums.append(ChunkCum(cum=1, image_grid_thw=None, video_grid_thw=None))
             else:
                 cum += 1
             token_idx += 1
     video_grid_thws: list[tuple[int, int, int]]
+def _merge_video_grid_thws(
+    thws: list[tuple[int, int, int]],
+) -> list[tuple[int, int, int]]:
+    """Merge consecutive video grid_thws that share the same (h, w)."""
+    if not thws:
+        return thws
+    merged: list[tuple[int, int, int]] = []
+    cur_t, cur_h, cur_w = thws[0]
+    for t, h, w in thws[1:]:
+        if h == cur_h and w == cur_w:
+            cur_t += t
+        else:
+            merged.append((cur_t, cur_h, cur_w))
+            cur_t, cur_h, cur_w = t, h, w
+    merged.append((cur_t, cur_h, cur_w))
+    return merged
 def chunk_tokens(
     max_chunk_size: int,
     input_ids: torch.Tensor | np.ndarray,
         current_video_grid_thws: list[tuple[int, int, int]] = []
         for cum in sequence_cums:
             if current_chunk_size + cum.cum > max_chunk_size:
                 chunks.append(Chunk(
                     start=current_chunk_start,
                     end=current_chunk_start + current_chunk_size,
                     image_grid_thws=current_image_grid_thws,
+                    video_grid_thws=_merge_video_grid_thws(current_video_grid_thws),
                 ))
                 current_chunk_start += current_chunk_size
                 current_chunk_size = 0
                 current_image_grid_thws = []
                 current_video_grid_thws = []
+            if cum.image_grid_thw is not None:
+                current_image_grid_thws.append(cum.image_grid_thw)
+            if cum.video_grid_thw is not None:
+                current_video_grid_thws.append(cum.video_grid_thw)
             current_chunk_size += cum.cum
         if current_chunk_size > 0:
                 start=current_chunk_start,
                 end=current_chunk_start + current_chunk_size,
                 image_grid_thws=current_image_grid_thws,
+                video_grid_thws=_merge_video_grid_thws(current_video_grid_thws),
             ))
         chunked_cums.append(chunks)