ameroyer commited on Dec 22, 2025

Commit

8a1bc81

verified ·

0 Parent(s):

Super-squash branch 'main' using huggingface_hub

Browse files

Files changed (24) hide show

.gitattributes +35 -0
Notice +2 -0
README.md +13 -0
__init__.py +0 -0
casa_attention.py +1010 -0
config.json +122 -0
configuration_qwen2_5vl_casa.py +36 -0
generation_config.json +6 -0
image_encoder.py +57 -0
language_qwen2_5vl_casa.py +276 -0
merges.txt +0 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +1083 -0
modeling_qwen2_5vl_casa.py +308 -0
processing.py +505 -0
processing_qwen2_5vl_casa.py +39 -0
processor_config.json +13 -0
tokenizer.json +0 -0
tokenizer_config.json +207 -0
utils.py +116 -0
vocab.json +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

Notice ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ CASA-Qwen2_5-VL-3B-LiveCC is finetuned from Qwen2.5-VL-3B with additional CASA layers.
2	+ Qwen is licensed under the Qwen LICENSE AGREEMENT, Copyright (c) Alibaba Cloud. All Rights Reserved.

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+datasets:
+- chenjoya/Live-WhisperX-526K
+language:
+- en
+base_model:
+- Qwen/Qwen2.5-VL-3B-Instruct
+pipeline_tag: video-text-to-text
+license: cc-by-nc-sa-4.0
+---
+Please refer to the [main model card](https://huggingface.co/kyutai/CASA-Helium1-VL-2B) for more information and instructions to run.
+This model page contains model weights for `CASA-Qwen2_5-VL-3B-LiveCC`, a Qwen-2.5VL model adapted from token insertion to cross-attention based using CASA layers and further finetuned on LiveCC for live video captioning. We provide model weights for other CASA models in the associated model collection.

__init__.py ADDED Viewed

File without changes

casa_attention.py ADDED Viewed

	@@ -0,0 +1,1010 @@

+"""CASA layers"""
+import bisect
+from dataclasses import dataclass
+from itertools import accumulate
+from typing import TYPE_CHECKING, Callable, Literal, Sequence, TypedDict, overload
+from typing import cast as type_cast
+import torch
+from transformers.configuration_utils import PretrainedConfig
+from .utils import StreamingModule, StreamingState, delta_w_factory
+if TYPE_CHECKING:
+    from transformers.configuration_utils import PretrainedConfig
+try:
+    from flash_attn import flash_attn_varlen_func
+except ImportError:
+    flash_attn_varlen_func = None  # type: ignore
+WindowsComputeKwargs = TypedDict(
+    "WindowsComputeKwargs",
+    {
+        "num_post_image_tokens": int,
+        "num_pre_image_tokens": int,
+    },
+    total=False,
+)
+def __split_n_merge__(
+    x: torch.Tensor,
+    sample_lengths: list[int],
+    padding_side: Literal["left", "right"] = "right",
+    pad_value: int | float | bool = 0,
+) -> torch.Tensor:
+    max_sample_length = max(sample_lengths)
+    pad_tuple = tuple(0 for _ in range((x.ndim - 1) * 2))
+    return torch.stack(
+        [
+            torch.nn.functional.pad(
+                _x,
+                pad_tuple + (0, max_sample_length - _x.shape[0])
+                if padding_side == "right"
+                else pad_tuple + (max_sample_length - _x.shape[0], 0),
+                value=pad_value,
+            )
+            for _x in torch.split(x, sample_lengths, dim=0)
+        ],
+        dim=0,
+    )
+@overload
+def insert_image_tokens(
+    inputs_embeds: torch.Tensor,
+    image_embeds: torch.Tensor | Sequence[torch.Tensor],
+    image_embeds_insertion_points: list[torch.Tensor],
+    recover_batch_dim: Literal[True],
+    attention_mask: torch.Tensor | None = None,
+    padding_side: Literal["left", "right"] = "right",
+    keep_only_attended: bool = False,
+    pad_output: int | float | bool = 0.0,
+) -> tuple[
+    torch.Tensor,
+    None,
+    torch.Tensor | None,
+    torch.Tensor,
+]: ...
+@overload
+def insert_image_tokens(
+    inputs_embeds: torch.Tensor,
+    image_embeds: torch.Tensor | Sequence[torch.Tensor],
+    image_embeds_insertion_points: list[torch.Tensor],
+    recover_batch_dim: Literal[False],
+    attention_mask: torch.Tensor | None = None,
+    padding_side: Literal["left", "right"] = "right",
+    keep_only_attended: bool = False,
+    pad_output: int | float | bool = 0.0,
+) -> tuple[
+    torch.Tensor,
+    list[int],
+    torch.Tensor | None,
+    torch.Tensor,
+]: ...
+def insert_image_tokens(
+    inputs_embeds: torch.Tensor,
+    image_embeds: torch.Tensor | Sequence[torch.Tensor],
+    image_embeds_insertion_points: list[torch.Tensor],
+    recover_batch_dim: bool = True,
+    attention_mask: torch.Tensor | None = None,
+    padding_side: Literal["left", "right"] = "right",
+    keep_only_attended: bool = False,
+    pad_output: int | float | bool = 0.0,
+) -> tuple[
+    torch.Tensor | torch.Tensor,
+    list[int] | None,
+    torch.Tensor | torch.Tensor | None,
+    torch.Tensor | torch.Tensor,
+]:
+    """
+    Insert image embeddings into text embeddings
+    Args:
+        inputs_embeds (torch.Tensor): (B, S, D) input token embeddings.
+        image_embeds (torch.Tensor | list[torch.Tensor]): (N_images, Nt, D) |  List[(Nt, D)] image token embeddings.
+        image_embeds_insertion_points (list[torch.Tensor]): Insertion indices.
+        attention_mask (torch.Tensor, optional): (B, S) attention mask.
+        padding_side (Literal["left", "right"]): Padding scheme. Controls behavior for padded images.
+        return_indices (bool): Whether to return gather indices or the fused sequence directly.
+        keep_only_attended: This is only applicable when recover_batch_dim is False; whether to
+            remove any non-attended tokens in the whole array. In this case, the attention
+            mask returned is **still the original one**, so we can remember which indices have been
+            removed
+    Returns:
+        output (torch.Tensor): (B, S + Ni * Nt) gather indices or (B, S + Ni * Nt, D) fused sequence
+        image_embeds (torch.Tensor): (B, Ni * Nt) image embeds, padded and batch if input was a list
+        attention_mask (torch.Tensor): Same shape, 1 for real tokens, 0 for image and text padding.
+        image_tokens_mask (torch.Tensor): (B, S + Ni * Nt, 1), marks image token positions.
+    """
+    if isinstance(image_embeds, list) and len(image_embeds) == 0:
+        batch_size, text_seq_length, token_dim = inputs_embeds.shape
+        if recover_batch_dim:
+            return (
+                inputs_embeds,
+                None,
+                attention_mask,
+                torch.zeros((batch_size, text_seq_length, 1), dtype=torch.bool),
+            )
+        else:
+            flattened_seq_length = inputs_embeds.shape[0] * inputs_embeds.shape[1]
+            return (
+                torch.reshape(inputs_embeds, (flattened_seq_length, inputs_embeds.shape[2])),
+                [text_seq_length] * inputs_embeds.shape[0],
+                attention_mask.flatten() if attention_mask is not None else None,
+                torch.zeros((flattened_seq_length, 1), dtype=torch.bool),
+            )
+    # Sanity checks
+    if isinstance(image_embeds, torch.Tensor):
+        assert inputs_embeds.shape[-1] == image_embeds.shape[-1]
+    else:
+        assert all(inputs_embeds.shape[-1] == _x.shape[-1] for _x in image_embeds)
+    batch_size, text_seq_length, token_dim = inputs_embeds.shape
+    image_seq_length = [x.shape[0] for x in image_embeds]
+    # Flatten insertion points
+    insertion_offset = []
+    counter, offset_from_text, offset_from_image = 0, 0, 0
+    for sample in image_embeds_insertion_points:
+        for pt in sample:
+            insertion_offset.append(pt + offset_from_image + offset_from_text)
+            offset_from_image += image_seq_length[counter]
+            counter += 1
+        offset_from_text += text_seq_length
+    image_insert_positions = [
+        x for idx, pt in enumerate(insertion_offset) for x in range(pt, pt + image_seq_length[idx])
+    ]
+    # Flatten image embeds
+    if isinstance(image_embeds, list):
+        image_embeds = torch.cat(image_embeds, dim=0)
+    else:
+        image_embeds = type_cast(torch.Tensor, image_embeds)
+        image_embeds = torch.reshape(image_embeds, (-1, token_dim))
+    # Flatten text embeds across batch dim (B x S, D)
+    inputs_embeds = torch.reshape(inputs_embeds, (-1, token_dim))
+    flattened_seq_length = inputs_embeds.shape[0] + sum(image_seq_length)
+    text_insert_positions = sorted(
+        set(range(flattened_seq_length)).difference(set(image_insert_positions))
+    )
+    # Scatter image embeds in the flattened dict
+    # scatter text related stuff
+    output = torch.empty(
+        (flattened_seq_length, token_dim),
+        device=inputs_embeds.device,
+        dtype=inputs_embeds.dtype,
+    )
+    txt_positions_tensor = torch.Tensor(text_insert_positions).to(
+        dtype=torch.long, device=inputs_embeds.device
+    )
+    output.scatter_(0, txt_positions_tensor[:, None].expand(-1, token_dim), inputs_embeds)
+    attention_mask_new: torch.Tensor | None = None
+    if attention_mask is not None:
+        attention_mask_new = torch.ones(
+            (flattened_seq_length,), dtype=torch.bool, device=inputs_embeds.device
+        )
+        attention_mask_new.scatter_(
+            0, txt_positions_tensor, attention_mask.flatten().to(torch.bool)
+        )
+    # scatter image related stuff
+    image_tokens_mask = torch.zeros(
+        (flattened_seq_length,), dtype=torch.bool, device=inputs_embeds.device
+    )
+    img_positions_tensor = torch.Tensor(image_insert_positions).to(
+        device=inputs_embeds.device, dtype=torch.long
+    )
+    output.scatter_(0, img_positions_tensor[:, None].expand(-1, token_dim), image_embeds)
+    image_tokens_mask.scatter_(0, img_positions_tensor, True)
+    # Compute expected sample length, taking into account the real batch
+    # i.e. recover the batch dimension of image embeddings
+    sample_lengths = []
+    counter = 0
+    for sample_idx, pts in enumerate(image_embeds_insertion_points):
+        num_image_tokens = 0
+        for _ in pts:
+            num_image_tokens += image_seq_length[counter]
+            counter += 1
+        if keep_only_attended and attention_mask is not None:
+            attended_seq_length = torch.sum(attention_mask[sample_idx]).cpu().item()
+            sample_lengths.append(attended_seq_length + num_image_tokens)
+        else:
+            sample_lengths.append(text_seq_length + num_image_tokens)
+    # For CASA attention, we can keep stuff flatten ad return
+    # the sample_lengths for the blockwise attention
+    if not recover_batch_dim:
+        if keep_only_attended and attention_mask_new is not None:
+            output = output[attention_mask_new]
+            image_tokens_mask = image_tokens_mask[attention_mask_new]
+        return output, sample_lengths, attention_mask_new, image_tokens_mask[..., None]
+    # Otherwise, time to (pad) and reshape
+    # Easy case: everything has the same length
+    if all(x == sample_lengths[0] for x in sample_lengths):
+        output = torch.reshape(output, (batch_size, sample_lengths[0], token_dim))
+        image_tokens_mask = torch.reshape(image_tokens_mask, (batch_size, sample_lengths[0], 1))
+        if attention_mask_new is not None:
+            attention_mask_new = torch.reshape(attention_mask_new, (batch_size, sample_lengths[0]))
+    # if there is any size mismatch we break into a
+    # list and pad again
+    else:
+        # split and merge
+        output = __split_n_merge__(output, sample_lengths, padding_side, pad_value=pad_output)
+        # note that the extra padding tokens are also marked as image tokens to be removed later
+        image_tokens_mask = __split_n_merge__(
+            image_tokens_mask, sample_lengths, padding_side, True
+        )[:, :, None]
+        if attention_mask_new is not None:
+            attention_mask_new = __split_n_merge__(
+                attention_mask_new, sample_lengths, padding_side, 0
+            )
+    # Return
+    return output, sample_lengths, attention_mask_new, image_tokens_mask
+def get_sample_lengths_from_insertion_points(
+    image_embeds_insertion_points: list[torch.Tensor],
+    image_embeds: torch.Tensor | list[torch.Tensor] | None,
+    total_seq_len: int | None = None,
+    attention_mask: torch.Tensor | None = None,
+    **kwargs: WindowsComputeKwargs,
+) -> tuple[list[tuple[int, bool]], list[int]]:
+    """Compute sample lengths as if each image insertion point defines a
+    new document (ex document ID)
+    """
+    num_post_image_tokens = type_cast(int, kwargs.get("num_post_image_tokens", 0))
+    num_pre_image_tokens = type_cast(int, kwargs.get("num_pre_image_tokens", 0))
+    squashed_samples_lengths = type_cast(
+        list[list[int]] | None, kwargs.get("squashed_samples_lengths", None)
+    )
+    if squashed_samples_lengths is not None:
+        assert len(squashed_samples_lengths) == len(image_embeds_insertion_points)
+    def __insert_next_sample__(
+        batch_idx: int, insrt_pt: int, last_insrt_pt: int, end_of_batch_sample: bool = False
+    ) -> None:
+        nonlocal attention_mask
+        nonlocal text_sample_lengths, full_sample_lengths
+        nonlocal cum_samples_lengths, current_image_offset
+        nonlocal last_image_idx, current_image_idx, current_length
+        # Add the sample between [last_insrt_pt, insrt_pt] with breaks in
+        # between any squashed samples we find on the way
+        start_pt = bisect.bisect_left(cum_samples_lengths, last_insrt_pt)
+        added_sample = False
+        for end_of_sample in cum_samples_lengths[start_pt:]:
+            # we will break the loop at the end when end_of_sample = insrt_pt
+            end_of_sample = min(end_of_sample, insrt_pt)
+            # Add between [last_insrt_pt, end_of_sample]
+            current_length = end_of_sample - last_insrt_pt
+            if attention_mask is not None:
+                current_length -= int(
+                    torch.sum(~attention_mask[batch_idx, last_insrt_pt:end_of_sample]).item()
+                )
+            if current_length > 0:
+                added_sample = True
+                text_sample_lengths.append(
+                    (current_length, end_of_batch_sample and insrt_pt == end_of_sample)
+                )
+                # add image tokens to current_length
+                if current_image_idx > 0 and image_embeds is not None:
+                    images_in_sample = [
+                        img_idx
+                        for img_idx in range(last_image_idx, current_image_idx)
+                        if img_idx < len(image_embeds_insertion_points[batch_idx])
+                        and last_insrt_pt
+                        <= image_embeds_insertion_points[batch_idx][img_idx]
+                        < end_of_sample
+                    ]
+                    if len(images_in_sample) > 0:
+                        num_image_tokens = sum(
+                            _x.shape[0]
+                            for _x in image_embeds[
+                                current_image_offset + images_in_sample[0] : current_image_offset
+                                + images_in_sample[-1]
+                                + 1
+                            ]
+                        )
+                        current_length += num_image_tokens
+                full_sample_lengths.append(current_length)
+            # prepare for next loop
+            last_insrt_pt = end_of_sample
+            if end_of_sample == insrt_pt:
+                break
+        # End of loop: Catching weird use case where we may end up on a span
+        # full of padding tokens which will not get added due to current_length > 0
+        if end_of_batch_sample:
+            assert added_sample, "Weird edge case. Don't do that, thank you"
+            text_sample_lengths[-1] = (text_sample_lengths[-1][0], True)
+        # End of loop: Catching weird use case where we may end up on a span
+        # full of padding tokens which will not get added due to current_length > 0
+        if end_of_batch_sample:
+            assert added_sample, "Weird edge case. Don't do that, thank you"
+            text_sample_lengths[-1] = (text_sample_lengths[-1][0], True)
+    current_image_offset = 0
+    text_sample_lengths, full_sample_lengths = [], []
+    cum_samples_lengths: list[int] = []
+    current_length, last_insrt_pt, last_image_idx, current_image_idx = 0, 0, 0, 0
+    for batch_idx, pts in enumerate(image_embeds_insertion_points):
+        if squashed_samples_lengths is not None:
+            cum_samples_lengths = list(accumulate(squashed_samples_lengths[batch_idx]))
+        else:
+            assert total_seq_len is not None
+            cum_samples_lengths = [total_seq_len]
+        for current_image_idx, insrt_pt in enumerate(pts.cpu().tolist()):
+            # check if the images are consecutive in which way we want
+            # them to belong to the same window
+            if current_image_idx >= 1 and insrt_pt == (
+                image_embeds_insertion_points[batch_idx][current_image_idx - 1]
+                + num_pre_image_tokens
+                + num_post_image_tokens
+            ):
+                continue
+            # Otherwise, we found a new sample
+            # not very important but for completeness: the insertion points come *after*
+            # the pre-image tokens per design but for the document-id mask it is more consistent to
+            # have them correspond to the same image
+            insrt_pt -= num_pre_image_tokens
+            # Update text and full sample lengths
+            if insrt_pt > last_insrt_pt:
+                __insert_next_sample__(
+                    batch_idx, insrt_pt, last_insrt_pt, end_of_batch_sample=False
+                )
+            last_image_idx = current_image_idx
+            last_insrt_pt = insrt_pt
+        # End of batch: add sample in progress and reset
+        current_image_idx += 1
+        if cum_samples_lengths[-1] > last_insrt_pt:
+            __insert_next_sample__(
+                batch_idx, cum_samples_lengths[-1], last_insrt_pt, end_of_batch_sample=True
+            )
+        current_length, last_insrt_pt, last_image_idx, current_image_idx = 0, 0, 0, 0
+        current_image_offset += len(pts)
+    # Sanity checks that the is_eob are correctly place
+    assert sum(_x[1] for _x in text_sample_lengths) == len(image_embeds_insertion_points), (
+        f"Number of eob markers ({sum(_x[1] for _x in text_sample_lengths)}) differs"
+        f" from original batch size ({len(image_embeds_insertion_points)})"
+    )
+    return text_sample_lengths, full_sample_lengths
+class CASAAttentionHandler:
+    def __init__(
+        self,
+        inputs_embeds: torch.Tensor,
+        image_embeds: torch.Tensor | list[torch.Tensor],
+        image_embeds_insertion_points: list[torch.Tensor],
+        attention_mask: torch.Tensor | None = None,
+        rope_fn: Callable | None = None,
+        windows: Literal["batch", "squashed", "images", "turn_based"] = "images",
+        use_asymetric_q_kv: bool = True,
+        casa_windows_info: None | dict = None,
+    ):
+        """Initialize the structure holding the query buffer for CASA attention layers
+        (ie the **flattened** text+image inserted tokens).
+        Note that this structure is shared across all casa layers, and it gets updated
+        with the current hidden states at every layer; this is merely a buffer to keep
+        scatter_ operations in-plae as much as possible
+        In this module, the embeddings related values (image_tokens_mask,
+        text_sample_lengths etc) are stored under the assumption of a tensor
+        which is *flatened* and *witout padding tokens*
+        Only the attention mask is kept as-is (text-only, batched, padded) to
+        be able to recover original shapes when needed
+        """
+        super().__init__()
+        assert windows == "images"  # for inference code release
+        # Note 1: Unless overriden, text/full_sample_lengths are defined such that one
+        # document = one sample in the batch
+        if attention_mask is None:
+            text_sample_lengths = [(_x.shape[0], True) for _x in inputs_embeds]
+        else:
+            text_sample_lengths = [(int(torch.sum(_x).item()), True) for _x in attention_mask]
+        (
+            full_inputs_embeds,
+            full_sample_lengths,
+            # Full attention mask is only needed at inference to
+            # flatten the KV-Cache and remove padding tokens
+            _,
+            self.image_tokens_mask,
+        ) = insert_image_tokens(
+            inputs_embeds=inputs_embeds,
+            image_embeds=image_embeds,
+            image_embeds_insertion_points=image_embeds_insertion_points,
+            attention_mask=attention_mask,
+            recover_batch_dim=False,
+            keep_only_attended=attention_mask is not None,
+        )
+        assert self.image_tokens_mask.ndim == 2
+        self.image_embeds = image_embeds
+        self.image_embeds_insertion_points = image_embeds_insertion_points
+        self.attention_mask = None if attention_mask is None else attention_mask.bool()
+        self.use_asymetric_qkv = use_asymetric_q_kv
+        # At inference, we have to use asymetric QKV for efficiency
+        if self.attention_mask is not None:
+            self.use_asymetric_qkv = True
+        # Build CASA windows
+        assert casa_windows_info is not None
+        text_sample_lengths, full_sample_lengths = get_sample_lengths_from_insertion_points(
+            image_embeds_insertion_points=image_embeds_insertion_points,
+            image_embeds=image_embeds,
+            total_seq_len=inputs_embeds.shape[1],
+            attention_mask=self.attention_mask,
+            **casa_windows_info,  # pyright: ignore
+        )
+        # Sanity checks on the sample lengths
+        self.text_sample_lengths = [(int(s), eob) for s, eob in text_sample_lengths if s > 0]
+        self.full_sample_lengths = [int(s) for s in full_sample_lengths if s > 0]
+        assert len(self.text_sample_lengths) == len(self.full_sample_lengths), (
+            f"Sanity check failed; text sample lengths {len(self.text_sample_lengths)}"
+            f" != full sample lengths {len(self.full_sample_lengths)}"
+        )
+        if self.attention_mask is None:
+            num_unpadded_text_tokens = inputs_embeds.shape[0] * inputs_embeds.shape[1]
+        else:
+            num_unpadded_text_tokens = int(
+                torch.sum(type_cast(torch.Tensor, attention_mask)).item()
+            )
+        assert sum(_x[0] for _x in self.text_sample_lengths) == num_unpadded_text_tokens, (
+            f"Sanity check failed; sample lengths {sum(self.full_sample_lengths)} != {full_inputs_embeds.shape[0]}"
+        )
+        assert sum(self.full_sample_lengths) == full_inputs_embeds.shape[0], (
+            f"Sanity check failed; sample lengths {sum(self.full_sample_lengths)} != {full_inputs_embeds.shape[0]}"
+        )
+        # Finally we can compute cu_seqlen based on sample lengths
+        self.max_seqlen_q = max(self.text_sample_lengths)[0]
+        self.cu_seqlens_q = self.get_cu_seqlens(
+            [x[0] for x in self.text_sample_lengths], device=inputs_embeds.device
+        )
+        self.max_seqlen_kv = max(self.full_sample_lengths)
+        self.cu_seqlens_kv = self.get_cu_seqlens(
+            self.full_sample_lengths, device=inputs_embeds.device
+        )
+        # For inference: We save the length of the current document
+        # to trim the KV cache appropriately
+        self.current_doc_lengths = self.full_sample_lengths
+        # Precompute position embeddings
+        self.position_embeds = None
+        self.rope_fn = rope_fn
+        if self.rope_fn is not None:
+            self.position_embeds = self.compute_position_embeddings(
+                self.rope_fn, full_sample_lengths, dummy_for_dtype_and_device=full_inputs_embeds
+            )
+    @property
+    def batch_lengths(self) -> list[int]:
+        """Return a (batch_size,) list of integers containing the
+        number of (non-padded) text tokens for each sample in the batch"""
+        bls = [0]
+        for ln, eob in self.text_sample_lengths:
+            bls[-1] += ln
+            if eob:
+                bls.append(0)
+        return bls[:-1]
+    @property
+    def full_batch_lengths(self) -> list[int]:
+        """Same as batch_lengths for text+image tokens"""
+        bls = [0]
+        for (_, eob), ln in zip(self.text_sample_lengths, self.full_sample_lengths):
+            bls[-1] += ln
+            if eob:
+                bls.append(0)
+        return bls[:-1]
+    def get_cu_seqlens(
+        self, sample_lengths: list[int], device: torch.device | None
+    ) -> torch.Tensor:
+        """Update cu_seqlengths according to the given sample_lengths"""
+        return torch.Tensor(list(accumulate(sample_lengths, initial=0))).to(
+            dtype=torch.int32, device=device
+        )
+    def compute_position_embeddings(
+        self,
+        rope_fn: Callable,
+        sample_lengths: list[int],
+        dummy_for_dtype_and_device: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Compute info required for position embeddings. Can be override e.g. for Qwen"""
+        # option 1: Standard range
+        # position_ids = torch.arange(0, full_inputs_embeds.shape[0])
+        # option 2: Follows document boundary
+        position_ids = torch.cat([torch.arange(0, lg) for lg in sample_lengths], dim=0)
+        return rope_fn(
+            dummy_for_dtype_and_device,
+            position_ids.to(dummy_for_dtype_and_device.device)[None, ...],
+        )
+    def get_position_embedding(
+        self,
+        key: Literal["q", "kv"],
+        num_queries: int = 0,
+    ) -> tuple[torch.Tensor, torch.Tensor] | None:
+        if self.position_embeds is None:
+            return None
+        cos, sin = self.position_embeds
+        bls = self.full_batch_lengths
+        # For Q, we only want the text-only posembeds
+        if key == "q" and self.use_asymetric_qkv:
+            bls = self.batch_lengths
+            cos, sin = cos[:, ~self.image_tokens_mask[:, 0]], sin[:, ~self.image_tokens_mask[:, 0]]
+        elif key not in {"q", "kv"}:
+            raise ValueError(f"Unknow for position embedding {key}")
+        # Easy case: training or first step at inference: we use all the posembeds
+        if num_queries == 0:
+            return cos, sin
+        # If num queries is given, we need to trim for *every sample in the batch*
+        cos = [x[:, -num_queries:] for x in torch.split(cos, bls, dim=1)]
+        sin = [x[:, -num_queries:] for x in torch.split(sin, bls, dim=1)]
+        return torch.cat(cos, dim=1), torch.cat(sin, dim=1)
+    def get_full_embeds(
+        self, hidden_states: torch.Tensor, norm_fn: Callable | None
+    ) -> torch.Tensor:
+        """Update attended hidden states in the current query buffer
+        :param  hidden_states: (b, s, d) Tensor input to the CASA attention layer"
+        """
+        assert self.image_embeds is not None
+        return insert_image_tokens(
+            inputs_embeds=hidden_states,
+            image_embeds=self.image_embeds
+            if norm_fn is None
+            else norm_fn(self.image_embeds)
+            if isinstance(self.image_embeds, torch.Tensor)
+            else [norm_fn(_x) for _x in self.image_embeds],
+            image_embeds_insertion_points=self.image_embeds_insertion_points,
+            attention_mask=self.attention_mask,
+            recover_batch_dim=False,
+            keep_only_attended=self.attention_mask is not None,
+        )[0][None, :, :]
+    def recover_text_embeds(
+        self,
+        hidden_states_out: torch.Tensor,
+        hidden_states_in: torch.Tensor,
+        update_image_embeddings: bool = False,
+    ) -> torch.Tensor:
+        """Returns text embeddings from the query buffer, including non-attended tokens at inference"""
+        if update_image_embeddings and not self.use_asymetric_qkv:
+            raise NotImplementedError("Implement image embeddings updates for asymetric QKV")
+        # Remove image tokens in the symetric case
+        if not self.use_asymetric_qkv:
+            hidden_states_out = hidden_states_out[~self.image_tokens_mask[:, 0]]
+        # if there's not attention mask, we are in the right padded case
+        # (keep_only_attended = False) we can directly return the query
+        # outputs (which don't contain the image)
+        if self.attention_mask is None:
+            return hidden_states_out
+        # Otherwise, we need to "scatter" back only the text-attended tokens to the original
+        # hidden states, which contain the paddings
+        num_queries = hidden_states_in.shape[1]
+        # Case 1: the padded hidden_states_in is larger than hidden_states_out
+        # we rebatch+pad hidden_state_out before doing the scattering
+        if hidden_states_out.shape[0] != hidden_states_in.shape[0] * hidden_states_in.shape[1]:
+            s = torch.split(hidden_states_out, self.batch_lengths, dim=0)
+            assert max(_s.shape[0] for _s in s) <= num_queries  # sanity check
+            s = [
+                torch.nn.functional.pad(_s, (0, 0, num_queries - _s.shape[0], 0), value=0)
+                for _s in s
+            ]
+            return torch.where(
+                self.attention_mask[:, -num_queries:, None],
+                torch.stack(s),
+                hidden_states_in,
+            )
+        # If both have the smae shape, it means hidden_states_in contained no padding
+        # so we can directly return hidden states out
+        return hidden_states_out
+    def extend(self, num_tokens: int, offset: int = 0):
+        """Extend all necessary values of the Handler for infenrece
+        Note: this implementation curently assumes a single conversation at a time
+        (otherwise image tokens mask would have to change) and that tokens added are
+        attended to"""
+        # image embeds is inserted in the first step and stored in the KV cache
+        self.image_embeds = None
+        # Update attention mask (non-flattened) (assumes all new tokens are attended to)
+        if self.attention_mask is not None:
+            self.attention_mask = torch.nn.functional.pad(
+                self.attention_mask, (0, num_tokens), value=1
+            )
+        # Update image token mask (assumes only one image/conversation
+        # is started at once so that we always extend by zero)
+        # Note that the mask is stored flattened to avoid padding so we have to
+        # do something a bit ugly and inefficient here
+        imtokmask = torch.split(self.image_tokens_mask, self.full_batch_lengths, dim=0)
+        imtokmask = [torch.nn.functional.pad(x, (0, 0, 0, num_tokens), value=0) for x in imtokmask]
+        self.image_tokens_mask = torch.cat(imtokmask, dim=0)
+        # Recompute cumulative document lengths after assigning the new
+        # number of tokens to each sample in the batch
+        for idx, (ln, is_eob) in enumerate(self.text_sample_lengths):
+            if is_eob:
+                self.text_sample_lengths[idx] = (num_tokens + ln, is_eob)
+                self.full_sample_lengths[idx] += num_tokens
+        # Recompute cu sequlen
+        # First step: Technically this never occurs, but we keep it for completeness
+        if offset == 0:
+            self.max_seqlen_q = max(self.text_sample_lengths)[0]
+            self.cu_seqlens_q = self.get_cu_seqlens(
+                [x[0] for x in self.text_sample_lengths], device=self.cu_seqlens_q.device
+            )
+            self.max_seqlen_kv = max(self.full_sample_lengths)
+            self.cu_seqlens_kv = self.get_cu_seqlens(
+                self.full_sample_lengths, device=self.cu_seqlens_kv.device
+            )
+        # Step > 0: the annoying part is since flashattn_varlen does not accept
+        # 0-len documents, we need to remove documents from the KV Cache when they're past
+        # their windows. In our current setting, this means we only want to keep the latest
+        # documents
+        else:
+            self.max_seqlen_q = num_tokens
+            self.cu_seqlens_q = self.get_cu_seqlens(
+                [num_tokens for (_, eob) in self.text_sample_lengths if eob],
+                device=self.cu_seqlens_q.device,
+            )
+            final_doc_lengths = [
+                ln
+                for (_, eob), ln in zip(self.text_sample_lengths, self.full_sample_lengths)
+                if eob
+            ]
+            self.current_doc_lengths = final_doc_lengths
+            self.max_seqlen_kv = max(self.current_doc_lengths)
+            self.cu_seqlens_kv = self.get_cu_seqlens(
+                final_doc_lengths,
+                device=self.cu_seqlens_kv.device,
+            )
+        # Update position embeddings
+        if self.rope_fn is not None and self.position_embeds is not None:
+            self.position_embeds = self.compute_position_embeddings(
+                self.rope_fn,
+                self.full_sample_lengths,
+                dummy_for_dtype_and_device=self.position_embeds[0],
+            )
+@dataclass
+class CASAAttentionStreamingState(StreamingState):
+    """Streaming State for CASA Atention module. Keep the hidden"""
+    k: torch.Tensor = None  # pyright: ignore[reportAssignmentType]
+    v: torch.Tensor = None  # pyright: ignore[reportAssignmentType]
+    recover_batched_trims: list[int] = None  # pyright: ignore[reportAssignmentType]
+    casa_handler: CASAAttentionHandler = None  # pyright: ignore[reportAssignmentType]
+    def maybe_get_casa_handler(
+        self,
+        casa_handler: CASAAttentionHandler | None,
+        is_first_casa_layer: bool = False,
+        num_queries: int = -1,
+    ) -> CASAAttentionHandler | None:
+        # Set given Casa Handler the first time we reach this
+        if self.casa_handler is None:
+            self.casa_handler = casa_handler  # pyright: ignore
+        # subsequent calls: we need to extend shape to accomodate new tokens
+        # however because CASA handler is shared across layers, we only need to do it once
+        if self.casa_handler is not None and self.offset > 0 and is_first_casa_layer:
+            # since CasaHandler is shared, we only use its extend step once
+            self.casa_handler.extend(num_queries, offset=self.offset)
+        return self.casa_handler
+    def __recover_batched_kv__(self, states: torch.Tensor) -> torch.Tensor:
+        """Recover batched key/value states with left padding"""
+        s = torch.split(states, self.casa_handler.full_batch_lengths, dim=1)
+        mlen = max(_s.shape[1] for _s in s)
+        # Remember the added padding so that we can re-flatten KV later
+        if self.recover_batched_trims is None:
+            self.recover_batched_trims = [mlen - _s.shape[1] for _s in s]
+        s = [torch.nn.functional.pad(_s, (0, 0, 0, 0, mlen - _s.shape[1], 0), value=0) for _s in s]
+        return torch.cat(s, dim=0)
+    def __get_flattened_kv__(
+        self, k: torch.Tensor | None = None, v: torch.Tensor | None = None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Flattened and remove padding to act with flash_attn_func
+        """
+        k = self.k if k is None else k
+        v = self.v if v is None else v
+        assert k is not None and v is not None
+        # Since every batch at least contributes one document,
+        # we can use this to check whether we are in streaming mode with dropped docs.
+        # If so, we should trim the kv cache accordingly
+        if len(self.casa_handler.current_doc_lengths) == len(k):
+            k = torch.cat(
+                [
+                    _k[self.recover_batched_trims[idx] :][-doc_len:]
+                    for idx, _k, doc_len in zip(
+                        range(len(k)), k, self.casa_handler.current_doc_lengths
+                    )
+                ]
+            )
+            v = torch.cat(
+                [
+                    _v[self.recover_batched_trims[idx] :][-doc_len:]
+                    for idx, _v, doc_len in zip(
+                        range(len(k)), v, self.casa_handler.current_doc_lengths
+                    )
+                ]
+            )
+            return k[None, ...], v[None, ...]
+        k = torch.cat([_k[self.recover_batched_trims[idx] :] for idx, _k in enumerate(k)])
+        v = torch.cat([_v[self.recover_batched_trims[idx] :] for idx, _v in enumerate(v)])
+        return k[None, ...], v[None, ...]
+    def extend_kv(
+        self, key_states: torch.Tensor, value_states: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Extend KV Cache while keep
+        """
+        assert self.casa_handler is not None
+        if self.k is None and self.v is None:
+            # Init with batch-padded key and value states
+            self.k = self.__recover_batched_kv__(key_states)
+            self.v = self.__recover_batched_kv__(value_states)
+            return self.__get_flattened_kv__()
+        if self.k is not None and self.v is not None:
+            # this is during generation; normally there is no padding at this stage
+            # so we can directly reshape the flattened key states
+            rshp = (self.k.shape[0], -1, self.k.shape[2], self.k.shape[3])
+            self.k = torch.cat([self.k, key_states.reshape(rshp)], dim=1)
+            self.v = torch.cat([self.v, value_states.reshape(rshp)], dim=1)
+            return self.__get_flattened_kv__()
+        raise ValueError("Impossible configuration (k and v updates are desynchronized )")
+class CASAAttention(StreamingModule[CASAAttentionStreamingState]):
+    def __init__(
+        self,
+        config: "PretrainedConfig",
+        layer_idx: int | None,
+        self_attn: torch.nn.Module | None = None,
+        input_layernorm_fn: Callable[[torch.Tensor], torch.Tensor] | None = None,
+    ):
+        super().__init__(CASAAttentionStreamingState)
+        self.head_dim = config.head_dim
+        self.config = config
+        self.is_first_casa_layer = layer_idx == (min(config.xa_layers) if config.xa_layers else 0)
+        self.use_delta_w = config.casa_delta_w
+        self.q_proj_casa = self.init_from_config_proj("q", config)
+        self.k_proj_casa = self.init_from_config_proj("k", config)
+        self.v_proj_casa = self.init_from_config_proj("v", config)
+        self.o_proj_casa = self.init_from_config_proj("o", config)
+        # Delta_w
+        self.override_q_proj: Callable[[torch.Tensor], torch.Tensor] | None = None
+        self.override_k_proj: Callable[[torch.Tensor], torch.Tensor] | None = None
+        self.override_v_proj: Callable[[torch.Tensor], torch.Tensor] | None = None
+        self.override_o_proj: Callable[[torch.Tensor], torch.Tensor] | None = None
+        if config.casa_delta_w:
+            assert self_attn is not None
+            self.set_delta_w(self_attn)
+        # Layer norm
+        self.norm_fn: Callable | None = None
+        if config.xa_norm_on_images:
+            assert input_layernorm_fn is not None
+            self.norm_fn = input_layernorm_fn
+    def init_from_mha(self, self_attn: torch.nn.Module):
+        assert self_attn is not None
+        with torch.no_grad():
+            assert hasattr(self_attn, "q_proj")
+            for key in ["q", "k", "v", "o"]:
+                src = type_cast(torch.nn.Linear, getattr(self_attn, f"{key}_proj"))
+                tgt = type_cast(torch.nn.Linear, getattr(self, f"{key}_proj_casa"))
+                tgt.weight.copy_(src.weight)
+                if tgt.bias is not None and src.bias is not None:
+                    tgt.bias.copy_(src.bias)
+    def set_delta_w(self, self_attn: torch.nn.Module):
+        """Delta w setup"""
+        self.override_q_proj = delta_w_factory(
+            self.q_proj_casa, type_cast(torch.nn.Linear, self_attn.q_proj)
+        )
+        self.override_k_proj = delta_w_factory(
+            self.k_proj_casa, type_cast(torch.nn.Linear, self_attn.k_proj)
+        )
+        self.override_v_proj = delta_w_factory(
+            self.v_proj_casa, type_cast(torch.nn.Linear, self_attn.v_proj)
+        )
+        self.override_o_proj = delta_w_factory(
+            self.o_proj_casa, type_cast(torch.nn.Linear, self_attn.o_proj)
+        )
+        with torch.no_grad():
+            torch.nn.init.zeros_(self.q_proj_casa.weight)
+            torch.nn.init.zeros_(self.k_proj_casa.weight)
+            torch.nn.init.zeros_(self.v_proj_casa.weight)
+            torch.nn.init.zeros_(self.o_proj_casa.weight)
+            if self.q_proj_casa.bias is not None:
+                torch.nn.init.zeros_(self.q_proj_casa.bias)
+            if self.k_proj_casa.bias is not None:
+                torch.nn.init.zeros_(self.k_proj_casa.bias)
+            if self.v_proj_casa.bias is not None:
+                torch.nn.init.zeros_(self.v_proj_casa.bias)
+            if self.o_proj_casa.bias is not None:
+                torch.nn.init.zeros_(self.o_proj_casa.bias)
+    def init_from_config_proj(
+        self, key: Literal["q", "o", "k", "v"], config: PretrainedConfig
+    ) -> torch.nn.Linear:
+        """Initialize the Linear proj in this module"""
+        raise NotImplementedError("Abastract class.")
+    def apply_position_embeddings(
+        self,
+        key: Literal["q", "kv"],
+        x: torch.Tensor,  # (batch, seq_len, num_heads, head_dim)
+        casa_handler: CASAAttentionHandler | None,
+        num_queries: int = 0,
+        unsqueeze_dim: int = 1,
+    ) -> torch.Tensor:  # (batch, seq_len, num_heads, head_dim)
+        """Apply position embeddings to query and key states"""
+        raise NotImplementedError("Abastract class.")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        casa_handler: CASAAttentionHandler | None,
+    ) -> torch.Tensor | None:
+        """Generic forward for CASA uses for instance in `helium1_attention`"""
+        og_dtype = hidden_states.dtype
+        if self.is_streaming:
+            casa_handler = self.streaming_state.maybe_get_casa_handler(
+                casa_handler,
+                is_first_casa_layer=self.is_first_casa_layer,
+                num_queries=hidden_states.shape[1],
+            )
+        # Case of text-only samples at training (or inference when no handler was cached)
+        # in this case we just skip CASA so we return None (no casa_update)
+        if casa_handler is None:
+            return None
+        if self.is_streaming:
+            assert casa_handler.use_asymetric_qkv, (
+                "You should set `use_asymetric_qkv` to True during inference"
+            )
+        og_shape = hidden_states.shape
+        # Build Q inputs
+        if casa_handler.use_asymetric_qkv:
+            q_inputs = hidden_states.flatten(0, 1)[None, ...]
+            if casa_handler.attention_mask is not None:
+                q_inputs = q_inputs[:, casa_handler.attention_mask[:, -og_shape[1] :].flatten()]
+        else:
+            q_inputs = casa_handler.get_full_embeds(hidden_states, norm_fn=self.norm_fn)
+        # Case 1: Training or first inference step
+        if not self.is_streaming or self.streaming_state.offset == 0:
+            kv_inputs = casa_handler.get_full_embeds(hidden_states, norm_fn=self.norm_fn)
+        else:
+            # during streaming, the KV cache including image embeddings
+            # will be inserted later so for now we only update the incoming queries
+            kv_inputs = q_inputs
+        # Compute QKV for the blockwise attention
+        bs, total_seq_len = kv_inputs.shape[:2]
+        hidden_shape_q = (bs, q_inputs.shape[1], -1, self.head_dim)
+        hidden_shape_kv = (bs, total_seq_len, -1, self.head_dim)
+        if self.override_q_proj is None:
+            query_states = self.q_proj_casa(q_inputs).view(*hidden_shape_q)
+        else:
+            query_states = self.override_q_proj(q_inputs).view(*hidden_shape_q)
+        if self.override_k_proj is None:
+            key_states = self.k_proj_casa(kv_inputs).view(*hidden_shape_kv)
+        else:
+            key_states = self.override_k_proj(kv_inputs).view(*hidden_shape_kv)
+        if self.override_v_proj is None:
+            value_states = self.v_proj_casa(kv_inputs).view(*hidden_shape_kv)
+        else:
+            value_states = self.override_v_proj(kv_inputs).view(*hidden_shape_kv)
+        # Apply position embedding at the right offset
+        num_queries = 0
+        if self.streaming and self.streaming_state.offset > 0:
+            num_queries = og_shape[1]
+        query_states = self.apply_position_embeddings(
+            "q", query_states, num_queries=num_queries, casa_handler=casa_handler
+        )
+        key_states = self.apply_position_embeddings(
+            "kv", key_states, num_queries=num_queries, casa_handler=casa_handler
+        )
+        assert flash_attn_varlen_func is not None, (
+            "flash_attention is not installed but required for block-wise attention"
+        )
+        # Flashattention has different efficient implem for streaming
+        # In that case, the KV cache has to be batched and has been extended
+        # to accomodate the shape of ne the new updates
+        if self.is_streaming:
+            key_states, value_states = self.streaming_state.extend_kv(
+                key_states=key_states, value_states=value_states
+            )
+        if casa_handler.use_asymetric_qkv:
+            cu_seqlens_q = casa_handler.cu_seqlens_q
+            max_seqlen_q = casa_handler.max_seqlen_q
+        else:
+            cu_seqlens_q = casa_handler.cu_seqlens_kv
+            max_seqlen_q = casa_handler.max_seqlen_kv
+        assert cu_seqlens_q[-1] == query_states.shape[1], (
+            f"{cu_seqlens_q[-1]} != {query_states.shape[1]}"
+        )
+        assert casa_handler.cu_seqlens_kv[-1] == key_states.shape[1], (
+            f"{casa_handler.cu_seqlens_kv[-1]} != {key_states.shape[1]}"
+        )
+        # for quer
+        attn_output: torch.Tensor = flash_attn_varlen_func(
+            query_states[0].to(torch.bfloat16),
+            key_states[0].to(torch.bfloat16),
+            value_states[0].to(torch.bfloat16),
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=casa_handler.cu_seqlens_kv,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=casa_handler.max_seqlen_kv,
+            dropout_p=0.0,
+            # softmax_scale=None, # defaults to 1/sqrt(d)
+            causal=True,
+        ).to(og_dtype)
+        attn_output = attn_output.reshape(hidden_shape_q[1], -1).contiguous()
+        if self.override_o_proj is None:
+            attn_output = self.o_proj_casa(attn_output)
+        else:
+            attn_output = self.override_o_proj(attn_output)
+        attn_output = casa_handler.recover_text_embeds(
+            attn_output, hidden_states, update_image_embeddings=self.config.xa_update_image_embeds
+        )
+        attn_output = attn_output.reshape(og_shape)
+        if self.is_streaming:
+            self.streaming_state.offset += attn_output.shape[1]
+        return attn_output

config.json ADDED Viewed

	@@ -0,0 +1,122 @@

+{
+    "attention_dropout": 0.0,
+    "auto_map": {
+        "AutoConfig": "configuration_qwen2_5vl_casa.Qwen2_5_VLCASAConfig",
+        "AutoModel": "modeling_qwen2_5vl_casa.V2Qwen2_5VL"
+    },
+    "bos_token_id": 151643,
+    "casa_attention": true,
+    "casa_delta_w": true,
+    "casa_use_asymetric_qkv": true,
+    "casa_windows": "images",
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2048,
+    "image_token_id": 151655,
+    "initializer_range": 0.02,
+    "intermediate_size": 11008,
+    "max_position_embeddings": 128000,
+    "max_window_layers": 70,
+    "model_type": "CASA_Qwen_2_5_VL_3B_LiveCC",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 2,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+        "mrope_section": [
+            16,
+            24,
+            24
+        ],
+        "rope_type": "default",
+        "type": "default"
+    },
+    "rope_theta": 1000000.0,
+    "sliding_window": 32768,
+    "tie_word_embeddings": true,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.51.3",
+    "use_cache": true,
+    "use_sliding_window": false,
+    "video_token_id": 151656,
+    "vision_config": {
+        "depth": 32,
+        "fullatt_block_indexes": [
+            7,
+            15,
+            23,
+            31
+        ],
+        "hidden_act": "silu",
+        "hidden_size": 1280,
+        "image_mean": [
+            0.48145466,
+            0.4578275,
+            0.40821073
+        ],
+        "image_std": [
+            0.26862954,
+            0.26130258,
+            0.27577711
+        ],
+        "in_channels": 3,
+        "in_chans": 3,
+        "intermediate_size": 3420,
+        "model_type": "qwen2_5_vl",
+        "num_heads": 16,
+        "out_dim": 2048,
+        "out_hidden_size": 2048,
+        "patch_size": 14,
+        "spatial_merge_size": 2,
+        "spatial_patch_size": 14,
+        "temporal_patch_size": 1,
+        "tokens_per_second": 2,
+        "window_size": 112
+    },
+    "vision_end_token_id": 151653,
+    "vision_start_token_id": 151652,
+    "vision_token_id": 151654,
+    "vocab_size": 151936,
+    "xa_layers": [
+        0,
+        1,
+        2,
+        3,
+        4,
+        5,
+        6,
+        7,
+        8,
+        9,
+        10,
+        11,
+        12,
+        13,
+        14,
+        15,
+        16,
+        17,
+        18,
+        19,
+        20,
+        21,
+        22,
+        23,
+        24,
+        25,
+        26,
+        27,
+        28,
+        29,
+        30,
+        31,
+        32,
+        33,
+        34,
+        35
+    ],
+    "xa_norm_on_images": true,
+    "xa_order": "parallel",
+    "xa_update_image_embeds": false
+}

configuration_qwen2_5vl_casa.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from typing import Any, Literal
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLConfig
+class Qwen2_5_VLCASAConfig(Qwen2_5_VLConfig):
+    """Qwen config augmented with CASA options"""
+    model_type = "qwen2_5vl_casa"
+    def __init__(
+        self,
+        *args: Any,
+        # Common to all fusion mechanisms
+        xa_layers: None | tuple = None,
+        xa_order: Literal["ca_first", "parallel", "instead"] = "ca_first",
+        xa_norm_on_images: bool = False,
+        xa_update_image_embeds: bool = False,
+        # CASA
+        casa_attention: bool = False,
+        casa_delta_w: bool = False,
+        casa_windows: Literal["batch", "squashed", "images", "turn_based"] = "batch",
+        casa_use_asymetric_qkv: bool = True,
+        **kwargs: Any,
+    ):
+        super().__init__(*args, **kwargs)
+        self.head_dim = self.hidden_size // self.num_attention_heads
+        self.xa_layers = xa_layers
+        self.xa_order: Literal["ca_first", "parallel", "instead"] = xa_order
+        self.xa_norm_on_images = xa_norm_on_images
+        self.xa_update_image_embeds = xa_update_image_embeds
+        # CASA config
+        self.casa_attention = casa_attention
+        self.casa_delta_w = casa_delta_w
+        self.casa_windows: Literal["batch", "squashed", "images", "turn_based"] = casa_windows
+        self.casa_use_asymetric_qkv = casa_use_asymetric_qkv

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151643,
+  "eos_token_id": 715,
+  "transformers_version": "4.51.3"
+}

image_encoder.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""Qwen2.5VL encoder with delayed normalization"""
+import torch
+from einops import rearrange
+from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+    Qwen2_5_VisionTransformerPretrainedModel,
+)
+def prepare_for_qwen_encoder(
+    x: torch.Tensor | list[torch.Tensor], mean: torch.Tensor, std: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Preprocessing for Qwen encoder
+    Image mean and std come from processor.image_processor.image_mean and image_std
+    """
+    grid_thw = torch.Tensor([[1, img.shape[0], img.shape[1]] for img in x]).to(x[0].device)
+    hws_flatten_shape = torch.prod(grid_thw, dim=-1)
+    x = torch.cat(
+        [img.reshape((int(hws_flatten_shape[idx].item()), -1)) for idx, img in enumerate(x)],
+        dim=0,
+    )
+    assert x.min() >= 0.0 and x.max() <= 1.0
+    og_shape = x.shape
+    x = rearrange(x, "L (c d) -> L c d", c=3)
+    x = (x - mean) / std
+    x = x.view(og_shape).to(torch.bfloat16)
+    return x, grid_thw
+class Qwen25VLEncoder(torch.nn.Module):
+    """Qwen2.5 VL encoder with pre/post processing to be compatible for
+    our CASA attention implementation"""
+    def __init__(
+        self,
+        visual: "Qwen2_5_VisionTransformerPretrainedModel",
+    ):
+        super().__init__()
+        self.visual = visual
+        self.image_mean = torch.tensor(self.visual.config.image_mean).view(1, 3, 1)
+        self.image_std = torch.tensor(self.visual.config.image_std).view(1, 3, 1)
+    def forward(
+        self, x: torch.Tensor | list[torch.Tensor]
+    ) -> dict[str, torch.Tensor | list[torch.Tensor]]:
+        x, grid_thw = prepare_for_qwen_encoder(
+            x, mean=self.image_mean.to(x[0].device), std=self.image_std.to(x[0].device)
+        )
+        grid_thw = grid_thw.type(torch.int)
+        assert len(x) == grid_thw.prod(dim=1).sum()
+        out = self.visual(x, grid_thw=grid_thw)
+        split_sizes = (grid_thw.prod(dim=-1) // self.visual.spatial_merge_size**2).tolist()
+        embeds = list(torch.split(out, split_sizes, dim=0))  # Ni * (seq, C)
+        return {"image_embeds": embeds, "grid_thw": grid_thw}

language_qwen2_5vl_casa.py ADDED Viewed

	@@ -0,0 +1,276 @@

+from functools import partial
+from typing import Any, Callable, Literal, Optional
+import torch
+from transformers.cache_utils import Cache
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+    Qwen2_5_VLDecoderLayer,
+    Qwen2_5_VLFlashAttention2,
+    rotate_half,
+)
+from .casa_attention import CASAAttention, CASAAttentionHandler
+from .configuration_qwen2_5vl_casa import Qwen2_5_VLCASAConfig
+class QwenCASAAttentionHandler(CASAAttentionHandler):
+    """Overrides CASAAttention with the right pos embedding computation for Qwen"""
+    def __init__(
+        self,
+        *args: Any,
+        get_rope_index: Callable | None = None,
+        grid_thw: torch.Tensor | None = None,
+        position_ids_offset: int = 0,
+        **kwargs: Any,
+    ):
+        assert get_rope_index is not None, "get_rope_index should be given for QwenCASA"
+        self.get_rope_index = partial(get_rope_index, image_grid_thw=grid_thw)
+        self.position_ids_offset = position_ids_offset
+        super().__init__(*args, **kwargs)
+    def compute_position_embeddings(
+        self,
+        rope_fn: Callable,
+        sample_lengths: list[int],
+        dummy_for_dtype_and_device: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Compute info required for position embeddings. Can be overriden e.g. for Qwen"""
+        # Here rope_fn is the "get_rope_index" function from the original mode
+        dummy_input_ids = torch.zeros(
+            (int(sum(sample_lengths)),), device=dummy_for_dtype_and_device.device, dtype=torch.long
+        )
+        # Set image token ids
+        dummy_input_ids[self.image_tokens_mask[:, 0]] = 151655
+        # required for the weird start of image tokens
+        # Highly recommended to use pre and post image tokens with Qwen
+        # Add vision start token ids (wherever a 151655 follows a 0)
+        start_of_images = torch.logical_and(
+            dummy_input_ids == 0,
+            torch.nn.functional.pad(dummy_input_ids[1:] == 151655, (0, 1), value=0),
+        )
+        dummy_input_ids[start_of_images] = 151652
+        # rebatch dummy input ids
+        padding_side = "left" if self.attention_mask is not None else "right"
+        s = list(torch.split(dummy_input_ids, self.full_batch_lengths))
+        mlen = max(_s.shape[0] for _s in s)
+        trims = [mlen - _s.shape[0] for _s in s]
+        dummy_input_ids = torch.stack(
+            [
+                torch.nn.functional.pad(
+                    _s,
+                    (
+                        trims[i] if padding_side == "left" else 0,
+                        trims[i] if padding_side == "right" else 0,
+                    ),
+                    value=-1,
+                )
+                for i, _s in enumerate(s)
+            ],
+            dim=0,
+        )
+        # We need to give attention map to rope_index in left padding
+        attention_mask = torch.ones_like(dummy_input_ids)
+        for i, t in enumerate(trims):
+            if padding_side == "right":
+                attention_mask[i, attention_mask.shape[-1] - t :] = 0
+            else:
+                attention_mask[i, :t] = 0
+        # compute pos embeds shape (3, bs, seq)
+        position_ids = (
+            self.get_rope_index(dummy_input_ids, attention_mask=attention_mask)[0]
+            + self.position_ids_offset
+        )
+        # Compute pos-ebemds and recover flattened unpadded shape
+        cos, sin = rope_fn(dummy_for_dtype_and_device, position_ids)
+        # reflatten seq
+        if padding_side == "right":
+            cos = torch.cat(
+                [cos[:, i : i + 1, : cos.shape[2] - t, :] for i, t in enumerate(trims)], dim=2
+            )
+            sin = torch.cat(
+                [sin[:, i : i + 1, : sin.shape[2] - t, :] for i, t in enumerate(trims)], dim=2
+            )
+        else:
+            cos = torch.cat([cos[:, i : i + 1, t:, :] for i, t in enumerate(trims)], dim=2)
+            sin = torch.cat([sin[:, i : i + 1, t:, :] for i, t in enumerate(trims)], dim=2)
+        return cos, sin
+    def get_position_embedding(
+        self,
+        key: Literal["q", "kv"],
+        num_queries: int = 0,
+    ) -> tuple[torch.Tensor, torch.Tensor] | None:
+        if self.position_embeds is None:
+            return None
+        cos, sin = self.position_embeds
+        # For Q, we only want the text-only posembeds
+        if key == "q":
+            cos, sin = (
+                cos[:, :, ~self.image_tokens_mask[:, 0]],
+                sin[:, :, ~self.image_tokens_mask[:, 0]],
+            )
+        elif key != "kv":
+            raise ValueError(f"Unknown key for position embedding {key}")
+        # Easy case: training or first step at inference: we use all the posembeds
+        if num_queries == 0:
+            return cos, sin
+        # If num queries is given, we need to trim for *every sample in the batch*
+        bls = self.full_batch_lengths if key == "kv" else self.batch_lengths
+        cos = [x[:, :, -num_queries:] for x in torch.split(cos, bls, dim=2)]
+        sin = [x[:, :, -num_queries:] for x in torch.split(sin, bls, dim=2)]
+        return torch.cat(cos, dim=2), torch.cat(sin, dim=2)
+class QwenCASAAttention(CASAAttention):
+    """A CASA Attention layer compatible with Qwen"""
+    def __init__(
+        self,
+        config: Qwen2_5_VLCASAConfig,
+        layer_idx: int | None,
+        self_attn: torch.nn.Module | None = None,
+        input_layernorm_fn: Callable | None = None,
+    ):
+        # Only adding  this init for typing purposes for the config
+        super().__init__(config, layer_idx, self_attn, input_layernorm_fn)  # pyright: ignore[reportArgumentType]
+        assert config.rope_scaling is not None
+        self.mrope_section = config.rope_scaling["mrope_section"] * 2
+    def apply_position_embeddings(
+        self,
+        key: Literal["q", "kv"],
+        x: torch.Tensor,  # (batch, seq_len, num_heads, head_dim)
+        casa_handler: CASAAttentionHandler | None,
+        num_queries: int = 0,
+        unsqueeze_dim: int = 1,
+    ) -> torch.Tensor:  # (batch, seq_len, num_heads, head_dim)
+        """Apply position embeddings to query and key states"""
+        if casa_handler is not None:
+            posemb = casa_handler.get_position_embedding(key, num_queries=num_queries)
+            if posemb is not None:
+                x = x.transpose(1, 2).to(torch.float32)
+                cos, sin = posemb
+                cos = torch.cat(
+                    [m[i % 3] for i, m in enumerate(cos.split(self.mrope_section, dim=-1))], dim=-1
+                ).unsqueeze(unsqueeze_dim)
+                sin = torch.cat(
+                    [m[i % 3] for i, m in enumerate(sin.split(self.mrope_section, dim=-1))], dim=-1
+                ).unsqueeze(unsqueeze_dim)
+                x = (x * cos) + (rotate_half(x) * sin)
+                return x.transpose(1, 2)
+        return x
+    def init_from_config_proj(
+        self, key: Literal["q", "o", "k", "v"], config: PretrainedConfig
+    ) -> torch.nn.Linear:
+        """Follows modeling_qwen2_5_vl.py initialization"""
+        head_dim = config.hidden_size // config.num_attention_heads
+        if key == "q":
+            return torch.nn.Linear(
+                config.hidden_size, config.num_attention_heads * head_dim, bias=True
+            )
+        if key in {"k", "v"}:
+            return torch.nn.Linear(
+                config.hidden_size, config.num_key_value_heads * head_dim, bias=True
+            )
+        if key == "o":
+            return torch.nn.Linear(
+                config.num_attention_heads * config.head_dim, config.hidden_size, bias=False
+            )
+        raise NotImplementedError(f"Unknown key {key}")
+class Qwen2_5_VLAttention_CASA(Qwen2_5_VLFlashAttention2):
+    """
+    Qwen Attention with extra CASA Attention layer
+    """
+    def __init__(
+        self,
+        config: Qwen2_5_VLCASAConfig,
+        layer_idx: Optional[int] = None,
+        input_layernorm: torch.nn.Module | None = None,
+    ):
+        super().__init__(config, layer_idx)  # pyright: ignore[reportArgumentType]
+        self.casa_attn = QwenCASAAttention(
+            config,
+            layer_idx=layer_idx,
+            self_attn=self,
+            input_layernorm_fn=input_layernorm.forward if input_layernorm is not None else None,
+        )
+        self.casa_attention_handler: CASAAttentionHandler | None = None
+    @classmethod
+    def from_qwen2_5_vl_attention(
+        cls, attention: Qwen2_5_VLFlashAttention2, input_layernorm: torch.nn.Module | None
+    ):
+        """Init this layer from Qwen Attention layer"""
+        layer_idx = attention.layer_idx
+        assert layer_idx is not None
+        new_attention = cls(attention.config, layer_idx=layer_idx, input_layernorm=input_layernorm)  # pyright: ignore
+        new_attention.load_state_dict(attention.state_dict(), strict=False)
+        return new_attention
+    def forward(  # pyright: ignore[reportIncompatibleMethodOverride]
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+    ):
+        casa_out: None | torch.Tensor = None
+        if self.casa_attn is not None and self.config.xa_order in {
+            "parallel",
+            "ca_first",
+            "instead",
+        }:
+            casa_out = self.casa_attn(
+                hidden_states=hidden_states,
+                casa_handler=self.casa_attention_handler,
+            )
+            if self.config.xa_order == "instead":
+                return casa_out, None, None
+            if self.config.xa_order == "ca_first" and casa_out is not None:
+                hidden_states, casa_out = casa_out, None
+        attn_output, attn_weights, past_key_values = super().forward(
+            hidden_states,
+            attention_mask,
+            position_ids,
+            past_key_value,
+            output_attentions,
+            use_cache,
+            cache_position,
+            position_embeddings,
+        )
+        if self.config.xa_order == "parallel" and casa_out is not None:
+            attn_output = casa_out + attn_output
+        return attn_output, attn_weights, past_key_values
+def add_casa_layers(m: torch.nn.Module, xa_layers: tuple[int, ...] | None):
+    """Replace Attention layer by CASA Attention layer as needed"""
+    if isinstance(m, Qwen2_5_VLDecoderLayer):
+        layer_idx = m.self_attn.layer_idx
+        assert layer_idx is not None
+        if xa_layers is None or len(xa_layers) == 0 or layer_idx in xa_layers:
+            m.self_attn = Qwen2_5_VLAttention_CASA.from_qwen2_5_vl_attention(
+                m.self_attn, input_layernorm=m.input_layernorm
+            )

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:959dc60fd4e5974349a1e23b79d33edd765bbcb911ea29ae594e4ba1d2872188
+size 4961226720

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aab471816213f5a9cf2b2214de75a7e1d17d9b9371d75e12a8481066db458f18
+size 4993905448

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:32abd001c7514589845599d3cc64d739bf9c57acb548c4099cbeab7be9e57f5a
+size 4994485840

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8943ad7ef101666fb28d9b0ac5b27cd1c8381c2d755ce0ddac1f9c6dea33de2d
+size 1425314512

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,1083 @@

+{
+  "metadata": {
+    "total_size": 16374804480
+  },
+  "weight_map": {
+    "image_prefix.visual.blocks.0.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.0.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.0.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.0.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.0.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.0.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.0.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.0.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.0.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.0.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.0.norm1.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.0.norm2.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.1.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.1.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.1.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.1.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.1.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.1.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.1.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.1.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.1.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.1.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.1.norm1.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.1.norm2.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.10.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.10.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.10.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.10.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.10.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.10.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.10.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.10.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.10.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.10.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.10.norm1.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.10.norm2.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.11.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.11.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.11.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.11.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.11.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.11.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.11.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.11.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.11.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.11.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.11.norm1.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.11.norm2.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.12.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.12.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.12.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.12.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.12.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.12.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.12.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.12.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.12.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.12.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.12.norm1.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.12.norm2.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.13.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.13.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.13.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.13.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.13.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.13.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.13.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.13.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.13.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.13.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.13.norm1.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.13.norm2.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.14.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.14.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.14.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.14.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.14.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.14.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.14.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.14.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.14.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.14.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.14.norm1.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.14.norm2.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.15.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.15.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.15.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.15.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.15.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.15.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.15.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.15.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.15.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.15.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.15.norm1.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.15.norm2.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.16.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.16.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.16.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.16.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.16.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.16.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.16.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.16.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.16.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.16.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.16.norm1.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.16.norm2.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.17.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.17.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.17.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.17.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.17.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.17.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.17.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.17.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.17.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.17.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.17.norm1.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.17.norm2.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.18.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.18.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.18.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.18.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.18.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.18.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.18.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.18.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.18.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.18.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.18.norm1.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.18.norm2.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.19.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.19.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.19.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.19.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.19.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.19.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.19.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.19.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.19.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.19.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.19.norm1.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.19.norm2.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.2.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.2.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.2.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.2.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.2.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.2.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.2.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.2.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.2.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.2.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.2.norm1.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.2.norm2.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.20.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.20.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.20.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.20.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.20.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.20.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.20.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.20.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.20.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.20.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.20.norm1.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.20.norm2.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.21.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.21.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.21.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.21.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.21.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.21.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.21.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.21.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.21.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.21.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.21.norm1.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.21.norm2.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.22.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.22.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.22.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.22.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.22.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.22.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.22.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.22.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.22.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.22.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.22.norm1.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.22.norm2.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.23.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.23.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.23.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.23.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.23.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.23.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.23.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.23.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.23.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.23.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.23.norm1.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.23.norm2.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.24.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.24.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.24.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.24.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.24.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.24.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.24.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.24.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.24.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.24.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.24.norm1.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.24.norm2.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.25.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.25.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.25.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.25.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.25.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.25.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.25.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.25.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.25.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.25.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.25.norm1.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.25.norm2.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.26.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.26.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.26.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.26.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.26.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.26.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.26.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.26.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.26.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.26.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.26.norm1.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.26.norm2.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.27.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.27.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.27.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.27.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.27.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.27.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.27.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.27.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.27.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.27.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.27.norm1.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.27.norm2.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.28.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.28.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.28.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.28.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.28.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.28.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.28.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.28.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.28.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.28.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.28.norm1.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.28.norm2.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.29.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.29.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.29.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.29.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.29.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.29.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.29.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.29.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.29.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.29.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.29.norm1.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.29.norm2.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.3.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.3.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.3.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.3.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.3.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.3.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.3.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.3.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.3.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.3.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.3.norm1.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.3.norm2.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.30.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.30.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.30.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.30.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.30.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.30.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.30.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.30.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.30.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.30.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.30.norm1.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.30.norm2.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.31.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.31.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.31.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.31.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.31.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.31.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.31.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.31.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.31.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.31.norm1.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.31.norm2.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.blocks.4.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.4.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.4.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.4.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.4.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.4.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.4.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.4.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.4.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.4.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.4.norm1.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.4.norm2.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.5.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.5.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.5.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.5.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.5.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.5.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.5.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.5.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.5.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.5.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.5.norm1.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.5.norm2.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.6.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.6.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.6.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.6.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.6.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.6.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.6.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.6.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.6.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.6.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.6.norm1.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.6.norm2.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.7.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.7.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.7.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.7.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.7.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.7.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.7.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.7.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.7.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.7.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.7.norm1.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.7.norm2.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.8.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.8.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.8.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.8.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.8.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.8.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.8.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.8.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.8.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.8.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.8.norm1.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.8.norm2.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.9.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.9.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.9.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.9.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.9.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.9.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.9.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.9.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.9.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.9.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.9.norm1.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.blocks.9.norm2.weight": "model-00003-of-00004.safetensors",
+    "image_prefix.visual.merger.ln_q.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.merger.mlp.0.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.merger.mlp.0.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.merger.mlp.2.bias": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.merger.mlp.2.weight": "model-00004-of-00004.safetensors",
+    "image_prefix.visual.patch_embed.proj.weight": "model-00003-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.casa_attn.k_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.casa_attn.k_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.casa_attn.o_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.casa_attn.q_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.casa_attn.q_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.casa_attn.v_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.casa_attn.v_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.casa_attn.k_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.casa_attn.k_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.casa_attn.o_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.casa_attn.q_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.casa_attn.q_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.casa_attn.v_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.casa_attn.v_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.casa_attn.k_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.10.self_attn.casa_attn.k_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.self_attn.casa_attn.o_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.self_attn.casa_attn.q_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.10.self_attn.casa_attn.q_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.self_attn.casa_attn.v_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.10.self_attn.casa_attn.v_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.casa_attn.k_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.casa_attn.k_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.casa_attn.o_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.casa_attn.q_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.casa_attn.q_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.casa_attn.v_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.casa_attn.v_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.casa_attn.k_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.casa_attn.k_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.casa_attn.o_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.casa_attn.q_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.casa_attn.q_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.casa_attn.v_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.casa_attn.v_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.casa_attn.k_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.casa_attn.k_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.casa_attn.o_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.casa_attn.q_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.casa_attn.q_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.casa_attn.v_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.casa_attn.v_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.casa_attn.k_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.casa_attn.k_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.casa_attn.o_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.casa_attn.q_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.casa_attn.q_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.casa_attn.v_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.casa_attn.v_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.casa_attn.k_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.casa_attn.k_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.casa_attn.o_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.casa_attn.q_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.casa_attn.q_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.casa_attn.v_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.casa_attn.v_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.casa_attn.k_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.casa_attn.k_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.casa_attn.o_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.casa_attn.q_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.casa_attn.q_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.casa_attn.v_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.casa_attn.v_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.casa_attn.k_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.casa_attn.k_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.casa_attn.o_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.casa_attn.q_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.casa_attn.q_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.casa_attn.v_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.casa_attn.v_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.casa_attn.k_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.casa_attn.k_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.casa_attn.o_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.casa_attn.q_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.casa_attn.q_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.casa_attn.v_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.casa_attn.v_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.casa_attn.k_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.casa_attn.k_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.casa_attn.o_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.casa_attn.q_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.casa_attn.q_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.casa_attn.v_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.casa_attn.v_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.casa_attn.k_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.casa_attn.k_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.casa_attn.o_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.casa_attn.q_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.casa_attn.q_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.casa_attn.v_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.casa_attn.v_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.casa_attn.k_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.casa_attn.k_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.casa_attn.o_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.casa_attn.q_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.casa_attn.q_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.casa_attn.v_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.casa_attn.v_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.casa_attn.k_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.casa_attn.k_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.casa_attn.o_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.casa_attn.q_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.casa_attn.q_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.casa_attn.v_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.casa_attn.v_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.casa_attn.k_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.casa_attn.k_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.casa_attn.o_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.casa_attn.q_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.casa_attn.q_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.casa_attn.v_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.casa_attn.v_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.23.self_attn.casa_attn.k_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.23.self_attn.casa_attn.k_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.23.self_attn.casa_attn.o_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.23.self_attn.casa_attn.q_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.23.self_attn.casa_attn.q_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.23.self_attn.casa_attn.v_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.23.self_attn.casa_attn.v_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.24.self_attn.casa_attn.k_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.24.self_attn.casa_attn.k_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.24.self_attn.casa_attn.o_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.24.self_attn.casa_attn.q_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.24.self_attn.casa_attn.q_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.24.self_attn.casa_attn.v_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.24.self_attn.casa_attn.v_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.casa_attn.k_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.25.self_attn.casa_attn.k_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.25.self_attn.casa_attn.o_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.casa_attn.q_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.25.self_attn.casa_attn.q_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.25.self_attn.casa_attn.v_proj_casa.bias": "model-00002-of-00004.safetensors",
+    "model.layers.25.self_attn.casa_attn.v_proj_casa.weight": "model-00002-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.casa_attn.k_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.casa_attn.k_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.casa_attn.o_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.casa_attn.q_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.casa_attn.q_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.casa_attn.v_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.casa_attn.v_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.casa_attn.k_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.casa_attn.k_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.casa_attn.o_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.casa_attn.q_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.casa_attn.q_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.casa_attn.v_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.casa_attn.v_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.casa_attn.k_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.casa_attn.k_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.casa_attn.o_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.casa_attn.q_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.casa_attn.q_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.casa_attn.v_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.casa_attn.v_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.casa_attn.k_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.casa_attn.k_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.casa_attn.o_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.casa_attn.q_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.casa_attn.q_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.casa_attn.v_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.casa_attn.v_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.casa_attn.k_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.casa_attn.k_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.casa_attn.o_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.casa_attn.q_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.casa_attn.q_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.casa_attn.v_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.casa_attn.v_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.casa_attn.k_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.casa_attn.k_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.casa_attn.o_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.casa_attn.q_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.casa_attn.q_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.casa_attn.v_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.casa_attn.v_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.casa_attn.k_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.casa_attn.k_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.casa_attn.o_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.casa_attn.q_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.casa_attn.q_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.casa_attn.v_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.casa_attn.v_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.casa_attn.k_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.casa_attn.k_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.casa_attn.o_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.casa_attn.q_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.casa_attn.q_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.casa_attn.v_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.casa_attn.v_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.casa_attn.k_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.casa_attn.k_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.casa_attn.o_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.casa_attn.q_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.casa_attn.q_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.casa_attn.v_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.casa_attn.v_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.casa_attn.k_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.casa_attn.k_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.casa_attn.o_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.casa_attn.q_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.casa_attn.q_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.casa_attn.v_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.casa_attn.v_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.self_attn.casa_attn.k_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.35.self_attn.casa_attn.k_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.self_attn.casa_attn.o_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.self_attn.casa_attn.q_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.35.self_attn.casa_attn.q_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.self_attn.casa_attn.v_proj_casa.bias": "model-00003-of-00004.safetensors",
+    "model.layers.35.self_attn.casa_attn.v_proj_casa.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.35.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.35.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.35.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.casa_attn.k_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.casa_attn.k_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.casa_attn.o_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.casa_attn.q_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.casa_attn.q_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.casa_attn.v_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.casa_attn.v_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.casa_attn.k_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.casa_attn.k_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.casa_attn.o_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.casa_attn.q_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.casa_attn.q_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.casa_attn.v_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.casa_attn.v_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.casa_attn.k_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.casa_attn.k_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.casa_attn.o_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.casa_attn.q_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.casa_attn.q_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.casa_attn.v_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.casa_attn.v_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.casa_attn.k_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.casa_attn.k_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.casa_attn.o_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.casa_attn.q_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.casa_attn.q_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.casa_attn.v_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.casa_attn.v_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.casa_attn.k_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.casa_attn.k_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.casa_attn.o_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.casa_attn.q_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.casa_attn.q_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.casa_attn.v_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.casa_attn.v_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.casa_attn.k_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.casa_attn.k_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.casa_attn.o_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.casa_attn.q_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.casa_attn.q_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.casa_attn.v_proj_casa.bias": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.casa_attn.v_proj_casa.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.norm.weight": "model-00003-of-00004.safetensors"
+  }
+}

modeling_qwen2_5vl_casa.py ADDED Viewed

	@@ -0,0 +1,308 @@

+from functools import partial
+from typing import Any
+from typing import cast as type_cast
+import torch
+from transformers.cache_utils import DynamicCache
+from transformers.generation.utils import GenerateOutput
+from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+    Qwen2_5_VLCausalLMOutputWithPast,
+    Qwen2_5_VLForConditionalGeneration,
+)
+from .image_encoder import Qwen25VLEncoder
+from .configuration_qwen2_5vl_casa import Qwen2_5_VLCASAConfig
+from .language_qwen2_5vl_casa import (
+    Qwen2_5_VLAttention_CASA,
+    QwenCASAAttention,
+    QwenCASAAttentionHandler,
+    add_casa_layers,
+)
+class V2Qwen2_5VL(Qwen2_5_VLForConditionalGeneration):  # pyright: ignore[reportIncompatibleMethodOverride]
+    config_class = Qwen2_5_VLCASAConfig
+    def __init__(self, config: Qwen2_5_VLCASAConfig, **kwargs: Any) -> None:
+        del kwargs
+        super().__init__(config)
+        self.image_prefix = Qwen25VLEncoder(self.visual)  # type: ignore[assignment]
+        self.visual = None
+        self.model.apply(partial(add_casa_layers, xa_layers=self.config.xa_layers))
+    def get_device(self) -> str:
+        """Return the device type of the model"""
+        return next(self.parameters()).device.type
+    @property
+    def token_dim(self) -> int:
+        """Returns the number of dimensions for the token representation"""
+        return self.config.hidden_size
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: Any,
+        model_kwargs: dict[str, Any],
+        is_encoder_decoder: bool = False,
+        num_new_tokens: int = 1,
+    ):
+        """This is required to handle multiple gen calls for subtitles"""
+        # Call parent to get default updates
+        model_kwargs = super()._update_model_kwargs_for_generation(
+            outputs, model_kwargs, is_encoder_decoder, num_new_tokens
+        )
+        # Used by prepare_inputs_for_generation
+        model_kwargs["__is_first_gen_call__"] = False
+        return model_kwargs
+    def prepare_inputs_for_generation(  # pyright: ignore[reportIncompatibleMethodOverride]
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: DynamicCache | None = None,
+        **kwargs: Any,
+    ):
+        """Required to handle cache_position = None with QwenVL"""
+        __is_first_gen_call__ = kwargs.pop("__is_first_gen_call__", True)
+        if past_key_values is not None and (
+            kwargs.get("cache_position") is None
+            or type_cast(torch.Tensor, kwargs.get("cache_position")).shape[0] == 0
+        ):
+            # We're continuing from a cached state
+            past_length = past_key_values._seen_tokens
+            kwargs["cache_position"] = torch.arange(
+                past_length,
+                past_length + (input_ids.shape[1] if __is_first_gen_call__ else 1),
+                dtype=torch.long,
+                device=input_ids.device,
+            )
+        return super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            **kwargs,
+        )
+    def prepare_multimodal_inputs(
+        self,
+        # text only training
+        input_ids: torch.Tensor | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        image_embeds_insertion_points: list[torch.Tensor] | None = None,
+        labels: torch.Tensor | None = None,
+        # image values
+        pixel_values: torch.Tensor | list[torch.Tensor] | None = None,
+        pre_image_tokens: list[int] | None = None,
+        post_image_tokens: list[int] | None = None,
+        **_kwargs: Any,
+    ) -> dict:
+        """Get a batch data mixing text and image data"""
+        del _kwargs
+        processed_inputs: dict = {
+            "input_ids": input_ids,
+            "inputs_embeds": inputs_embeds,
+            "labels": labels,
+            "attention_mask": attention_mask,
+            "image_embeds_insertion_points": image_embeds_insertion_points,
+        }
+        if pixel_values is not None:
+            processed_inputs.update(self.image_prefix(pixel_values))
+            assert "image_embeds" in processed_inputs
+            assert (
+                isinstance(processed_inputs["image_embeds"], torch.Tensor)
+                and processed_inputs["image_embeds"].ndim == 3
+            ) or (
+                isinstance(processed_inputs["image_embeds"], list)
+                and all(_x.ndim == 2 for _x in processed_inputs["image_embeds"])
+            )
+        # Add kwargs necessary to compute cu_seqlens windows for CASA
+        processed_inputs["casa_windows_info"] = {
+            "num_post_image_tokens": 0 if post_image_tokens is None else len(post_image_tokens),
+            "num_pre_image_tokens": 0 if pre_image_tokens is None else len(pre_image_tokens),
+        }
+        return processed_inputs
+    def forward(  # type: ignore[override] # pylint: disable=W0221
+        self,
+        input_ids: torch.Tensor | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        pixel_values: torch.Tensor | list[torch.Tensor] | None = None,
+        labels: torch.Tensor | None = None,
+        image_embeds_insertion_points: list[torch.Tensor] | None = None,
+        reinit_casa_handler: bool = True,
+        pre_image_tokens: list[int] | None = None,
+        post_image_tokens: list[int] | None = None,
+        **kwargs: Any,
+    ) -> tuple | Qwen2_5_VLCausalLMOutputWithPast:
+        """Multi-modal forward pass"""
+        if reinit_casa_handler:
+            processed_inputs = self.prepare_multimodal_inputs(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                image_embeds_insertion_points=image_embeds_insertion_points,
+                pixel_values=pixel_values,
+                labels=labels,
+                post_image_tokens=post_image_tokens,
+                pre_image_tokens=pre_image_tokens,
+            )
+            inputs_embeds = type_cast(
+                torch.Tensor, self.model.embed_tokens(processed_inputs["input_ids"])
+            )
+            casa_attention_handler: QwenCASAAttentionHandler | None = None
+            image_embeds = processed_inputs.get("image_embeds", None)
+            attention_mask = processed_inputs["attention_mask"]
+            inst_points = processed_inputs.get("image_embeds_insertion_points", None)
+            if image_embeds is None:
+                inst_points = None
+            casa_attention_handler = QwenCASAAttentionHandler(
+                # for text tokens, we don't need the actual values
+                inputs_embeds=torch.zeros_like(inputs_embeds),
+                # for image embeddings, we put real inputs as this will be fixed
+                image_embeds=[] if image_embeds is None else image_embeds,
+                image_embeds_insertion_points=inst_points,
+                # attention mask is only needed at inference / left padding
+                attention_mask=None if self.training else processed_inputs["attention_mask"],
+                rope_fn=self.model.rotary_emb,
+                windows=self.config.casa_windows,
+                casa_windows_info=processed_inputs.pop("casa_windows_info", None),
+                use_asymetric_q_kv=self.config.casa_use_asymetric_qkv,
+                # extra for Qwen
+                get_rope_index=self.get_rope_index,
+                grid_thw=processed_inputs.get("grid_thw", None),
+            )
+            self.update_casa_states(casa_attention_handler)
+        else:
+            inputs_embeds = self.model.embed_tokens(input_ids)
+        # Run Qwen with the attention layers replaced to use CASA
+        assert inputs_embeds is not None, "Could not compute input embeddings!"
+        out = super().forward(
+            inputs_embeds=inputs_embeds,  # type: ignore[arg-type]
+            attention_mask=attention_mask,
+            pixel_values=None,
+            **kwargs,
+        )
+        return out
+    @torch.no_grad()
+    def generate_from_image(  # pyright: ignore[reportInconsistentOverload]
+        self,
+        input_ids: torch.Tensor | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        image_embeds_insertion_points: list[torch.Tensor] | None = None,
+        pixel_values: torch.Tensor | list[torch.Tensor] | None = None,
+        pre_image_tokens: list[int] | None = None,
+        post_image_tokens: list[int] | None = None,
+        position_ids_offset: int | None = None,
+        reset_streaming: bool = True,
+        **kwargs: Any,
+    ) -> GenerateOutput | torch.LongTensor:
+        """Custom generate function"""
+        assert input_ids is not None and inputs_embeds is None, (
+            "Input IDs must be provided for generation"
+        )
+        # init self-attention KVCache
+        if kwargs.get("past_key_values", None) is None:
+            kwargs["past_key_values"] = DynamicCache()
+        # To avoid generate warning
+        if kwargs.get("pad_token_id", None) is None:
+            kwargs["pad_token_id"] = kwargs.get("eos_token_id", None)
+            if isinstance(kwargs["pad_token_id"], (list, tuple)):
+                kwargs["pad_token_id"] = kwargs["pad_token_id"][0]
+        # Init CASA states
+        processed_inputs = self.prepare_multimodal_inputs(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            image_embeds_insertion_points=image_embeds_insertion_points,
+            pixel_values=pixel_values,
+            labels=None,
+            pre_image_tokens=pre_image_tokens,
+            post_image_tokens=post_image_tokens,
+        )
+        if pixel_values is not None:
+            assert (image_embeds := processed_inputs.get("image_embeds", None)) is not None
+            assert (
+                insrt_pts := processed_inputs.get("image_embeds_insertion_points", None)
+            ) is not None
+            casa_attention_handler = QwenCASAAttentionHandler(
+                inputs_embeds=torch.empty(
+                    (input_ids.shape[0], input_ids.shape[1], image_embeds[0].shape[-1]),
+                    dtype=image_embeds[0].dtype,
+                    device=image_embeds[0].device,
+                ),
+                image_embeds=image_embeds,
+                image_embeds_insertion_points=insrt_pts,
+                attention_mask=attention_mask,
+                rope_fn=self.model.rotary_emb,
+                windows=self.config.casa_windows,
+                casa_windows_info=processed_inputs.pop("casa_windows_info", None),
+                use_asymetric_q_kv=self.config.casa_use_asymetric_qkv,
+                get_rope_index=self.get_rope_index,
+                grid_thw=processed_inputs.get("grid_thw", None),
+                position_ids_offset=position_ids_offset or kwargs["past_key_values"]._seen_tokens,
+            )
+            self.update_casa_states(casa_attention_handler)
+            self.start_casa_streaming_states()
+            pixel_values = None
+        # Generate
+        outputs = self.generate(
+            input_ids,
+            attention_mask=attention_mask,
+            pixel_values=pixel_values,
+            use_cache=True,
+            reinit_casa_handler=False,
+            **kwargs,
+        )
+        if reset_streaming:
+            self.reset_casa_streaming_states()
+        return outputs
+    def update_casa_states(self, handler: QwenCASAAttentionHandler | None):
+        """Update handler in all layers"""
+        def __update__(m: torch.nn.Module):
+            nonlocal handler
+            if isinstance(m, Qwen2_5_VLAttention_CASA):
+                m.casa_attention_handler = handler
+        self.apply(__update__)
+    def reset_casa_streaming_states(self, clean_cache: bool = True) -> None:
+        def __reset__(m: torch.nn.Module):
+            if isinstance(m, QwenCASAAttention):
+                m._set_streaming(False, ())
+                m.reset_streaming()
+                if clean_cache:
+                    del m.streaming_state.k
+                    del m.streaming_state.v
+                    m.streaming_state.k = None  # pyright: ignore[reportAttributeAccessIssue]
+                    m.streaming_state.v = None  # pyright: ignore[reportAttributeAccessIssue]
+            elif isinstance(m, Qwen2_5_VLAttention_CASA):
+                del m.casa_attention_handler
+                m.casa_attention_handler = None
+        self.apply(__reset__)
+    def start_casa_streaming_states(self) -> None:
+        def __start__(m: torch.nn.Module):
+            if isinstance(m, QwenCASAAttention):
+                m._set_streaming(True, ())
+        self.apply(__start__)

processing.py ADDED Viewed

	@@ -0,0 +1,505 @@

+# pylint: disable=no-member  # avoid weird pylint warnings from SentencePieceProcessor
+"""Text and Image processor for CASA models using Qwen2.5_VL image encoder"""
+from math import ceil
+from typing import TYPE_CHECKING, Any, Literal, TypedDict, cast, overload
+from typing import cast as type_cast
+import torch
+import torchvision.transforms.v2 as T
+from einops import rearrange
+from PIL import Image
+from torchvision.transforms import InterpolationMode
+from torchvision.transforms.functional import to_tensor as pil_to_tensor
+from torchvision.transforms.v2 import functional as F
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.processing_utils import ProcessorMixin
+if TYPE_CHECKING:
+    from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer
+    from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+ImageMessage = TypedDict(
+    "ImageMessage",
+    {
+        "type": Literal["image"],
+        "image": str | Image.Image | None,
+    },
+)
+TextMessage = TypedDict(
+    "TextMessage",
+    {
+        "type": Literal["text"],
+        "text": str,
+    },
+)
+MessageContent = list[ImageMessage | TextMessage]
+Message = TypedDict(
+    "Message",
+    {
+        "role": Literal["system", "user", "assistant"],
+        "content": MessageContent,
+    },
+)
+ProcessorInput = list[list[Message]] | list[Message]
+__INTERP_NAME_TO_MODE__ = {
+    "nearest": InterpolationMode.NEAREST,
+    "bilinear": InterpolationMode.BILINEAR,
+    "bicubic": InterpolationMode.BICUBIC,
+    "lanczos": InterpolationMode.LANCZOS,
+}
+__INTERP_INT_TO_MODE__ = {
+    0: InterpolationMode.NEAREST,
+    2: InterpolationMode.BILINEAR,
+    3: InterpolationMode.BICUBIC,
+    4: InterpolationMode.BOX,
+    5: InterpolationMode.HAMMING,
+    1: InterpolationMode.LANCZOS,
+}
+@overload
+def universal_resize(
+    img: Image.Image,
+    size: tuple[int, int],
+    interpolation: str | InterpolationMode | int = "bilinear",
+    antialias: bool = True,
+) -> Image.Image: ...
+@overload
+def universal_resize(
+    img: torch.Tensor,
+    size: tuple[int, int],
+    interpolation: str | InterpolationMode | int = "bilinear",
+    antialias: bool = True,
+) -> torch.Tensor: ...
+def universal_resize(
+    img: Image.Image | torch.Tensor,
+    size: tuple[int, int],
+    interpolation: str | InterpolationMode | int = "bilinear",
+    antialias: bool = True,
+) -> Image.Image | torch.Tensor:
+    """Resize that works for PIL.Image, CHW tensor, or BCHW tensor"""
+    if isinstance(interpolation, str):
+        interpolation = __INTERP_NAME_TO_MODE__[interpolation]
+    elif isinstance(interpolation, int):
+        interpolation = __INTERP_INT_TO_MODE__[interpolation]
+    return F.resize(
+        img, size, interpolation=type_cast(InterpolationMode, interpolation), antialias=antialias
+    )
+@overload
+def convert_to_rgb(img: Image.Image) -> Image.Image: ...
+@overload
+def convert_to_rgb(img: torch.Tensor) -> torch.Tensor: ...
+def convert_to_rgb(img: Image.Image | torch.Tensor) -> Image.Image | torch.Tensor:
+    """Convert any image to RGB in a way that does not throw PIL warning"""
+    if isinstance(img, torch.Tensor):
+        return img
+    if img.mode == "RGB":  # no changes
+        return img
+    if img.mode == "P":  # palette images need to be converted to RGBA first
+        return img.convert("RGBA").convert("RGB")
+    return img.convert("RGB")
+class QwenImageProcessor(BaseImageProcessor):
+    """Resizing for the Qwen2.5VL encoder. Note that the normalization is
+    handled in the image_encoder in the model forward"""
+    def __init__(
+        self,
+        img_size: int = 448,
+        interpolation: Literal["bicubic", "bilinear", "nearest", "nearest_exact"] = "bicubic",
+        max_ratio: int = 10,
+        round_to_patch_size: int = 56,
+        use_fast: bool = True,
+        **kwargs: Any,
+    ) -> None:
+        # this will also be used in V2llms to determine whether to remove
+        # the temporal conv
+        self._num_target_channels = 588
+        self._merge_size = 2
+        self._patch_size = 14
+        super().__init__(
+            use_fast=use_fast,
+            do_normalize=False,
+            **kwargs,
+        )
+        self.img_size = img_size
+        self.interpolation = interpolation
+        self.max_ratio = max_ratio
+        self.round_to_patch_size = round_to_patch_size
+    def resize_transform(
+        self, img: Image.Image | torch.Tensor, img_size: int | None = None
+    ) -> Image.Image | torch.Tensor:
+        if img_size is None:
+            img_size = self.img_size
+        max_area = img_size**2
+        if isinstance(img, Image.Image):
+            img = convert_to_rgb(img)
+            w_og, h_og = img.size
+        else:
+            h_og, w_og = img.shape[-2:]
+        w, h = w_og, h_og
+        # Qwen requires max ratio of 10 between max and min sizes
+        if self.max_ratio > 0:
+            w, h = max(w, h // self.max_ratio), max(h, w // self.max_ratio)
+        # resize to max area
+        current_area = w * h
+        if current_area > max_area:
+            scale = (max_area / current_area) ** 0.5
+            w, h = int(w * scale), int(h * scale)
+        # resize to patch size
+        if self.round_to_patch_size > 0:
+            w = ceil(w / self.round_to_patch_size) * self.round_to_patch_size
+            h = ceil((h / self.round_to_patch_size)) * self.round_to_patch_size
+        # resize
+        if w != w_og or h != h_og:
+            img = universal_resize(img, (h, w), self.interpolation)
+        if isinstance(img, torch.Tensor):
+            img = T.ToDtype(torch.float32, scale=True)(T.ToImage()(img))
+        return img
+    def __process_one__(
+        self, video_or_img: Image.Image | torch.Tensor, img_size: int | None = None
+    ) -> torch.Tensor:
+        """Same operation as __process_one_with_processor__ but without going through numpy"""
+        video_or_img = self.resize_transform(video_or_img, img_size)
+        if isinstance(video_or_img, Image.Image):
+            video_or_img = pil_to_tensor(video_or_img)
+        assert isinstance(video_or_img, torch.Tensor)
+        if video_or_img.ndim == 3:
+            video_or_img = video_or_img[None]
+        assert video_or_img.ndim == 4 and video_or_img.shape[1] == 3, (
+            f"Invalid shape {video_or_img.shape}."
+        )
+        t, c, h, w = video_or_img.shape
+        p = self._patch_size
+        m = self._merge_size
+        # Convert to RGB
+        if c == 1:
+            video_or_img = video_or_img.expand((-1, 3, -1, -1))
+        if c == 4:
+            video_or_img = video_or_img[:, :3]
+        c = video_or_img.shape[1]
+        assert c == 3, "Expecting RGB image in QwenNormalize"
+        # Reshape to t h w c' format
+        h, w = video_or_img.shape[2] // p, video_or_img.shape[3] // p
+        rearrange_dict = dict(p1=p, p2=p, m1=m, m2=m)
+        video_or_img = rearrange(
+            video_or_img,
+            "t c (h m1 p1) (w m2 p2) -> (t h w m1 m2) (c p1 p2)",
+            **rearrange_dict,
+        )
+        assert video_or_img.shape[-1] == self._num_target_channels, (
+            f"{video_or_img.shape[-1]} != {self._num_target_channels}"
+        )
+        video_or_img = video_or_img.view((-1, h, w, self._num_target_channels))
+        return video_or_img
+    @overload
+    def process_images(
+        self, image: Image.Image | torch.Tensor, img_size: int | None = None
+    ) -> torch.Tensor: ...
+    @overload
+    def process_images(
+        self, image: list[Image.Image] | list[torch.Tensor], img_size: int | None = None
+    ) -> list[torch.Tensor]: ...
+    def process_images(
+        self,
+        image: Image.Image | torch.Tensor | list[Image.Image] | list[torch.Tensor],
+        img_size: int | None = None,
+    ) -> torch.Tensor | list[torch.Tensor]:
+        if isinstance(image, list):
+            return [self.__process_one__(_x, img_size) for _x in image]
+        return self.__process_one__(image, img_size)
+class ProcessorOutput(dict):
+    input_ids: torch.Tensor
+    attention_mask: torch.Tensor
+    image_embeds_insertion_points: list[torch.Tensor] | None
+    pixel_values: torch.Tensor | list[torch.Tensor] | None
+    def to(
+        self, device: torch.device | str, dtype: torch.dtype = torch.bfloat16
+    ) -> "ProcessorOutput":
+        return ProcessorOutput(
+            {
+                "input_ids": self["input_ids"].to(device),
+                "attention_mask": self["attention_mask"].to(device),
+                "image_embeds_insertion_points": self["image_embeds_insertion_points"],
+                "pixel_values": (
+                    self["pixel_values"].to(dtype).to(device)
+                    if isinstance(self["pixel_values"], torch.Tensor)
+                    else [x.to(dtype).to(device) for x in self["pixel_values"]]
+                    if self["pixel_values"] is not None
+                    else None
+                ),
+            }
+        )
+class BaseProcessor(ProcessorMixin):
+    def __init__(
+        self,
+        tokenizer: "PreTrainedTokenizerFast | Qwen2Tokenizer",
+        pre_image_tokens: tuple[int, ...] = (),
+        post_image_tokens: tuple[int, ...] = (),
+        system_start_tokens: tuple[int, ...] = (),
+        system_end_tokens: tuple[int, ...] = (),
+        user_start_tokens: tuple[int, ...] = (),
+        user_end_tokens: tuple[int, ...] = (),
+        asst_start_tokens: tuple[int, ...] = (),
+        asst_end_tokens: tuple[int, ...] = (),
+        allow_system_prompt: bool = True,
+        pad_token: int = 0,
+        bos_token: int | None = None,
+    ) -> None:
+        self.pre_image_tokens = list(pre_image_tokens)
+        self.post_image_tokens = list(post_image_tokens)
+        self.system_start_tokens = list(system_start_tokens)
+        self.system_end_tokens = list(system_end_tokens)
+        self.user_start_tokens = list(user_start_tokens)
+        self.user_end_tokens = list(user_end_tokens)
+        self.asst_start_tokens = list(asst_start_tokens)
+        self.asst_end_tokens = list(asst_end_tokens)
+        self._allow_system_prompt = allow_system_prompt
+        self.tokenizer = tokenizer
+        self._image_processor = None
+        self._pad_token = pad_token
+        self.bos_token = bos_token
+    @property
+    def image_processor(self) -> QwenImageProcessor:
+        assert self._image_processor is not None
+        return self._image_processor
+    def _process_content(
+        self,
+        message_content: MessageContent,
+        role: Literal["system", "user", "assistant"],
+        tokenized_messages: list[torch.Tensor],
+        insertion_points: list[int],
+        image_list: list[torch.Tensor | None],
+        token_count: int,
+        img_size: int | None = None,
+        **kwargs: Any,
+    ) -> int:
+        mapping = {
+            "user": (self.user_start_tokens, self.user_end_tokens),
+            "assistant": (self.asst_start_tokens, self.asst_end_tokens),
+            "system": (self.system_start_tokens, self.system_end_tokens),
+        }
+        if role.lower() not in mapping:
+            raise ValueError(f"Unknown role '{role}' encountered in messages.")
+        start_tokens, end_tokens = mapping[role.lower()]
+        # 1) Add the start tokens
+        if start_tokens:
+            tokenized_messages.append(torch.Tensor(start_tokens).flatten().to(torch.long))
+            token_count += len(start_tokens)
+        # 2) Process the message content one by one (potentially interleaved image and text)
+        for part in message_content:
+            elt_type = part["type"]
+            if elt_type == "image":
+                part = cast(ImageMessage, part)
+                self._process_image_message(
+                    part,
+                    tokenized_messages,
+                    image_list,
+                    img_size=img_size,
+                )
+                token_count += len(self.pre_image_tokens)
+                insertion_points.append(token_count)
+                token_count += len(self.post_image_tokens)
+            else:
+                part = cast(TextMessage, part)
+                self._process_text_message(
+                    part["text"],
+                    role=role,
+                    token_list=tokenized_messages,
+                    **kwargs,
+                )
+                token_count += tokenized_messages[-1].size(0)
+        # 3) Add the end tokens
+        if end_tokens:
+            tokenized_messages.append(torch.Tensor(end_tokens).flatten().to(torch.long))
+            token_count += len(end_tokens)
+        return token_count
+    def _process_text_message(
+        self,
+        message: str,
+        role: Literal["system", "user", "assistant"],
+        token_list: list[torch.Tensor],
+        **kwargs: Any,
+    ) -> None:
+        if role.lower() == "system" and not self._allow_system_prompt:
+            raise ValueError("System prompts are not allowed in this tokenizer configuration.")
+        tokens = self.tokenizer.encode(
+            message, add_special_tokens=False, return_tensors="pt", **kwargs
+        )
+        tokens = cast(torch.Tensor, tokens)
+        token_list.append(tokens.flatten().to(torch.long))
+    def _process_image_message(
+        self,
+        message: ImageMessage,
+        token_list: list[torch.Tensor],
+        image_list: list[torch.Tensor | None],
+        img_size: int | None = None,
+    ) -> None:
+        img = message["image"]
+        if img is None:
+            image_list.append(None)
+        else:
+            image_list.append(
+                self.image_processor.process_images(
+                    self._load_image(img), img_size=img_size
+                ).squeeze(0)
+            )
+        if self.pre_image_tokens:
+            token_list.append(torch.Tensor(self.pre_image_tokens).flatten().to(torch.long))
+        if self.post_image_tokens:
+            token_list.append(torch.Tensor(self.post_image_tokens).flatten().to(torch.long))
+    def _load_image(self, image_path_or_image: str | Image.Image) -> Image.Image:
+        if isinstance(image_path_or_image, str):
+            return Image.open(image_path_or_image).convert("RGB")
+        return image_path_or_image
+    def _maybe_pad(self, tokens: torch.Tensor, pad_len: int, pad_value: int) -> torch.Tensor:
+        return torch.nn.functional.pad(
+            tokens,
+            (0, pad_len) if self.tokenizer.padding_side == "right" else (pad_len, 0),
+            value=pad_value,
+        )
+    def pad_tokenized_messages(
+        self,
+        tokenized_messages_batch: list[torch.Tensor],
+        image_insertion_points_batch: list[torch.Tensor] | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor, list[torch.Tensor] | None]:
+        max_len = max(len(x) for x in tokenized_messages_batch)
+        if image_insertion_points_batch is not None and self.tokenizer.padding_side == "left":
+            image_insertion_points_batch = [
+                x + max_len - len(tokenized_messages_batch[idx])
+                for idx, x in enumerate(image_insertion_points_batch)
+            ]
+        input_ids = torch.stack(
+            [
+                self._maybe_pad(s, max_len - s.size(0), self._pad_token)
+                for s in tokenized_messages_batch
+            ],
+            dim=0,
+        )
+        attention_mask = torch.stack(
+            [
+                self._maybe_pad(torch.ones_like(s), max_len - s.size(0), 0)
+                for s in tokenized_messages_batch
+            ],
+            dim=0,
+        )
+        return input_ids, attention_mask, image_insertion_points_batch
+    def tokenize_messages(
+        self,
+        messages: ProcessorInput,
+        suppress_bos_token: bool = False,
+        **kwargs: Any,
+    ) -> ProcessorOutput | None:
+        """Tokenize a batch of messages into token IDs suitable for Helium1 CASA model.
+        Args:
+            messages (list[list[dict[str, str]]] | list[dict[str, str]]): Batch of message lists (or single list of messages),
+              where each message is a list of dictionaries with 'role' and 'content' keys.
+            continue_final_message (bool, optional): If True, the final message in each list will not have an end token added.
+              Defaults to False.
+            suppress_bos_token (bool, optional): If True, the beginning-of-sequence token will not be added.
+                Defaults to False.
+            **kwargs: Additional keyword arguments passed to the underlying encode method.
+        """
+        if not messages:
+            return None
+        if isinstance(messages[0], dict):
+            messages = [messages]  # type: ignore[assignment]
+        messages = cast(list[list[Message]], messages)
+        image_insertion_points_batch = []
+        tokenized_messages_batch = []
+        image_list: list[torch.Tensor | None] = []
+        for msgs in messages:
+            # msgs.append({
+            #     "role": "assistant",
+            #     "content": [{"type": "text", "text": ""}]
+            # })
+            tokenized_messages = []
+            if not suppress_bos_token and self.bos_token is not None:
+                tokenized_messages.append(torch.tensor([self.bos_token], dtype=torch.long))
+            insertion_points = []
+            token_count = 0
+            for msg in msgs:
+                token_count = self._process_content(
+                    msg["content"],
+                    role=msg["role"],
+                    tokenized_messages=tokenized_messages,
+                    insertion_points=insertion_points,
+                    image_list=image_list,
+                    token_count=token_count,
+                    **kwargs,
+                )
+            tokenized_messages_batch.append(torch.cat(tokenized_messages, dim=0).to(torch.long))
+            image_insertion_points_batch.append(torch.tensor(insertion_points, dtype=torch.long))
+            if msgs and self.asst_end_tokens and msgs[-1]["role"].lower() == "assistant":
+                # Remove the assistant end tokens from the final message
+                end_token_len = len(self.asst_end_tokens)
+                tokenized_messages_batch[-1] = tokenized_messages_batch[-1][:-end_token_len]
+            if msgs and self.asst_start_tokens and msgs[-1]["role"].lower() == "user":
+                # Remove the assistant end tokens from the final message
+                end_token_len = len(self.asst_end_tokens)
+                tokenized_messages_batch[-1] = torch.cat(
+                    [
+                        tokenized_messages_batch[-1],
+                        torch.Tensor(self.asst_start_tokens).to(torch.long),
+                    ]
+                )
+        input_ids, attention_mask, image_embeds_insertion_points = self.pad_tokenized_messages(
+            tokenized_messages_batch, image_insertion_points_batch
+        )
+        if image_list:
+            assert sum(img is None for img in image_list) % len(image_list) == 0, (
+                "Either all or no image must be None."
+            )
+        pixel_values: None | torch.Tensor | list[torch.Tensor]
+        if image_list[0] is None:
+            pixel_values = None
+        else:
+            pixel_values = cast(list[torch.Tensor], image_list)
+        return ProcessorOutput(
+            input_ids=input_ids,
+            image_embeds_insertion_points=image_embeds_insertion_points,
+            attention_mask=attention_mask,
+            pixel_values=pixel_values,
+        )

processing_qwen2_5vl_casa.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from typing import Any
+from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer
+from .processing import BaseProcessor, QwenImageProcessor
+class QwenCASAProcessor(BaseProcessor):
+    attributes = ["tokenizer"]
+    tokenizer_class = "Qwen2Tokenizer"
+    def __init__(
+        self,
+        tokenizer: Qwen2Tokenizer,
+        pre_image_tokens: tuple[int, ...] = (151652,),
+        post_image_tokens: tuple[int, ...] = (151653,),
+        system_start_tokens: tuple[int, ...] = (151644, 8948, 198),
+        system_end_tokens: tuple[int, ...] = (151645, 198),
+        user_start_tokens: tuple[int, ...] = (151644, 872, 198),
+        user_end_tokens: tuple[int, ...] = (151645, 198),
+        asst_start_tokens: tuple[int, ...] = (151644, 77091, 198),
+        asst_end_tokens: tuple[int, ...] = (151645, 198),
+        image_size: int = 448,
+        **kwargs: Any,
+    ):
+        del kwargs
+        super().__init__(
+            tokenizer=tokenizer,
+            pre_image_tokens=pre_image_tokens,
+            post_image_tokens=post_image_tokens,
+            system_start_tokens=system_start_tokens,
+            system_end_tokens=system_end_tokens,
+            user_start_tokens=user_start_tokens,
+            user_end_tokens=user_end_tokens,
+            asst_start_tokens=asst_start_tokens,
+            asst_end_tokens=asst_end_tokens,
+        )
+        self._image_processor = QwenImageProcessor(img_size=image_size)

processor_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "auto_map": {
+        "AutoProcessor": "processing_qwen2_5vl_casa.QwenCASAProcessor"
+    },
+    "image_size": 448,
+    "post_image_tokens": [
+        151653
+    ],
+    "pre_image_tokens": [
+        151652
+    ],
+    "processor_class": "QwenCASAProcessor"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null,
+  "add_bos_token": false
+}

utils.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# pylint: disable=protected-access
+"""Utils to handle CASA layers construction"""
+from contextlib import contextmanager
+from dataclasses import dataclass, fields
+from typing import Any, Callable, Generic, TypeVar
+import torch
+def delta_w_factory(
+    org_lin: torch.nn.Linear, new_lin: torch.nn.Linear
+) -> Callable[[torch.Tensor], torch.Tensor]:
+    """Factory for building linear op where the weights are the sum of two layers' weights"""
+    def _delta_w_fwd(input: torch.Tensor) -> torch.Tensor:
+        nonlocal org_lin, new_lin
+        bias = None if org_lin.bias is None else org_lin.bias + new_lin.bias
+        return torch.nn.functional.linear(input, org_lin.weight + new_lin.weight, bias)
+    return _delta_w_fwd
+@dataclass
+class StreamingState:
+    """Streaming State used by CASA layers at inference to save
+    e.g. the offset, the KV Cache and other persistent states"""
+    offset: int = 0
+    def _is_valid_field(self, key: str) -> bool:
+        return key in {x.name for x in fields(self)}
+    def _init_field(self, key: str) -> None:
+        """Init function for non-arggment dependent defauls"""
+        assert self._is_valid_field(key)
+        if key == "offset":
+            self.offset = 0
+        else:
+            # for fields which should be set explicitly and cannot be auto-initialized
+            setattr(self, key, None)
+    def init(self) -> None:
+        for key in [x.name for x in fields(self)]:
+            self._init_field(key)
+    def _reset_field(self, name: str) -> None:
+        """Resets the given field"""
+        self._init_field(name)
+    def reset(self) -> None:
+        for f in fields(self):
+            self._reset_field(f.name)
+    def _get_field(self, f: str) -> Any:
+        """Get field and init if not"""
+        assert self._is_valid_field(f)
+        if getattr(self, f) is None:
+            self._init_field(f)
+        return getattr(self, f)
+    def _set_field(self, f: str, value: Any) -> None:
+        assert self._is_valid_field(f)
+        setattr(self, f, value)
+StreamingStateT = TypeVar("StreamingStateT", bound=StreamingState)
+class StreamingModule(torch.nn.Module, Generic[StreamingStateT]):  # pylint: disable=abstract-method
+    """Overrides Audiocraft's Streaming modules with additional small utils"""
+    def __init__(self, state_class: type) -> None:
+        torch.nn.Module.__init__(self)
+        self.is_streaming: bool = False
+        self.enable_viz: tuple[str, ...] = ()
+        self._streaming_state: StreamingStateT = state_class()
+    @property
+    def streaming_state(self) -> StreamingStateT:
+        return self._streaming_state
+    def _apply_named_streaming(self, fn: Callable):
+        """Apply function to all streaming modules"""
+        for name, module in self.named_modules():
+            if isinstance(module, StreamingModule):
+                fn(name, module)
+    def reset_streaming(self):
+        """Reset the streaming state."""
+        def _reset(_: str, module: StreamingModule):
+            module._streaming_state.reset()
+        self._apply_named_streaming(_reset)
+    def _set_streaming(self, streaming: bool, viz: tuple[str, ...] = ()):
+        """Set all streaming modules in streaming mode"""
+        def _set_streaming(_, module: StreamingModule) -> None:
+            module.is_streaming = streaming
+            module.enable_viz = viz
+            if streaming:
+                module.streaming_state.init()
+        self._apply_named_streaming(_set_streaming)
+    @contextmanager
+    def streaming(self, stream: bool = True, viz: tuple[str, ...] = ()):
+        """Context manager to enter streaming mode. Reset streaming state on exit."""
+        self._set_streaming(stream, viz)
+        try:
+            yield
+        finally:
+            self._set_streaming(False, ())
+            self.reset_streaming()

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff