ameroyer commited on Dec 22, 2025

Commit

1126ea7

verified ·

0 Parent(s):

Super-squash branch 'main' using huggingface_hub

Browse files

Files changed (22) hide show

.gitattributes +35 -0
Notice +2 -0
README.md +15 -0
__init__.py +0 -0
casa_attention.py +1010 -0
config.json +77 -0
configuration_helium1_casa.py +270 -0
generation_config.json +10 -0
image_encoder.py +57 -0
language_helium1_casa.py +1077 -0
model-00001-of-00003.safetensors +3 -0
model-00002-of-00003.safetensors +3 -0
model-00003-of-00003.safetensors +3 -0
model.safetensors.index.json +653 -0
modeling_helium1_casa.py +330 -0
processing.py +505 -0
processing_helium1_casa.py +37 -0
processor_config.json +10 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +14 -0
utils.py +116 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

Notice ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Helium1-VL-2B's image encoder is finetuned from the image encoder of Qwen2.5-VL-3B.
2	+ Qwen is licensed under the Qwen LICENSE AGREEMENT, Copyright (c) Alibaba Cloud. All Rights Reserved.

README.md ADDED Viewed

	@@ -0,0 +1,15 @@

+---
+language:
+- en
+base_model:
+- kyutai/helium-1-2b
+pipeline_tag: image-text-to-text
+license: cc-by-nc-sa-4.0
+datasets:
+- HuggingFaceM4/FineVision
+- mvp-lab/LLaVA-OneVision-1.5-Instruct-Data
+---
+Please refer to the [main model card](https://huggingface.co/kyutai/CASA-Helium1-VL-2B) for more information and instructions to run.
+This model page contains model weights for `Helium1-VL-2B`, a Helium1-2B model which is instruct-tuned and further trained to handle visual inputs using a pretrained encoder from Qwen-2.5VL.
+This model is released as part of our CASA model release. We provide model weights for CASA models in the associated model collection.

__init__.py ADDED Viewed

File without changes

casa_attention.py ADDED Viewed

	@@ -0,0 +1,1010 @@

+"""CASA layers"""
+import bisect
+from dataclasses import dataclass
+from itertools import accumulate
+from typing import TYPE_CHECKING, Callable, Literal, Sequence, TypedDict, overload
+from typing import cast as type_cast
+import torch
+from transformers.configuration_utils import PretrainedConfig
+from .utils import StreamingModule, StreamingState, delta_w_factory
+if TYPE_CHECKING:
+    from transformers.configuration_utils import PretrainedConfig
+try:
+    from flash_attn import flash_attn_varlen_func
+except ImportError:
+    flash_attn_varlen_func = None  # type: ignore
+WindowsComputeKwargs = TypedDict(
+    "WindowsComputeKwargs",
+    {
+        "num_post_image_tokens": int,
+        "num_pre_image_tokens": int,
+    },
+    total=False,
+)
+def __split_n_merge__(
+    x: torch.Tensor,
+    sample_lengths: list[int],
+    padding_side: Literal["left", "right"] = "right",
+    pad_value: int | float | bool = 0,
+) -> torch.Tensor:
+    max_sample_length = max(sample_lengths)
+    pad_tuple = tuple(0 for _ in range((x.ndim - 1) * 2))
+    return torch.stack(
+        [
+            torch.nn.functional.pad(
+                _x,
+                pad_tuple + (0, max_sample_length - _x.shape[0])
+                if padding_side == "right"
+                else pad_tuple + (max_sample_length - _x.shape[0], 0),
+                value=pad_value,
+            )
+            for _x in torch.split(x, sample_lengths, dim=0)
+        ],
+        dim=0,
+    )
+@overload
+def insert_image_tokens(
+    inputs_embeds: torch.Tensor,
+    image_embeds: torch.Tensor | Sequence[torch.Tensor],
+    image_embeds_insertion_points: list[torch.Tensor],
+    recover_batch_dim: Literal[True],
+    attention_mask: torch.Tensor | None = None,
+    padding_side: Literal["left", "right"] = "right",
+    keep_only_attended: bool = False,
+    pad_output: int | float | bool = 0.0,
+) -> tuple[
+    torch.Tensor,
+    None,
+    torch.Tensor | None,
+    torch.Tensor,
+]: ...
+@overload
+def insert_image_tokens(
+    inputs_embeds: torch.Tensor,
+    image_embeds: torch.Tensor | Sequence[torch.Tensor],
+    image_embeds_insertion_points: list[torch.Tensor],
+    recover_batch_dim: Literal[False],
+    attention_mask: torch.Tensor | None = None,
+    padding_side: Literal["left", "right"] = "right",
+    keep_only_attended: bool = False,
+    pad_output: int | float | bool = 0.0,
+) -> tuple[
+    torch.Tensor,
+    list[int],
+    torch.Tensor | None,
+    torch.Tensor,
+]: ...
+def insert_image_tokens(
+    inputs_embeds: torch.Tensor,
+    image_embeds: torch.Tensor | Sequence[torch.Tensor],
+    image_embeds_insertion_points: list[torch.Tensor],
+    recover_batch_dim: bool = True,
+    attention_mask: torch.Tensor | None = None,
+    padding_side: Literal["left", "right"] = "right",
+    keep_only_attended: bool = False,
+    pad_output: int | float | bool = 0.0,
+) -> tuple[
+    torch.Tensor | torch.Tensor,
+    list[int] | None,
+    torch.Tensor | torch.Tensor | None,
+    torch.Tensor | torch.Tensor,
+]:
+    """
+    Insert image embeddings into text embeddings
+    Args:
+        inputs_embeds (torch.Tensor): (B, S, D) input token embeddings.
+        image_embeds (torch.Tensor | list[torch.Tensor]): (N_images, Nt, D) |  List[(Nt, D)] image token embeddings.
+        image_embeds_insertion_points (list[torch.Tensor]): Insertion indices.
+        attention_mask (torch.Tensor, optional): (B, S) attention mask.
+        padding_side (Literal["left", "right"]): Padding scheme. Controls behavior for padded images.
+        return_indices (bool): Whether to return gather indices or the fused sequence directly.
+        keep_only_attended: This is only applicable when recover_batch_dim is False; whether to
+            remove any non-attended tokens in the whole array. In this case, the attention
+            mask returned is **still the original one**, so we can remember which indices have been
+            removed
+    Returns:
+        output (torch.Tensor): (B, S + Ni * Nt) gather indices or (B, S + Ni * Nt, D) fused sequence
+        image_embeds (torch.Tensor): (B, Ni * Nt) image embeds, padded and batch if input was a list
+        attention_mask (torch.Tensor): Same shape, 1 for real tokens, 0 for image and text padding.
+        image_tokens_mask (torch.Tensor): (B, S + Ni * Nt, 1), marks image token positions.
+    """
+    if isinstance(image_embeds, list) and len(image_embeds) == 0:
+        batch_size, text_seq_length, token_dim = inputs_embeds.shape
+        if recover_batch_dim:
+            return (
+                inputs_embeds,
+                None,
+                attention_mask,
+                torch.zeros((batch_size, text_seq_length, 1), dtype=torch.bool),
+            )
+        else:
+            flattened_seq_length = inputs_embeds.shape[0] * inputs_embeds.shape[1]
+            return (
+                torch.reshape(inputs_embeds, (flattened_seq_length, inputs_embeds.shape[2])),
+                [text_seq_length] * inputs_embeds.shape[0],
+                attention_mask.flatten() if attention_mask is not None else None,
+                torch.zeros((flattened_seq_length, 1), dtype=torch.bool),
+            )
+    # Sanity checks
+    if isinstance(image_embeds, torch.Tensor):
+        assert inputs_embeds.shape[-1] == image_embeds.shape[-1]
+    else:
+        assert all(inputs_embeds.shape[-1] == _x.shape[-1] for _x in image_embeds)
+    batch_size, text_seq_length, token_dim = inputs_embeds.shape
+    image_seq_length = [x.shape[0] for x in image_embeds]
+    # Flatten insertion points
+    insertion_offset = []
+    counter, offset_from_text, offset_from_image = 0, 0, 0
+    for sample in image_embeds_insertion_points:
+        for pt in sample:
+            insertion_offset.append(pt + offset_from_image + offset_from_text)
+            offset_from_image += image_seq_length[counter]
+            counter += 1
+        offset_from_text += text_seq_length
+    image_insert_positions = [
+        x for idx, pt in enumerate(insertion_offset) for x in range(pt, pt + image_seq_length[idx])
+    ]
+    # Flatten image embeds
+    if isinstance(image_embeds, list):
+        image_embeds = torch.cat(image_embeds, dim=0)
+    else:
+        image_embeds = type_cast(torch.Tensor, image_embeds)
+        image_embeds = torch.reshape(image_embeds, (-1, token_dim))
+    # Flatten text embeds across batch dim (B x S, D)
+    inputs_embeds = torch.reshape(inputs_embeds, (-1, token_dim))
+    flattened_seq_length = inputs_embeds.shape[0] + sum(image_seq_length)
+    text_insert_positions = sorted(
+        set(range(flattened_seq_length)).difference(set(image_insert_positions))
+    )
+    # Scatter image embeds in the flattened dict
+    # scatter text related stuff
+    output = torch.empty(
+        (flattened_seq_length, token_dim),
+        device=inputs_embeds.device,
+        dtype=inputs_embeds.dtype,
+    )
+    txt_positions_tensor = torch.Tensor(text_insert_positions).to(
+        dtype=torch.long, device=inputs_embeds.device
+    )
+    output.scatter_(0, txt_positions_tensor[:, None].expand(-1, token_dim), inputs_embeds)
+    attention_mask_new: torch.Tensor | None = None
+    if attention_mask is not None:
+        attention_mask_new = torch.ones(
+            (flattened_seq_length,), dtype=torch.bool, device=inputs_embeds.device
+        )
+        attention_mask_new.scatter_(
+            0, txt_positions_tensor, attention_mask.flatten().to(torch.bool)
+        )
+    # scatter image related stuff
+    image_tokens_mask = torch.zeros(
+        (flattened_seq_length,), dtype=torch.bool, device=inputs_embeds.device
+    )
+    img_positions_tensor = torch.Tensor(image_insert_positions).to(
+        device=inputs_embeds.device, dtype=torch.long
+    )
+    output.scatter_(0, img_positions_tensor[:, None].expand(-1, token_dim), image_embeds)
+    image_tokens_mask.scatter_(0, img_positions_tensor, True)
+    # Compute expected sample length, taking into account the real batch
+    # i.e. recover the batch dimension of image embeddings
+    sample_lengths = []
+    counter = 0
+    for sample_idx, pts in enumerate(image_embeds_insertion_points):
+        num_image_tokens = 0
+        for _ in pts:
+            num_image_tokens += image_seq_length[counter]
+            counter += 1
+        if keep_only_attended and attention_mask is not None:
+            attended_seq_length = torch.sum(attention_mask[sample_idx]).cpu().item()
+            sample_lengths.append(attended_seq_length + num_image_tokens)
+        else:
+            sample_lengths.append(text_seq_length + num_image_tokens)
+    # For CASA attention, we can keep stuff flatten ad return
+    # the sample_lengths for the blockwise attention
+    if not recover_batch_dim:
+        if keep_only_attended and attention_mask_new is not None:
+            output = output[attention_mask_new]
+            image_tokens_mask = image_tokens_mask[attention_mask_new]
+        return output, sample_lengths, attention_mask_new, image_tokens_mask[..., None]
+    # Otherwise, time to (pad) and reshape
+    # Easy case: everything has the same length
+    if all(x == sample_lengths[0] for x in sample_lengths):
+        output = torch.reshape(output, (batch_size, sample_lengths[0], token_dim))
+        image_tokens_mask = torch.reshape(image_tokens_mask, (batch_size, sample_lengths[0], 1))
+        if attention_mask_new is not None:
+            attention_mask_new = torch.reshape(attention_mask_new, (batch_size, sample_lengths[0]))
+    # if there is any size mismatch we break into a
+    # list and pad again
+    else:
+        # split and merge
+        output = __split_n_merge__(output, sample_lengths, padding_side, pad_value=pad_output)
+        # note that the extra padding tokens are also marked as image tokens to be removed later
+        image_tokens_mask = __split_n_merge__(
+            image_tokens_mask, sample_lengths, padding_side, True
+        )[:, :, None]
+        if attention_mask_new is not None:
+            attention_mask_new = __split_n_merge__(
+                attention_mask_new, sample_lengths, padding_side, 0
+            )
+    # Return
+    return output, sample_lengths, attention_mask_new, image_tokens_mask
+def get_sample_lengths_from_insertion_points(
+    image_embeds_insertion_points: list[torch.Tensor],
+    image_embeds: torch.Tensor | list[torch.Tensor] | None,
+    total_seq_len: int | None = None,
+    attention_mask: torch.Tensor | None = None,
+    **kwargs: WindowsComputeKwargs,
+) -> tuple[list[tuple[int, bool]], list[int]]:
+    """Compute sample lengths as if each image insertion point defines a
+    new document (ex document ID)
+    """
+    num_post_image_tokens = type_cast(int, kwargs.get("num_post_image_tokens", 0))
+    num_pre_image_tokens = type_cast(int, kwargs.get("num_pre_image_tokens", 0))
+    squashed_samples_lengths = type_cast(
+        list[list[int]] | None, kwargs.get("squashed_samples_lengths", None)
+    )
+    if squashed_samples_lengths is not None:
+        assert len(squashed_samples_lengths) == len(image_embeds_insertion_points)
+    def __insert_next_sample__(
+        batch_idx: int, insrt_pt: int, last_insrt_pt: int, end_of_batch_sample: bool = False
+    ) -> None:
+        nonlocal attention_mask
+        nonlocal text_sample_lengths, full_sample_lengths
+        nonlocal cum_samples_lengths, current_image_offset
+        nonlocal last_image_idx, current_image_idx, current_length
+        # Add the sample between [last_insrt_pt, insrt_pt] with breaks in
+        # between any squashed samples we find on the way
+        start_pt = bisect.bisect_left(cum_samples_lengths, last_insrt_pt)
+        added_sample = False
+        for end_of_sample in cum_samples_lengths[start_pt:]:
+            # we will break the loop at the end when end_of_sample = insrt_pt
+            end_of_sample = min(end_of_sample, insrt_pt)
+            # Add between [last_insrt_pt, end_of_sample]
+            current_length = end_of_sample - last_insrt_pt
+            if attention_mask is not None:
+                current_length -= int(
+                    torch.sum(~attention_mask[batch_idx, last_insrt_pt:end_of_sample]).item()
+                )
+            if current_length > 0:
+                added_sample = True
+                text_sample_lengths.append(
+                    (current_length, end_of_batch_sample and insrt_pt == end_of_sample)
+                )
+                # add image tokens to current_length
+                if current_image_idx > 0 and image_embeds is not None:
+                    images_in_sample = [
+                        img_idx
+                        for img_idx in range(last_image_idx, current_image_idx)
+                        if img_idx < len(image_embeds_insertion_points[batch_idx])
+                        and last_insrt_pt
+                        <= image_embeds_insertion_points[batch_idx][img_idx]
+                        < end_of_sample
+                    ]
+                    if len(images_in_sample) > 0:
+                        num_image_tokens = sum(
+                            _x.shape[0]
+                            for _x in image_embeds[
+                                current_image_offset + images_in_sample[0] : current_image_offset
+                                + images_in_sample[-1]
+                                + 1
+                            ]
+                        )
+                        current_length += num_image_tokens
+                full_sample_lengths.append(current_length)
+            # prepare for next loop
+            last_insrt_pt = end_of_sample
+            if end_of_sample == insrt_pt:
+                break
+        # End of loop: Catching weird use case where we may end up on a span
+        # full of padding tokens which will not get added due to current_length > 0
+        if end_of_batch_sample:
+            assert added_sample, "Weird edge case. Don't do that, thank you"
+            text_sample_lengths[-1] = (text_sample_lengths[-1][0], True)
+        # End of loop: Catching weird use case where we may end up on a span
+        # full of padding tokens which will not get added due to current_length > 0
+        if end_of_batch_sample:
+            assert added_sample, "Weird edge case. Don't do that, thank you"
+            text_sample_lengths[-1] = (text_sample_lengths[-1][0], True)
+    current_image_offset = 0
+    text_sample_lengths, full_sample_lengths = [], []
+    cum_samples_lengths: list[int] = []
+    current_length, last_insrt_pt, last_image_idx, current_image_idx = 0, 0, 0, 0
+    for batch_idx, pts in enumerate(image_embeds_insertion_points):
+        if squashed_samples_lengths is not None:
+            cum_samples_lengths = list(accumulate(squashed_samples_lengths[batch_idx]))
+        else:
+            assert total_seq_len is not None
+            cum_samples_lengths = [total_seq_len]
+        for current_image_idx, insrt_pt in enumerate(pts.cpu().tolist()):
+            # check if the images are consecutive in which way we want
+            # them to belong to the same window
+            if current_image_idx >= 1 and insrt_pt == (
+                image_embeds_insertion_points[batch_idx][current_image_idx - 1]
+                + num_pre_image_tokens
+                + num_post_image_tokens
+            ):
+                continue
+            # Otherwise, we found a new sample
+            # not very important but for completeness: the insertion points come *after*
+            # the pre-image tokens per design but for the document-id mask it is more consistent to
+            # have them correspond to the same image
+            insrt_pt -= num_pre_image_tokens
+            # Update text and full sample lengths
+            if insrt_pt > last_insrt_pt:
+                __insert_next_sample__(
+                    batch_idx, insrt_pt, last_insrt_pt, end_of_batch_sample=False
+                )
+            last_image_idx = current_image_idx
+            last_insrt_pt = insrt_pt
+        # End of batch: add sample in progress and reset
+        current_image_idx += 1
+        if cum_samples_lengths[-1] > last_insrt_pt:
+            __insert_next_sample__(
+                batch_idx, cum_samples_lengths[-1], last_insrt_pt, end_of_batch_sample=True
+            )
+        current_length, last_insrt_pt, last_image_idx, current_image_idx = 0, 0, 0, 0
+        current_image_offset += len(pts)
+    # Sanity checks that the is_eob are correctly place
+    assert sum(_x[1] for _x in text_sample_lengths) == len(image_embeds_insertion_points), (
+        f"Number of eob markers ({sum(_x[1] for _x in text_sample_lengths)}) differs"
+        f" from original batch size ({len(image_embeds_insertion_points)})"
+    )
+    return text_sample_lengths, full_sample_lengths
+class CASAAttentionHandler:
+    def __init__(
+        self,
+        inputs_embeds: torch.Tensor,
+        image_embeds: torch.Tensor | list[torch.Tensor],
+        image_embeds_insertion_points: list[torch.Tensor],
+        attention_mask: torch.Tensor | None = None,
+        rope_fn: Callable | None = None,
+        windows: Literal["batch", "squashed", "images", "turn_based"] = "images",
+        use_asymetric_q_kv: bool = True,
+        casa_windows_info: None | dict = None,
+    ):
+        """Initialize the structure holding the query buffer for CASA attention layers
+        (ie the **flattened** text+image inserted tokens).
+        Note that this structure is shared across all casa layers, and it gets updated
+        with the current hidden states at every layer; this is merely a buffer to keep
+        scatter_ operations in-plae as much as possible
+        In this module, the embeddings related values (image_tokens_mask,
+        text_sample_lengths etc) are stored under the assumption of a tensor
+        which is *flatened* and *witout padding tokens*
+        Only the attention mask is kept as-is (text-only, batched, padded) to
+        be able to recover original shapes when needed
+        """
+        super().__init__()
+        assert windows == "images"  # for inference code release
+        # Note 1: Unless overriden, text/full_sample_lengths are defined such that one
+        # document = one sample in the batch
+        if attention_mask is None:
+            text_sample_lengths = [(_x.shape[0], True) for _x in inputs_embeds]
+        else:
+            text_sample_lengths = [(int(torch.sum(_x).item()), True) for _x in attention_mask]
+        (
+            full_inputs_embeds,
+            full_sample_lengths,
+            # Full attention mask is only needed at inference to
+            # flatten the KV-Cache and remove padding tokens
+            _,
+            self.image_tokens_mask,
+        ) = insert_image_tokens(
+            inputs_embeds=inputs_embeds,
+            image_embeds=image_embeds,
+            image_embeds_insertion_points=image_embeds_insertion_points,
+            attention_mask=attention_mask,
+            recover_batch_dim=False,
+            keep_only_attended=attention_mask is not None,
+        )
+        assert self.image_tokens_mask.ndim == 2
+        self.image_embeds = image_embeds
+        self.image_embeds_insertion_points = image_embeds_insertion_points
+        self.attention_mask = None if attention_mask is None else attention_mask.bool()
+        self.use_asymetric_qkv = use_asymetric_q_kv
+        # At inference, we have to use asymetric QKV for efficiency
+        if self.attention_mask is not None:
+            self.use_asymetric_qkv = True
+        # Build CASA windows
+        assert casa_windows_info is not None
+        text_sample_lengths, full_sample_lengths = get_sample_lengths_from_insertion_points(
+            image_embeds_insertion_points=image_embeds_insertion_points,
+            image_embeds=image_embeds,
+            total_seq_len=inputs_embeds.shape[1],
+            attention_mask=self.attention_mask,
+            **casa_windows_info,  # pyright: ignore
+        )
+        # Sanity checks on the sample lengths
+        self.text_sample_lengths = [(int(s), eob) for s, eob in text_sample_lengths if s > 0]
+        self.full_sample_lengths = [int(s) for s in full_sample_lengths if s > 0]
+        assert len(self.text_sample_lengths) == len(self.full_sample_lengths), (
+            f"Sanity check failed; text sample lengths {len(self.text_sample_lengths)}"
+            f" != full sample lengths {len(self.full_sample_lengths)}"
+        )
+        if self.attention_mask is None:
+            num_unpadded_text_tokens = inputs_embeds.shape[0] * inputs_embeds.shape[1]
+        else:
+            num_unpadded_text_tokens = int(
+                torch.sum(type_cast(torch.Tensor, attention_mask)).item()
+            )
+        assert sum(_x[0] for _x in self.text_sample_lengths) == num_unpadded_text_tokens, (
+            f"Sanity check failed; sample lengths {sum(self.full_sample_lengths)} != {full_inputs_embeds.shape[0]}"
+        )
+        assert sum(self.full_sample_lengths) == full_inputs_embeds.shape[0], (
+            f"Sanity check failed; sample lengths {sum(self.full_sample_lengths)} != {full_inputs_embeds.shape[0]}"
+        )
+        # Finally we can compute cu_seqlen based on sample lengths
+        self.max_seqlen_q = max(self.text_sample_lengths)[0]
+        self.cu_seqlens_q = self.get_cu_seqlens(
+            [x[0] for x in self.text_sample_lengths], device=inputs_embeds.device
+        )
+        self.max_seqlen_kv = max(self.full_sample_lengths)
+        self.cu_seqlens_kv = self.get_cu_seqlens(
+            self.full_sample_lengths, device=inputs_embeds.device
+        )
+        # For inference: We save the length of the current document
+        # to trim the KV cache appropriately
+        self.current_doc_lengths = self.full_sample_lengths
+        # Precompute position embeddings
+        self.position_embeds = None
+        self.rope_fn = rope_fn
+        if self.rope_fn is not None:
+            self.position_embeds = self.compute_position_embeddings(
+                self.rope_fn, full_sample_lengths, dummy_for_dtype_and_device=full_inputs_embeds
+            )
+    @property
+    def batch_lengths(self) -> list[int]:
+        """Return a (batch_size,) list of integers containing the
+        number of (non-padded) text tokens for each sample in the batch"""
+        bls = [0]
+        for ln, eob in self.text_sample_lengths:
+            bls[-1] += ln
+            if eob:
+                bls.append(0)
+        return bls[:-1]
+    @property
+    def full_batch_lengths(self) -> list[int]:
+        """Same as batch_lengths for text+image tokens"""
+        bls = [0]
+        for (_, eob), ln in zip(self.text_sample_lengths, self.full_sample_lengths):
+            bls[-1] += ln
+            if eob:
+                bls.append(0)
+        return bls[:-1]
+    def get_cu_seqlens(
+        self, sample_lengths: list[int], device: torch.device | None
+    ) -> torch.Tensor:
+        """Update cu_seqlengths according to the given sample_lengths"""
+        return torch.Tensor(list(accumulate(sample_lengths, initial=0))).to(
+            dtype=torch.int32, device=device
+        )
+    def compute_position_embeddings(
+        self,
+        rope_fn: Callable,
+        sample_lengths: list[int],
+        dummy_for_dtype_and_device: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Compute info required for position embeddings. Can be override e.g. for Qwen"""
+        # option 1: Standard range
+        # position_ids = torch.arange(0, full_inputs_embeds.shape[0])
+        # option 2: Follows document boundary
+        position_ids = torch.cat([torch.arange(0, lg) for lg in sample_lengths], dim=0)
+        return rope_fn(
+            dummy_for_dtype_and_device,
+            position_ids.to(dummy_for_dtype_and_device.device)[None, ...],
+        )
+    def get_position_embedding(
+        self,
+        key: Literal["q", "kv"],
+        num_queries: int = 0,
+    ) -> tuple[torch.Tensor, torch.Tensor] | None:
+        if self.position_embeds is None:
+            return None
+        cos, sin = self.position_embeds
+        bls = self.full_batch_lengths
+        # For Q, we only want the text-only posembeds
+        if key == "q" and self.use_asymetric_qkv:
+            bls = self.batch_lengths
+            cos, sin = cos[:, ~self.image_tokens_mask[:, 0]], sin[:, ~self.image_tokens_mask[:, 0]]
+        elif key not in {"q", "kv"}:
+            raise ValueError(f"Unknow for position embedding {key}")
+        # Easy case: training or first step at inference: we use all the posembeds
+        if num_queries == 0:
+            return cos, sin
+        # If num queries is given, we need to trim for *every sample in the batch*
+        cos = [x[:, -num_queries:] for x in torch.split(cos, bls, dim=1)]
+        sin = [x[:, -num_queries:] for x in torch.split(sin, bls, dim=1)]
+        return torch.cat(cos, dim=1), torch.cat(sin, dim=1)
+    def get_full_embeds(
+        self, hidden_states: torch.Tensor, norm_fn: Callable | None
+    ) -> torch.Tensor:
+        """Update attended hidden states in the current query buffer
+        :param  hidden_states: (b, s, d) Tensor input to the CASA attention layer"
+        """
+        assert self.image_embeds is not None
+        return insert_image_tokens(
+            inputs_embeds=hidden_states,
+            image_embeds=self.image_embeds
+            if norm_fn is None
+            else norm_fn(self.image_embeds)
+            if isinstance(self.image_embeds, torch.Tensor)
+            else [norm_fn(_x) for _x in self.image_embeds],
+            image_embeds_insertion_points=self.image_embeds_insertion_points,
+            attention_mask=self.attention_mask,
+            recover_batch_dim=False,
+            keep_only_attended=self.attention_mask is not None,
+        )[0][None, :, :]
+    def recover_text_embeds(
+        self,
+        hidden_states_out: torch.Tensor,
+        hidden_states_in: torch.Tensor,
+        update_image_embeddings: bool = False,
+    ) -> torch.Tensor:
+        """Returns text embeddings from the query buffer, including non-attended tokens at inference"""
+        if update_image_embeddings and not self.use_asymetric_qkv:
+            raise NotImplementedError("Implement image embeddings updates for asymetric QKV")
+        # Remove image tokens in the symetric case
+        if not self.use_asymetric_qkv:
+            hidden_states_out = hidden_states_out[~self.image_tokens_mask[:, 0]]
+        # if there's not attention mask, we are in the right padded case
+        # (keep_only_attended = False) we can directly return the query
+        # outputs (which don't contain the image)
+        if self.attention_mask is None:
+            return hidden_states_out
+        # Otherwise, we need to "scatter" back only the text-attended tokens to the original
+        # hidden states, which contain the paddings
+        num_queries = hidden_states_in.shape[1]
+        # Case 1: the padded hidden_states_in is larger than hidden_states_out
+        # we rebatch+pad hidden_state_out before doing the scattering
+        if hidden_states_out.shape[0] != hidden_states_in.shape[0] * hidden_states_in.shape[1]:
+            s = torch.split(hidden_states_out, self.batch_lengths, dim=0)
+            assert max(_s.shape[0] for _s in s) <= num_queries  # sanity check
+            s = [
+                torch.nn.functional.pad(_s, (0, 0, num_queries - _s.shape[0], 0), value=0)
+                for _s in s
+            ]
+            return torch.where(
+                self.attention_mask[:, -num_queries:, None],
+                torch.stack(s),
+                hidden_states_in,
+            )
+        # If both have the smae shape, it means hidden_states_in contained no padding
+        # so we can directly return hidden states out
+        return hidden_states_out
+    def extend(self, num_tokens: int, offset: int = 0):
+        """Extend all necessary values of the Handler for infenrece
+        Note: this implementation curently assumes a single conversation at a time
+        (otherwise image tokens mask would have to change) and that tokens added are
+        attended to"""
+        # image embeds is inserted in the first step and stored in the KV cache
+        self.image_embeds = None
+        # Update attention mask (non-flattened) (assumes all new tokens are attended to)
+        if self.attention_mask is not None:
+            self.attention_mask = torch.nn.functional.pad(
+                self.attention_mask, (0, num_tokens), value=1
+            )
+        # Update image token mask (assumes only one image/conversation
+        # is started at once so that we always extend by zero)
+        # Note that the mask is stored flattened to avoid padding so we have to
+        # do something a bit ugly and inefficient here
+        imtokmask = torch.split(self.image_tokens_mask, self.full_batch_lengths, dim=0)
+        imtokmask = [torch.nn.functional.pad(x, (0, 0, 0, num_tokens), value=0) for x in imtokmask]
+        self.image_tokens_mask = torch.cat(imtokmask, dim=0)
+        # Recompute cumulative document lengths after assigning the new
+        # number of tokens to each sample in the batch
+        for idx, (ln, is_eob) in enumerate(self.text_sample_lengths):
+            if is_eob:
+                self.text_sample_lengths[idx] = (num_tokens + ln, is_eob)
+                self.full_sample_lengths[idx] += num_tokens
+        # Recompute cu sequlen
+        # First step: Technically this never occurs, but we keep it for completeness
+        if offset == 0:
+            self.max_seqlen_q = max(self.text_sample_lengths)[0]
+            self.cu_seqlens_q = self.get_cu_seqlens(
+                [x[0] for x in self.text_sample_lengths], device=self.cu_seqlens_q.device
+            )
+            self.max_seqlen_kv = max(self.full_sample_lengths)
+            self.cu_seqlens_kv = self.get_cu_seqlens(
+                self.full_sample_lengths, device=self.cu_seqlens_kv.device
+            )
+        # Step > 0: the annoying part is since flashattn_varlen does not accept
+        # 0-len documents, we need to remove documents from the KV Cache when they're past
+        # their windows. In our current setting, this means we only want to keep the latest
+        # documents
+        else:
+            self.max_seqlen_q = num_tokens
+            self.cu_seqlens_q = self.get_cu_seqlens(
+                [num_tokens for (_, eob) in self.text_sample_lengths if eob],
+                device=self.cu_seqlens_q.device,
+            )
+            final_doc_lengths = [
+                ln
+                for (_, eob), ln in zip(self.text_sample_lengths, self.full_sample_lengths)
+                if eob
+            ]
+            self.current_doc_lengths = final_doc_lengths
+            self.max_seqlen_kv = max(self.current_doc_lengths)
+            self.cu_seqlens_kv = self.get_cu_seqlens(
+                final_doc_lengths,
+                device=self.cu_seqlens_kv.device,
+            )
+        # Update position embeddings
+        if self.rope_fn is not None and self.position_embeds is not None:
+            self.position_embeds = self.compute_position_embeddings(
+                self.rope_fn,
+                self.full_sample_lengths,
+                dummy_for_dtype_and_device=self.position_embeds[0],
+            )
+@dataclass
+class CASAAttentionStreamingState(StreamingState):
+    """Streaming State for CASA Atention module. Keep the hidden"""
+    k: torch.Tensor = None  # pyright: ignore[reportAssignmentType]
+    v: torch.Tensor = None  # pyright: ignore[reportAssignmentType]
+    recover_batched_trims: list[int] = None  # pyright: ignore[reportAssignmentType]
+    casa_handler: CASAAttentionHandler = None  # pyright: ignore[reportAssignmentType]
+    def maybe_get_casa_handler(
+        self,
+        casa_handler: CASAAttentionHandler | None,
+        is_first_casa_layer: bool = False,
+        num_queries: int = -1,
+    ) -> CASAAttentionHandler | None:
+        # Set given Casa Handler the first time we reach this
+        if self.casa_handler is None:
+            self.casa_handler = casa_handler  # pyright: ignore
+        # subsequent calls: we need to extend shape to accomodate new tokens
+        # however because CASA handler is shared across layers, we only need to do it once
+        if self.casa_handler is not None and self.offset > 0 and is_first_casa_layer:
+            # since CasaHandler is shared, we only use its extend step once
+            self.casa_handler.extend(num_queries, offset=self.offset)
+        return self.casa_handler
+    def __recover_batched_kv__(self, states: torch.Tensor) -> torch.Tensor:
+        """Recover batched key/value states with left padding"""
+        s = torch.split(states, self.casa_handler.full_batch_lengths, dim=1)
+        mlen = max(_s.shape[1] for _s in s)
+        # Remember the added padding so that we can re-flatten KV later
+        if self.recover_batched_trims is None:
+            self.recover_batched_trims = [mlen - _s.shape[1] for _s in s]
+        s = [torch.nn.functional.pad(_s, (0, 0, 0, 0, mlen - _s.shape[1], 0), value=0) for _s in s]
+        return torch.cat(s, dim=0)
+    def __get_flattened_kv__(
+        self, k: torch.Tensor | None = None, v: torch.Tensor | None = None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Flattened and remove padding to act with flash_attn_func
+        """
+        k = self.k if k is None else k
+        v = self.v if v is None else v
+        assert k is not None and v is not None
+        # Since every batch at least contributes one document,
+        # we can use this to check whether we are in streaming mode with dropped docs.
+        # If so, we should trim the kv cache accordingly
+        if len(self.casa_handler.current_doc_lengths) == len(k):
+            k = torch.cat(
+                [
+                    _k[self.recover_batched_trims[idx] :][-doc_len:]
+                    for idx, _k, doc_len in zip(
+                        range(len(k)), k, self.casa_handler.current_doc_lengths
+                    )
+                ]
+            )
+            v = torch.cat(
+                [
+                    _v[self.recover_batched_trims[idx] :][-doc_len:]
+                    for idx, _v, doc_len in zip(
+                        range(len(k)), v, self.casa_handler.current_doc_lengths
+                    )
+                ]
+            )
+            return k[None, ...], v[None, ...]
+        k = torch.cat([_k[self.recover_batched_trims[idx] :] for idx, _k in enumerate(k)])
+        v = torch.cat([_v[self.recover_batched_trims[idx] :] for idx, _v in enumerate(v)])
+        return k[None, ...], v[None, ...]
+    def extend_kv(
+        self, key_states: torch.Tensor, value_states: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Extend KV Cache while keep
+        """
+        assert self.casa_handler is not None
+        if self.k is None and self.v is None:
+            # Init with batch-padded key and value states
+            self.k = self.__recover_batched_kv__(key_states)
+            self.v = self.__recover_batched_kv__(value_states)
+            return self.__get_flattened_kv__()
+        if self.k is not None and self.v is not None:
+            # this is during generation; normally there is no padding at this stage
+            # so we can directly reshape the flattened key states
+            rshp = (self.k.shape[0], -1, self.k.shape[2], self.k.shape[3])
+            self.k = torch.cat([self.k, key_states.reshape(rshp)], dim=1)
+            self.v = torch.cat([self.v, value_states.reshape(rshp)], dim=1)
+            return self.__get_flattened_kv__()
+        raise ValueError("Impossible configuration (k and v updates are desynchronized )")
+class CASAAttention(StreamingModule[CASAAttentionStreamingState]):
+    def __init__(
+        self,
+        config: "PretrainedConfig",
+        layer_idx: int | None,
+        self_attn: torch.nn.Module | None = None,
+        input_layernorm_fn: Callable[[torch.Tensor], torch.Tensor] | None = None,
+    ):
+        super().__init__(CASAAttentionStreamingState)
+        self.head_dim = config.head_dim
+        self.config = config
+        self.is_first_casa_layer = layer_idx == (min(config.xa_layers) if config.xa_layers else 0)
+        self.use_delta_w = config.casa_delta_w
+        self.q_proj_casa = self.init_from_config_proj("q", config)
+        self.k_proj_casa = self.init_from_config_proj("k", config)
+        self.v_proj_casa = self.init_from_config_proj("v", config)
+        self.o_proj_casa = self.init_from_config_proj("o", config)
+        # Delta_w
+        self.override_q_proj: Callable[[torch.Tensor], torch.Tensor] | None = None
+        self.override_k_proj: Callable[[torch.Tensor], torch.Tensor] | None = None
+        self.override_v_proj: Callable[[torch.Tensor], torch.Tensor] | None = None
+        self.override_o_proj: Callable[[torch.Tensor], torch.Tensor] | None = None
+        if config.casa_delta_w:
+            assert self_attn is not None
+            self.set_delta_w(self_attn)
+        # Layer norm
+        self.norm_fn: Callable | None = None
+        if config.xa_norm_on_images:
+            assert input_layernorm_fn is not None
+            self.norm_fn = input_layernorm_fn
+    def init_from_mha(self, self_attn: torch.nn.Module):
+        assert self_attn is not None
+        with torch.no_grad():
+            assert hasattr(self_attn, "q_proj")
+            for key in ["q", "k", "v", "o"]:
+                src = type_cast(torch.nn.Linear, getattr(self_attn, f"{key}_proj"))
+                tgt = type_cast(torch.nn.Linear, getattr(self, f"{key}_proj_casa"))
+                tgt.weight.copy_(src.weight)
+                if tgt.bias is not None and src.bias is not None:
+                    tgt.bias.copy_(src.bias)
+    def set_delta_w(self, self_attn: torch.nn.Module):
+        """Delta w setup"""
+        self.override_q_proj = delta_w_factory(
+            self.q_proj_casa, type_cast(torch.nn.Linear, self_attn.q_proj)
+        )
+        self.override_k_proj = delta_w_factory(
+            self.k_proj_casa, type_cast(torch.nn.Linear, self_attn.k_proj)
+        )
+        self.override_v_proj = delta_w_factory(
+            self.v_proj_casa, type_cast(torch.nn.Linear, self_attn.v_proj)
+        )
+        self.override_o_proj = delta_w_factory(
+            self.o_proj_casa, type_cast(torch.nn.Linear, self_attn.o_proj)
+        )
+        with torch.no_grad():
+            torch.nn.init.zeros_(self.q_proj_casa.weight)
+            torch.nn.init.zeros_(self.k_proj_casa.weight)
+            torch.nn.init.zeros_(self.v_proj_casa.weight)
+            torch.nn.init.zeros_(self.o_proj_casa.weight)
+            if self.q_proj_casa.bias is not None:
+                torch.nn.init.zeros_(self.q_proj_casa.bias)
+            if self.k_proj_casa.bias is not None:
+                torch.nn.init.zeros_(self.k_proj_casa.bias)
+            if self.v_proj_casa.bias is not None:
+                torch.nn.init.zeros_(self.v_proj_casa.bias)
+            if self.o_proj_casa.bias is not None:
+                torch.nn.init.zeros_(self.o_proj_casa.bias)
+    def init_from_config_proj(
+        self, key: Literal["q", "o", "k", "v"], config: PretrainedConfig
+    ) -> torch.nn.Linear:
+        """Initialize the Linear proj in this module"""
+        raise NotImplementedError("Abastract class.")
+    def apply_position_embeddings(
+        self,
+        key: Literal["q", "kv"],
+        x: torch.Tensor,  # (batch, seq_len, num_heads, head_dim)
+        casa_handler: CASAAttentionHandler | None,
+        num_queries: int = 0,
+        unsqueeze_dim: int = 1,
+    ) -> torch.Tensor:  # (batch, seq_len, num_heads, head_dim)
+        """Apply position embeddings to query and key states"""
+        raise NotImplementedError("Abastract class.")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        casa_handler: CASAAttentionHandler | None,
+    ) -> torch.Tensor | None:
+        """Generic forward for CASA uses for instance in `helium1_attention`"""
+        og_dtype = hidden_states.dtype
+        if self.is_streaming:
+            casa_handler = self.streaming_state.maybe_get_casa_handler(
+                casa_handler,
+                is_first_casa_layer=self.is_first_casa_layer,
+                num_queries=hidden_states.shape[1],
+            )
+        # Case of text-only samples at training (or inference when no handler was cached)
+        # in this case we just skip CASA so we return None (no casa_update)
+        if casa_handler is None:
+            return None
+        if self.is_streaming:
+            assert casa_handler.use_asymetric_qkv, (
+                "You should set `use_asymetric_qkv` to True during inference"
+            )
+        og_shape = hidden_states.shape
+        # Build Q inputs
+        if casa_handler.use_asymetric_qkv:
+            q_inputs = hidden_states.flatten(0, 1)[None, ...]
+            if casa_handler.attention_mask is not None:
+                q_inputs = q_inputs[:, casa_handler.attention_mask[:, -og_shape[1] :].flatten()]
+        else:
+            q_inputs = casa_handler.get_full_embeds(hidden_states, norm_fn=self.norm_fn)
+        # Case 1: Training or first inference step
+        if not self.is_streaming or self.streaming_state.offset == 0:
+            kv_inputs = casa_handler.get_full_embeds(hidden_states, norm_fn=self.norm_fn)
+        else:
+            # during streaming, the KV cache including image embeddings
+            # will be inserted later so for now we only update the incoming queries
+            kv_inputs = q_inputs
+        # Compute QKV for the blockwise attention
+        bs, total_seq_len = kv_inputs.shape[:2]
+        hidden_shape_q = (bs, q_inputs.shape[1], -1, self.head_dim)
+        hidden_shape_kv = (bs, total_seq_len, -1, self.head_dim)
+        if self.override_q_proj is None:
+            query_states = self.q_proj_casa(q_inputs).view(*hidden_shape_q)
+        else:
+            query_states = self.override_q_proj(q_inputs).view(*hidden_shape_q)
+        if self.override_k_proj is None:
+            key_states = self.k_proj_casa(kv_inputs).view(*hidden_shape_kv)
+        else:
+            key_states = self.override_k_proj(kv_inputs).view(*hidden_shape_kv)
+        if self.override_v_proj is None:
+            value_states = self.v_proj_casa(kv_inputs).view(*hidden_shape_kv)
+        else:
+            value_states = self.override_v_proj(kv_inputs).view(*hidden_shape_kv)
+        # Apply position embedding at the right offset
+        num_queries = 0
+        if self.streaming and self.streaming_state.offset > 0:
+            num_queries = og_shape[1]
+        query_states = self.apply_position_embeddings(
+            "q", query_states, num_queries=num_queries, casa_handler=casa_handler
+        )
+        key_states = self.apply_position_embeddings(
+            "kv", key_states, num_queries=num_queries, casa_handler=casa_handler
+        )
+        assert flash_attn_varlen_func is not None, (
+            "flash_attention is not installed but required for block-wise attention"
+        )
+        # Flashattention has different efficient implem for streaming
+        # In that case, the KV cache has to be batched and has been extended
+        # to accomodate the shape of ne the new updates
+        if self.is_streaming:
+            key_states, value_states = self.streaming_state.extend_kv(
+                key_states=key_states, value_states=value_states
+            )
+        if casa_handler.use_asymetric_qkv:
+            cu_seqlens_q = casa_handler.cu_seqlens_q
+            max_seqlen_q = casa_handler.max_seqlen_q
+        else:
+            cu_seqlens_q = casa_handler.cu_seqlens_kv
+            max_seqlen_q = casa_handler.max_seqlen_kv
+        assert cu_seqlens_q[-1] == query_states.shape[1], (
+            f"{cu_seqlens_q[-1]} != {query_states.shape[1]}"
+        )
+        assert casa_handler.cu_seqlens_kv[-1] == key_states.shape[1], (
+            f"{casa_handler.cu_seqlens_kv[-1]} != {key_states.shape[1]}"
+        )
+        # for quer
+        attn_output: torch.Tensor = flash_attn_varlen_func(
+            query_states[0].to(torch.bfloat16),
+            key_states[0].to(torch.bfloat16),
+            value_states[0].to(torch.bfloat16),
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=casa_handler.cu_seqlens_kv,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=casa_handler.max_seqlen_kv,
+            dropout_p=0.0,
+            # softmax_scale=None, # defaults to 1/sqrt(d)
+            causal=True,
+        ).to(og_dtype)
+        attn_output = attn_output.reshape(hidden_shape_q[1], -1).contiguous()
+        if self.override_o_proj is None:
+            attn_output = self.o_proj_casa(attn_output)
+        else:
+            attn_output = self.override_o_proj(attn_output)
+        attn_output = casa_handler.recover_text_embeds(
+            attn_output, hidden_states, update_image_embeddings=self.config.xa_update_image_embeds
+        )
+        attn_output = attn_output.reshape(og_shape)
+        if self.is_streaming:
+            self.streaming_state.offset += attn_output.shape[1]
+        return attn_output

config.json ADDED Viewed

	@@ -0,0 +1,77 @@

+{
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "auto_map": {
+        "AutoConfig": "configuration_helium1_casa.Helium1CASAConfig",
+        "AutoModel": "modeling_helium1_casa.V2Helium1"
+    },
+    "bos_token_id": 1,
+    "casa_attention": false,
+    "casa_delta_w": true,
+    "casa_use_asymetric_qkv": true,
+    "casa_windows": "images",
+    "eos_token_id": null,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": 8192,
+    "mask_squash_blockwise": false,
+    "max_position_embeddings": 4096,
+    "mlp_bias": false,
+    "model_type": "Helium1_VL_2B",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 8,
+    "pad_token_id": 3,
+    "post_image_tokens": [],
+    "pre_image_tokens": [],
+    "pretraining_tp": 1,
+    "rms_norm_eps": 1e-08,
+    "rope_scaling": null,
+    "rope_theta": 20000.0,
+    "tie_word_embeddings": false,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.51.3",
+    "use_cache": true,
+    "vision_config": {
+        "depth": 32,
+        "fullatt_block_indexes": [
+            7,
+            15,
+            23,
+            31
+        ],
+        "hidden_act": "silu",
+        "hidden_size": 1280,
+        "image_mean": [
+            0.48145466,
+            0.4578275,
+            0.40821073
+        ],
+        "image_std": [
+            0.26862954,
+            0.26130258,
+            0.27577711
+        ],
+        "in_channels": 3,
+        "in_chans": 3,
+        "intermediate_size": 3420,
+        "model_type": "qwen2_5_vl",
+        "num_heads": 16,
+        "out_dim": 2048,
+        "out_hidden_size": 2048,
+        "patch_size": 14,
+        "spatial_merge_size": 2,
+        "spatial_patch_size": 14,
+        "temporal_patch_size": 1,
+        "tokens_per_second": 2,
+        "window_size": 112
+    },
+    "vocab_size": 64000,
+    "xa_custom_norm": false,
+    "xa_layers": [],
+    "xa_norm_on_images": false,
+    "xa_order": "ca_first",
+    "xa_update_image_embeds": false
+}

configuration_helium1_casa.py ADDED Viewed

	@@ -0,0 +1,270 @@

+from typing import Any, Literal
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
+class Helium1CASAConfig(PretrainedConfig):
+    r"""
+    Helium1 Config augmented with CASA options
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Helium1Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens,
+            Llama 2 up to 4096, CodeLlama up to 16384.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
+            understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
+            results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
+        head_dim (`int`, *optional*):
+            The attention head dimension. If None, it will default to hidden_size // num_attention_heads
+    """
+    model_type = "helium1_casa"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `Helium1Model`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {  # pyright: ignore[reportAssignmentType]
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        vocab_size: int = 32000,
+        hidden_size: int = 4096,
+        intermediate_size: int = 11008,
+        num_hidden_layers: int = 32,
+        num_attention_heads: int = 32,
+        num_key_value_heads: None | int = None,
+        head_dim: None | int = None,
+        hidden_act: str = "silu",
+        attention_dropout: float = 0.0,
+        max_position_embeddings: int = 2048,
+        initializer_range: float = 0.02,
+        rms_norm_eps: float = 1e-6,
+        use_cache: bool = True,
+        tie_word_embeddings: bool = False,
+        rope_theta: float = 10000.0,
+        pad_token_id: int = 3,
+        eos_token_id: int = 2,
+        bos_token_id: int = 1,
+        pretraining_tp: int = 1,
+        rope_scaling: None | dict = None,
+        attention_bias: bool = False,
+        mlp_bias: bool = False,
+        # Our fusion mechanisms
+        # Common to all fusion mechanisms
+        xa_layers: None | tuple = None,
+        xa_order: Literal["ca_first", "parallel", "instead"] = "ca_first",
+        xa_norm_on_images: bool = False,
+        xa_update_image_embeds: bool = False,
+        mask_squash_blockwise: bool = False,
+        # CASA
+        casa_attention: bool = False,
+        casa_delta_w: bool = False,
+        casa_windows: Literal["batch", "squashed", "images", "turn_based"] = "batch",
+        casa_use_asymetric_qkv: bool = True,
+        xa_custom_norm: bool = False,
+        # Qwen2.5-VL vision config
+        vision_config: dict[str, Any] | None = None,
+        **kwargs: Any,
+    ):
+        from transformers.modeling_rope_utils import rope_config_validation
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = (
+            head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
+        )
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        self.head_dim = self.hidden_size // self.num_attention_heads
+        self.xa_layers = xa_layers
+        self.xa_order: Literal["ca_first", "parallel", "instead"] = xa_order
+        self.xa_norm_on_images = xa_norm_on_images
+        self.xa_update_image_embeds = xa_update_image_embeds
+        self.mask_squash_blockwise = mask_squash_blockwise
+        # CASA config
+        self.casa_attention = casa_attention
+        self.casa_delta_w = casa_delta_w
+        self.casa_windows: Literal["batch", "squashed", "images", "turn_based"] = casa_windows
+        self.casa_use_asymetric_qkv = casa_use_asymetric_qkv
+        self.xa_custom_norm = xa_custom_norm
+        if vision_config is None:
+            vision_config = dict()
+        self.vision_config = Qwen2_5_VLVisionConfig(**vision_config)
+        self.vision_config.temporal_patch_size = 1
+        self.vision_config.image_mean = [0.48145466, 0.4578275, 0.40821073]
+        self.vision_config.image_std = [0.26862954, 0.26130258, 0.27577711]
+        self.vision_config.out_dim = 2048
+        self.pre_image_tokens = []
+        self.post_image_tokens = []
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+if __name__ == "__main__":
+    import argparse
+    from pathlib import Path
+    import rich
+    import yaml
+    from transformers.models.auto.configuration_auto import AutoConfig
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--out_dir", type=str, default="./saved_config/")
+    parser.add_argument(
+        "--ckpt_path",
+        type=str,
+        default="/lustre/scwpod02/client/kyutai/juliette/experiments/finext_casa_896_xtxt_up_b20_64gpu/fdf76e6774",
+    )
+    args = parser.parse_args()
+    path = Path(args.ckpt_path) / "kyuteye_config.yml"
+    helium_config = AutoConfig.from_pretrained("kyutai/helium-1-2b")
+    vision_config = AutoConfig.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct").vision_config
+    # 3) Create YOUR config by merging both
+    config = Helium1CASAConfig(
+        **helium_config.to_dict(),  # all helium parameters
+        vision_config=vision_config.to_dict(),  # override or add vision_config
+    )
+    with open(path) as stream:
+        kconfig = yaml.safe_load(stream)
+    # print keys that are in kconfig and in config
+    for key in set(kconfig.keys()).intersection(set(config.to_dict().keys())):
+        rich.print(f"Overwriting [bold green]{key:>50s}[/]: [bold red]{kconfig[key]}")
+        setattr(config, key, kconfig[key])
+    # TODO: handle casa_own_norm -> xa_custom_norm
+    print("Configuration successfully loaded.")
+    # Save config to json
+    config.save_pretrained(args.out_dir)
+    print(f"Configuration saved to {args.out_dir}/config.json")

generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "pad_token_id": 3,
+  "eos_token_id": [
+    3,
+    103
+  ],
+  "transformers_version": "4.51.3"
+}

image_encoder.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""Qwen2.5VL encoder with delayed normalization"""
+import torch
+from einops import rearrange
+from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+    Qwen2_5_VisionTransformerPretrainedModel,
+)
+def prepare_for_qwen_encoder(
+    x: torch.Tensor | list[torch.Tensor], mean: torch.Tensor, std: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Preprocessing for Qwen encoder
+    Image mean and std come from processor.image_processor.image_mean and image_std
+    """
+    grid_thw = torch.Tensor([[1, img.shape[0], img.shape[1]] for img in x]).to(x[0].device)
+    hws_flatten_shape = torch.prod(grid_thw, dim=-1)
+    x = torch.cat(
+        [img.reshape((int(hws_flatten_shape[idx].item()), -1)) for idx, img in enumerate(x)],
+        dim=0,
+    )
+    assert x.min() >= 0.0 and x.max() <= 1.0
+    og_shape = x.shape
+    x = rearrange(x, "L (c d) -> L c d", c=3)
+    x = (x - mean) / std
+    x = x.view(og_shape).to(torch.bfloat16)
+    return x, grid_thw
+class Qwen25VLEncoder(torch.nn.Module):
+    """Qwen2.5 VL encoder with pre/post processing to be compatible for
+    our CASA attention implementation"""
+    def __init__(
+        self,
+        visual: "Qwen2_5_VisionTransformerPretrainedModel",
+    ):
+        super().__init__()
+        self.visual = visual
+        self.image_mean = torch.tensor(self.visual.config.image_mean).view(1, 3, 1)
+        self.image_std = torch.tensor(self.visual.config.image_std).view(1, 3, 1)
+    def forward(
+        self, x: torch.Tensor | list[torch.Tensor]
+    ) -> dict[str, torch.Tensor | list[torch.Tensor]]:
+        x, grid_thw = prepare_for_qwen_encoder(
+            x, mean=self.image_mean.to(x[0].device), std=self.image_std.to(x[0].device)
+        )
+        grid_thw = grid_thw.type(torch.int)
+        assert len(x) == grid_thw.prod(dim=1).sum()
+        out = self.visual(x, grid_thw=grid_thw)
+        split_sizes = (grid_thw.prod(dim=-1) // self.visual.spatial_merge_size**2).tolist()
+        embeds = list(torch.split(out, split_sizes, dim=0))  # Ni * (seq, C)
+        return {"image_embeds": embeds, "grid_thw": grid_thw}

language_helium1_casa.py ADDED Viewed

	@@ -0,0 +1,1077 @@

+# ADAPTED FROM https://github.com/huggingface/transformers/blob/main/src/transformers/models/helium/modeling_helium.py
+# GIT HASH 1b222903c3e1cfd9492d75e4b2548aa8bd458674
+import logging
+import math
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Callable, Literal, Optional
+from typing import cast as type_cast
+import torch
+from torch import nn
+from transformers import (
+    ROPE_INIT_FUNCTIONS,  # pyright: ignore[reportPrivateImportUsage]
+    dynamic_rope_update,  # pyright: ignore[reportPrivateImportUsage]
+)
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.configuration_utils import PretrainedConfig
+from transformers.generation.utils import GenerationMixin
+from transformers.loss.loss_utils import ForCausalLMLoss
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils.generic import LossKwargs, can_return_tuple
+from transformers.utils.import_utils import is_torch_flex_attn_available
+from .casa_attention import CASAAttention, CASAAttentionHandler, insert_image_tokens
+from .configuration_helium1_casa import Helium1CASAConfig
+logger = logging.getLogger(__name__)
+if is_torch_flex_attn_available():
+    from transformers.integrations.flex_attention import make_flex_block_causal_mask
+def remove_image_tokens(
+    inputs_embeds: torch.Tensor,
+    image_tokens_mask: torch.Tensor,
+) -> torch.Tensor:
+    """Remove the image tokens from inputs_embeds as indicated by image_tokens_mask
+    :param inputs_embeds: Tokens of shape (Batch, Seqlen, Dims) containing image tokens
+    :param image_tokens_mask: 1-0 mask indicating where image tokens are; (Batch, Seqlen)
+    :return: Tokens tensor of shape (Batch, S' < Seqlen, Dims)
+    """
+    image_seq_lengths = torch.sum(image_tokens_mask, dim=1)[:, 0]
+    image_seq_length = int(image_seq_lengths[0].item())
+    assert torch.all(image_seq_lengths == image_seq_length)
+    new_shape = (
+        inputs_embeds.shape[0],
+        inputs_embeds.shape[1] - image_seq_length,
+        inputs_embeds.shape[-1],
+    )
+    tokens = torch.masked_select(
+        inputs_embeds,
+        torch.logical_not(image_tokens_mask).expand((-1, -1, inputs_embeds.shape[-1])),
+    )
+    return tokens.reshape(new_shape)
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(
+        batch, num_key_value_heads, n_rep, slen, head_dim
+    )
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: "HeliumAttention",
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: None | torch.Tensor,
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Any,
+):
+    del kwargs  # unused
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+# Different Attention Classes
+class HeliumAttention(torch.nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: Helium1CASAConfig, layer_idx: None | int = None):
+        super().__init__()
+        self.config = config
+        assert layer_idx is not None
+        self.layer_idx: int = layer_idx
+        self.apply_rotary_fn = ApplyRotaryPosEmbHelium1()
+        self.head_dim = getattr(
+            config, "head_dim", config.hidden_size // config.num_attention_heads
+        )
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = 1 / math.sqrt(self.head_dim)
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.q_proj = nn.Linear(
+            config.hidden_size,
+            config.num_attention_heads * self.head_dim,
+            bias=config.attention_bias,
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size,
+            config.num_key_value_heads * self.head_dim,
+            bias=config.attention_bias,
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size,
+            config.num_key_value_heads * self.head_dim,
+            bias=config.attention_bias,
+        )
+        self.o_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: None | torch.Tensor,
+        past_key_values: None | Cache = None,
+        cache_position: None | torch.LongTensor = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        # del (cache_position, past_key_value)  # we use our own generate/caching
+        bs, seq_len, _ = hidden_states.shape
+        # Get QKV
+        hidden_shape = (bs, seq_len, -1, self.head_dim)
+        # Embed Queries
+        # Shape: (batch_size, num_heads, seq_len, head_dim)
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        num_queries = query_states.shape[2]
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        # Applies rotation
+        cos, sin = position_embeddings
+        query_states, key_states = self.apply_rotary_fn(
+            query_states, key_states, cos, sin, num_queries=num_queries
+        )
+        assert key_states is not None and query_states is not None
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get(
+                "output_attentions", False
+            ):
+                print(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support"
+                    " `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument"\
+                    " `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(
+                key_states, value_states, self.layer_idx, cache_kwargs
+            )
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(bs, num_queries, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        assert isinstance(attn_output, torch.Tensor)
+        return attn_output, attn_weights
+class ApplyRotaryPosEmbHelium1:
+    @staticmethod
+    def rotate_half(x: torch.Tensor) -> torch.Tensor:
+        """Rotates half the hidden dims of the input."""
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+    @staticmethod
+    def __call__(
+        q: torch.Tensor,
+        k: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        position_ids: torch.Tensor | None = None,
+        unsqueeze_dim: int = 1,
+        num_queries: int | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Applies Rotary Position Embedding to the query and key tensors.
+        Args:
+            q (`torch.Tensor`): The query tensor.
+            k (`torch.Tensor`): The key tensor.
+            cos (`torch.Tensor`): The cosine part of the rotary embedding.
+            sin (`torch.Tensor`): The sine part of the rotary embedding.
+            position_ids (`torch.Tensor`, *optional*):
+                Deprecated and unused.
+            unsqueeze_dim (`int`, *optional*, defaults to 1):
+                The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+                sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+                that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+                k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+                cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+                the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+        Returns:
+            `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+        """
+        del position_ids
+        cos = cos.unsqueeze(unsqueeze_dim)
+        sin = sin.unsqueeze(unsqueeze_dim)
+        if num_queries is None:
+            offset = 0
+        else:
+            offset = -num_queries
+        q_embed = (q * cos[:, :, offset:]) + (
+            ApplyRotaryPosEmbHelium1.rotate_half(q) * sin[:, :, offset:]
+        )
+        k_embed = (k * cos) + (ApplyRotaryPosEmbHelium1.rotate_half(k) * sin)
+        return q_embed, k_embed
+class HeliumRotaryEmbedding(nn.Module):
+    def __init__(self, config: Helium1CASAConfig, device: None | torch.device | str = None):
+        super().__init__()
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        assert self.rope_type in ROPE_INIT_FUNCTIONS, (
+            f"Invalid rope type {self.rope_type}. Supported types are: {list(ROPE_INIT_FUNCTIONS.keys())}"
+        )
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(config, device=device)
+        self.inv_freq: torch.Tensor  # only defined for typing
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(
+        self, x: torch.Tensor, position_ids: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        inv_freq_expanded = (
+            self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        )
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = (
+            x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        )
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class Helium1CASAAttention(CASAAttention):
+    """A CASA Attention layer compatible with Qwen"""
+    def __init__(
+        self,
+        config: Helium1CASAConfig,
+        layer_idx: int | None,
+        self_attn: torch.nn.Module | None = None,
+        input_layernorm_fn: Callable[[torch.Tensor], torch.Tensor] | None = None,
+    ):
+        # Only adding  this init for typing purposes for the config
+        super().__init__(config, layer_idx, self_attn, input_layernorm_fn)  # pyright: ignore[reportArgumentType]
+    @staticmethod
+    def rotate_half(x: torch.Tensor) -> torch.Tensor:
+        """Rotates half the hidden dims of the input."""
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+    def apply_position_embeddings(
+        self,
+        key: Literal["q", "kv"],
+        x: torch.Tensor,  # (batch, seq_len, num_heads, head_dim)
+        casa_handler: CASAAttentionHandler | None,
+        num_queries: int = 0,
+        unsqueeze_dim: int = 1,
+    ) -> torch.Tensor:  # (batch, seq_len, num_heads, head_dim)
+        """Apply position embeddings to query and key states"""
+        if casa_handler is not None:
+            posemb = casa_handler.get_position_embedding(key, num_queries=num_queries)
+            if posemb is not None:
+                x = x.transpose(1, 2).to(torch.float32)
+                x = (x * posemb[0].unsqueeze(dim=unsqueeze_dim)) + (
+                    self.rotate_half(x) * posemb[1].unsqueeze(dim=unsqueeze_dim)
+                )
+                return x.transpose(1, 2)
+        return x
+    def init_from_config_proj(
+        self, key: Literal["q", "o", "k", "v"], config: PretrainedConfig
+    ) -> torch.nn.Linear:
+        """Initialize the Linear proj in this module"""
+        num_heads = config.num_key_value_heads if key in {"k", "v"} else config.num_attention_heads
+        return torch.nn.Linear(
+            config.hidden_size,
+            num_heads * config.head_dim,
+            bias=config.attention_bias if key != "o" else False,
+        )
+# NORMALISATION LAYER
+def __rms_norm_forward__(
+    hidden_states: torch.Tensor, weight: torch.Tensor, variance_epsilon: float = 1e-6
+) -> torch.Tensor:
+    input_dtype = hidden_states.dtype
+    hidden_states = hidden_states.to(torch.float32)
+    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+    hidden_states = hidden_states * torch.rsqrt(variance + variance_epsilon)
+    return weight * hidden_states.to(input_dtype)
+class Helium1RMSNorm(nn.Module):
+    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+        """
+        Helium1RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return __rms_norm_forward__(hidden_states, self.weight, self.variance_epsilon)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+def delta_w_factory_rms_norm(
+    org_lin: Helium1RMSNorm, new_lin: Helium1RMSNorm
+) -> Callable[[torch.Tensor], torch.Tensor]:
+    """Factory for building rms norm where the weights are the sum of two layers' weights"""
+    def _delta_w_fwd(input: torch.Tensor) -> torch.Tensor:
+        nonlocal org_lin, new_lin
+        return __rms_norm_forward__(
+            input, org_lin.weight + new_lin.weight, new_lin.variance_epsilon
+        )
+    return _delta_w_fwd
+# FULL CONNECTED LAYER
+class HeliumMLP(nn.Module):
+    def __init__(self, config: Helium1CASAConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+class HeliumDecoderLayer(nn.Module):
+    def __init__(self, config: Helium1CASAConfig, layer_idx: None | int = None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.config = config
+        self.mlp = HeliumMLP(config)
+        self.input_layernorm = Helium1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Helium1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        # Self-attention
+        self.self_attn = HeliumAttention(config=config, layer_idx=layer_idx)
+        # Setup norm for fusion mechanisms; Note that this norm is on the text tokens
+        is_xa_layer = layer_idx is None or not config.xa_layers or layer_idx in config.xa_layers
+        self.norm_cross: None | Helium1RMSNorm = None
+        self.override_norm_cross: Callable[[torch.Tensor], torch.Tensor] | None = None
+        if is_xa_layer and config.casa_attention:
+            # Custom normalization layer for the extra fusion module
+            if self.config.xa_custom_norm:
+                self.norm_cross = Helium1RMSNorm(config.hidden_size)
+                if config.casa_delta_w:
+                    self.override_norm_cross = delta_w_factory_rms_norm(
+                        self.input_layernorm, self.norm_cross
+                    )
+                    with torch.no_grad():
+                        torch.nn.init.ones_(self.norm_cross.weight)
+        # Setup additional norm for images tokens which is set in each individual mechansims
+        norm_on_images_fn = (
+            None
+            if not self.config.xa_norm_on_images
+            else self.override_norm_cross
+            if self.override_norm_cross is not None
+            else self.norm_cross.forward
+            if self.norm_cross is not None
+            else self.input_layernorm.forward
+        )
+        # CASA
+        self.casa_attn: Helium1CASAAttention | None = None
+        if config.casa_attention and is_xa_layer:
+            self.casa_attn = Helium1CASAAttention(
+                config, layer_idx, self_attn=self.self_attn, input_layernorm_fn=norm_on_images_fn
+            )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: None | torch.Tensor = None,
+        position_ids: None | torch.LongTensor = None,
+        past_key_values: None | Cache = None,
+        output_attentions: None | bool = False,
+        use_cache: None | bool = False,
+        cache_position: None | torch.LongTensor = None,
+        position_embeddings: None
+        | tuple[torch.Tensor, torch.Tensor] = None,  # necessary, but kept here for BC
+        # CASA
+        casa_handler: CASAAttentionHandler | None = None,
+        cu_seqlens: torch.Tensor | None = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor] | tuple[torch.Tensor]:
+        # Image fusion mechanisms
+        apply_ca = self.casa_attn is not None
+        ca_update: torch.Tensor | None = None
+        if (
+            self.config.xa_order
+            in {
+                "parallel",
+                "ca_first",
+                "instead",
+            }
+            and apply_ca
+        ):
+            # Apply layer norm
+            assert self.norm_cross is not None
+            ca_input = (
+                self.override_norm_cross
+                if self.override_norm_cross is not None
+                else self.norm_cross
+            )(hidden_states)
+            # CASA
+            if self.casa_attn is not None:
+                ca_update = self.casa_attn(ca_input, casa_handler=casa_handler)
+            # If we're here, it's because we had proper inputs (no text-only samples)
+            # so the output better be not None !
+            if ca_update is not None:
+                # `instead`: directly return the output of the CA module as residual
+                if self.config.xa_order == "instead":
+                    outputs = (hidden_states + ca_update,)
+                    if output_attentions:
+                        outputs += (
+                            torch.zeros((), device=ca_update.device, dtype=ca_update.dtype),
+                        )
+                    return outputs
+                # `ca_first`: update then continue with normal self-attention
+                if self.config.xa_order == "ca_first":
+                    hidden_states = hidden_states + ca_update
+                    ca_update = None
+        # Self Attention with initial input layer norm
+        residual = hidden_states
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=self.input_layernorm(hidden_states),
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            cu_seqlens=cu_seqlens,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # parallel - residual update
+        if self.config.xa_order == "parallel" and apply_ca and ca_update is not None:
+            hidden_states = hidden_states + ca_update
+        # Fully Connected layer
+        residual = hidden_states
+        # MLP updates for image embeddings
+        if (
+            self.config.xa_update_image_embeds
+            and self.casa_attn is not None
+            and casa_handler is not None
+            and casa_handler.image_embeds is not None
+        ):
+            # Text flattening
+            hs = self.post_attention_layernorm(hidden_states).reshape(-1, hidden_states.shape[-1])
+            # Image flattening
+            img_seq_lengths = [_x.shape[0] for _x in casa_handler.image_embeds]
+            img_residual = torch.cat(list(casa_handler.image_embeds), dim=0)
+            update = self.mlp(torch.cat([hs, self.post_attention_layernorm(img_residual)], dim=0))
+            # update text
+            hidden_states = hidden_states + update[: hs.shape[0]].reshape(hidden_states.shape)
+            casa_handler.image_embeds = list(
+                torch.split(img_residual + update[hs.shape[0] :], img_seq_lengths)
+            )
+        else:
+            hidden_states = self.mlp(self.post_attention_layernorm(hidden_states))
+            hidden_states = residual + hidden_states
+        # Outputs
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        return outputs
+# FULL HELIUM MODEL
+@dataclass
+class CausalHeliumOutput(CausalLMOutputWithPast):
+    attention_mask: Optional[torch.Tensor] = None
+    num_image_tokens_log: Optional[torch.Tensor] = None
+    num_text_tokens_log: Optional[torch.Tensor] = None
+class Helium1PreTrainedModel(PreTrainedModel):
+    config_class = Helium1CASAConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["HeliumDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_attention_backend = True
+    def _init_weights(self, module: torch.nn.Module) -> None:
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, Helium1RMSNorm):
+            module.weight.data.fill_(1.0)
+class Helium1Model(Helium1PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+    Args:
+        config: Helium1CASAConfig
+    """
+    def __init__(self, config: Helium1CASAConfig):
+        Helium1PreTrainedModel.__init__(self, config)
+        self.training: bool
+        self._gradient_checkpointing_func: Callable
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [HeliumDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Helium1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = HeliumRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value: nn.Module) -> None:
+        self.embed_tokens = value
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: None | torch.LongTensor = None,
+        attention_mask: None | torch.Tensor = None,
+        position_ids: None | torch.Tensor = None,
+        past_key_values: None | DynamicCache = None,
+        inputs_embeds: None | torch.Tensor = None,
+        use_cache: None | bool = None,
+        output_attentions: None | bool = None,
+        output_hidden_states: None | bool = None,
+        cache_position: None | torch.Tensor = None,
+        # Insertion
+        image_tokens_mask: torch.Tensor | None = None,
+        # CASA
+        casa_handler: CASAAttentionHandler | None = None,
+        cu_seqlens: torch.Tensor | None = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = not self.training and (
+            use_cache if use_cache is not None else self.config.use_cache
+        )
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training and use_cache:
+            print(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        assert inputs_embeds is not None
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+        if cache_position is None:
+            past_seen_tokens = 0 if past_key_values is None else past_key_values._seen_tokens
+            assert inputs_embeds is not None
+            cache_position = torch.arange(
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device=inputs_embeds.device,
+            )
+            assert cache_position is not None
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        # Get attention mask
+        causal_mask: None | torch.Tensor = self._update_causal_mask(
+            attention_mask,
+            inputs_embeds,
+            cache_position,
+            past_key_values,
+            output_attentions,
+            force_mask=False,
+        )
+        # create position embeddings to be shared across the decoder layers
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(inputs_embeds, position_ids)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        for decoder_layer_idx, decoder_layer in enumerate(
+            self.layers[: self.config.num_hidden_layers]
+        ):
+            is_xa_layer = not self.config.xa_layers or decoder_layer_idx in self.config.xa_layers
+            if output_hidden_states is not None:
+                if all_hidden_states is None:
+                    all_hidden_states = ()
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                    casa_handler if is_xa_layer else None,
+                    cu_seqlens,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                    casa_handler=casa_handler if is_xa_layer else None,
+                    cu_seqlens=cu_seqlens,
+                    **flash_attn_kwargs,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                if all_self_attns is None:
+                    all_self_attns = ()
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            if all_hidden_states is None:
+                all_hidden_states = ()
+            all_hidden_states += (hidden_states,)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,  # pyright: ignore[reportArgumentType]
+            hidden_states=all_hidden_states,  # pyright: ignore[reportArgumentType]
+            attentions=all_self_attns,
+        )
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor | None,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: None | DynamicCache | Cache,
+        output_attentions: bool = False,
+        force_mask: bool = False,
+    ) -> torch.Tensor | None:
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)  # type: ignore
+            return attention_mask
+        assert attention_mask is None or isinstance(attention_mask, torch.Tensor)
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (force_mask or (attention_mask == 0.0).any()):
+                return attention_mask
+            return None
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_compilable_cache = (
+            past_key_values.is_compileable if past_key_values is not None else False
+        )
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if (
+            self.config._attn_implementation == "sdpa"
+            and not using_compilable_cache
+            and not output_attentions
+        ):
+            if not force_mask and AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+        dtype = input_tensor.dtype
+        sequence_length = input_tensor.shape[1]
+        if using_compilable_cache and past_key_values is not None:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length
+            )
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        assert target_length is not None
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(
+                type_cast(torch.FloatTensor, causal_mask), min_dtype
+            )
+        return causal_mask
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor | None,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs: Any,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        del kwargs
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length),
+                fill_value=min_dtype,
+                dtype=dtype,
+                device=cache_position.device,
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(
+                target_length, device=cache_position.device
+            ) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[
+                    :, None, None, :
+                ].to(causal_mask.device)
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+class Helium1ForCausalLM(Helium1PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    def __init__(self, config: Helium1CASAConfig, **kwargs: Any) -> None:
+        del kwargs
+        super().__init__(config)
+        self.model: Helium1Model
+        self.model = Helium1Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self._loss_function = ForCausalLMLoss
+    def get_input_embeddings(self) -> nn.Module:
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value: nn.Module) -> None:
+        self.model.embed_tokens = value
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings: nn.Module) -> None:
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder: Helium1Model) -> None:
+        self.model = decoder
+    def get_decoder(self) -> Helium1Model:
+        return self.model
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: None | torch.LongTensor = None,
+        attention_mask: None | torch.Tensor = None,
+        position_ids: None | torch.LongTensor = None,
+        past_key_values: None | Cache = None,
+        inputs_embeds: None | torch.Tensor = None,
+        image_embeds: None | torch.Tensor | list[torch.Tensor] = None,
+        image_embeds_insertion_points: None | list[torch.Tensor] = None,
+        labels: None | torch.LongTensor = None,
+        use_cache: None | bool = None,
+        output_attentions: None | bool = None,
+        output_hidden_states: None | bool = None,
+        cache_position: None | torch.LongTensor = None,
+        logits_to_keep: int | torch.Tensor = 0,
+        # CASA
+        casa_windows_info: None | dict = None,
+        **kwargs: Unpack[KwargsForCausalLM],
+    ) -> CausalHeliumOutput:
+        r"""
+        Helium1 augmented with CASA layers
+        """
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        if input_ids is not None:
+            assert inputs_embeds is None, (
+                "Need to provide only one of `input_ids` or `inputs_embeds`."
+            )
+            inputs_embeds = self.model.embed_tokens(input_ids)
+        assert inputs_embeds is not None
+        # Setup image + text token fusion
+        bs, og_seq_len, _ = inputs_embeds.shape
+        image_tokens_mask: torch.Tensor | None = None
+        casa_handler: CASAAttentionHandler | None = None
+        num_image_tokens = -1
+        if image_embeds is not None:
+            num_image_tokens = sum(_x.shape[0] for _x in image_embeds)
+            assert image_embeds_insertion_points is not None, (
+                "Missing image embeddings insertion points"
+            )
+            # B1. CASA layers: We need to init the shared Handler
+            if self.model.config.casa_attention:
+                casa_handler = CASAAttentionHandler(
+                    # for text tokens, we don't need the actual values
+                    inputs_embeds=torch.zeros_like(inputs_embeds),
+                    # for image embeddings, we put real inputs as this will be fixed
+                    image_embeds=image_embeds,
+                    image_embeds_insertion_points=image_embeds_insertion_points,
+                    # attention mask is only needed at inference / left padding
+                    attention_mask=None if self.training else attention_mask,
+                    rope_fn=self.model.rotary_emb,
+                    windows=self.model.config.casa_windows,
+                    use_asymetric_q_kv=self.model.config.casa_use_asymetric_qkv,
+                    # further params are fed to the funtion computing attention
+                    casa_windows_info=casa_windows_info,
+                )
+            # B2. Direct image insertion
+            else:
+                inputs_embeds, _, attention_mask, image_tokens_mask = insert_image_tokens(
+                    inputs_embeds=inputs_embeds,
+                    image_embeds=image_embeds,
+                    image_embeds_insertion_points=image_embeds_insertion_points,
+                    attention_mask=attention_mask,
+                    padding_side="right" if self.training else "left",
+                    recover_batch_dim=True,
+                )
+        del image_embeds
+        del input_ids
+        outputs: BaseModelOutputWithPast = self.model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            image_tokens_mask=image_tokens_mask,
+            casa_handler=casa_handler,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        assert hidden_states is not None
+        if image_tokens_mask is not None:
+            hidden_states = remove_image_tokens(hidden_states, image_tokens_mask)
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = (
+            slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        )
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+        out = CausalHeliumOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            num_image_tokens_log=torch.tensor(num_image_tokens).to(logits.device).to(torch.float),
+            num_text_tokens_log=torch.tensor(og_seq_len).to(logits.device).to(torch.float),
+        )
+        return out

model-00001-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fa6d71d108b8e3d968936d7d61e5928a63a8967cb26dd2c88ee942d9c84164a7
+size 4936992240

model-00002-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1acdddfff9557107c017f04e8e6b0f47244b07a139cf1849b49c0ee983097968
+size 4993844784

model-00003-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6656ee483ebfc9903325da8a4418a9713429ca6d60c4ce554c3fbdfe953c4b8
+size 836448912

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,653 @@

+{
+  "metadata": {
+    "total_size": 10767208448
+  },
+  "weight_map": {
+    "image_prefix.enc.visual.blocks.0.attn.proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.0.attn.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.0.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.0.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.0.mlp.down_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.0.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.0.mlp.gate_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.0.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.0.mlp.up_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.0.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.0.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.0.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.1.attn.proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.1.attn.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.1.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.1.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.1.mlp.down_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.1.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.1.mlp.gate_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.1.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.1.mlp.up_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.1.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.1.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.1.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.10.attn.proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.10.attn.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.10.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.10.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.10.mlp.down_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.10.mlp.gate_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.10.mlp.up_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.10.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.10.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.11.attn.proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.11.attn.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.11.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.11.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.11.mlp.down_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.11.mlp.gate_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.11.mlp.up_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.11.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.11.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.12.attn.proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.12.attn.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.12.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.12.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.12.mlp.down_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.12.mlp.gate_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.12.mlp.up_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.12.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.12.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.13.attn.proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.13.attn.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.13.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.13.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.13.mlp.down_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.13.mlp.gate_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.13.mlp.up_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.13.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.13.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.14.attn.proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.14.attn.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.14.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.14.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.14.mlp.down_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.14.mlp.gate_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.14.mlp.up_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.14.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.14.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.15.attn.proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.15.attn.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.15.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.15.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.15.mlp.down_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.15.mlp.gate_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.15.mlp.up_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.15.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.15.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.16.attn.proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.16.attn.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.16.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.16.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.16.mlp.down_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.16.mlp.gate_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.16.mlp.up_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.16.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.16.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.17.attn.proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.17.attn.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.17.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.17.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.17.mlp.down_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.17.mlp.gate_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.17.mlp.up_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.17.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.17.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.18.attn.proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.18.attn.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.18.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.18.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.18.mlp.down_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.18.mlp.gate_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.18.mlp.up_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.18.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.18.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.19.attn.proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.19.attn.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.19.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.19.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.19.mlp.down_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.19.mlp.gate_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.19.mlp.up_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.19.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.19.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.2.attn.proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.2.attn.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.2.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.2.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.2.mlp.down_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.2.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.2.mlp.gate_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.2.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.2.mlp.up_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.2.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.2.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.2.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.20.attn.proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.20.attn.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.20.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.20.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.20.mlp.down_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.20.mlp.gate_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.20.mlp.up_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.20.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.20.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.21.attn.proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.21.attn.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.21.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.21.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.21.mlp.down_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.21.mlp.gate_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.21.mlp.up_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.21.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.21.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.22.attn.proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.22.attn.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.22.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.22.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.22.mlp.down_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.22.mlp.gate_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.22.mlp.up_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.22.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.22.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.23.attn.proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.23.attn.proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.23.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.23.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.23.mlp.down_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.23.mlp.gate_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.23.mlp.up_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.23.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.23.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.24.attn.proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.24.attn.proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.24.attn.qkv.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.24.attn.qkv.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.24.mlp.down_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.24.mlp.gate_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.24.mlp.up_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.24.norm1.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.24.norm2.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.25.attn.proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.25.attn.proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.25.attn.qkv.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.25.attn.qkv.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.25.mlp.down_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.25.mlp.gate_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.25.mlp.up_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.25.norm1.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.25.norm2.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.26.attn.proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.26.attn.proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.26.attn.qkv.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.26.attn.qkv.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.26.mlp.down_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.26.mlp.gate_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.26.mlp.up_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.26.norm1.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.26.norm2.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.27.attn.proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.27.attn.proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.27.attn.qkv.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.27.attn.qkv.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.27.mlp.down_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.27.mlp.gate_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.27.mlp.up_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.27.norm1.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.27.norm2.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.28.attn.proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.28.attn.proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.28.attn.qkv.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.28.attn.qkv.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.28.mlp.down_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.28.mlp.gate_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.28.mlp.up_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.28.norm1.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.28.norm2.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.29.attn.proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.29.attn.proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.29.attn.qkv.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.29.attn.qkv.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.29.mlp.down_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.29.mlp.gate_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.29.mlp.up_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.29.norm1.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.29.norm2.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.3.attn.proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.3.attn.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.3.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.3.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.3.mlp.down_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.3.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.3.mlp.gate_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.3.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.3.mlp.up_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.3.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.3.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.3.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.30.attn.proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.30.attn.proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.30.attn.qkv.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.30.attn.qkv.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.30.mlp.down_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.30.mlp.gate_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.30.mlp.up_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.30.norm1.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.30.norm2.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.31.attn.proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.31.attn.proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.31.attn.qkv.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.31.attn.qkv.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.31.mlp.down_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.31.mlp.gate_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.31.mlp.up_proj.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.31.norm1.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.31.norm2.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.4.attn.proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.4.attn.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.4.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.4.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.4.mlp.down_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.4.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.4.mlp.gate_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.4.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.4.mlp.up_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.4.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.4.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.4.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.5.attn.proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.5.attn.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.5.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.5.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.5.mlp.down_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.5.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.5.mlp.gate_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.5.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.5.mlp.up_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.5.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.5.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.5.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.6.attn.proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.6.attn.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.6.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.6.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.6.mlp.down_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.6.mlp.gate_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.6.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.6.mlp.up_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.6.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.6.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.7.attn.proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.7.attn.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.7.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.7.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.7.mlp.down_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.7.mlp.gate_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.7.mlp.up_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.7.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.7.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.8.attn.proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.8.attn.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.8.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.8.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.8.mlp.down_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.8.mlp.gate_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.8.mlp.up_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.8.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.8.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.9.attn.proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.9.attn.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.9.attn.qkv.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.9.attn.qkv.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.9.mlp.down_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.9.mlp.gate_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.9.mlp.up_proj.bias": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.9.norm1.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.blocks.9.norm2.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.enc.visual.merger.ln_q.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.merger.mlp.0.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.merger.mlp.0.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.merger.mlp.2.bias": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.merger.mlp.2.weight": "model-00003-of-00003.safetensors",
+    "image_prefix.enc.visual.patch_embed.proj.weight": "model-00002-of-00003.safetensors",
+    "image_prefix.norm_extra.weight": "model-00003-of-00003.safetensors",
+    "lm_head.weight": "model-00002-of-00003.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.norm.weight": "model-00002-of-00003.safetensors"
+  }
+}

modeling_helium1_casa.py ADDED Viewed

	@@ -0,0 +1,330 @@

+from typing import Any, Callable
+from typing import cast as type_cast
+import torch
+from transformers.cache_utils import DynamicCache
+from transformers.configuration_utils import PretrainedConfig
+from transformers.generation.utils import GenerateOutput
+from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+    Qwen2_5_VisionTransformerPretrainedModel,
+)
+from .image_encoder import Qwen25VLEncoder
+from .configuration_helium1_casa import Helium1CASAConfig
+from .language_helium1_casa import (
+    CausalHeliumOutput,
+    Helium1CASAAttention,
+    Helium1ForCausalLM,
+    Helium1RMSNorm,
+)
+def meta_project(
+    logits: torch.Tensor | list[torch.Tensor],
+    projector: torch.nn.Module,
+    norm: torch.nn.Module | None = None,
+) -> torch.Tensor | list[torch.Tensor]:
+    """Projection operation that handles both tensors and list of tensors
+    Outputs either a (N, S, D) tensors (same resolution images) or a list of N (S, D) tensors (where
+    S can be a different sequence length per image)
+    """
+    split_sizes: list[int] | None = None
+    if not isinstance(logits, torch.Tensor):
+        split_sizes = [_x.shape[0] for _x in logits]
+        logits = torch.cat(logits, dim=0)[None, :, :]
+    logits = type_cast(torch.Tensor, logits)
+    logits = projector(logits)
+    assert isinstance(logits, torch.Tensor)
+    if norm is not None:
+        logits = norm(logits)
+    if split_sizes is not None:
+        return list(torch.split(type_cast(torch.Tensor, logits[0]), split_sizes, dim=0))
+    return logits
+class ImageProjection(torch.nn.Module):
+    """Takes in a batch or sequence of images and returns embeddings
+      which are then fed to the LM.
+    :param config: KyuteyeConfig object
+    :param lm_model_dim: Output dimension (number of channels) for this module
+    """
+    def __init__(self, config: PretrainedConfig, lm_model_dim: int) -> None:
+        super().__init__()
+        self.config = config
+        self.out_dim = lm_model_dim
+        visual = Qwen2_5_VisionTransformerPretrainedModel._from_config(config.vision_config)
+        self.enc = Qwen25VLEncoder(visual=visual)
+        # Projection layer
+        self.proj_extra = self.init_proj_module()
+        # Output normalizations
+        self.norm_extra = Helium1RMSNorm(self.out_dim)
+    def init_proj_module(self) -> torch.nn.Module:
+        """Init the project module for the inserted and/or cross-attended image tokens"""
+        if self.config.vision_config.out_dim == self.out_dim:
+            return torch.nn.Identity()
+        return torch.nn.Linear(self.config.vision_config.out_dim, self.out_dim)
+    def forward(
+        self, x: torch.Tensor | list[torch.Tensor]
+    ) -> dict[
+        str,
+        torch.Tensor | list[torch.Tensor],
+    ]:
+        """Image embedding mapping
+        :param x: Either a tensor with shape (Bi, C, H, W) or a list of Bi tensors
+        with shape (C, H, W) (or (H, W, C) in the case of Qwen)
+        :return: Either a tensor with shape (num_total_image, S, D) or, if images
+        can have different seq length, a list of `num_total_images` Tensors with shape
+        (S, D)
+        """
+        # Apply image encoder
+        og_dtype = x[0].dtype
+        encoded = self.enc(x)["image_embeds"]
+        encoded = [_x.to(og_dtype) for _x in encoded]
+        if all(x.shape[0] == encoded[0].shape[0] for x in encoded):
+            encoded = torch.stack(encoded, dim=0)
+        # Extra projection
+        image_embeds = meta_project(encoded, self.proj_extra, self.norm_extra)
+        # Apply different projection for extra vs cross attended tokens
+        return {"image_embeds": image_embeds}
+class V2Helium1(Helium1ForCausalLM):  # pyright: ignore[reportIncompatibleMethodOverride]
+    config_class = Helium1CASAConfig
+    def __init__(self, config: Helium1CASAConfig, **kwargs: Any) -> None:
+        del kwargs
+        super().__init__(config)
+        self.image_prefix = ImageProjection(config=config, lm_model_dim=self.token_dim)
+    def get_device(self) -> str:
+        """Return the device type of the model"""
+        return next(self.parameters()).device.type
+    @property
+    def token_dim(self) -> int:
+        """Returns the number of dimensions for the token representation"""
+        return self.config.hidden_size
+    @property
+    def rotary_embed(self) -> Callable:
+        """Returns the rotary embedding function of the underlying model"""
+        return self.model.rotary_emb
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: Any,
+        model_kwargs: dict[str, Any],
+        is_encoder_decoder: bool = False,
+        num_new_tokens: int = 1,
+    ):
+        """This is required to handle multiple gen calls for subtitles"""
+        # Call parent to get default updates
+        model_kwargs = super()._update_model_kwargs_for_generation(
+            outputs, model_kwargs, is_encoder_decoder, num_new_tokens
+        )
+        # Used by prepare_inputs_for_generation
+        model_kwargs["__is_first_gen_call__"] = False
+        return model_kwargs
+    def prepare_inputs_for_generation(  # pyright: ignore[reportIncompatibleMethodOverride]
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: DynamicCache | None = None,
+        **kwargs: Any,
+    ):
+        __is_first_gen_call__ = kwargs.get("__is_first_gen_call__", True)
+        if past_key_values is not None and (
+            kwargs.get("cache_position") is None
+            or type_cast(torch.Tensor, kwargs.get("cache_position")).shape[0] == 0
+        ):
+            # We're continuing from a cached state
+            past_length = past_key_values._seen_tokens
+            kwargs["cache_position"] = torch.arange(
+                past_length,
+                past_length + (input_ids.shape[1] if __is_first_gen_call__ else 1),
+                dtype=torch.long,
+                device=input_ids.device,
+            )
+        return super().prepare_inputs_for_generation(
+            type_cast(torch.LongTensor, input_ids),
+            past_key_values=past_key_values,
+            **kwargs,
+        )
+    def prepare_multimodal_inputs(
+        self,
+        # text only training
+        input_ids: torch.Tensor | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        image_embeds_insertion_points: list[torch.Tensor] | None = None,
+        labels: torch.Tensor | None = None,
+        # image values
+        pixel_values: torch.Tensor | list[torch.Tensor] | None = None,
+        pre_image_tokens: list[int] | None = None,
+        post_image_tokens: list[int] | None = None,
+        **_kwargs: Any,
+    ) -> dict:
+        """Get a batch data mixing text and image data"""
+        del _kwargs
+        processed_inputs = {
+            "input_ids": input_ids,
+            "inputs_embeds": inputs_embeds,
+            "labels": labels,
+            "attention_mask": attention_mask,
+            "image_embeds_insertion_points": image_embeds_insertion_points,
+        }
+        if pixel_values is not None:
+            processed_inputs.update(self.image_prefix(pixel_values))
+            assert "image_embeds" in processed_inputs
+            assert (
+                isinstance(processed_inputs["image_embeds"], torch.Tensor)
+                and processed_inputs["image_embeds"].ndim == 3
+            ) or (
+                isinstance(processed_inputs["image_embeds"], list)
+                and all(_x.ndim == 2 for _x in processed_inputs["image_embeds"])
+            )
+        # Add kwargs necessary to compute cu_seqlens windows for CASA
+        processed_inputs["casa_windows_info"] = {
+            "num_post_image_tokens": 0 if post_image_tokens is None else len(post_image_tokens),
+            "num_pre_image_tokens": 0 if pre_image_tokens is None else len(pre_image_tokens),
+        }
+        return processed_inputs
+    def forward(  # pyright: ignore[reportIncompatibleMethodOverride]
+        self,
+        input_ids: torch.Tensor | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        pixel_values: torch.Tensor | list[torch.Tensor] | None = None,
+        return_loss: bool = True,
+        labels: torch.Tensor | None = None,
+        image_embeds_insertion_points: list[torch.Tensor] | None = None,
+        pre_image_tokens: list[int] | None = None,
+        post_image_tokens: list[int] | None = None,
+        **kwargs: Any,
+    ) -> CausalHeliumOutput:
+        """Multi modal forward pass"""
+        assert input_ids is not None or inputs_embeds is not None
+        if self.training:
+            assert return_loss is True, (
+                "Helium models always compute its own labels/losses in train mode"
+            )
+        # Case 1: For first generation call we need to compute pixel values and CASA states
+        if kwargs.get("__is_first_gen_call__", True):
+            processed_inputs = self.prepare_multimodal_inputs(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                image_embeds_insertion_points=image_embeds_insertion_points,
+                pixel_values=pixel_values,
+                labels=labels,
+                pre_image_tokens=pre_image_tokens,
+                post_image_tokens=post_image_tokens,
+            )
+            processed_inputs.pop("inputs_embeds", None)
+        else:
+            processed_inputs = {
+                "inputs_embeds": self.model.embed_tokens(input_ids),
+                "attention_mask": attention_mask,
+            }
+        # For Helium prefix, we need to update the positions by the number
+        # of image tokens inserted in the first call
+        if (
+            not self.config.casa_attention
+            and (cp := kwargs.get("cache_position", None)) is not None
+            and pixel_values is not None
+        ):
+            start = kwargs["cache_position"][0].item()
+            num_image_tokens = (pixel_values[0].shape[0] * pixel_values[0].shape[1]) // 4
+            num_tokens = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]  # type: ignore
+            kwargs["cache_position"] = torch.arange(
+                start + (0 if kwargs.get("__is_first_gen_call__", True) else num_image_tokens),
+                start + num_tokens + num_image_tokens,
+                dtype=cp.dtype,
+                device=cp.device,
+            )
+        kwargs.pop("__is_first_gen_call__", True)
+        out = super().forward(
+            **processed_inputs,  # type: ignore
+            **kwargs,
+        )
+        return out
+    @torch.no_grad()
+    def generate_from_image(  # pyright: ignore[reportInconsistentOverload,reportIncompatibleMethodOverride]
+        self,
+        input_ids: torch.Tensor | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        image_embeds_insertion_points: list[torch.Tensor] | None = None,
+        pixel_values: torch.Tensor | list[torch.Tensor] | None = None,
+        reset_streaming: bool = True,
+        **kwargs: Any,
+    ) -> "GenerateOutput | torch.LongTensor":
+        assert input_ids is not None and inputs_embeds is None, (
+            "Input IDs must be provided for generation"
+        )
+        # init self-attention KVCache
+        if kwargs.get("past_key_values", None) is None:
+            kwargs["past_key_values"] = DynamicCache()
+        # To avoid generate warning
+        if kwargs.get("pad_token_id", None) is None:
+            kwargs["pad_token_id"] = kwargs.get("eos_token_id", None)
+            if isinstance(kwargs["pad_token_id"], (list, tuple)):
+                kwargs["pad_token_id"] = kwargs["pad_token_id"][0]
+        self.start_casa_streaming_states()
+        outputs = self.generate(
+            input_ids,
+            attention_mask=attention_mask,
+            pixel_values=pixel_values,
+            image_embeds_insertion_points=image_embeds_insertion_points,
+            use_cache=True,
+            **kwargs,
+        )
+        if reset_streaming:
+            self.reset_casa_streaming_states()
+        return outputs
+    def reset_casa_streaming_states(self, clean_cache: bool = True) -> None:
+        def __reset__(m: torch.nn.Module):
+            if isinstance(m, Helium1CASAAttention):
+                m._set_streaming(False, ())
+                m.reset_streaming()
+                if clean_cache:
+                    del m.streaming_state.k
+                    del m.streaming_state.v
+                    del m.streaming_state.casa_handler
+        self.apply(__reset__)
+    def start_casa_streaming_states(self) -> None:
+        def __start__(m: torch.nn.Module):
+            if isinstance(m, Helium1CASAAttention):
+                m._set_streaming(True, ())
+        self.apply(__start__)

processing.py ADDED Viewed

	@@ -0,0 +1,505 @@

+# pylint: disable=no-member  # avoid weird pylint warnings from SentencePieceProcessor
+"""Text and Image processor for CASA models using Qwen2.5_VL image encoder"""
+from math import ceil
+from typing import TYPE_CHECKING, Any, Literal, TypedDict, cast, overload
+from typing import cast as type_cast
+import torch
+import torchvision.transforms.v2 as T
+from einops import rearrange
+from PIL import Image
+from torchvision.transforms import InterpolationMode
+from torchvision.transforms.functional import to_tensor as pil_to_tensor
+from torchvision.transforms.v2 import functional as F
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.processing_utils import ProcessorMixin
+if TYPE_CHECKING:
+    from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer
+    from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+ImageMessage = TypedDict(
+    "ImageMessage",
+    {
+        "type": Literal["image"],
+        "image": str | Image.Image | None,
+    },
+)
+TextMessage = TypedDict(
+    "TextMessage",
+    {
+        "type": Literal["text"],
+        "text": str,
+    },
+)
+MessageContent = list[ImageMessage | TextMessage]
+Message = TypedDict(
+    "Message",
+    {
+        "role": Literal["system", "user", "assistant"],
+        "content": MessageContent,
+    },
+)
+ProcessorInput = list[list[Message]] | list[Message]
+__INTERP_NAME_TO_MODE__ = {
+    "nearest": InterpolationMode.NEAREST,
+    "bilinear": InterpolationMode.BILINEAR,
+    "bicubic": InterpolationMode.BICUBIC,
+    "lanczos": InterpolationMode.LANCZOS,
+}
+__INTERP_INT_TO_MODE__ = {
+    0: InterpolationMode.NEAREST,
+    2: InterpolationMode.BILINEAR,
+    3: InterpolationMode.BICUBIC,
+    4: InterpolationMode.BOX,
+    5: InterpolationMode.HAMMING,
+    1: InterpolationMode.LANCZOS,
+}
+@overload
+def universal_resize(
+    img: Image.Image,
+    size: tuple[int, int],
+    interpolation: str | InterpolationMode | int = "bilinear",
+    antialias: bool = True,
+) -> Image.Image: ...
+@overload
+def universal_resize(
+    img: torch.Tensor,
+    size: tuple[int, int],
+    interpolation: str | InterpolationMode | int = "bilinear",
+    antialias: bool = True,
+) -> torch.Tensor: ...
+def universal_resize(
+    img: Image.Image | torch.Tensor,
+    size: tuple[int, int],
+    interpolation: str | InterpolationMode | int = "bilinear",
+    antialias: bool = True,
+) -> Image.Image | torch.Tensor:
+    """Resize that works for PIL.Image, CHW tensor, or BCHW tensor"""
+    if isinstance(interpolation, str):
+        interpolation = __INTERP_NAME_TO_MODE__[interpolation]
+    elif isinstance(interpolation, int):
+        interpolation = __INTERP_INT_TO_MODE__[interpolation]
+    return F.resize(
+        img, size, interpolation=type_cast(InterpolationMode, interpolation), antialias=antialias
+    )
+@overload
+def convert_to_rgb(img: Image.Image) -> Image.Image: ...
+@overload
+def convert_to_rgb(img: torch.Tensor) -> torch.Tensor: ...
+def convert_to_rgb(img: Image.Image | torch.Tensor) -> Image.Image | torch.Tensor:
+    """Convert any image to RGB in a way that does not throw PIL warning"""
+    if isinstance(img, torch.Tensor):
+        return img
+    if img.mode == "RGB":  # no changes
+        return img
+    if img.mode == "P":  # palette images need to be converted to RGBA first
+        return img.convert("RGBA").convert("RGB")
+    return img.convert("RGB")
+class QwenImageProcessor(BaseImageProcessor):
+    """Resizing for the Qwen2.5VL encoder. Note that the normalization is
+    handled in the image_encoder in the model forward"""
+    def __init__(
+        self,
+        img_size: int = 448,
+        interpolation: Literal["bicubic", "bilinear", "nearest", "nearest_exact"] = "bicubic",
+        max_ratio: int = 10,
+        round_to_patch_size: int = 56,
+        use_fast: bool = True,
+        **kwargs: Any,
+    ) -> None:
+        # this will also be used in V2llms to determine whether to remove
+        # the temporal conv
+        self._num_target_channels = 588
+        self._merge_size = 2
+        self._patch_size = 14
+        super().__init__(
+            use_fast=use_fast,
+            do_normalize=False,
+            **kwargs,
+        )
+        self.img_size = img_size
+        self.interpolation = interpolation
+        self.max_ratio = max_ratio
+        self.round_to_patch_size = round_to_patch_size
+    def resize_transform(
+        self, img: Image.Image | torch.Tensor, img_size: int | None = None
+    ) -> Image.Image | torch.Tensor:
+        if img_size is None:
+            img_size = self.img_size
+        max_area = img_size**2
+        if isinstance(img, Image.Image):
+            img = convert_to_rgb(img)
+            w_og, h_og = img.size
+        else:
+            h_og, w_og = img.shape[-2:]
+        w, h = w_og, h_og
+        # Qwen requires max ratio of 10 between max and min sizes
+        if self.max_ratio > 0:
+            w, h = max(w, h // self.max_ratio), max(h, w // self.max_ratio)
+        # resize to max area
+        current_area = w * h
+        if current_area > max_area:
+            scale = (max_area / current_area) ** 0.5
+            w, h = int(w * scale), int(h * scale)
+        # resize to patch size
+        if self.round_to_patch_size > 0:
+            w = ceil(w / self.round_to_patch_size) * self.round_to_patch_size
+            h = ceil((h / self.round_to_patch_size)) * self.round_to_patch_size
+        # resize
+        if w != w_og or h != h_og:
+            img = universal_resize(img, (h, w), self.interpolation)
+        if isinstance(img, torch.Tensor):
+            img = T.ToDtype(torch.float32, scale=True)(T.ToImage()(img))
+        return img
+    def __process_one__(
+        self, video_or_img: Image.Image | torch.Tensor, img_size: int | None = None
+    ) -> torch.Tensor:
+        """Same operation as __process_one_with_processor__ but without going through numpy"""
+        video_or_img = self.resize_transform(video_or_img, img_size)
+        if isinstance(video_or_img, Image.Image):
+            video_or_img = pil_to_tensor(video_or_img)
+        assert isinstance(video_or_img, torch.Tensor)
+        if video_or_img.ndim == 3:
+            video_or_img = video_or_img[None]
+        assert video_or_img.ndim == 4 and video_or_img.shape[1] == 3, (
+            f"Invalid shape {video_or_img.shape}."
+        )
+        t, c, h, w = video_or_img.shape
+        p = self._patch_size
+        m = self._merge_size
+        # Convert to RGB
+        if c == 1:
+            video_or_img = video_or_img.expand((-1, 3, -1, -1))
+        if c == 4:
+            video_or_img = video_or_img[:, :3]
+        c = video_or_img.shape[1]
+        assert c == 3, "Expecting RGB image in QwenNormalize"
+        # Reshape to t h w c' format
+        h, w = video_or_img.shape[2] // p, video_or_img.shape[3] // p
+        rearrange_dict = dict(p1=p, p2=p, m1=m, m2=m)
+        video_or_img = rearrange(
+            video_or_img,
+            "t c (h m1 p1) (w m2 p2) -> (t h w m1 m2) (c p1 p2)",
+            **rearrange_dict,
+        )
+        assert video_or_img.shape[-1] == self._num_target_channels, (
+            f"{video_or_img.shape[-1]} != {self._num_target_channels}"
+        )
+        video_or_img = video_or_img.view((-1, h, w, self._num_target_channels))
+        return video_or_img
+    @overload
+    def process_images(
+        self, image: Image.Image | torch.Tensor, img_size: int | None = None
+    ) -> torch.Tensor: ...
+    @overload
+    def process_images(
+        self, image: list[Image.Image] | list[torch.Tensor], img_size: int | None = None
+    ) -> list[torch.Tensor]: ...
+    def process_images(
+        self,
+        image: Image.Image | torch.Tensor | list[Image.Image] | list[torch.Tensor],
+        img_size: int | None = None,
+    ) -> torch.Tensor | list[torch.Tensor]:
+        if isinstance(image, list):
+            return [self.__process_one__(_x, img_size) for _x in image]
+        return self.__process_one__(image, img_size)
+class ProcessorOutput(dict):
+    input_ids: torch.Tensor
+    attention_mask: torch.Tensor
+    image_embeds_insertion_points: list[torch.Tensor] | None
+    pixel_values: torch.Tensor | list[torch.Tensor] | None
+    def to(
+        self, device: torch.device | str, dtype: torch.dtype = torch.bfloat16
+    ) -> "ProcessorOutput":
+        return ProcessorOutput(
+            {
+                "input_ids": self["input_ids"].to(device),
+                "attention_mask": self["attention_mask"].to(device),
+                "image_embeds_insertion_points": self["image_embeds_insertion_points"],
+                "pixel_values": (
+                    self["pixel_values"].to(dtype).to(device)
+                    if isinstance(self["pixel_values"], torch.Tensor)
+                    else [x.to(dtype).to(device) for x in self["pixel_values"]]
+                    if self["pixel_values"] is not None
+                    else None
+                ),
+            }
+        )
+class BaseProcessor(ProcessorMixin):
+    def __init__(
+        self,
+        tokenizer: "PreTrainedTokenizerFast | Qwen2Tokenizer",
+        pre_image_tokens: tuple[int, ...] = (),
+        post_image_tokens: tuple[int, ...] = (),
+        system_start_tokens: tuple[int, ...] = (),
+        system_end_tokens: tuple[int, ...] = (),
+        user_start_tokens: tuple[int, ...] = (),
+        user_end_tokens: tuple[int, ...] = (),
+        asst_start_tokens: tuple[int, ...] = (),
+        asst_end_tokens: tuple[int, ...] = (),
+        allow_system_prompt: bool = True,
+        pad_token: int = 0,
+        bos_token: int | None = None,
+    ) -> None:
+        self.pre_image_tokens = list(pre_image_tokens)
+        self.post_image_tokens = list(post_image_tokens)
+        self.system_start_tokens = list(system_start_tokens)
+        self.system_end_tokens = list(system_end_tokens)
+        self.user_start_tokens = list(user_start_tokens)
+        self.user_end_tokens = list(user_end_tokens)
+        self.asst_start_tokens = list(asst_start_tokens)
+        self.asst_end_tokens = list(asst_end_tokens)
+        self._allow_system_prompt = allow_system_prompt
+        self.tokenizer = tokenizer
+        self._image_processor = None
+        self._pad_token = pad_token
+        self.bos_token = bos_token
+    @property
+    def image_processor(self) -> QwenImageProcessor:
+        assert self._image_processor is not None
+        return self._image_processor
+    def _process_content(
+        self,
+        message_content: MessageContent,
+        role: Literal["system", "user", "assistant"],
+        tokenized_messages: list[torch.Tensor],
+        insertion_points: list[int],
+        image_list: list[torch.Tensor | None],
+        token_count: int,
+        img_size: int | None = None,
+        **kwargs: Any,
+    ) -> int:
+        mapping = {
+            "user": (self.user_start_tokens, self.user_end_tokens),
+            "assistant": (self.asst_start_tokens, self.asst_end_tokens),
+            "system": (self.system_start_tokens, self.system_end_tokens),
+        }
+        if role.lower() not in mapping:
+            raise ValueError(f"Unknown role '{role}' encountered in messages.")
+        start_tokens, end_tokens = mapping[role.lower()]
+        # 1) Add the start tokens
+        if start_tokens:
+            tokenized_messages.append(torch.Tensor(start_tokens).flatten().to(torch.long))
+            token_count += len(start_tokens)
+        # 2) Process the message content one by one (potentially interleaved image and text)
+        for part in message_content:
+            elt_type = part["type"]
+            if elt_type == "image":
+                part = cast(ImageMessage, part)
+                self._process_image_message(
+                    part,
+                    tokenized_messages,
+                    image_list,
+                    img_size=img_size,
+                )
+                token_count += len(self.pre_image_tokens)
+                insertion_points.append(token_count)
+                token_count += len(self.post_image_tokens)
+            else:
+                part = cast(TextMessage, part)
+                self._process_text_message(
+                    part["text"],
+                    role=role,
+                    token_list=tokenized_messages,
+                    **kwargs,
+                )
+                token_count += tokenized_messages[-1].size(0)
+        # 3) Add the end tokens
+        if end_tokens:
+            tokenized_messages.append(torch.Tensor(end_tokens).flatten().to(torch.long))
+            token_count += len(end_tokens)
+        return token_count
+    def _process_text_message(
+        self,
+        message: str,
+        role: Literal["system", "user", "assistant"],
+        token_list: list[torch.Tensor],
+        **kwargs: Any,
+    ) -> None:
+        if role.lower() == "system" and not self._allow_system_prompt:
+            raise ValueError("System prompts are not allowed in this tokenizer configuration.")
+        tokens = self.tokenizer.encode(
+            message, add_special_tokens=False, return_tensors="pt", **kwargs
+        )
+        tokens = cast(torch.Tensor, tokens)
+        token_list.append(tokens.flatten().to(torch.long))
+    def _process_image_message(
+        self,
+        message: ImageMessage,
+        token_list: list[torch.Tensor],
+        image_list: list[torch.Tensor | None],
+        img_size: int | None = None,
+    ) -> None:
+        img = message["image"]
+        if img is None:
+            image_list.append(None)
+        else:
+            image_list.append(
+                self.image_processor.process_images(
+                    self._load_image(img), img_size=img_size
+                ).squeeze(0)
+            )
+        if self.pre_image_tokens:
+            token_list.append(torch.Tensor(self.pre_image_tokens).flatten().to(torch.long))
+        if self.post_image_tokens:
+            token_list.append(torch.Tensor(self.post_image_tokens).flatten().to(torch.long))
+    def _load_image(self, image_path_or_image: str | Image.Image) -> Image.Image:
+        if isinstance(image_path_or_image, str):
+            return Image.open(image_path_or_image).convert("RGB")
+        return image_path_or_image
+    def _maybe_pad(self, tokens: torch.Tensor, pad_len: int, pad_value: int) -> torch.Tensor:
+        return torch.nn.functional.pad(
+            tokens,
+            (0, pad_len) if self.tokenizer.padding_side == "right" else (pad_len, 0),
+            value=pad_value,
+        )
+    def pad_tokenized_messages(
+        self,
+        tokenized_messages_batch: list[torch.Tensor],
+        image_insertion_points_batch: list[torch.Tensor] | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor, list[torch.Tensor] | None]:
+        max_len = max(len(x) for x in tokenized_messages_batch)
+        if image_insertion_points_batch is not None and self.tokenizer.padding_side == "left":
+            image_insertion_points_batch = [
+                x + max_len - len(tokenized_messages_batch[idx])
+                for idx, x in enumerate(image_insertion_points_batch)
+            ]
+        input_ids = torch.stack(
+            [
+                self._maybe_pad(s, max_len - s.size(0), self._pad_token)
+                for s in tokenized_messages_batch
+            ],
+            dim=0,
+        )
+        attention_mask = torch.stack(
+            [
+                self._maybe_pad(torch.ones_like(s), max_len - s.size(0), 0)
+                for s in tokenized_messages_batch
+            ],
+            dim=0,
+        )
+        return input_ids, attention_mask, image_insertion_points_batch
+    def tokenize_messages(
+        self,
+        messages: ProcessorInput,
+        suppress_bos_token: bool = False,
+        **kwargs: Any,
+    ) -> ProcessorOutput | None:
+        """Tokenize a batch of messages into token IDs suitable for Helium1 CASA model.
+        Args:
+            messages (list[list[dict[str, str]]] | list[dict[str, str]]): Batch of message lists (or single list of messages),
+              where each message is a list of dictionaries with 'role' and 'content' keys.
+            continue_final_message (bool, optional): If True, the final message in each list will not have an end token added.
+              Defaults to False.
+            suppress_bos_token (bool, optional): If True, the beginning-of-sequence token will not be added.
+                Defaults to False.
+            **kwargs: Additional keyword arguments passed to the underlying encode method.
+        """
+        if not messages:
+            return None
+        if isinstance(messages[0], dict):
+            messages = [messages]  # type: ignore[assignment]
+        messages = cast(list[list[Message]], messages)
+        image_insertion_points_batch = []
+        tokenized_messages_batch = []
+        image_list: list[torch.Tensor | None] = []
+        for msgs in messages:
+            # msgs.append({
+            #     "role": "assistant",
+            #     "content": [{"type": "text", "text": ""}]
+            # })
+            tokenized_messages = []
+            if not suppress_bos_token and self.bos_token is not None:
+                tokenized_messages.append(torch.tensor([self.bos_token], dtype=torch.long))
+            insertion_points = []
+            token_count = 0
+            for msg in msgs:
+                token_count = self._process_content(
+                    msg["content"],
+                    role=msg["role"],
+                    tokenized_messages=tokenized_messages,
+                    insertion_points=insertion_points,
+                    image_list=image_list,
+                    token_count=token_count,
+                    **kwargs,
+                )
+            tokenized_messages_batch.append(torch.cat(tokenized_messages, dim=0).to(torch.long))
+            image_insertion_points_batch.append(torch.tensor(insertion_points, dtype=torch.long))
+            if msgs and self.asst_end_tokens and msgs[-1]["role"].lower() == "assistant":
+                # Remove the assistant end tokens from the final message
+                end_token_len = len(self.asst_end_tokens)
+                tokenized_messages_batch[-1] = tokenized_messages_batch[-1][:-end_token_len]
+            if msgs and self.asst_start_tokens and msgs[-1]["role"].lower() == "user":
+                # Remove the assistant end tokens from the final message
+                end_token_len = len(self.asst_end_tokens)
+                tokenized_messages_batch[-1] = torch.cat(
+                    [
+                        tokenized_messages_batch[-1],
+                        torch.Tensor(self.asst_start_tokens).to(torch.long),
+                    ]
+                )
+        input_ids, attention_mask, image_embeds_insertion_points = self.pad_tokenized_messages(
+            tokenized_messages_batch, image_insertion_points_batch
+        )
+        if image_list:
+            assert sum(img is None for img in image_list) % len(image_list) == 0, (
+                "Either all or no image must be None."
+            )
+        pixel_values: None | torch.Tensor | list[torch.Tensor]
+        if image_list[0] is None:
+            pixel_values = None
+        else:
+            pixel_values = cast(list[torch.Tensor], image_list)
+        return ProcessorOutput(
+            input_ids=input_ids,
+            image_embeds_insertion_points=image_embeds_insertion_points,
+            attention_mask=attention_mask,
+            pixel_values=pixel_values,
+        )

processing_helium1_casa.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+from .processing import BaseProcessor, QwenImageProcessor
+class Helium1CASAProcessor(BaseProcessor):
+    attributes = ["tokenizer"]
+    tokenizer_class = "PreTrainedTokenizerFast"
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerFast,
+        pre_image_tokens: tuple[int, ...] = tuple(),
+        post_image_tokens: tuple[int, ...] = tuple(),
+        system_start_tokens: tuple[int, ...] = tuple(),
+        system_end_tokens: tuple[int, ...] = tuple(),
+        user_start_tokens: tuple[int, ...] = (104,),
+        user_end_tokens: tuple[int, ...] = (105,),
+        asst_start_tokens: tuple[int, ...] = (102,),
+        asst_end_tokens: tuple[int, ...] = (103,),
+        bos_token: int = 1,
+        image_size: int = 896,
+    ):
+        super().__init__(
+            tokenizer=tokenizer,
+            pre_image_tokens=pre_image_tokens,
+            post_image_tokens=post_image_tokens,
+            system_start_tokens=system_start_tokens,
+            system_end_tokens=system_end_tokens,
+            user_start_tokens=user_start_tokens,
+            user_end_tokens=user_end_tokens,
+            asst_start_tokens=asst_start_tokens,
+            asst_end_tokens=asst_end_tokens,
+            allow_system_prompt=False,
+            bos_token=bos_token,
+        )
+        self._image_processor = QwenImageProcessor(img_size=image_size)

processor_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "auto_map": {
+        "AutoProcessor": "processing_helium1_casa.Helium1CASAProcessor"
+    },
+    "bos_token": 1,
+    "image_size": 896,
+    "post_image_tokens": [],
+    "pre_image_tokens": [],
+    "processor_class": "Helium1CASAProcessor"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90cea6d2a04d6c89a9904853c22aac0c342fc193a75048f4cbee4f98b9c835d8
+size 70505

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+    "tokenizer_class": "PreTrainedTokenizerFast",
+    "additional_special_tokens": [
+        "<|im_sp_00|>",
+        "<|im_sp_01|>",
+        "<|im_sp_02|>",
+        "<|im_sp_94|>",
+        "<|im_sp_95|>",
+        "<|im_sp_96|>",
+        "<|im_sp_97|>",
+        "<|im_sp_98|>",
+        "<|im_sp_99|>"
+    ]
+}

utils.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# pylint: disable=protected-access
+"""Utils to handle CASA layers construction"""
+from contextlib import contextmanager
+from dataclasses import dataclass, fields
+from typing import Any, Callable, Generic, TypeVar
+import torch
+def delta_w_factory(
+    org_lin: torch.nn.Linear, new_lin: torch.nn.Linear
+) -> Callable[[torch.Tensor], torch.Tensor]:
+    """Factory for building linear op where the weights are the sum of two layers' weights"""
+    def _delta_w_fwd(input: torch.Tensor) -> torch.Tensor:
+        nonlocal org_lin, new_lin
+        bias = None if org_lin.bias is None else org_lin.bias + new_lin.bias
+        return torch.nn.functional.linear(input, org_lin.weight + new_lin.weight, bias)
+    return _delta_w_fwd
+@dataclass
+class StreamingState:
+    """Streaming State used by CASA layers at inference to save
+    e.g. the offset, the KV Cache and other persistent states"""
+    offset: int = 0
+    def _is_valid_field(self, key: str) -> bool:
+        return key in {x.name for x in fields(self)}
+    def _init_field(self, key: str) -> None:
+        """Init function for non-arggment dependent defauls"""
+        assert self._is_valid_field(key)
+        if key == "offset":
+            self.offset = 0
+        else:
+            # for fields which should be set explicitly and cannot be auto-initialized
+            setattr(self, key, None)
+    def init(self) -> None:
+        for key in [x.name for x in fields(self)]:
+            self._init_field(key)
+    def _reset_field(self, name: str) -> None:
+        """Resets the given field"""
+        self._init_field(name)
+    def reset(self) -> None:
+        for f in fields(self):
+            self._reset_field(f.name)
+    def _get_field(self, f: str) -> Any:
+        """Get field and init if not"""
+        assert self._is_valid_field(f)
+        if getattr(self, f) is None:
+            self._init_field(f)
+        return getattr(self, f)
+    def _set_field(self, f: str, value: Any) -> None:
+        assert self._is_valid_field(f)
+        setattr(self, f, value)
+StreamingStateT = TypeVar("StreamingStateT", bound=StreamingState)
+class StreamingModule(torch.nn.Module, Generic[StreamingStateT]):  # pylint: disable=abstract-method
+    """Overrides Audiocraft's Streaming modules with additional small utils"""
+    def __init__(self, state_class: type) -> None:
+        torch.nn.Module.__init__(self)
+        self.is_streaming: bool = False
+        self.enable_viz: tuple[str, ...] = ()
+        self._streaming_state: StreamingStateT = state_class()
+    @property
+    def streaming_state(self) -> StreamingStateT:
+        return self._streaming_state
+    def _apply_named_streaming(self, fn: Callable):
+        """Apply function to all streaming modules"""
+        for name, module in self.named_modules():
+            if isinstance(module, StreamingModule):
+                fn(name, module)
+    def reset_streaming(self):
+        """Reset the streaming state."""
+        def _reset(_: str, module: StreamingModule):
+            module._streaming_state.reset()
+        self._apply_named_streaming(_reset)
+    def _set_streaming(self, streaming: bool, viz: tuple[str, ...] = ()):
+        """Set all streaming modules in streaming mode"""
+        def _set_streaming(_, module: StreamingModule) -> None:
+            module.is_streaming = streaming
+            module.enable_viz = viz
+            if streaming:
+                module.streaming_state.init()
+        self._apply_named_streaming(_set_streaming)
+    @contextmanager
+    def streaming(self, stream: bool = True, viz: tuple[str, ...] = ()):
+        """Context manager to enter streaming mode. Reset streaming state on exit."""
+        self._set_streaming(stream, viz)
+        try:
+            yield
+        finally:
+            self._set_streaming(False, ())
+            self.reset_streaming()