File size: 10,609 Bytes

6a109bf

# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Text encoder implementation in PyTorch."""

import typing as t

import numpy as np
import sentencepiece as spm
import torch
from torch import nn
import torch.nn.functional as F


class Tokenizer(object):
  """A simple tokenizer using SentencePiece."""

  def __init__(self, tokenizer_path: str):
    self.sp = spm.SentencePieceProcessor(model_file=tokenizer_path)
    # Match tensorflow_text.SentencepieceTokenizer(add_bos=False, add_eos=False)
    self.sp.SetEncodeExtraOptions("")
    # Explicitly disable BOS/EOS to match the reference Colab implementation.
    self._add_bos = False
    self._add_eos = False

  def tokenize(self, input_texts, max_len=64):
    if isinstance(input_texts, str):
      input_texts = [input_texts]
    batch_ids = [
        self.sp.encode(t.lower(), add_bos=self._add_bos, add_eos=self._add_eos)
        for t in input_texts
    ]
    tokens = np.zeros((len(batch_ids), max_len), dtype=np.int64)
    for i, ids in enumerate(batch_ids):
      length = min(len(ids), max_len)
      tokens[i, :length] = ids[:length]
    is_padding = (tokens == 0).astype(np.int32)
    return tokens, is_padding


class PositionalEmbedding(nn.Module):
  """Generates position embedding for a given 1-d sequence.

  Attributes:
    min_timescale: Start of the geometric index. Determines the periodicity of
      the added signal.
    max_timescale: End of the geometric index. Determines the frequency of the
      added signal.
    embedding_dim: Dimension of the embedding to be generated.
  """

  min_timescale: int = 1
  max_timescale: int = 10_000
  embedding_dim: int = 0

  def __init__(self, embedding_dim: int):
    super().__init__()
    self.embedding_dim = embedding_dim

  def __call__(self, seq_length: int = None, position: torch.tensor = None):
    """Generates a torch.tensor of sinusoids with different frequencies.

    Args:
      seq_length: an optional Python int defining the output sequence length.
        if the `position` argument is specified.
      position:   [B, seq_length], optional position for each token in the
        sequence, only required when the sequence is packed.

    Returns:
      [B, seqlen, D] if `position` is specified, else [1, seqlen, D]
    """
    if position is None:
      assert seq_length is not None
      # [1, seqlen]
      position = torch.arange(seq_length, dtype=torch.float32)[None, :]
    else:
      assert position.ndim == 2, position.shape

    num_timescales = self.embedding_dim // 2
    log_timescale_increment = torch.log(
        torch.tensor(float(self.max_timescale) / float(self.min_timescale))
    ) / torch.maximum(
        torch.tensor(num_timescales, dtype=torch.float32) - 1, torch.tensor(1)
    )
    inv_timescales = self.min_timescale * torch.exp(
        torch.arange(num_timescales, dtype=torch.float32)
        * -log_timescale_increment
    )
    scaled_time = position[:, :, None] * inv_timescales[None, None, :]
    signal = torch.cat((torch.sin(scaled_time), torch.cos(scaled_time)), dim=2)
    # Force usage of `np` rather than `jnp` to compute static values at trace
    # time.
    signal = F.pad(signal, (0, self.embedding_dim % 2, 0, 0, 0, 0))
    return signal


class MlpBlockWithMask(nn.Module):
  """Transformer MLP / feed-forward block that supports masking."""

  def __init__(
      self,
      mlp_dim: int,
      d_model: int,
      use_bias: bool = True,
      dtype: torch.dtype = torch.float32,
      activation_fn: nn.Module = nn.GELU,
  ):
    super().__init__()

    self.mlp_dim = mlp_dim
    self.d_model = d_model
    self.use_bias = use_bias
    self.dtype = dtype
    self.activation_fn = activation_fn

    self.c_fc = nn.Linear(
        in_features=self.d_model,
        out_features=self.mlp_dim,
        dtype=self.dtype,
        bias=self.use_bias,
    )
    self.c_proj = nn.Linear(
        in_features=self.mlp_dim,
        out_features=self.d_model,
        dtype=self.dtype,
        bias=self.use_bias,
    )

  def __call__(
      self, inputs: torch.Tensor, mlp_mask: torch.Tensor
  ) -> torch.Tensor:
    """Applies Transformer MlpBlock with mask module."""
    x = self.c_fc(inputs)
    x = self.activation_fn()(x)
    x = x * mlp_mask[..., None]  # First masking.
    x = self.c_proj(x)
    x = x * mlp_mask[..., None]  # Second masking.
    return x


class ResidualAttentionBlock(nn.Module):
  """Transformer residual attention block."""

  def __init__(
      self,
      d_model: int,
      n_head: int,
      mlp_dim: int,
      dtype: torch.dtype = torch.float32,
  ):
    super().__init__()
    self.d_model = d_model
    self.n_head = n_head
    self.mlp_dim = mlp_dim
    self.dtype = dtype

    self.attn = nn.MultiheadAttention(d_model, n_head, dtype=self.dtype)
    self.ln_1 = nn.LayerNorm(d_model, dtype=self.dtype)
    self.mlp = MlpBlockWithMask(
        self.mlp_dim,
        d_model,
        use_bias=True,
        dtype=self.dtype,
        activation_fn=nn.ReLU,
    )
    self.ln_2 = nn.LayerNorm(d_model, dtype=self.dtype)

  def attention(self, x: torch.Tensor, mask: torch.Tensor):
    attn_mask = (
        mask[:, None, None, :]
        .repeat(1, self.n_head, x.shape[0], 1)
        .flatten(0, 1)
    )
    attn_mask[attn_mask == 0] = float('-inf')
    attn_mask[attn_mask == 1] = 0
    return self.attn(x, x, x, need_weights=False, attn_mask=attn_mask)[0]

  def forward(self, x: torch.Tensor, mask: torch.Tensor):
    x = x + self.attention(self.ln_1(x), mask.permute(1, 0))
    x = x + self.mlp(self.ln_2(x), mask)
    return x, mask


class SequentialMultiInput(nn.Sequential):
  """Sequential module that can take multiple inputs."""

  def forward(self, *inputs):
    for module in self._modules.values():
      if isinstance(inputs, tuple):
        inputs = module(*inputs)
      else:
        inputs = module(inputs)
    return inputs


class Transformer(nn.Module):
  """Transformer implementation."""

  def __init__(
      self,
      width: int,
      layers: int,
      heads: int,
      mlp_dim: int,
      dtype: torch.dtype = torch.float32,
  ):
    super().__init__()
    self.width = width
    self.layers = layers
    self.heads = heads
    self.mlp_dim = mlp_dim
    self.dtype = dtype

    self.resblocks = SequentialMultiInput(*[
        ResidualAttentionBlock(self.width, self.heads, self.mlp_dim, self.dtype)
        for _ in range(self.layers)
    ])

  def forward(self, x: torch.Tensor, mask: torch.Tensor):
    return self.resblocks(x, mask)[0]


class GlobalAvgPooling(nn.Module):
  """Performs a simple global pooling over the input with optional paddings.

  Attributes:
    pooling_dims: A list of dims to perform pooling over.
    keepdims: If True, keep dimension of inputs after pooling.
  """

  pooling_dims: t.Sequence[int]
  epsilon: float = 1e-8

  def __init__(
      self, pooling_dims: t.Sequence[int], epsilon: float = 1e-8
  ):
    super().__init__()
    self.pooling_dims = pooling_dims
    self.epsilon = epsilon

    if not all([p_dims >= 0 for p_dims in self.pooling_dims]):
      raise ValueError('pooling_dims must be non-negative integers.')

  def __call__(
      self,
      inputs: torch.tensor,
      compatible_paddings: torch.tensor,
  ):
    """Applies global average spatial pooling to inputs.

    Args:
      inputs: An input tensor.
      compatible_paddings: paddings of inputs with shapes compatible with
        inputs, e.g. compatible_paddings with shape [B, 1] for inputs with shape
        [B, D].

    Returns:
      Output tensor with global pooling applied.
    """
    padded_value = torch.zeros_like(inputs)
    padded_value = torch.ones_like(inputs) * padded_value
    inputs = torch.where(compatible_paddings > 0, padded_value, inputs)
    valid_inputs = (
        torch.sum(
            1.0 - compatible_paddings,
            self.pooling_dims,
            keepdims=True,
            dtype=inputs.dtype,
        )
        + self.epsilon
    )
    inputs_sum = torch.sum(inputs, self.pooling_dims, keepdims=True)
    outputs = torch.divide(inputs_sum, valid_inputs).type(inputs.dtype)
    outputs = torch.squeeze(outputs, axis=self.pooling_dims)
    return outputs


class TextEncoder(nn.Module):
  """Text encoder implementation."""

  def __init__(
      self,
      config: t.Dict[str, int],
      vocab_size: int,
      dtype: torch.dtype = torch.float32,
      scale_sqrt_depth: bool = True,
  ):
    super().__init__()
    self.vocab_size = vocab_size
    self.dtype = dtype
    self.scale_sqrt_depth = scale_sqrt_depth

    # The text tower layers are fixed independent of vision tower size.
    self.transformer_layers = config['num_layers']
    self.embedding_dim = config['hidden_size']
    self.transformer_width = config['hidden_size']
    self.mlp_dim = config['mlp_dim']
    self.transformer_heads = config['num_heads']

    self.token_embedding = nn.Embedding(
        self.vocab_size, self.embedding_dim, dtype=self.dtype
    )
    self.pos_embedder = PositionalEmbedding(embedding_dim=self.embedding_dim)
    self.transformer = Transformer(
        width=self.transformer_width,
        layers=self.transformer_layers,
        heads=self.transformer_heads,
        mlp_dim=self.mlp_dim,
        dtype=self.dtype,
    )
    self.pooling = GlobalAvgPooling(pooling_dims=[1])
    self.ln_final = nn.LayerNorm(self.transformer_width, dtype=self.dtype)

  def __call__(
      self,
      ids: torch.tensor,
      paddings: torch.tensor,
  ):
    """Applies TextEncoder module."""
    _, seq_length = ids.shape
    mask = (paddings == 0).type(torch.float32)
    mask = mask.permute(1, 0)  # NL -> LN
    x = self.token_embedding(ids)
    if self.scale_sqrt_depth:
      x = x * (self.embedding_dim**0.5)
    x = x + self.pos_embedder(seq_length=seq_length).to(x.device)
    x = x.permute(1, 0, 2)  # NLD -> LND
    x = self.transformer(x, mask)
    x = x.permute(1, 0, 2)  # LND -> NLD
    x = self.ln_final(x)
    x = self.pooling(x, compatible_paddings=paddings[:, :, None])
    return x