Update Weight

Browse files

Files changed (4) hide show

config.json +53 -0
configuration_sapiens.py +127 -0
model.safetensors +3 -0
modeling_sapiens.py +621 -0

config.json ADDED Viewed

	@@ -0,0 +1,53 @@

+{
+  "architectures": [
+    "SapiensGaitModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_sapiens.SapiensGaitConfig",
+    "AutoModel": "modeling_sapiens.SapiensGaitModel"
+  },
+  "model_type": "sapiens_gait",
+  "image_size": [
+    1024,
+    768
+  ],
+  "patch_size": 16,
+  "patch_padding": 2,
+  "image_mean": [
+    123.675,
+    116.28,
+    103.53
+  ],
+  "image_std": [
+    58.395,
+    57.12,
+    57.375
+  ],
+  "in_channels": 3,
+  "embed_dims": 1024,
+  "num_layers": 24,
+  "num_heads": 16,
+  "feedforward_channels": 4096,
+  "drop_rate": 0.0,
+  "drop_path_rate": 0.0,
+  "qkv_bias": true,
+  "num_keypoints": 133,
+  "deconv_out_channels": [
+    768,
+    768
+  ],
+  "deconv_kernel_sizes": [
+    4,
+    4
+  ],
+  "conv_out_channels": [
+    768,
+    768
+  ],
+  "conv_kernel_sizes": [
+    1,
+    1
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.0"
+}

configuration_sapiens.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""Sapiens Gait Model Configuration (Pure HuggingFace, no OpenMMLab dependency)."""
+from transformers import PretrainedConfig
+# Pre-defined architecture variants
+SAPIENS_ARCH_ZOO = {
+    "sapiens_0.3b": {
+        "embed_dims": 1024,
+        "num_layers": 24,
+        "num_heads": 16,
+        "feedforward_channels": 4096,
+    },
+    "sapiens_0.6b": {
+        "embed_dims": 1280,
+        "num_layers": 32,
+        "num_heads": 16,
+        "feedforward_channels": 5120,
+    },
+    "sapiens_1b": {
+        "embed_dims": 1536,
+        "num_layers": 40,
+        "num_heads": 24,
+        "feedforward_channels": 6144,
+    },
+    "sapiens_2b": {
+        "embed_dims": 1920,
+        "num_layers": 48,
+        "num_heads": 32,
+        "feedforward_channels": 7680,
+    },
+}
+class SapiensGaitConfig(PretrainedConfig):
+    """Configuration class for Sapiens Gait pose estimation model.
+    This configuration stores all architecture parameters needed to build
+    the Sapiens model natively in PyTorch/HuggingFace without any OpenMMLab
+    dependency.
+    Args:
+        arch (str, optional): Architecture variant name. One of
+            "sapiens_0.3b", "sapiens_0.6b", "sapiens_1b", "sapiens_2b".
+            If provided, overrides embed_dims/num_layers/num_heads/feedforward_channels.
+        image_size (list[int]): Input image size as [height, width].
+            Defaults to [1024, 768].
+        patch_size (int): Patch size for the ViT backbone. Defaults to 16.
+        in_channels (int): Number of input image channels. Defaults to 3.
+        embed_dims (int): Embedding dimension. Defaults to 1920 (sapiens_2b).
+        num_layers (int): Number of transformer layers. Defaults to 48.
+        num_heads (int): Number of attention heads. Defaults to 32.
+        feedforward_channels (int): Hidden dim of FFN. Defaults to 7680.
+        drop_rate (float): Dropout rate. Defaults to 0.0.
+        drop_path_rate (float): Stochastic depth rate. Defaults to 0.0.
+        qkv_bias (bool): Whether to use bias in QKV projection. Defaults to True.
+        patch_padding (int): Padding for patch embedding conv. Defaults to 2.
+        num_keypoints (int): Number of output keypoints. Defaults to 17.
+        deconv_out_channels (list[int]): Output channels for deconv layers.
+        deconv_kernel_sizes (list[int]): Kernel sizes for deconv layers.
+        conv_out_channels (list[int]): Output channels for conv layers in head.
+        conv_kernel_sizes (list[int]): Kernel sizes for conv layers in head.
+        image_mean (list[float]): Normalization mean (RGB).
+        image_std (list[float]): Normalization std (RGB).
+    """
+    model_type = "sapiens_gait"
+    def __init__(
+        self,
+        arch=None,
+        image_size=None,
+        patch_size=16,
+        in_channels=3,
+        embed_dims=1920,
+        num_layers=48,
+        num_heads=32,
+        feedforward_channels=7680,
+        drop_rate=0.0,
+        drop_path_rate=0.0,
+        qkv_bias=True,
+        patch_padding=2,
+        num_keypoints=17,
+        deconv_out_channels=None,
+        deconv_kernel_sizes=None,
+        conv_out_channels=None,
+        conv_kernel_sizes=None,
+        image_mean=None,
+        image_std=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        # Apply architecture preset if specified
+        if arch is not None:
+            if arch not in SAPIENS_ARCH_ZOO:
+                raise ValueError(
+                    f"Unknown arch '{arch}'. Choose from: {list(SAPIENS_ARCH_ZOO.keys())}"
+                )
+            preset = SAPIENS_ARCH_ZOO[arch]
+            embed_dims = preset["embed_dims"]
+            num_layers = preset["num_layers"]
+            num_heads = preset["num_heads"]
+            feedforward_channels = preset["feedforward_channels"]
+        # Backbone (ViT) parameters
+        self.image_size = image_size if image_size is not None else [1024, 768]
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        self.embed_dims = embed_dims
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.feedforward_channels = feedforward_channels
+        self.drop_rate = drop_rate
+        self.drop_path_rate = drop_path_rate
+        self.qkv_bias = qkv_bias
+        self.patch_padding = patch_padding
+        # Head parameters
+        self.num_keypoints = num_keypoints
+        self.deconv_out_channels = list(deconv_out_channels) if deconv_out_channels is not None else [768, 768]
+        self.deconv_kernel_sizes = list(deconv_kernel_sizes) if deconv_kernel_sizes is not None else [4, 4]
+        self.conv_out_channels = list(conv_out_channels) if conv_out_channels is not None else [768, 768]
+        self.conv_kernel_sizes = list(conv_kernel_sizes) if conv_kernel_sizes is not None else [1, 1]
+        # Preprocessing (for reference; user applies externally)
+        self.image_mean = image_mean if image_mean is not None else [123.675, 116.28, 103.53]
+        self.image_std = image_std if image_std is not None else [58.395, 57.12, 57.375]

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3276793c19e48030c1770ff356e2b2361ec4c10b6e6b4f26d1210ca55dc1044
+size 1318227164

modeling_sapiens.py ADDED Viewed

	@@ -0,0 +1,621 @@

+"""
+Sapiens Gait Model — Pure HuggingFace Transformers implementation.
+No dependency on mmengine, mmcv, mmpose, or mmpretrain.
+Weight key names are designed to exactly match the original OpenMMLab
+checkpoint layout so existing safetensors can be loaded directly.
+Architecture:
+    SapiensGaitModel (PreTrainedModel)
+    └── backbone (SapiensTopdownPoseEstimator)
+          ├── backbone (SapiensVisionTransformer)
+          │     ├── patch_embed.projection  (Conv2d)
+          │     ├── pos_embed               (Parameter)
+          │     ├── layers[i].ln1           (LayerNorm)
+          │     ├── layers[i].attn.qkv      (Linear)
+          │     ├── layers[i].attn.proj     (Linear)
+          │     ├── layers[i].ln2           (LayerNorm)
+          │     ├── layers[i].ffn.layers    (Sequential)
+          │     └── ln1                     (final LayerNorm)
+          └── head (SapiensHeatmapHead)
+                ├── deconv_layers  (Sequential[ConvTranspose2d, InstanceNorm2d, SiLU, ...])
+                ├── conv_layers    (Sequential[Conv2d, InstanceNorm2d, SiLU, ...])
+                └── final_layer    (Conv2d 1x1)
+"""
+import math
+from typing import Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import ModelOutput
+try:
+    from .configuration_sapiens import SapiensGaitConfig
+except ImportError:
+    from configuration_sapiens import SapiensGaitConfig
+# ---------------------------------------------------------------------------
+# Utility: Stochastic Depth (DropPath)
+# ---------------------------------------------------------------------------
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample."""
+    def __init__(self, drop_prob: float = 0.0):
+        super().__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self.training or self.drop_prob == 0.0:
+            return x
+        keep_prob = 1.0 - self.drop_prob
+        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+        random_tensor = torch.rand(shape, dtype=x.dtype, device=x.device)
+        random_tensor = torch.floor_(random_tensor + keep_prob)
+        return x.div(keep_prob) * random_tensor
+    def extra_repr(self) -> str:
+        return f"drop_prob={self.drop_prob}"
+# ---------------------------------------------------------------------------
+# Utility: Resize positional embedding via interpolation
+# ---------------------------------------------------------------------------
+def resize_pos_embed(
+    pos_embed: torch.Tensor,
+    src_shape: Tuple[int, int],
+    dst_shape: Tuple[int, int],
+    mode: str = "bicubic",
+    num_extra_tokens: int = 0,
+) -> torch.Tensor:
+    """Resize positional embedding from *src_shape* to *dst_shape*.
+    Works on a (1, N, C) tensor where N = num_extra_tokens + H*W.
+    """
+    if src_shape == dst_shape:
+        return pos_embed
+    extra_tokens = pos_embed[:, :num_extra_tokens] if num_extra_tokens > 0 else None
+    patch_pos_embed = pos_embed[:, num_extra_tokens:]  # (1, H_s*W_s, C)
+    src_h, src_w = src_shape
+    dst_h, dst_w = dst_shape
+    C = patch_pos_embed.shape[-1]
+    # (1, H_s*W_s, C) -> (1, C, H_s, W_s)
+    patch_pos_embed = patch_pos_embed.reshape(1, src_h, src_w, C).permute(0, 3, 1, 2).float()
+    # Interpolate to (1, C, H_d, W_d)
+    patch_pos_embed = F.interpolate(patch_pos_embed, size=(dst_h, dst_w), mode=mode, align_corners=False)
+    # (1, C, H_d, W_d) -> (1, H_d*W_d, C)
+    patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).reshape(1, -1, C)
+    if extra_tokens is not None:
+        patch_pos_embed = torch.cat([extra_tokens.float(), patch_pos_embed], dim=1)
+    return patch_pos_embed.to(pos_embed.dtype)
+# ---------------------------------------------------------------------------
+# Patch Embedding
+# ---------------------------------------------------------------------------
+class SapiensPatchEmbed(nn.Module):
+    """Image-to-patch embedding using a single Conv2d.
+    Matches the mmcv ``PatchEmbed`` weight layout::
+        patch_embed.projection.weight   (embed_dims, in_channels, kH, kW)
+        patch_embed.projection.bias     (embed_dims,)
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        embed_dims: int,
+        kernel_size: int,
+        stride: int,
+        padding: int,
+        input_size: Tuple[int, int],
+    ):
+        super().__init__()
+        self.projection = nn.Conv2d(
+            in_channels,
+            embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+        )
+        # Pre-compute the initial output resolution (used for pos_embed sizing)
+        h, w = input_size
+        out_h = (h + 2 * padding - kernel_size) // stride + 1
+        out_w = (w + 2 * padding - kernel_size) // stride + 1
+        self.init_out_size = (out_h, out_w)
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Tuple[int, int]]:
+        x = self.projection(x)  # (B, C, H_out, W_out)
+        out_size = (x.shape[2], x.shape[3])
+        x = x.flatten(2).transpose(1, 2)  # (B, N, C)
+        return x, out_size
+# ---------------------------------------------------------------------------
+# Multi-Head Self-Attention (with fused QKV)
+# ---------------------------------------------------------------------------
+class SapiensAttention(nn.Module):
+    """Multi-head self-attention with fused QKV linear.
+    Weight layout::
+        attn.qkv.weight   (3*embed_dims, embed_dims)
+        attn.qkv.bias     (3*embed_dims,)
+        attn.proj.weight   (embed_dims, embed_dims)
+        attn.proj.bias     (embed_dims,)
+    """
+    def __init__(
+        self,
+        embed_dims: int,
+        num_heads: int,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        drop_path_rate: float = 0.0,
+        qkv_bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.head_dims = embed_dims // num_heads
+        self.scale = self.head_dims ** -0.5
+        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.out_drop = DropPath(drop_path_rate)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, self.head_dims)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv.unbind(0)  # each (B, heads, N, head_dim)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        x = self.out_drop(x)
+        return x
+# ---------------------------------------------------------------------------
+# Feed-Forward Network  (matches mmcv FFN sequential layout)
+# ---------------------------------------------------------------------------
+class SapiensFFN(nn.Module):
+    """Two-layer MLP with GELU activation.
+    The internal ``self.layers`` is structured as::
+        Sequential(
+            Sequential(Linear, GELU, Dropout),   # index 0
+            Linear,                               # index 1
+            Dropout,                              # index 2
+        )
+    This ensures the weight keys are::
+        ffn.layers.0.0.weight   (fc1 weight)
+        ffn.layers.0.0.bias     (fc1 bias)
+        ffn.layers.1.weight     (fc2 weight)
+        ffn.layers.1.bias       (fc2 bias)
+    """
+    def __init__(
+        self,
+        embed_dims: int,
+        feedforward_channels: int,
+        ffn_drop: float = 0.0,
+        drop_path_rate: float = 0.0,
+    ):
+        super().__init__()
+        self.layers = nn.Sequential(
+            nn.Sequential(
+                nn.Linear(embed_dims, feedforward_channels),
+                nn.GELU(),
+                nn.Dropout(ffn_drop),
+            ),
+            nn.Linear(feedforward_channels, embed_dims),
+            nn.Dropout(ffn_drop),
+        )
+        self.dropout_layer = DropPath(drop_path_rate)
+    def forward(self, x: torch.Tensor, identity: Optional[torch.Tensor] = None) -> torch.Tensor:
+        out = self.layers(x)
+        if identity is None:
+            identity = x
+        return identity + self.dropout_layer(out)
+# ---------------------------------------------------------------------------
+# Transformer Encoder Layer
+# ---------------------------------------------------------------------------
+class SapiensTransformerLayer(nn.Module):
+    """Pre-norm Transformer encoder layer.
+    Architecture::
+        x = x + attn(ln1(x))
+        x = ffn(ln2(x), identity=x)   # residual handled inside FFN
+    """
+    def __init__(
+        self,
+        embed_dims: int,
+        num_heads: int,
+        feedforward_channels: int,
+        drop_rate: float = 0.0,
+        attn_drop_rate: float = 0.0,
+        drop_path_rate: float = 0.0,
+        qkv_bias: bool = True,
+    ):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(embed_dims, eps=1e-6)
+        self.attn = SapiensAttention(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            attn_drop=attn_drop_rate,
+            proj_drop=drop_rate,
+            drop_path_rate=drop_path_rate,
+            qkv_bias=qkv_bias,
+        )
+        self.ln2 = nn.LayerNorm(embed_dims, eps=1e-6)
+        self.ffn = SapiensFFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            ffn_drop=drop_rate,
+            drop_path_rate=drop_path_rate,
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln1(x))
+        x = self.ffn(self.ln2(x), identity=x)
+        return x
+# ---------------------------------------------------------------------------
+# Vision Transformer Backbone
+# ---------------------------------------------------------------------------
+class SapiensVisionTransformer(nn.Module):
+    """Sapiens Vision Transformer backbone (no CLS token, feature-map output).
+    Key weight names::
+        patch_embed.projection.{weight,bias}
+        pos_embed
+        layers.{i}.ln1.{weight,bias}
+        layers.{i}.attn.qkv.{weight,bias}
+        layers.{i}.attn.proj.{weight,bias}
+        layers.{i}.ln2.{weight,bias}
+        layers.{i}.ffn.layers.0.0.{weight,bias}
+        layers.{i}.ffn.layers.1.{weight,bias}
+        ln1.{weight,bias}
+    """
+    def __init__(self, config: SapiensGaitConfig):
+        super().__init__()
+        self.embed_dims = config.embed_dims
+        self.num_layers = config.num_layers
+        # Patch embedding
+        self.patch_embed = SapiensPatchEmbed(
+            in_channels=config.in_channels,
+            embed_dims=config.embed_dims,
+            kernel_size=config.patch_size,
+            stride=config.patch_size,
+            padding=config.patch_padding,
+            input_size=tuple(config.image_size),
+        )
+        self.patch_resolution = self.patch_embed.init_out_size
+        num_patches = self.patch_resolution[0] * self.patch_resolution[1]
+        # Positional embedding (no CLS token)
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, config.embed_dims))
+        self.drop_after_pos = nn.Dropout(p=config.drop_rate)
+        # Stochastic depth schedule
+        dpr = np.linspace(0, config.drop_path_rate, config.num_layers).tolist()
+        # Transformer encoder layers
+        self.layers = nn.ModuleList(
+            [
+                SapiensTransformerLayer(
+                    embed_dims=config.embed_dims,
+                    num_heads=config.num_heads,
+                    feedforward_channels=config.feedforward_channels,
+                    drop_rate=config.drop_rate,
+                    attn_drop_rate=0.0,
+                    drop_path_rate=dpr[i],
+                    qkv_bias=config.qkv_bias,
+                )
+                for i in range(config.num_layers)
+            ]
+        )
+        # Final LayerNorm
+        self.ln1 = nn.LayerNorm(config.embed_dims, eps=1e-6)
+    # ---- Load hook: resize / strip CLS from saved pos_embed if needed -----
+    def _prepare_pos_embed(self, state_dict, prefix, *args, **kwargs):
+        name = prefix + "pos_embed"
+        if name not in state_dict:
+            return
+        ckpt_pe = state_dict[name]
+        model_pe = self.pos_embed
+        # If checkpoint has one extra token (CLS) but model doesn't -> strip it
+        if ckpt_pe.shape[1] == model_pe.shape[1] + 1:
+            state_dict[name] = ckpt_pe[:, 1:]
+            ckpt_pe = state_dict[name]
+        elif ckpt_pe.shape[1] != model_pe.shape[1] and ckpt_pe.shape[1] % 2 == 1:
+            # Odd number of tokens likely means CLS token is present
+            state_dict[name] = ckpt_pe[:, 1:]
+            ckpt_pe = state_dict[name]
+        # If spatial resolution differs -> interpolate
+        if ckpt_pe.shape != model_pe.shape:
+            num_ckpt_patches = ckpt_pe.shape[1]
+            ckpt_h = ckpt_w = int(math.sqrt(num_ckpt_patches))
+            if ckpt_h * ckpt_w != num_ckpt_patches:
+                # Non-square: try to infer from aspect ratio
+                # Fallback: assume the same aspect ratio as model
+                ratio = self.patch_resolution[0] / self.patch_resolution[1]
+                ckpt_h = int(math.sqrt(num_ckpt_patches * ratio))
+                ckpt_w = num_ckpt_patches // ckpt_h
+            state_dict[name] = resize_pos_embed(
+                ckpt_pe,
+                (ckpt_h, ckpt_w),
+                self.patch_resolution,
+                mode="bicubic",
+                num_extra_tokens=0,
+            )
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
+        B = x.shape[0]
+        # Patch embedding
+        x, patch_resolution = self.patch_embed(x)
+        # Add (possibly interpolated) positional embedding
+        x = x + resize_pos_embed(
+            self.pos_embed,
+            self.patch_resolution,
+            patch_resolution,
+            mode="bicubic",
+            num_extra_tokens=0,
+        )
+        x = self.drop_after_pos(x)
+        hidden_states = []
+        # Transformer layers
+        for layer in self.layers:
+            x = layer(x)
+            hidden_states.append(x)  # save output after each layer for potential MMVision compatibility
+        # Final norm
+        x = self.ln1(x)
+        # Reshape to 2-D feature map: (B, N, C) -> (B, C, H, W)
+        x_mapping = x.reshape(B, *patch_resolution, -1).permute(0, 3, 1, 2)
+        outputs = {
+            "feat_1d_tokens": x,       # (B, N, C)
+            "feat_2d_tokens": x_mapping,  # (B, C, H, W)
+            "hidden_states": hidden_states,  # list of (B, N, C) after each layer
+        }
+        return outputs
+# ---------------------------------------------------------------------------
+# Heatmap Head (deconv upsampler)
+# ---------------------------------------------------------------------------
+class SapiensHeatmapHead(nn.Module):
+    """Simple-Baselines-style heatmap head with deconv + conv layers.
+    Uses ``InstanceNorm2d + SiLU`` (matching the Sapiens default ``use_silu=True``).
+    Key weight names::
+        deconv_layers.{0,3,...}.weight     (ConvTranspose2d, no bias)
+        conv_layers.{0,3,...}.{weight,bias} (Conv2d)
+        final_layer.{weight,bias}           (Conv2d 1×1)
+    """
+    def __init__(self, config: SapiensGaitConfig):
+        super().__init__()
+        in_channels = config.embed_dims
+        # --- Deconv (transposed-conv) upsampling layers ---
+        deconv_layers: List[nn.Module] = []
+        for out_ch, ks in zip(config.deconv_out_channels, config.deconv_kernel_sizes):
+            if ks == 4:
+                pad, opad = 1, 0
+            elif ks == 3:
+                pad, opad = 1, 1
+            elif ks == 2:
+                pad, opad = 0, 0
+            else:
+                raise ValueError(f"Unsupported deconv kernel size {ks}")
+            deconv_layers.append(
+                nn.ConvTranspose2d(in_channels, out_ch, kernel_size=ks, stride=2, padding=pad, output_padding=opad, bias=False)
+            )
+            deconv_layers.append(nn.InstanceNorm2d(out_ch))
+            deconv_layers.append(nn.SiLU(inplace=True))
+            in_channels = out_ch
+        self.deconv_layers = nn.Sequential(*deconv_layers)
+        # --- 1×1 (or N×N) conv refinement layers ---
+        conv_layers: List[nn.Module] = []
+        for out_ch, ks in zip(config.conv_out_channels, config.conv_kernel_sizes):
+            pad = (ks - 1) // 2
+            conv_layers.append(
+                nn.Conv2d(in_channels, out_ch, kernel_size=ks, stride=1, padding=pad)
+            )
+            conv_layers.append(nn.InstanceNorm2d(out_ch))
+            conv_layers.append(nn.SiLU(inplace=True))
+            in_channels = out_ch
+        self.conv_layers = nn.Sequential(*conv_layers)
+        # --- Final projection to keypoint heatmaps ---
+        self.final_layer = nn.Conv2d(in_channels, config.num_keypoints, kernel_size=1)
+    def forward(self, feats: Tuple[torch.Tensor]) -> torch.Tensor:
+        x = feats[-1]  # take the last (only) feature map
+        x = self.deconv_layers(x)
+        x = self.conv_layers(x)
+        x = self.final_layer(x)
+        return x
+# ---------------------------------------------------------------------------
+# Top-Down Pose Estimator (backbone + head wrapper)
+# ---------------------------------------------------------------------------
+class SapiensTopdownPoseEstimator(nn.Module):
+    """Wraps the ViT backbone and heatmap head.
+    Named ``backbone`` and ``head`` to match the original OpenMMLab key prefix:
+        backbone.backbone.…   ->  self.backbone.…
+        backbone.head.…       ->  self.head.…
+    """
+    def __init__(self, config: SapiensGaitConfig):
+        super().__init__()
+        self.backbone = SapiensVisionTransformer(config)
+        self.head = SapiensHeatmapHead(config)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        outputs = self.backbone(x)
+        feats_2d_tokens = outputs["feat_2d_tokens"]
+        feats_1d_tokens = outputs["feat_1d_tokens"]
+        hidden_states = outputs["hidden_states"]
+        heatmaps = self.head(feats_2d_tokens)
+        outputs = {
+            "heatmaps": heatmaps,
+            "last_hidden_state": feats_1d_tokens,
+            "hidden_states": hidden_states,
+        }
+        return outputs
+# ---------------------------------------------------------------------------
+# HuggingFace PreTrainedModel
+# ---------------------------------------------------------------------------
+class SapiensGaitModel(PreTrainedModel):
+    """Sapiens pose-estimation model as a HuggingFace ``PreTrainedModel``.
+    This implementation is **completely independent** of OpenMMLab and only
+    requires ``torch`` and ``transformers``.
+    Usage::
+        from transformers import AutoModel, AutoConfig
+        config = AutoConfig.from_pretrained("path/to/sapiens_gait_fixed")
+        model  = AutoModel.from_pretrained("path/to/sapiens_gait_fixed")
+        # pixel_values: (B, 3, H, W), already normalised with config.image_mean / image_std
+        out = model(pixel_values)
+        keypoints = out["keypoints"]   # (B, K, 2)  normalised [0,1]
+        scores    = out["scores"]      # (B, K)
+        heatmaps  = out["heatmaps"]    # (B, K, Hm, Wm)
+    .. note::
+        Input images should be RGB and normalised with::
+            pixel = (pixel - mean) / std
+        using ``config.image_mean`` and ``config.image_std`` (in 0-255 scale).
+    """
+    config_class = SapiensGaitConfig
+    def __init__(self, config: SapiensGaitConfig):
+        super().__init__(config)
+        self.backbone = SapiensTopdownPoseEstimator(config)
+        # Register the pos_embed resize hook so from_pretrained handles
+        # checkpoints with a different spatial resolution gracefully.
+        self.backbone.backbone._register_load_state_dict_pre_hook(
+            self.backbone.backbone._prepare_pos_embed
+        )
+        # Initialize weights (only for freshly created models; from_pretrained
+        # overwrites with checkpoint values).
+        self.post_init()
+    def _init_weights(self, module: nn.Module):
+        """Initialize weights following the original Sapiens convention."""
+        if isinstance(module, nn.Linear):
+            nn.init.xavier_uniform_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Conv2d):
+            nn.init.normal_(module.weight, std=0.001)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.ConvTranspose2d):
+            nn.init.normal_(module.weight, std=0.001)
+        elif isinstance(module, nn.LayerNorm):
+            nn.init.ones_(module.weight)
+            nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.InstanceNorm2d):
+            if module.weight is not None:
+                nn.init.ones_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        return_heatmaps: bool = True,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Args:
+            pixel_values: (B, 3, H, W) normalised input images.
+            return_heatmaps: whether to include raw heatmaps in the output.
+        Returns:
+            dict with keys ``keypoints`` (B, K, 2), ``scores`` (B, K),
+            and optionally ``heatmaps`` (B, K, Hm, Wm).
+        """
+        outputs = self.backbone(pixel_values)
+        heatmaps = outputs["heatmaps"]
+        feats_1d_tokens = outputs["feats"]
+        B, K, H, W = heatmaps.shape
+        heatmaps_flat = heatmaps.view(B, K, -1)
+        max_scores, idx = torch.max(heatmaps_flat, dim=-1)
+        preds_x = (idx % W).float() / W
+        preds_y = (idx // W).float() / H
+        keypoints = torch.stack([preds_x, preds_y], dim=-1)
+        out = {"keypoints": keypoints, "scores": max_scores}
+        if return_heatmaps:
+            out["heatmaps"] = heatmaps
+        return out