"""Sapiens Gait Model Configuration (Pure HuggingFace, no OpenMMLab dependency).""" from transformers import PretrainedConfig # Pre-defined architecture variants SAPIENS_ARCH_ZOO = { "sapiens_0.3b": { "embed_dims": 1024, "num_layers": 24, "num_heads": 16, "feedforward_channels": 4096, }, "sapiens_0.6b": { "embed_dims": 1280, "num_layers": 32, "num_heads": 16, "feedforward_channels": 5120, }, "sapiens_1b": { "embed_dims": 1536, "num_layers": 40, "num_heads": 24, "feedforward_channels": 6144, }, "sapiens_2b": { "embed_dims": 1920, "num_layers": 48, "num_heads": 32, "feedforward_channels": 7680, }, } class SapiensGaitConfig(PretrainedConfig): """Configuration class for Sapiens Gait pose estimation model. This configuration stores all architecture parameters needed to build the Sapiens model natively in PyTorch/HuggingFace without any OpenMMLab dependency. Args: arch (str, optional): Architecture variant name. One of "sapiens_0.3b", "sapiens_0.6b", "sapiens_1b", "sapiens_2b". If provided, overrides embed_dims/num_layers/num_heads/feedforward_channels. image_size (list[int]): Input image size as [height, width]. Defaults to [1024, 768]. patch_size (int): Patch size for the ViT backbone. Defaults to 16. in_channels (int): Number of input image channels. Defaults to 3. embed_dims (int): Embedding dimension. Defaults to 1920 (sapiens_2b). num_layers (int): Number of transformer layers. Defaults to 48. num_heads (int): Number of attention heads. Defaults to 32. feedforward_channels (int): Hidden dim of FFN. Defaults to 7680. drop_rate (float): Dropout rate. Defaults to 0.0. drop_path_rate (float): Stochastic depth rate. Defaults to 0.0. qkv_bias (bool): Whether to use bias in QKV projection. Defaults to True. patch_padding (int): Padding for patch embedding conv. Defaults to 2. num_keypoints (int): Number of output keypoints. Defaults to 17. deconv_out_channels (list[int]): Output channels for deconv layers. deconv_kernel_sizes (list[int]): Kernel sizes for deconv layers. conv_out_channels (list[int]): Output channels for conv layers in head. conv_kernel_sizes (list[int]): Kernel sizes for conv layers in head. image_mean (list[float]): Normalization mean (RGB). image_std (list[float]): Normalization std (RGB). """ model_type = "sapiens_gait" def __init__( self, arch=None, image_size=None, patch_size=16, in_channels=3, embed_dims=1920, num_layers=48, num_heads=32, feedforward_channels=7680, drop_rate=0.0, drop_path_rate=0.0, qkv_bias=True, patch_padding=2, num_keypoints=17, deconv_out_channels=None, deconv_kernel_sizes=None, conv_out_channels=None, conv_kernel_sizes=None, image_mean=None, image_std=None, **kwargs, ): super().__init__(**kwargs) # Apply architecture preset if specified if arch is not None: if arch not in SAPIENS_ARCH_ZOO: raise ValueError( f"Unknown arch '{arch}'. Choose from: {list(SAPIENS_ARCH_ZOO.keys())}" ) preset = SAPIENS_ARCH_ZOO[arch] embed_dims = preset["embed_dims"] num_layers = preset["num_layers"] num_heads = preset["num_heads"] feedforward_channels = preset["feedforward_channels"] # Backbone (ViT) parameters self.image_size = image_size if image_size is not None else [1024, 768] self.patch_size = patch_size self.in_channels = in_channels self.embed_dims = embed_dims self.num_layers = num_layers self.num_heads = num_heads self.feedforward_channels = feedforward_channels self.drop_rate = drop_rate self.drop_path_rate = drop_path_rate self.qkv_bias = qkv_bias self.patch_padding = patch_padding # Head parameters self.num_keypoints = num_keypoints self.deconv_out_channels = list(deconv_out_channels) if deconv_out_channels is not None else [768, 768] self.deconv_kernel_sizes = list(deconv_kernel_sizes) if deconv_kernel_sizes is not None else [4, 4] self.conv_out_channels = list(conv_out_channels) if conv_out_channels is not None else [768, 768] self.conv_kernel_sizes = list(conv_kernel_sizes) if conv_kernel_sizes is not None else [1, 1] # Preprocessing (for reference; user applies externally) self.image_mean = image_mean if image_mean is not None else [123.675, 116.28, 103.53] self.image_std = image_std if image_std is not None else [58.395, 57.12, 57.375]