"""Sapiens Gait Model Configuration (Pure HuggingFace, no OpenMMLab dependency)."""

from transformers import PretrainedConfig

# Pre-defined architecture variants
SAPIENS_ARCH_ZOO = {
    "sapiens_0.3b": {
        "embed_dims": 1024,
        "num_layers": 24,
        "num_heads": 16,
        "feedforward_channels": 4096,
    },
    "sapiens_0.6b": {
        "embed_dims": 1280,
        "num_layers": 32,
        "num_heads": 16,
        "feedforward_channels": 5120,
    },
    "sapiens_1b": {
        "embed_dims": 1536,
        "num_layers": 40,
        "num_heads": 24,
        "feedforward_channels": 6144,
    },
    "sapiens_2b": {
        "embed_dims": 1920,
        "num_layers": 48,
        "num_heads": 32,
        "feedforward_channels": 7680,
    },
}


class SapiensGaitConfig(PretrainedConfig):
    """Configuration class for Sapiens Gait pose estimation model.

    This configuration stores all architecture parameters needed to build
    the Sapiens model natively in PyTorch/HuggingFace without any OpenMMLab
    dependency.

    Args:
        arch (str, optional): Architecture variant name. One of
            "sapiens_0.3b", "sapiens_0.6b", "sapiens_1b", "sapiens_2b".
            If provided, overrides embed_dims/num_layers/num_heads/feedforward_channels.
        image_size (list[int]): Input image size as [height, width].
            Defaults to [1024, 768].
        patch_size (int): Patch size for the ViT backbone. Defaults to 16.
        in_channels (int): Number of input image channels. Defaults to 3.
        embed_dims (int): Embedding dimension. Defaults to 1920 (sapiens_2b).
        num_layers (int): Number of transformer layers. Defaults to 48.
        num_heads (int): Number of attention heads. Defaults to 32.
        feedforward_channels (int): Hidden dim of FFN. Defaults to 7680.
        drop_rate (float): Dropout rate. Defaults to 0.0.
        drop_path_rate (float): Stochastic depth rate. Defaults to 0.0.
        qkv_bias (bool): Whether to use bias in QKV projection. Defaults to True.
        patch_padding (int): Padding for patch embedding conv. Defaults to 2.
        num_keypoints (int): Number of output keypoints. Defaults to 17.
        deconv_out_channels (list[int]): Output channels for deconv layers.
        deconv_kernel_sizes (list[int]): Kernel sizes for deconv layers.
        conv_out_channels (list[int]): Output channels for conv layers in head.
        conv_kernel_sizes (list[int]): Kernel sizes for conv layers in head.
        image_mean (list[float]): Normalization mean (RGB).
        image_std (list[float]): Normalization std (RGB).
    """

    model_type = "sapiens_gait"

    def __init__(
        self,
        arch=None,
        image_size=None,
        patch_size=16,
        in_channels=3,
        embed_dims=1920,
        num_layers=48,
        num_heads=32,
        feedforward_channels=7680,
        drop_rate=0.0,
        drop_path_rate=0.0,
        qkv_bias=True,
        patch_padding=2,
        num_keypoints=17,
        deconv_out_channels=None,
        deconv_kernel_sizes=None,
        conv_out_channels=None,
        conv_kernel_sizes=None,
        image_mean=None,
        image_std=None,
        **kwargs,
    ):
        super().__init__(**kwargs)

        # Apply architecture preset if specified
        if arch is not None:
            if arch not in SAPIENS_ARCH_ZOO:
                raise ValueError(
                    f"Unknown arch '{arch}'. Choose from: {list(SAPIENS_ARCH_ZOO.keys())}"
                )
            preset = SAPIENS_ARCH_ZOO[arch]
            embed_dims = preset["embed_dims"]
            num_layers = preset["num_layers"]
            num_heads = preset["num_heads"]
            feedforward_channels = preset["feedforward_channels"]

        # Backbone (ViT) parameters
        self.image_size = image_size if image_size is not None else [1024, 768]
        self.patch_size = patch_size
        self.in_channels = in_channels
        self.embed_dims = embed_dims
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.feedforward_channels = feedforward_channels
        self.drop_rate = drop_rate
        self.drop_path_rate = drop_path_rate
        self.qkv_bias = qkv_bias
        self.patch_padding = patch_padding

        # Head parameters
        self.num_keypoints = num_keypoints
        self.deconv_out_channels = list(deconv_out_channels) if deconv_out_channels is not None else [768, 768]
        self.deconv_kernel_sizes = list(deconv_kernel_sizes) if deconv_kernel_sizes is not None else [4, 4]
        self.conv_out_channels = list(conv_out_channels) if conv_out_channels is not None else [768, 768]
        self.conv_kernel_sizes = list(conv_kernel_sizes) if conv_kernel_sizes is not None else [1, 1]

        # Preprocessing (for reference; user applies externally)
        self.image_mean = image_mean if image_mean is not None else [123.675, 116.28, 103.53]
        self.image_std = image_std if image_std is not None else [58.395, 57.12, 57.375]