| |
| |
| |
| |
| |
| from typing import Optional |
|
|
| from transformers.configuration_utils import PretrainedConfig |
| from transformers.utils import logging |
|
|
| logger = logging.get_logger(__name__) |
|
|
|
|
| class DINOv3ViTConfig(PretrainedConfig): |
| r""" |
| This is the configuration class to store the configuration of a [`DINOv3Model`]. It is used to instantiate an |
| DINOv3 model according to the specified arguments, defining the model architecture. Instantiating a configuration |
| with the defaults will yield a similar configuration to that of the DINOv3 |
| [facebook/dinov3-vits16-pretrain-lvd1689m](https://huggingface.co/facebook/dinov3-vits16-pretrain-lvd1689m) architecture. |
| |
| Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
| documentation from [`PretrainedConfig`] for more information. |
| |
| Args: |
| patch_size (`int`, *optional*, defaults to 16): |
| The size (resolution) of each patch. |
| hidden_size (`int`, *optional*, defaults to 384): |
| Dimensionality of the encoder layers and the pooler layer. |
| intermediate_size (`int`, *optional*, defaults to 1536): |
| Dimensionality of the "intermediate" (i.e., feed-forward) layer. |
| num_hidden_layers (`int`, *optional*, defaults to 12): |
| Number of hidden layers in the Transformer encoder. |
| num_attention_heads (`int`, *optional*, defaults to 6): |
| Number of attention heads for each attention layer in the Transformer encoder. |
| hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): |
| The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, |
| `"relu"`, `"selu"` and `"gelu_new"` are supported. |
| attention_dropout (`float`, *optional*, defaults to 0.0): |
| The dropout ratio for the attention probabilities. |
| initializer_range (`float`, *optional*, defaults to 0.02): |
| The standard deviation of the truncated_normal_initializer for initializing all weight matrices. |
| layer_norm_eps (`float`, *optional*, defaults to 1e-05): |
| The epsilon used by the layer normalization layers. |
| rope_theta (`float`, *optional*, defaults to 100.0): |
| The base period of the RoPE embeddings. |
| image_size (`int`, *optional*, defaults to 224): |
| The size (resolution) of each image. |
| num_channels (`int`, *optional*, defaults to 3): |
| The number of input channels. |
| query_bias (`bool`, *optional*, defaults to `True`): |
| Whether to add a bias to the query projection. |
| key_bias (`bool`, *optional*, defaults to `False`): |
| Whether to add a bias to the key projection. |
| value_bias (`bool`, *optional*, defaults to `True`): |
| Whether to add a bias to the value projection. |
| proj_bias (`bool`, *optional*, defaults to `True`): |
| Whether to add a bias to the output projection. |
| mlp_bias (`bool`, *optional*, defaults to `True`): |
| Whether to add a bias to the MLP layers. |
| layerscale_value (`float`, *optional*, defaults to 1.0): |
| Initial value to use for layer scale. |
| drop_path_rate (`float`, *optional*, defaults to 0.0): |
| Stochastic depth rate per sample (when applied in the main path of residual layers). |
| use_gated_mlp (`bool`, *optional*, defaults to `False`): |
| Whether to use the SwiGLU feedforward neural network. |
| num_register_tokens (`int`, *optional*, defaults to 0): |
| The number of register tokens. |
| pos_embed_shift (`float`, *optional*): |
| Amount to randomly shift position embedding coordinates in [-shift, shift], |
| applied only in training mode if not `None`. |
| pos_embed_jitter (`float`, *optional*): |
| Amount to randomly jitter position embedding coordinates in log-uniform value in [1/jitter, jitter], |
| applied only in training mode if not `None`. |
| pos_embed_rescale (`float`, *optional*, defaults to 2.0): |
| Amount to randomly rescale position embedding coordinates in log-uniform value in [1/rescale, rescale], |
| applied only in training mode if not `None`. |
| |
| Example: |
| |
| ```python |
| >>> from transformers import DINOv3ViTConfig, DINOv3ViTModel |
| |
| >>> # Initializing a DINOv3 ViT-small style configuration |
| >>> config = DINOv3ViTConfig() |
| |
| >>> # Initializing a model (with random weights) from the config |
| >>> model = DINOv3ViTModel(config) |
| |
| >>> # Accessing the model config |
| >>> config = model.config |
| ```""" |
|
|
| model_type = "dinov3_vit" |
|
|
| def __init__( |
| self, |
| patch_size: int = 16, |
| hidden_size: int = 384, |
| intermediate_size: int = 1536, |
| num_hidden_layers: int = 12, |
| num_attention_heads: int = 6, |
| hidden_act: str = "gelu", |
| attention_dropout: float = 0.0, |
| initializer_range: float = 0.02, |
| layer_norm_eps: float = 1e-5, |
| rope_theta: float = 100.0, |
| image_size: int = 224, |
| num_channels: int = 3, |
| query_bias: bool = True, |
| key_bias: bool = False, |
| value_bias: bool = True, |
| proj_bias: bool = True, |
| mlp_bias: bool = True, |
| layerscale_value: float = 1.0, |
| drop_path_rate: float = 0.0, |
| use_gated_mlp: bool = False, |
| num_register_tokens: int = 0, |
| |
| pos_embed_shift: Optional[float] = None, |
| pos_embed_jitter: Optional[float] = None, |
| pos_embed_rescale: Optional[float] = 2.0, |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
|
|
| self.image_size = image_size |
| self.patch_size = patch_size |
| self.num_channels = num_channels |
| self.hidden_size = hidden_size |
| self.intermediate_size = intermediate_size |
| self.num_hidden_layers = num_hidden_layers |
| self.num_attention_heads = num_attention_heads |
| self.hidden_act = hidden_act |
| self.attention_dropout = attention_dropout |
| self.initializer_range = initializer_range |
| self.layer_norm_eps = layer_norm_eps |
| self.layerscale_value = layerscale_value |
| self.drop_path_rate = drop_path_rate |
| self.use_gated_mlp = use_gated_mlp |
| self.rope_theta = rope_theta |
| self.query_bias = query_bias |
| self.key_bias = key_bias |
| self.value_bias = value_bias |
| self.proj_bias = proj_bias |
| self.mlp_bias = mlp_bias |
| self.num_register_tokens = num_register_tokens |
|
|
| |
| self.pos_embed_shift = pos_embed_shift |
| self.pos_embed_jitter = pos_embed_jitter |
| self.pos_embed_rescale = pos_embed_rescale |
|
|
|
|
| __all__ = ["DINOv3ViTConfig"] |