vectorllm-hf / configuration_dinov3_vit.py

Upload folder using huggingface_hub

a04bbbc verified 20 days ago

7 kB

	# --------------------------------------------------------
	# InternVL
	# Copyright (c) 2024 OpenGVLab
	# Licensed under The MIT License [see LICENSE for details]
	# --------------------------------------------------------
	from typing import Optional

	from transformers.configuration_utils import PretrainedConfig
	from transformers.utils import logging

	logger = logging.get_logger(__name__)


	class DINOv3ViTConfig(PretrainedConfig):
	r"""
	This is the configuration class to store the configuration of a [`DINOv3Model`]. It is used to instantiate an
	DINOv3 model according to the specified arguments, defining the model architecture. Instantiating a configuration
	with the defaults will yield a similar configuration to that of the DINOv3
	[facebook/dinov3-vits16-pretrain-lvd1689m](https://huggingface.co/facebook/dinov3-vits16-pretrain-lvd1689m) architecture.

	Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
	documentation from [`PretrainedConfig`] for more information.

	Args:
	patch_size (`int`, optional, defaults to 16):
	The size (resolution) of each patch.
	hidden_size (`int`, optional, defaults to 384):
	Dimensionality of the encoder layers and the pooler layer.
	intermediate_size (`int`, optional, defaults to 1536):
	Dimensionality of the "intermediate" (i.e., feed-forward) layer.
	num_hidden_layers (`int`, optional, defaults to 12):
	Number of hidden layers in the Transformer encoder.
	num_attention_heads (`int`, optional, defaults to 6):
	Number of attention heads for each attention layer in the Transformer encoder.
	hidden_act (`str` or `function`, optional, defaults to `"gelu"`):
	The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
	`"relu"`, `"selu"` and `"gelu_new"` are supported.
	attention_dropout (`float`, optional, defaults to 0.0):
	The dropout ratio for the attention probabilities.
	initializer_range (`float`, optional, defaults to 0.02):
	The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
	layer_norm_eps (`float`, optional, defaults to 1e-05):
	The epsilon used by the layer normalization layers.
	rope_theta (`float`, optional, defaults to 100.0):
	The base period of the RoPE embeddings.
	image_size (`int`, optional, defaults to 224):
	The size (resolution) of each image.
	num_channels (`int`, optional, defaults to 3):
	The number of input channels.
	query_bias (`bool`, optional, defaults to `True`):
	Whether to add a bias to the query projection.
	key_bias (`bool`, optional, defaults to `False`):
	Whether to add a bias to the key projection.
	value_bias (`bool`, optional, defaults to `True`):
	Whether to add a bias to the value projection.
	proj_bias (`bool`, optional, defaults to `True`):
	Whether to add a bias to the output projection.
	mlp_bias (`bool`, optional, defaults to `True`):
	Whether to add a bias to the MLP layers.
	layerscale_value (`float`, optional, defaults to 1.0):
	Initial value to use for layer scale.
	drop_path_rate (`float`, optional, defaults to 0.0):
	Stochastic depth rate per sample (when applied in the main path of residual layers).
	use_gated_mlp (`bool`, optional, defaults to `False`):
	Whether to use the SwiGLU feedforward neural network.
	num_register_tokens (`int`, optional, defaults to 0):
	The number of register tokens.
	pos_embed_shift (`float`, optional):
	Amount to randomly shift position embedding coordinates in [-shift, shift],
	applied only in training mode if not `None`.
	pos_embed_jitter (`float`, optional):
	Amount to randomly jitter position embedding coordinates in log-uniform value in [1/jitter, jitter],
	applied only in training mode if not `None`.
	pos_embed_rescale (`float`, optional, defaults to 2.0):
	Amount to randomly rescale position embedding coordinates in log-uniform value in [1/rescale, rescale],
	applied only in training mode if not `None`.

	Example:

	```python
	>>> from transformers import DINOv3ViTConfig, DINOv3ViTModel

	>>> # Initializing a DINOv3 ViT-small style configuration
	>>> config = DINOv3ViTConfig()

	>>> # Initializing a model (with random weights) from the config
	>>> model = DINOv3ViTModel(config)

	>>> # Accessing the model config
	>>> config = model.config
	```"""

	model_type = "dinov3_vit"

	def __init__(
	self,
	patch_size: int = 16,
	hidden_size: int = 384,
	intermediate_size: int = 1536,
	num_hidden_layers: int = 12,
	num_attention_heads: int = 6,
	hidden_act: str = "gelu",
	attention_dropout: float = 0.0,
	initializer_range: float = 0.02,
	layer_norm_eps: float = 1e-5,
	rope_theta: float = 100.0,
	image_size: int = 224,
	num_channels: int = 3,
	query_bias: bool = True,
	key_bias: bool = False,
	value_bias: bool = True,
	proj_bias: bool = True,
	mlp_bias: bool = True,
	layerscale_value: float = 1.0,
	drop_path_rate: float = 0.0,
	use_gated_mlp: bool = False,
	num_register_tokens: int = 0,
	# train augs
	pos_embed_shift: Optional[float] = None,
	pos_embed_jitter: Optional[float] = None,
	pos_embed_rescale: Optional[float] = 2.0,
	**kwargs,
	):
	super().__init__(**kwargs)

	self.image_size = image_size
	self.patch_size = patch_size
	self.num_channels = num_channels
	self.hidden_size = hidden_size
	self.intermediate_size = intermediate_size
	self.num_hidden_layers = num_hidden_layers
	self.num_attention_heads = num_attention_heads
	self.hidden_act = hidden_act
	self.attention_dropout = attention_dropout
	self.initializer_range = initializer_range
	self.layer_norm_eps = layer_norm_eps
	self.layerscale_value = layerscale_value
	self.drop_path_rate = drop_path_rate
	self.use_gated_mlp = use_gated_mlp
	self.rope_theta = rope_theta
	self.query_bias = query_bias
	self.key_bias = key_bias
	self.value_bias = value_bias
	self.proj_bias = proj_bias
	self.mlp_bias = mlp_bias
	self.num_register_tokens = num_register_tokens

	# train augs
	self.pos_embed_shift = pos_embed_shift
	self.pos_embed_jitter = pos_embed_jitter
	self.pos_embed_rescale = pos_embed_rescale


	__all__ = ["DINOv3ViTConfig"]