| from transformers import PretrainedConfig |
| from typing import List |
|
|
|
|
| class LMConfig(PretrainedConfig): |
| model_type = "minimind" |
|
|
| def __init__( |
| self, |
| dim: int = 512, |
| n_layers: int = 8, |
| n_heads: int = 8, |
| n_kv_heads: int = 2, |
| vocab_size: int = 6400, |
| hidden_dim: int = None, |
| multiple_of: int = 64, |
| norm_eps: float = 1e-5, |
| max_seq_len: int = 8192, |
| rope_theta: int = 1e6, |
| dropout: float = 0.0, |
| flash_attn: bool = True, |
| |
| |
| |
| |
| use_moe: bool = False, |
| |
| num_experts_per_tok: int = 2, |
| n_routed_experts: int = 4, |
| n_shared_experts: bool = True, |
| scoring_func: str = 'softmax', |
| aux_loss_alpha: float = 0.1, |
| seq_aux: bool = True, |
| norm_topk_prob: bool = True, |
| **kwargs, |
| ): |
| self.dim = dim |
| self.n_layers = n_layers |
| self.n_heads = n_heads |
| self.n_kv_heads = n_kv_heads |
| self.vocab_size = vocab_size |
| self.hidden_dim = hidden_dim |
| self.multiple_of = multiple_of |
| self.norm_eps = norm_eps |
| self.max_seq_len = max_seq_len |
| self.rope_theta = rope_theta |
| self.dropout = dropout |
| self.flash_attn = flash_attn |
| |
| |
| |
| |
| self.use_moe = use_moe |
| self.num_experts_per_tok = num_experts_per_tok |
| self.n_routed_experts = n_routed_experts |
| self.n_shared_experts = n_shared_experts |
| self.scoring_func = scoring_func |
| self.aux_loss_alpha = aux_loss_alpha |
| self.seq_aux = seq_aux |
| self.norm_topk_prob = norm_topk_prob |
| super().__init__(**kwargs) |
|
|