| """GLAP (Generalized Language Audio Pretraining) configuration.""" |
|
|
| from transformers import PretrainedConfig |
|
|
|
|
| class GlapConfig(PretrainedConfig): |
| model_type = "glap" |
|
|
| def __init__( |
| self, |
| |
| audio_embed_dim: int = 768, |
| audio_depth: int = 12, |
| audio_num_heads: int = 12, |
| patch_size: list = None, |
| patch_stride: list = None, |
| target_length: int = 1008, |
| sample_rate: int = 16000, |
| |
| text_vocab_size: int = 256206, |
| text_model_dim: int = 1024, |
| text_num_layers: int = 24, |
| text_num_heads: int = 16, |
| text_ffn_inner_dim: int = 8192, |
| text_max_seq_len: int = 514, |
| text_pad_idx: int = 0, |
| text_dropout_p: float = 0.1, |
| |
| embed_size: int = 1024, |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
| self.audio_embed_dim = audio_embed_dim |
| self.audio_depth = audio_depth |
| self.audio_num_heads = audio_num_heads |
| self.patch_size = patch_size or [64, 4] |
| self.patch_stride = patch_stride or [64, 4] |
| self.target_length = target_length |
| self.sample_rate = sample_rate |
| self.text_vocab_size = text_vocab_size |
| self.text_model_dim = text_model_dim |
| self.text_num_layers = text_num_layers |
| self.text_num_heads = text_num_heads |
| self.text_ffn_inner_dim = text_ffn_inner_dim |
| self.text_max_seq_len = text_max_seq_len |
| self.text_pad_idx = text_pad_idx |
| self.text_dropout_p = text_dropout_p |
| self.embed_size = embed_size |
|
|