| { |
| "config": { |
| "audio_history_cfg": { |
| "attn_qk_norm": true, |
| "attn_scaled_cosine": false, |
| "clip_attn_logit": null, |
| "dropout_broadcast_dims": [ |
| -2 |
| ], |
| "dropout_rate": 0.0, |
| "droppath_rate": 0.0, |
| "dtype": "float32", |
| "emb_dim": 768, |
| "float32_attention_logits": true, |
| "head_dim": 64, |
| "latents_size": 16, |
| "layer_drop": 0.0, |
| "max_frames": 8, |
| "mlp_activations": [ |
| "gelu" |
| ], |
| "mlp_dim": 2048, |
| "num_heads": 12, |
| "num_layers": 2, |
| "resampler_type": "perceiver", |
| "xattention_index": [ |
| 0, |
| 1 |
| ], |
| "xattn_qk_norm": true, |
| "xattn_scaled_cosine": false |
| }, |
| "audio_vit_cfg": { |
| "default_input_size": [ |
| 256, |
| 128 |
| ], |
| "dropout_broadcast_dims": [], |
| "dropout_rate": 0.0, |
| "dtype": "float32", |
| "emb_dim": 768, |
| "float32_attention_logits": true, |
| "head_dim": 64, |
| "mlp_activations": [ |
| "gelu" |
| ], |
| "mlp_dim": 3072, |
| "num_heads": 12, |
| "num_layers": 11, |
| "patch_size": 16, |
| "pos_patch_size": 16, |
| "transpose_input": true, |
| "vit_embed": true |
| }, |
| "audio_vqgan": { |
| "act_fn": "relu", |
| "attention_dropout_rate": 0.0, |
| "checkpoint_path": "", |
| "decoder_head_dim": 64, |
| "decoder_hidden_size": 512, |
| "decoder_mlp_dim": 2048, |
| "decoder_num_heads": 8, |
| "decoder_num_layers": 8, |
| "default_input_size": [ |
| 128, |
| 256 |
| ], |
| "dropout_rate": 0.0, |
| "droppath_rate": 0.0, |
| "dtype": "float32", |
| "encoder_head_dim": 64, |
| "encoder_hidden_size": 512, |
| "encoder_mlp_dim": 2048, |
| "encoder_num_heads": 8, |
| "encoder_num_layers": 8, |
| "output_channel": 1, |
| "patch_size": [ |
| 8, |
| 8 |
| ], |
| "proj_dim": 32, |
| "use_bias": false, |
| "use_decoder": true, |
| "vocab_size": 8192 |
| }, |
| "freeze_vit": true, |
| "image_history_cfg": { |
| "attn_qk_norm": true, |
| "attn_scaled_cosine": false, |
| "clip_attn_logit": null, |
| "dropout_broadcast_dims": [ |
| -2 |
| ], |
| "dropout_rate": 0.0, |
| "droppath_rate": 0.0, |
| "dtype": "float32", |
| "emb_dim": 768, |
| "float32_attention_logits": true, |
| "head_dim": 64, |
| "latents_size": 32, |
| "layer_drop": 0.0, |
| "max_frames": 8, |
| "mlp_activations": [ |
| "gelu" |
| ], |
| "mlp_dim": 2048, |
| "num_heads": 12, |
| "num_layers": 2, |
| "resampler_type": "perceiver", |
| "xattention_index": [ |
| 0, |
| 1 |
| ], |
| "xattn_qk_norm": true, |
| "xattn_scaled_cosine": false |
| }, |
| "image_vit_cfg": { |
| "default_input_size": [ |
| 256, |
| 256 |
| ], |
| "dropout_broadcast_dims": [], |
| "dropout_rate": 0.0, |
| "dtype": "float32", |
| "emb_dim": 768, |
| "float32_attention_logits": true, |
| "head_dim": 64, |
| "mlp_activations": [ |
| "gelu" |
| ], |
| "mlp_dim": 3072, |
| "num_heads": 12, |
| "num_layers": 11, |
| "num_pos": 197, |
| "patch_size": 16, |
| "pos_patch_size": 16 |
| }, |
| "image_vqgan": { |
| "attn_resolutions": [ |
| 32 |
| ], |
| "ch": 128, |
| "ch_mult": [ |
| 1, |
| 2, |
| 2, |
| 4 |
| ], |
| "checkpoint_path": "", |
| "default_input_size": [ |
| 256, |
| 256 |
| ], |
| "double_z": false, |
| "dropout": 0, |
| "dtype": "float32", |
| "embed_dim": 4, |
| "in_channels": 3, |
| "n_embed": 16384, |
| "num_res_blocks": 2, |
| "out_ch": 3, |
| "patch_size": [ |
| 8, |
| 8 |
| ], |
| "resolution": 256, |
| "z_channels": 4 |
| }, |
| "input_modalities": [ |
| "text", |
| "image", |
| "image_history", |
| "audio", |
| "audio_history" |
| ], |
| "sequence_length": { |
| "audio_history_input_samples": 128, |
| "audio_input_samples": 128, |
| "image_history_input_samples": 256, |
| "image_input_samples": 576, |
| "is_training": true, |
| "num_frames": 4 |
| }, |
| "t5_config": { |
| "audio_history_pos_emb": "llama_rope", |
| "audio_patch_size": 16, |
| "audio_pos_emb": "llama_rope", |
| "audio_vit_patch_size": 16, |
| "audio_vocab_size": 8320, |
| "dalle_attn_mask": true, |
| "decoder_max_audio_length": 512, |
| "decoder_max_image_length": 1024, |
| "decoder_max_text_length": 512, |
| "decoder_xattention_internval": 1, |
| "default_audio_history_vit_size": [ |
| 256, |
| 128 |
| ], |
| "default_audio_size": [ |
| 256, |
| 128 |
| ], |
| "default_audio_vit_size": [ |
| 256, |
| 128 |
| ], |
| "default_image_history_vit_size": [ |
| 256, |
| 256 |
| ], |
| "default_image_size": [ |
| 256, |
| 256 |
| ], |
| "default_image_vit_size": [ |
| 384, |
| 384 |
| ], |
| "dropout_broadcast_dims": [ |
| -2 |
| ], |
| "dropout_rate": 0.0, |
| "dtype": "float32", |
| "dynamic_unk_mask": true, |
| "emb_dim": 1024, |
| "encoder_max_audio_length": 128, |
| "encoder_max_image_length": 576, |
| "encoder_max_text_length": 512, |
| "float32_attention_logits": true, |
| "head_dim": 64, |
| "image_history_pos_emb": "llama_rope", |
| "image_patch_size": 16, |
| "image_pos_emb": "llama_rope", |
| "image_tokenizer_type": "vqgan", |
| "image_vit_patch_size": 16, |
| "image_vocab_size": 16512, |
| "logits_via_embedding": true, |
| "mlp_activations": [ |
| "silu", |
| "linear" |
| ], |
| "mlp_dim": 2816, |
| "num_decoder_layers": 24, |
| "num_encoder_layers": 24, |
| "num_heads": 16, |
| "qk_norm": true, |
| "text_pos_emb": "llama_rope", |
| "vocab_size": 33280 |
| }, |
| "target_modalities": [ |
| "text", |
| "image", |
| "audio" |
| ], |
| "use_audio_history_vit": true, |
| "use_audio_vit": true, |
| "use_image_history_vit": true, |
| "use_image_vit": true |
| }, |
| "sequence_length": { |
| "audio_history_input_samples": 128, |
| "audio_input_samples": 128, |
| "image_history_input_samples": 256, |
| "image_input_samples": 576, |
| "is_training": true, |
| "num_frames": 4 |
| } |
| } |
|
|