uio2-dbg-preprocessor / preprocessor_config.json

Upload UnifiedIOPreprocessing

36db805 verified about 2 years ago

6.14 kB

	{
	"config": {
	"audio_history_cfg": {
	"attn_qk_norm": true,
	"attn_scaled_cosine": false,
	"clip_attn_logit": null,
	"dropout_broadcast_dims": [
	-2
	],
	"dropout_rate": 0.0,
	"droppath_rate": 0.0,
	"dtype": "float32",
	"emb_dim": 768,
	"float32_attention_logits": true,
	"head_dim": 64,
	"latents_size": 16,
	"layer_drop": 0.0,
	"max_frames": 8,
	"mlp_activations": [
	"gelu"
	],
	"mlp_dim": 2048,
	"num_heads": 12,
	"num_layers": 2,
	"resampler_type": "perceiver",
	"xattention_index": [
	0,
	1
	],
	"xattn_qk_norm": true,
	"xattn_scaled_cosine": false
	},
	"audio_vit_cfg": {
	"default_input_size": [
	256,
	128
	],
	"dropout_broadcast_dims": [],
	"dropout_rate": 0.0,
	"dtype": "float32",
	"emb_dim": 768,
	"float32_attention_logits": true,
	"head_dim": 64,
	"mlp_activations": [
	"gelu"
	],
	"mlp_dim": 3072,
	"num_heads": 12,
	"num_layers": 11,
	"patch_size": 16,
	"pos_patch_size": 16,
	"transpose_input": true,
	"vit_embed": true
	},
	"audio_vqgan": {
	"act_fn": "relu",
	"attention_dropout_rate": 0.0,
	"checkpoint_path": "",
	"decoder_head_dim": 64,
	"decoder_hidden_size": 512,
	"decoder_mlp_dim": 2048,
	"decoder_num_heads": 8,
	"decoder_num_layers": 8,
	"default_input_size": [
	128,
	256
	],
	"dropout_rate": 0.0,
	"droppath_rate": 0.0,
	"dtype": "float32",
	"encoder_head_dim": 64,
	"encoder_hidden_size": 512,
	"encoder_mlp_dim": 2048,
	"encoder_num_heads": 8,
	"encoder_num_layers": 8,
	"output_channel": 1,
	"patch_size": [
	8,
	8
	],
	"proj_dim": 32,
	"use_bias": false,
	"use_decoder": true,
	"vocab_size": 8192
	},
	"freeze_vit": true,
	"image_history_cfg": {
	"attn_qk_norm": true,
	"attn_scaled_cosine": false,
	"clip_attn_logit": null,
	"dropout_broadcast_dims": [
	-2
	],
	"dropout_rate": 0.0,
	"droppath_rate": 0.0,
	"dtype": "float32",
	"emb_dim": 768,
	"float32_attention_logits": true,
	"head_dim": 64,
	"latents_size": 32,
	"layer_drop": 0.0,
	"max_frames": 8,
	"mlp_activations": [
	"gelu"
	],
	"mlp_dim": 2048,
	"num_heads": 12,
	"num_layers": 2,
	"resampler_type": "perceiver",
	"xattention_index": [
	0,
	1
	],
	"xattn_qk_norm": true,
	"xattn_scaled_cosine": false
	},
	"image_vit_cfg": {
	"default_input_size": [
	256,
	256
	],
	"dropout_broadcast_dims": [],
	"dropout_rate": 0.0,
	"dtype": "float32",
	"emb_dim": 768,
	"float32_attention_logits": true,
	"head_dim": 64,
	"mlp_activations": [
	"gelu"
	],
	"mlp_dim": 3072,
	"num_heads": 12,
	"num_layers": 11,
	"num_pos": 197,
	"patch_size": 16,
	"pos_patch_size": 16
	},
	"image_vqgan": {
	"attn_resolutions": [
	32
	],
	"ch": 128,
	"ch_mult": [
	1,
	2,
	2,
	4
	],
	"checkpoint_path": "",
	"default_input_size": [
	256,
	256
	],
	"double_z": false,
	"dropout": 0,
	"dtype": "float32",
	"embed_dim": 4,
	"in_channels": 3,
	"n_embed": 16384,
	"num_res_blocks": 2,
	"out_ch": 3,
	"patch_size": [
	8,
	8
	],
	"resolution": 256,
	"z_channels": 4
	},
	"input_modalities": [
	"text",
	"image",
	"image_history",
	"audio",
	"audio_history"
	],
	"sequence_length": {
	"audio_history_input_samples": 128,
	"audio_input_samples": 128,
	"image_history_input_samples": 256,
	"image_input_samples": 576,
	"is_training": true,
	"num_frames": 4
	},
	"t5_config": {
	"audio_history_pos_emb": "llama_rope",
	"audio_patch_size": 16,
	"audio_pos_emb": "llama_rope",
	"audio_vit_patch_size": 16,
	"audio_vocab_size": 8320,
	"dalle_attn_mask": true,
	"decoder_max_audio_length": 512,
	"decoder_max_image_length": 1024,
	"decoder_max_text_length": 512,
	"decoder_xattention_internval": 1,
	"default_audio_history_vit_size": [
	256,
	128
	],
	"default_audio_size": [
	256,
	128
	],
	"default_audio_vit_size": [
	256,
	128
	],
	"default_image_history_vit_size": [
	256,
	256
	],
	"default_image_size": [
	256,
	256
	],
	"default_image_vit_size": [
	384,
	384
	],
	"dropout_broadcast_dims": [
	-2
	],
	"dropout_rate": 0.0,
	"dtype": "float32",
	"dynamic_unk_mask": true,
	"emb_dim": 1024,
	"encoder_max_audio_length": 128,
	"encoder_max_image_length": 576,
	"encoder_max_text_length": 512,
	"float32_attention_logits": true,
	"head_dim": 64,
	"image_history_pos_emb": "llama_rope",
	"image_patch_size": 16,
	"image_pos_emb": "llama_rope",
	"image_tokenizer_type": "vqgan",
	"image_vit_patch_size": 16,
	"image_vocab_size": 16512,
	"logits_via_embedding": true,
	"mlp_activations": [
	"silu",
	"linear"
	],
	"mlp_dim": 2816,
	"num_decoder_layers": 24,
	"num_encoder_layers": 24,
	"num_heads": 16,
	"qk_norm": true,
	"text_pos_emb": "llama_rope",
	"vocab_size": 33280
	},
	"target_modalities": [
	"text",
	"image",
	"audio"
	],
	"use_audio_history_vit": true,
	"use_audio_vit": true,
	"use_image_history_vit": true,
	"use_image_vit": true
	},
	"sequence_length": {
	"audio_history_input_samples": 128,
	"audio_input_samples": 128,
	"image_history_input_samples": 256,
	"image_input_samples": 576,
	"is_training": true,
	"num_frames": 4
	}
	}