| { |
| "_name_or_path": "", |
| "architectures": [ |
| "MultiModalLLM_PT" |
| ], |
| "auto_map": { |
| "AutoConfig": "model_config.VideoChatEConfig", |
| "AutoModel": "modeling_videochate.MultiModalLLM_PT" |
| }, |
| "model_config": { |
| "bridge": { |
| "extra_num_query_token": 64, |
| "name": "qformer", |
| "num_query_token": 32, |
| "qformer_attention_probs_dropout_prob": 0.1, |
| "qformer_drop_path_rate": 0.2, |
| "qformer_hidden_dropout_prob": 0.1 |
| }, |
| "freeze_bridge": false, |
| "freeze_llm": false, |
| "freeze_vision_encoder": false, |
| "llm": { |
| "lora_alpha": 32, |
| "lora_dropout": 0.1, |
| "lora_r": 16, |
| "name": "mistral_7b", |
| "pretrained_llm_path": "mistralai/Mistral-7B-Instruct-v0.3", |
| "use_lora": true, |
| "hidden_size": 4096 |
| }, |
| "loss": { |
| "use_vision_regression_loss": false |
| }, |
| "pretrained_paths": {}, |
| |
| "vision_encoder": { |
| "name":"vit_l14", |
| "img_size":224, |
| "patch_size":16, |
| "d_model":1024, |
| "encoder_embed_dim":1024, |
| "encoder_depth":24, |
| "encoder_num_heads":16, |
| "drop_path_rate": 0.0, |
| "num_frames":16, |
| "tubelet_size":1, |
| "use_checkpoint":false, |
| "checkpoint_num":0, |
| "return_index":-2, |
| "vit_add_ln":true, |
| "pretrained": null |
| } |
| }, |
| "torch_dtype": "float32", |
| "transformers_version": "4.38.0", |
| "use_flash_attention": true, |
| "use_cache": true, |
| "build_decoder":true, |
| "hidden_size": 4096 |
| } |
|
|