| #!/bin/bash |
|
|
| |
| cd /opt/tiger/masp_models |
| pip install --upgrade pip |
| pip install -e . |
|
|
| echo "$PWD" |
|
|
| ports=(`echo $METIS_WORKER_0_PORT | tr ',' ' '`) |
| port=${ports[0]} |
|
|
| echo "total workers: ${ARNOLD_WORKER_NUM}" |
| echo "cur worker id: ${ARNOLD_ID}" |
| echo "gpus per worker: ${ARNOLD_WORKER_GPU}" |
| echo "master ip: ${METIS_WORKER_0_HOST}" |
| echo "master port: ${port}" |
|
|
| |
| |
| |
| |
| |
| export NCCL_DEBUG=INFO |
|
|
| env="$1" |
| cmd="$2" |
| echo $env |
| echo $cmd |
|
|
| deepspeed \ |
| --num_nodes=$ARNOLD_WORKER_NUM \ |
| --num_gpus=$ARNOLD_WORKER_GPU \ |
| --master_port=$port \ |
| --master_addr $METIS_WORKER_0_HOST \ |
| llava/train/train_mem.py \ |
| --deepspeed ./scripts/zero2.json \ |
| --model_name_or_path mistralai/Mistral-7B-Instruct-v0.1 \ |
| --version plain \ |
| --dataset_config /mnt/bn/algo-masp-nas-2/xiangchen/repo/LLaVA/llava/configs/pretrain_data.yaml \ |
| --vision_tower HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit \ |
| --adapter_module_name naive_resampler \ |
| --tune_mm_mlp_adapter True \ |
| --mm_vision_select_layer -2 \ |
| --mm_use_start_end True \ |
| --mm_use_patch_token False \ |
| --image_aspect_ratio pad \ |
| --num_token_per_image 64 \ |
| --num_query_token 64 \ |
| --bf16 True \ |
| --output_dir /mnt/bn/masp-nas/xiangchen/model/masp_models/checkpoints/llava-pretrain-siglip_resampler_64_projector \ |
| --group_by_modality_length True \ |
| --num_train_epochs 1 \ |
| --per_device_train_batch_size 32 \ |
| --per_device_eval_batch_size 4 \ |
| --gradient_accumulation_steps 1 \ |
| --evaluation_strategy "no" \ |
| --save_strategy "steps" \ |
| --save_steps 2000 \ |
| --save_total_limit 1 \ |
| --learning_rate 1e-3 \ |
| --weight_decay 0. \ |
| --warmup_ratio 0.03 \ |
| --lr_scheduler_type "cosine" \ |
| --logging_steps 1 \ |
| --tf32 True \ |
| --model_max_length 4096 \ |
| --gradient_checkpointing True \ |
| --dataloader_num_workers 1 \ |
| --lazy_preprocess True \ |
| --report_to none |