| set -x |
| |
|
|
| |
| |
| lpips_lambda=2.0 |
| |
| ssim_lambda=0. |
| l1_lambda=0. |
| l2_lambda=1 |
|
|
| NUM_GPUS=1 |
|
|
|
|
| image_size=128 |
|
|
| num_workers=3 |
| image_size_encoder=256 |
| patch_size=14 |
| kl_lambda=1.0e-06 |
| patch_rendering_resolution=56 |
| batch_size=4 |
| microbatch=4 |
|
|
|
|
| |
| data_dir=./assets/Objaverse/ |
|
|
|
|
| DATASET_FLAGS=" |
| --data_dir "NONE" \ |
| --eval_data_dir ${data_dir} \ |
| " |
|
|
| conv_lr=2e-4 |
| lr=1e-4 |
|
|
| vit_decoder_lr=$lr |
| encoder_lr=${conv_lr} |
| triplane_decoder_lr=$conv_lr |
| super_resolution_lr=$conv_lr |
|
|
| |
|
|
| LR_FLAGS="--encoder_lr $encoder_lr \ |
| --vit_decoder_lr $vit_decoder_lr \ |
| --triplane_decoder_lr $triplane_decoder_lr \ |
| --super_resolution_lr $super_resolution_lr \ |
| --lr $lr" |
|
|
| TRAIN_FLAGS="--iterations 10001 --anneal_lr False \ |
| --batch_size $batch_size --save_interval 10000 \ |
| --microbatch ${microbatch} \ |
| --image_size_encoder $image_size_encoder \ |
| --dino_version mv-sd-dit \ |
| --sr_training False \ |
| --cls_token False \ |
| --weight_decay 0.05 \ |
| --image_size $image_size \ |
| --kl_lambda ${kl_lambda} \ |
| --no_dim_up_mlp True \ |
| --uvit_skip_encoder False \ |
| --fg_mse True \ |
| --bg_lamdba 1.0 \ |
| --lpips_delay_iter 100 \ |
| --sr_delay_iter 25000 \ |
| --kl_anneal True \ |
| --symmetry_loss False \ |
| --vae_p 2 \ |
| --plucker_embedding True \ |
| --encoder_in_channels 10 \ |
| --arch_dit_decoder DiT2-B/2 \ |
| --sd_E_ch 64 \ |
| --sd_E_num_res_blocks 1 \ |
| --lrm_decoder False \ |
| --resume_checkpoint checkpoints/objaverse/model_rec1680000.pt \ |
| " |
|
|
| |
| logdir="./logs/vae-reconstruction/objav/vae/infer-latents" |
|
|
| SR_TRAIN_FLAGS_v1_2XC=" |
| --decoder_in_chans 32 \ |
| --out_chans 96 \ |
| --alpha_lambda 1.0 \ |
| --logdir $logdir \ |
| --arch_encoder vits \ |
| --arch_decoder vitb \ |
| --vit_decoder_wd 0.001 \ |
| --encoder_weight_decay 0.001 \ |
| --color_criterion mse \ |
| --decoder_output_dim 3 \ |
| --ae_classname vit.vit_triplane.RodinSR_256_fusionv6_ConvQuant_liteSR_dinoInit3DAttn_SD_B_3L_C_withrollout_withSD_D_ditDecoder_S \ |
| " |
|
|
| SR_TRAIN_FLAGS=${SR_TRAIN_FLAGS_v1_2XC} |
|
|
|
|
| rm -rf "$logdir"/runs |
| mkdir -p "$logdir"/ |
| cp "$0" "$logdir"/ |
|
|
| |
| export LC_ALL=en_US.UTF-8 |
|
|
| export OPENCV_IO_ENABLE_OPENEXR=1 |
| export OMP_NUM_THREADS=12 |
| export NCCL_ASYNC_ERROR_HANDLING=1 |
| export NCCL_IB_GID_INDEX=3 |
| export CUDA_VISIBLE_DEVICES=0 |
|
|
|
|
| torchrun --nproc_per_node=$NUM_GPUS \ |
| --nnodes=1 \ |
| --rdzv-endpoint=${HOST_NODE_ADDR} \ |
| --rdzv_backend=c10d \ |
| scripts/vit_triplane_train.py \ |
| --trainer_name nv_rec_patch_mvE \ |
| --num_workers ${num_workers} \ |
| ${TRAIN_FLAGS} \ |
| ${SR_TRAIN_FLAGS} \ |
| ${DATASET_FLAGS} \ |
| --lpips_lambda $lpips_lambda \ |
| --overfitting False \ |
| --load_pretrain_encoder False \ |
| --iterations 5000001 \ |
| --save_interval 10000 \ |
| --eval_interval 250000000 \ |
| --decomposed True \ |
| --logdir $logdir \ |
| --decoder_load_pretrained False \ |
| --cfg objverse_tuneray_aug_resolution_64_64_auto \ |
| --patch_size ${patch_size} \ |
| --use_amp False \ |
| --eval_batch_size 4 \ |
| ${LR_FLAGS} \ |
| --l1_lambda ${l1_lambda} \ |
| --l2_lambda ${l2_lambda} \ |
| --ssim_lambda ${ssim_lambda} \ |
| --depth_smoothness_lambda 0 \ |
| --use_conf_map False \ |
| --objv_dataset True \ |
| --depth_lambda 0.5 \ |
| --patch_rendering_resolution ${patch_rendering_resolution} \ |
| --use_lmdb_compressed False \ |
| --use_lmdb False \ |
| --mv_input True \ |
| --inference True \ |
| --split_chunk_input False \ |
| --use_wds False \ |
| --four_view_for_latent True \ |
| --append_depth True \ |
| --save_latent True \ |
| --shuffle_across_cls True \ |
|
|