Spaces:
Running on Zero
Running on Zero
File size: 2,251 Bytes
ddb382a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | #!/bin/bash
ckpt_dir="your_project_name"
log_dir="logs/$ckpt_dir"
dataset_config="ThinkSound/configs/multimodal_dataset_demo.json"
model_config="ThinkSound/configs/model_configs/thinksound.json"
pretransform_ckpt_path="ckpts/vae.ckpt"
#export MASTER_ADDR="10.32.3.240"
export MASTER_PORT="9511"
# pip install git+https://github.com/patrick-kidger/torchcubicspline.git
debug_mode="false"
node_rank=0
while [[ "$#" -gt 0 ]]; do
case $1 in
--debug) debug_mode="true"; shift ;;
--node-rank) node_rank="$2"; shift; shift ;;
*) echo "Unknown parameter passed: $1"; exit 1 ;;
esac
done
export NODE_RANK=$node_rank
export WORLD_SIZE=8
mkdir demos
if [ "$debug_mode" != "true" ]; then
mkdir -p "$log_dir"
cp "$dataset_config" "$log_dir/"
cp "$model_config" "$log_dir/"
cp "$0" "$log_dir/"
fi
if [ "$debug_mode" == "true" ]; then
num_gpus=1
num_nodes=1
else
num_gpus=8
num_nodes=1
fi
echo "Training Configuration:"
echo "Checkpoint Directory: $ckpt_dir"
echo "Log Directory: $log_dir"
echo "Dataset Config: $dataset_config"
echo "Model Config: $model_config"
echo "Pretransform Checkpoint Path: $pretransform_ckpt_path"
echo "Num GPUs: $num_gpus"
echo "Num Nodes: $num_nodes"
echo "Batch Size: 32"
echo "Num Workers: 24"
echo "Node Rank: $node_rank"
if [ "$debug_mode" == "true" ]; then
nohup python train.py \
--dataset-config "$dataset_config" \
--model-config "$model_config" \
--name "$ckpt_dir" \
--save-dir "logs/" \
--pretransform-ckpt-path "$pretransform_ckpt_path" \
--checkpoint-every 2000 \
--num-gpus "$num_gpus" \
--num-nodes "$num_nodes" \
--batch-size 32 \
--num-workers 24
else
nohup python train.py \
--dataset-config "$dataset_config" \
--model-config "$model_config" \
--name "$ckpt_dir" \
--save-dir "logs/" \
--pretransform-ckpt-path "$pretransform_ckpt_path" \
--checkpoint-every 4000 \
--num-gpus "$num_gpus" \
--num-nodes "$num_nodes" \
--batch-size 32 \
--num-workers 24 \
> "$log_dir/train.log" 2>&1 &
echo "Training started. Logs can be found in $log_dir/train.log"
fi
|