File size: 2,251 Bytes
ddb382a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/bin/bash


ckpt_dir="your_project_name"
log_dir="logs/$ckpt_dir"
dataset_config="ThinkSound/configs/multimodal_dataset_demo.json"
model_config="ThinkSound/configs/model_configs/thinksound.json"
pretransform_ckpt_path="ckpts/vae.ckpt"
#export MASTER_ADDR="10.32.3.240"
export MASTER_PORT="9511"
# pip install git+https://github.com/patrick-kidger/torchcubicspline.git

debug_mode="false"
node_rank=0


while [[ "$#" -gt 0 ]]; do
    case $1 in
        --debug) debug_mode="true"; shift ;;
        --node-rank) node_rank="$2"; shift; shift ;;
        *) echo "Unknown parameter passed: $1"; exit 1 ;;
    esac
done

export NODE_RANK=$node_rank
export WORLD_SIZE=8

mkdir demos

if [ "$debug_mode" != "true" ]; then
    mkdir -p "$log_dir"

    cp "$dataset_config" "$log_dir/"
    cp "$model_config" "$log_dir/"
    cp "$0" "$log_dir/"
fi



if [ "$debug_mode" == "true" ]; then
    num_gpus=1
    num_nodes=1
else
    num_gpus=8
    num_nodes=1
fi


echo "Training Configuration:"
echo "Checkpoint Directory: $ckpt_dir"
echo "Log Directory: $log_dir"
echo "Dataset Config: $dataset_config"
echo "Model Config: $model_config"
echo "Pretransform Checkpoint Path: $pretransform_ckpt_path"
echo "Num GPUs: $num_gpus"
echo "Num Nodes: $num_nodes"
echo "Batch Size: 32"
echo "Num Workers: 24"
echo "Node Rank: $node_rank"


if [ "$debug_mode" == "true" ]; then
    nohup python train.py \
        --dataset-config "$dataset_config" \
        --model-config "$model_config" \
        --name "$ckpt_dir" \
        --save-dir "logs/" \
        --pretransform-ckpt-path "$pretransform_ckpt_path" \
        --checkpoint-every 2000 \
        --num-gpus "$num_gpus" \
        --num-nodes "$num_nodes" \
        --batch-size 32 \
        --num-workers 24
else
    nohup python train.py \
        --dataset-config "$dataset_config" \
        --model-config "$model_config" \
        --name "$ckpt_dir" \
        --save-dir "logs/" \
        --pretransform-ckpt-path "$pretransform_ckpt_path" \
        --checkpoint-every 4000 \
        --num-gpus "$num_gpus" \
        --num-nodes "$num_nodes" \
        --batch-size 32 \
        --num-workers 24 \
        > "$log_dir/train.log" 2>&1 &
    
    echo "Training started. Logs can be found in $log_dir/train.log"
fi