| #!/bin/bash |
|
|
| a=$(echo $HOSTNAME | cut -c12-16) |
|
|
| CONFIG=$1 |
| JOB_NAME=${2:-"experiments"} |
| GPUS=${3:-8} |
| |
| partition=${4:-'local'} |
|
|
| GPUS_PER_NODE=${GPUS:-8} |
| if [ $GPUS_PER_NODE -ge 8 ]; then |
| GPUS_PER_NODE=8 |
| fi |
| CPUS_PER_TASK=${CPUS_PER_TASK:-4} |
| SRUN_ARGS=${SRUN_ARGS:-""} |
|
|
| PY_ARGS=${@:5} |
|
|
| WORK_DIR=${CONFIG//configs/work_dirs} |
| WORK_DIR=${WORK_DIR//.yaml//$JOB_NAME} |
| echo $WORK_DIR |
| mkdir -p $WORK_DIR |
| mkdir -p data/temp |
|
|
| |
| export DATA_PATH='/mnt/lustre/share_data/zhujinguo' |
|
|
| srun --partition=${partition} $SRUN_ARGS \ |
| --job-name=${JOB_NAME} -n$GPUS --gres=gpu:${GPUS_PER_NODE} \ |
| --ntasks-per-node=${GPUS_PER_NODE} \ |
| --kill-on-bad-exit=1 --cpus-per-task 12 \ |
| python -u main.py --num-gpus $GPUS \ |
| --config-file ${CONFIG} --init_method slurm --resume \ |
| ${PY_ARGS} OUTPUT_DIR $WORK_DIR |
|
|
|
|