| #!/bin/bash |
| |
| |
|
|
| set -e |
|
|
| if [ -z "$HF_TOKEN" ]; then echo "ERROR: export HF_TOKEN first"; exit 1; fi |
| if [ -z "$WANDB_API_KEY" ]; then echo "ERROR: export WANDB_API_KEY first"; exit 1; fi |
|
|
| DATASET_DIR="${DATASET_DIR:-/ephemeral/community_dataset_v3}" |
| REPO_DIR="${REPO_DIR:-/workspace/pi05-so100-diverse}" |
| NUM_GPUS="${NUM_GPUS:-1}" |
|
|
| echo "=== Step 1: Clone repo ===" |
| if [ ! -d "$REPO_DIR" ]; then |
| git clone https://huggingface.co/StrongRoboticsLab/pi05-so100-diverse "$REPO_DIR" |
| else |
| echo "Repo already cloned, skipping" |
| fi |
|
|
| echo "=== Step 2: Build Docker image ===" |
| cd "$REPO_DIR" |
| if ! docker images pi05-training --format '{{.ID}}' | grep -q .; then |
| docker build -t pi05-training . |
| else |
| echo "Image already built, skipping" |
| fi |
|
|
| echo "=== Step 3: Preflight checks ===" |
| docker run --rm --runtime=nvidia \ |
| -e HF_TOKEN="$HF_TOKEN" \ |
| pi05-training "bash /workspace/pi05-so100-diverse/preflight.sh" |
|
|
| if [ "${SKIP_DOWNLOAD:-0}" != "1" ]; then |
| echo "=== Step 4: Download dataset ===" |
| mkdir -p "$DATASET_DIR" |
| docker run --rm \ |
| -e HF_TOKEN="$HF_TOKEN" \ |
| -e HF_XET_HIGH_PERFORMANCE=1 \ |
| -v "$(dirname $DATASET_DIR):$(dirname $DATASET_DIR)" \ |
| pi05-training "huggingface-cli download \ |
| --repo-type dataset \ |
| HuggingFaceVLA/community_dataset_v3 \ |
| --local-dir $DATASET_DIR \ |
| --token \$HF_TOKEN" |
| else |
| echo "=== Step 4: Skipped (SKIP_DOWNLOAD=1) ===" |
| fi |
|
|
| echo "=== Step 5: Start training ===" |
| docker run --rm --runtime=nvidia \ |
| --ipc=host \ |
| --ulimit memlock=-1 \ |
| --ulimit stack=67108864 \ |
| -e HF_TOKEN="$HF_TOKEN" \ |
| -e WANDB_API_KEY="$WANDB_API_KEY" \ |
| -e NUM_GPUS="$NUM_GPUS" \ |
| -e DATASET_DIR="$DATASET_DIR" \ |
| -v /ephemeral:/ephemeral \ |
| -v "$REPO_DIR:/workspace/pi05-so100-diverse" \ |
| pi05-training "apt-get update -qq && apt-get install -y -qq software-properties-common > /dev/null \ |
| && add-apt-repository ppa:deadsnakes/ppa -y > /dev/null && apt-get update -qq \ |
| && apt-get install -y -qq python3.12 python3.12-venv python3.12-dev > /dev/null \ |
| && python3.12 -m ensurepip > /dev/null \ |
| && cd /workspace/pi05-so100-diverse/lerobot && python3.12 -m pip install -q -e '.[pi]' \ |
| && python3.12 -m pip install -q accelerate wandb huggingface_hub \ |
| && python3.12 -m pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 \ |
| && cd /workspace/pi05-so100-diverse \ |
| && bash train_cloud.sh" |
|
|