File size: 2,659 Bytes
a8eb6e5 4d47bd9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 | #!/bin/bash
# One-click bootstrap: builds Docker image, downloads dataset, starts training.
# Usage: HF_TOKEN=xxx WANDB_API_KEY=xxx bash bootstrap.sh
set -e
if [ -z "$HF_TOKEN" ]; then echo "ERROR: export HF_TOKEN first"; exit 1; fi
if [ -z "$WANDB_API_KEY" ]; then echo "ERROR: export WANDB_API_KEY first"; exit 1; fi
DATASET_DIR="${DATASET_DIR:-/ephemeral/community_dataset_v3}"
REPO_DIR="${REPO_DIR:-/workspace/pi05-so100-diverse}"
NUM_GPUS="${NUM_GPUS:-1}"
echo "=== Step 1: Clone repo ==="
if [ ! -d "$REPO_DIR" ]; then
git clone https://huggingface.co/StrongRoboticsLab/pi05-so100-diverse "$REPO_DIR"
else
echo "Repo already cloned, skipping"
fi
echo "=== Step 2: Build Docker image ==="
cd "$REPO_DIR"
if ! docker images pi05-training --format '{{.ID}}' | grep -q .; then
docker build -t pi05-training .
else
echo "Image already built, skipping"
fi
echo "=== Step 3: Preflight checks ==="
docker run --rm --runtime=nvidia \
-e HF_TOKEN="$HF_TOKEN" \
pi05-training "bash /workspace/pi05-so100-diverse/preflight.sh"
if [ "${SKIP_DOWNLOAD:-0}" != "1" ]; then
echo "=== Step 4: Download dataset ==="
mkdir -p "$DATASET_DIR"
docker run --rm \
-e HF_TOKEN="$HF_TOKEN" \
-e HF_XET_HIGH_PERFORMANCE=1 \
-v "$(dirname $DATASET_DIR):$(dirname $DATASET_DIR)" \
pi05-training "huggingface-cli download \
--repo-type dataset \
HuggingFaceVLA/community_dataset_v3 \
--local-dir $DATASET_DIR \
--token \$HF_TOKEN"
else
echo "=== Step 4: Skipped (SKIP_DOWNLOAD=1) ==="
fi
echo "=== Step 5: Start training ==="
docker run --rm --runtime=nvidia \
--ipc=host \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
-e HF_TOKEN="$HF_TOKEN" \
-e WANDB_API_KEY="$WANDB_API_KEY" \
-e NUM_GPUS="$NUM_GPUS" \
-e DATASET_DIR="$DATASET_DIR" \
-v /ephemeral:/ephemeral \
-v "$REPO_DIR:/workspace/pi05-so100-diverse" \
pi05-training "apt-get update -qq && apt-get install -y -qq software-properties-common > /dev/null \
&& add-apt-repository ppa:deadsnakes/ppa -y > /dev/null && apt-get update -qq \
&& apt-get install -y -qq python3.12 python3.12-venv python3.12-dev > /dev/null \
&& python3.12 -m ensurepip > /dev/null \
&& cd /workspace/pi05-so100-diverse/lerobot && python3.12 -m pip install -q -e '.[pi]' \
&& python3.12 -m pip install -q accelerate wandb huggingface_hub \
&& python3.12 -m pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 \
&& cd /workspace/pi05-so100-diverse \
&& bash train_cloud.sh"
|