#!/bin/bash # One-click bootstrap: builds Docker image, downloads dataset, starts training. # Usage: HF_TOKEN=xxx WANDB_API_KEY=xxx bash bootstrap.sh set -e if [ -z "$HF_TOKEN" ]; then echo "ERROR: export HF_TOKEN first"; exit 1; fi if [ -z "$WANDB_API_KEY" ]; then echo "ERROR: export WANDB_API_KEY first"; exit 1; fi DATASET_DIR="${DATASET_DIR:-/ephemeral/community_dataset_v3}" REPO_DIR="${REPO_DIR:-/workspace/pi05-so100-diverse}" NUM_GPUS="${NUM_GPUS:-1}" echo "=== Step 1: Clone repo ===" if [ ! -d "$REPO_DIR" ]; then git clone https://huggingface.co/StrongRoboticsLab/pi05-so100-diverse "$REPO_DIR" else echo "Repo already cloned, skipping" fi echo "=== Step 2: Build Docker image ===" cd "$REPO_DIR" if ! docker images pi05-training --format '{{.ID}}' | grep -q .; then docker build -t pi05-training . else echo "Image already built, skipping" fi echo "=== Step 3: Preflight checks ===" docker run --rm --runtime=nvidia \ -e HF_TOKEN="$HF_TOKEN" \ pi05-training "bash /workspace/pi05-so100-diverse/preflight.sh" if [ "${SKIP_DOWNLOAD:-0}" != "1" ]; then echo "=== Step 4: Download dataset ===" mkdir -p "$DATASET_DIR" docker run --rm \ -e HF_TOKEN="$HF_TOKEN" \ -e HF_XET_HIGH_PERFORMANCE=1 \ -v "$(dirname $DATASET_DIR):$(dirname $DATASET_DIR)" \ pi05-training "huggingface-cli download \ --repo-type dataset \ HuggingFaceVLA/community_dataset_v3 \ --local-dir $DATASET_DIR \ --token \$HF_TOKEN" else echo "=== Step 4: Skipped (SKIP_DOWNLOAD=1) ===" fi echo "=== Step 5: Start training ===" docker run --rm --runtime=nvidia \ --ipc=host \ --ulimit memlock=-1 \ --ulimit stack=67108864 \ -e HF_TOKEN="$HF_TOKEN" \ -e WANDB_API_KEY="$WANDB_API_KEY" \ -e NUM_GPUS="$NUM_GPUS" \ -e DATASET_DIR="$DATASET_DIR" \ -v /ephemeral:/ephemeral \ -v "$REPO_DIR:/workspace/pi05-so100-diverse" \ pi05-training "apt-get update -qq && apt-get install -y -qq software-properties-common > /dev/null \ && add-apt-repository ppa:deadsnakes/ppa -y > /dev/null && apt-get update -qq \ && apt-get install -y -qq python3.12 python3.12-venv python3.12-dev > /dev/null \ && python3.12 -m ensurepip > /dev/null \ && cd /workspace/pi05-so100-diverse/lerobot && python3.12 -m pip install -q -e '.[pi]' \ && python3.12 -m pip install -q accelerate wandb huggingface_hub \ && python3.12 -m pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 \ && cd /workspace/pi05-so100-diverse \ && bash train_cloud.sh"