File size: 2,659 Bytes
a8eb6e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d47bd9
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/bin/bash
# One-click bootstrap: builds Docker image, downloads dataset, starts training.
# Usage: HF_TOKEN=xxx WANDB_API_KEY=xxx bash bootstrap.sh

set -e

if [ -z "$HF_TOKEN" ]; then echo "ERROR: export HF_TOKEN first"; exit 1; fi
if [ -z "$WANDB_API_KEY" ]; then echo "ERROR: export WANDB_API_KEY first"; exit 1; fi

DATASET_DIR="${DATASET_DIR:-/ephemeral/community_dataset_v3}"
REPO_DIR="${REPO_DIR:-/workspace/pi05-so100-diverse}"
NUM_GPUS="${NUM_GPUS:-1}"

echo "=== Step 1: Clone repo ==="
if [ ! -d "$REPO_DIR" ]; then
    git clone https://huggingface.co/StrongRoboticsLab/pi05-so100-diverse "$REPO_DIR"
else
    echo "Repo already cloned, skipping"
fi

echo "=== Step 2: Build Docker image ==="
cd "$REPO_DIR"
if ! docker images pi05-training --format '{{.ID}}' | grep -q .; then
    docker build -t pi05-training .
else
    echo "Image already built, skipping"
fi

echo "=== Step 3: Preflight checks ==="
docker run --rm --runtime=nvidia \
    -e HF_TOKEN="$HF_TOKEN" \
    pi05-training "bash /workspace/pi05-so100-diverse/preflight.sh"

if [ "${SKIP_DOWNLOAD:-0}" != "1" ]; then
    echo "=== Step 4: Download dataset ==="
    mkdir -p "$DATASET_DIR"
    docker run --rm \
        -e HF_TOKEN="$HF_TOKEN" \
        -e HF_XET_HIGH_PERFORMANCE=1 \
        -v "$(dirname $DATASET_DIR):$(dirname $DATASET_DIR)" \
        pi05-training "huggingface-cli download \
            --repo-type dataset \
            HuggingFaceVLA/community_dataset_v3 \
            --local-dir $DATASET_DIR \
            --token \$HF_TOKEN"
else
    echo "=== Step 4: Skipped (SKIP_DOWNLOAD=1) ==="
fi

echo "=== Step 5: Start training ==="
docker run --rm --runtime=nvidia \
    --ipc=host \
    --ulimit memlock=-1 \
    --ulimit stack=67108864 \
    -e HF_TOKEN="$HF_TOKEN" \
    -e WANDB_API_KEY="$WANDB_API_KEY" \
    -e NUM_GPUS="$NUM_GPUS" \
    -e DATASET_DIR="$DATASET_DIR" \
    -v /ephemeral:/ephemeral \
    -v "$REPO_DIR:/workspace/pi05-so100-diverse" \
    pi05-training "apt-get update -qq && apt-get install -y -qq software-properties-common > /dev/null \
        && add-apt-repository ppa:deadsnakes/ppa -y > /dev/null && apt-get update -qq \
        && apt-get install -y -qq python3.12 python3.12-venv python3.12-dev > /dev/null \
        && python3.12 -m ensurepip > /dev/null \
        && cd /workspace/pi05-so100-diverse/lerobot && python3.12 -m pip install -q -e '.[pi]' \
        && python3.12 -m pip install -q accelerate wandb huggingface_hub \
        && python3.12 -m pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 \
        && cd /workspace/pi05-so100-diverse \
        && bash train_cloud.sh"