mishig/autoresearch-cache / setup_cache.py
download
raw
1.45 kB
"""
One-time setup: prepare tokenizer and upload to HF bucket.
Run on a cheap CPU job:
hf jobs uv run \
--flavor cpu-basic \
--timeout 20m \
--namespace mishig \
--secrets HF_TOKEN \
setup_cache.py
"""
import os
import subprocess
import sys
import tarfile
import urllib.request
REPO_URL = "https://github.com/karpathy/autoresearch/archive/refs/heads/master.tar.gz"
REPO_DIR = "autoresearch-master"
BUCKET = "hf://buckets/mishig/autoresearch-cache"
# Download and extract repo
print("Downloading repo...")
urllib.request.urlretrieve(REPO_URL, "repo.tar.gz")
with tarfile.open("repo.tar.gz") as tar:
tar.extractall(filter="data")
os.chdir(REPO_DIR)
# Prepare data (download shards + train tokenizer)
print("Running prepare.py...")
subprocess.run(["uv", "run", "prepare.py", "--num-shards", "10"], check=True)
# Upload tokenizer to bucket using Python API
tokenizer_dir = os.path.expanduser("~/.cache/autoresearch/tokenizer")
print("Uploading tokenizer to bucket...")
from huggingface_hub import batch_bucket_files
files_to_upload = []
for fname in os.listdir(tokenizer_dir):
local_path = os.path.join(tokenizer_dir, fname)
if os.path.isfile(local_path):
files_to_upload.append((local_path, f"tokenizer/{fname}"))
print(f" Uploading {fname}")
batch_bucket_files("mishig/autoresearch-cache", add=files_to_upload)
print("Done! Tokenizer cached at", f"{BUCKET}/tokenizer")

Xet Storage Details

Size:
1.45 kB
·
Xet hash:
eaf108d826e2ee345643dfa2331e33d2e29afcd642a6878b67ddb081fb29b09b

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.