Buckets:

mishig
/

autoresearch-cache

1.45 kB

	"""
	One-time setup: prepare tokenizer and upload to HF bucket.
	Run on a cheap CPU job:

	hf jobs uv run \
	--flavor cpu-basic \
	--timeout 20m \
	--namespace mishig \
	--secrets HF_TOKEN \
	setup_cache.py
	"""
	import os
	import subprocess
	import sys
	import tarfile
	import urllib.request

	REPO_URL = "https://github.com/karpathy/autoresearch/archive/refs/heads/master.tar.gz"
	REPO_DIR = "autoresearch-master"
	BUCKET = "hf://buckets/mishig/autoresearch-cache"

	# Download and extract repo
	print("Downloading repo...")
	urllib.request.urlretrieve(REPO_URL, "repo.tar.gz")
	with tarfile.open("repo.tar.gz") as tar:
	tar.extractall(filter="data")
	os.chdir(REPO_DIR)

	# Prepare data (download shards + train tokenizer)
	print("Running prepare.py...")
	subprocess.run(["uv", "run", "prepare.py", "--num-shards", "10"], check=True)

	# Upload tokenizer to bucket using Python API
	tokenizer_dir = os.path.expanduser("~/.cache/autoresearch/tokenizer")
	print("Uploading tokenizer to bucket...")

	from huggingface_hub import batch_bucket_files

	files_to_upload = []
	for fname in os.listdir(tokenizer_dir):
	local_path = os.path.join(tokenizer_dir, fname)
	if os.path.isfile(local_path):
	files_to_upload.append((local_path, f"tokenizer/{fname}"))
	print(f" Uploading {fname}")

	batch_bucket_files("mishig/autoresearch-cache", add=files_to_upload)

	print("Done! Tokenizer cached at", f"{BUCKET}/tokenizer")

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.