Buckets:
| """ | |
| One-time setup: prepare tokenizer and upload to HF bucket. | |
| Run on a cheap CPU job: | |
| hf jobs uv run \ | |
| --flavor cpu-basic \ | |
| --timeout 20m \ | |
| --namespace mishig \ | |
| --secrets HF_TOKEN \ | |
| setup_cache.py | |
| """ | |
| import os | |
| import subprocess | |
| import sys | |
| import tarfile | |
| import urllib.request | |
| REPO_URL = "https://github.com/karpathy/autoresearch/archive/refs/heads/master.tar.gz" | |
| REPO_DIR = "autoresearch-master" | |
| BUCKET = "hf://buckets/mishig/autoresearch-cache" | |
| # Download and extract repo | |
| print("Downloading repo...") | |
| urllib.request.urlretrieve(REPO_URL, "repo.tar.gz") | |
| with tarfile.open("repo.tar.gz") as tar: | |
| tar.extractall(filter="data") | |
| os.chdir(REPO_DIR) | |
| # Prepare data (download shards + train tokenizer) | |
| print("Running prepare.py...") | |
| subprocess.run(["uv", "run", "prepare.py", "--num-shards", "10"], check=True) | |
| # Upload tokenizer to bucket using Python API | |
| tokenizer_dir = os.path.expanduser("~/.cache/autoresearch/tokenizer") | |
| print("Uploading tokenizer to bucket...") | |
| from huggingface_hub import batch_bucket_files | |
| files_to_upload = [] | |
| for fname in os.listdir(tokenizer_dir): | |
| local_path = os.path.join(tokenizer_dir, fname) | |
| if os.path.isfile(local_path): | |
| files_to_upload.append((local_path, f"tokenizer/{fname}")) | |
| print(f" Uploading {fname}") | |
| batch_bucket_files("mishig/autoresearch-cache", add=files_to_upload) | |
| print("Done! Tokenizer cached at", f"{BUCKET}/tokenizer") | |
Xet Storage Details
- Size:
- 1.45 kB
- Xet hash:
- eaf108d826e2ee345643dfa2331e33d2e29afcd642a6878b67ddb081fb29b09b
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.