File size: 2,654 Bytes
78a0ca9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | import os
import argparse
import subprocess
from typing import Literal, Optional
def submit_hf_job(
image: str = "pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel",
flavor: str = "a10g-small",
command: str = "python3 train.py",
timeout: str = "2h",
secrets: Optional[list] = None
):
"""Submits a job to Hugging Face Jobs using the hf-cli."""
print(f"Submitting job to Hugging Face (Flavor: {flavor})")
cmd = [
"hf", "jobs", "run",
"--flavor", flavor,
"--timeout", timeout,
"--secrets", "HF_TOKEN"
]
if secrets:
for s in secrets:
cmd.extend(["--secrets", s])
cmd.extend([image] + command.split())
print(f"Executing: {' '.join(cmd)}")
subprocess.run(cmd, check=True)
def submit_kaggle_job(
script_path: str,
competition: Optional[str] = None,
dataset_path: Optional[str] = None
):
"""Submits a job to Kaggle using the Kaggle CLI."""
# Kaggle submission is often for competitions, but for general training
# it usually involves pushing a kernel/notebook.
print(f"Submitting script {script_path} to Kaggle...")
# Placeholder: In a real scenario, we'd generate a kernel-metadata.json
# and use 'kaggle kernels push -p /path/to/kernel'
# For now, we'll just show intent.
print("Step 1: Generate kernel-metadata.json")
print("Step 2: kaggle kernels push -p .")
# Example command (commented out as it needs a full dir with metadata)
# subprocess.run(["kaggle", "kernels", "push", "-p", "."], check=True)
pass
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Unified Job Submission for Qwen Trainer")
parser.add_argument("--platform", choices=["hf", "kaggle"], required=True)
parser.add_argument("--flavor", type=str, default="a10g-small", help="HF Job flavor")
parser.add_argument("--image", type=str, default="pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel")
parser.add_argument("--command", type=str, default="python3 skills/qwen-trainer/scripts/train.py --model Qwen/Qwen3.5-7B --method grpo --use_agentic --task 'Complex Reasoning' --num_synthetic 100")
parser.add_argument("--timeout", type=str, default="2h")
args = parser.parse_args()
if args.platform == "hf":
submit_hf_job(
image=args.image,
flavor=args.flavor,
command=args.command,
timeout=args.timeout
)
elif args.platform == "kaggle":
# For Kaggle we'd typically need the full script plus deps
submit_kaggle_job("skills/qwen-trainer/scripts/train.py")
|