mindchain's picture
Upload folder using huggingface_hub
78a0ca9 verified
#!/usr/bin/env python3
import argparse
import subprocess
import os
import sys
def main():
parser = argparse.ArgumentParser(description="Qwen Trainer CLI - Unified interface for data gen and fine-tuning.")
subparsers = parser.add_subparsers(dest="command", help="Command to run")
# Data Gen Subcommand
data_parser = subparsers.add_parser("data", help="Generate synthetic agentic data")
data_parser.add_argument("--task", type=str, required=True, help="Task description")
data_parser.add_argument("--num", type=int, default=10, help="Number of records")
data_parser.add_argument("--output", type=str, default="synthetic_data.jsonl", help="Output path")
data_parser.add_argument("--reasoning", action="store_true", help="Generate reasoning format")
data_parser.add_argument("--dpo", action="store_true", help="Generate DPO pairs")
data_parser.add_argument("--max-tokens", type=int, default=4096, help="Max tokens for generation")
# Train Subcommand
train_parser = subparsers.add_parser("train", help="Run fine-tuning")
train_parser.add_argument("--model", type=str, default="Qwen/Qwen3.5-2B", help="Base model")
train_parser.add_argument("--dataset", type=str, help="Dataset path/name")
train_parser.add_argument("--method", choices=["sft", "dpo", "grpo"], default="sft", help="Method")
train_parser.add_argument("--task", type=str, help="Auto-generate data for this task")
train_parser.add_argument("--num_synthetic", type=int, default=50, help="Number of synthetic records if --task is set")
train_parser.add_argument("--push", action="store_true", help="Push to Hub")
train_parser.add_argument("--hub_id", type=str, help="HF Hub ID")
# Submit Subcommand
submit_parser = subparsers.add_parser("submit", help="Submit a job to HF or Kaggle")
submit_parser.add_argument("--platform", choices=["hf", "kaggle"], required=True)
submit_parser.add_argument("--flavor", type=str, default="a10g-small", help="HF Job flavor")
submit_parser.add_argument("--image", type=str, default="pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel")
submit_parser.add_argument("--cmd", type=str, help="Full command to run in the job")
# Benchmark Subcommand
benchmark_parser = subparsers.add_parser("benchmark", help="Benchmark a model on a dataset")
benchmark_parser.add_argument("--model", type=str, default="Qwen/Qwen3.5-7B", help="Model ID")
benchmark_parser.add_argument("--dataset", type=str, default="reasoning_assistant_v2_10.jsonl", help="Dataset path")
benchmark_parser.add_argument("--num", type=int, default=10, help="Number of samples")
args = parser.parse_args()
if args.command == "data":
cmd = [
f"{os.path.expanduser('~/datadesigner-env-py312/bin/python3')}",
"skills/qwen-trainer/scripts/agentic_data_gen.py",
"--task", args.task,
"--num", str(args.num),
"--output", args.output,
"--max-tokens", str(args.max_tokens)
]
if args.reasoning: cmd.append("--reasoning")
if args.dpo: cmd.append("--dpo")
print(f"Running Data Generation: {' '.join(cmd)}")
subprocess.run(cmd, check=True)
elif args.command == "train":
cmd = [
"python3",
"skills/qwen-trainer/scripts/train.py",
"--model", args.model,
"--method", args.method
]
if args.dataset:
cmd.extend(["--dataset", args.dataset])
if args.task:
cmd.extend(["--use_agentic", "--task", args.task, "--num_synthetic", str(args.num_synthetic)])
if args.push and args.hub_id:
cmd.extend(["--push", "--hub_id", args.hub_id])
print(f"Running Training: {' '.join(cmd)}")
subprocess.run(cmd, check=True)
elif args.command == "submit":
cmd = [
"python3",
"skills/qwen-trainer/scripts/submit.py",
"--platform", args.platform,
"--flavor", args.flavor,
"--image", args.image
]
if args.cmd:
cmd.extend(["--command", args.cmd])
print(f"Submitting Job: {' '.join(cmd)}")
subprocess.run(cmd, check=True)
elif args.command == "benchmark":
cmd = [
"python3",
"skills/qwen-trainer/scripts/benchmark.py",
"--model", args.model,
"--dataset", args.dataset,
"--num", str(args.num)
]
print(f"Running Benchmark: {' '.join(cmd)}")
subprocess.run(cmd, check=True)
else:
parser.print_help()
if __name__ == "__main__":
main()