py2cpp-training / app.py
Ajedrel's picture
spaces gpu added
9caf5bf verified
"""
cloud/huggingface/training_space/app.py
----------------------------------------
HuggingFace Space para fine-tuning de py2cpp con ZeroGPU.
ZeroGPU = GPU gratuita (A100 40GB) para Spaces con plan PRO (~$9/mes).
Si no tienes PRO, funciona en CPU (más lento) o puedes pagar T4 (~$0.60/hr).
Este Space hace TODO el pipeline:
1. Sube tu dataset de pares Python→C++ (o usa los ejemplos bootstrap)
2. Configura el fine-tuning (modelo, LoRA, épocas)
3. Entrena con progreso en tiempo real
4. Guarda el modelo directamente en tu HF Hub
Despliegue:
- Crea un Space en huggingface.co/new-space
- SDK: Gradio | Hardware: ZeroGPU (gratis con PRO) o T4 Small
- Sube todos los archivos de esta carpeta
- Configura los Secrets: HF_TOKEN (para escribir en el Hub)
"""
import spaces
import os
import sys
import json
import time
import threading
import tempfile
import logging
from pathlib import Path
from typing import Optional
import gradio as gr
import torch
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
HF_TOKEN = os.environ.get("HF_TOKEN", "")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Estado global del entrenamiento (compartido entre hilos)
_train_state = {
"running": False,
"log": [],
"progress": 0.0,
"done": False,
"error": None,
"model_url": None,
}
# ---------------------------------------------------------------------------
# Datos bootstrap embebidos (mínimo para arrancar sin dataset propio)
# ---------------------------------------------------------------------------
# Dataset builtin: 100 pares verificados, todos compilados con g++ -std=c++17
# Se cargan desde builtin_dataset.json (incluido junto a este app.py)
_BUILTIN_PAIRS: list[dict] = []
def _get_builtin_pairs() -> list[dict]:
global _BUILTIN_PAIRS
if _BUILTIN_PAIRS:
return _BUILTIN_PAIRS
builtin_path = Path(__file__).parent / "builtin_dataset.json"
if builtin_path.exists():
with open(builtin_path) as f:
_BUILTIN_PAIRS = json.load(f)
else:
# Fallback si el archivo no está presente
_BUILTIN_PAIRS = [
{"python": "print('Hello, World!')",
"cpp": '#include <iostream>\nusing namespace std;\nint main(){cout<<"Hello, World!"<<endl;return 0;}'},
{"python": "def factorial(n):\n return 1 if n<=1 else n*factorial(n-1)\nprint(factorial(10))",
"cpp": '#include <iostream>\nusing namespace std;\nlong long f(int n){return n<=1?1:n*f(n-1);}\nint main(){cout<<f(10)<<endl;return 0;}'},
]
return _BUILTIN_PAIRS
def _log(msg: str):
ts = time.strftime("%H:%M:%S")
line = f"[{ts}] {msg}"
_train_state["log"].append(line)
logger.info(msg)
def _prepare_dataset(jsonl_content: Optional[str], tmp_dir: str):
"""Prepara el dataset: usa el content subido o el bootstrap."""
pairs = []
if jsonl_content and jsonl_content.strip():
for line in jsonl_content.strip().splitlines():
try:
obj = json.loads(line)
if "python" in obj and "cpp" in obj:
pairs.append(obj)
except json.JSONDecodeError:
pass
_log(f"Dataset cargado: {len(pairs)} pares del archivo subido")
builtin = _get_builtin_pairs()
_log(f"Usando {len(builtin)} pares verificados del dataset builtin")
pairs.extend(builtin)
import random
random.shuffle(pairs)
n_train = max(1, int(len(pairs) * 0.85))
for split, data in [("train", pairs[:n_train]), ("val", pairs[n_train:])]:
p = Path(tmp_dir) / f"{split}.jsonl"
with open(p, "w") as f:
for item in data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
_log(f"Split: {n_train} train / {len(pairs)-n_train} val")
return str(Path(tmp_dir) / "train.jsonl"), str(Path(tmp_dir) / "val.jsonl")
def _run_training(
model_name: str,
lora_r: int,
lora_alpha: int,
num_epochs: int,
batch_size: int,
learning_rate: float,
hub_repo_id: str,
jsonl_content: Optional[str],
):
"""Hilo de entrenamiento."""
_train_state.update({"running": True, "log": [], "progress": 0.0,
"done": False, "error": None, "model_url": None})
try:
import torch
from torch.utils.data import Dataset
from transformers import (
AutoTokenizer, AutoModelForSeq2SeqLM,
Seq2SeqTrainer, Seq2SeqTrainingArguments,
DataCollatorForSeq2Seq, TrainerCallback,
)
from peft import LoraConfig, TaskType, get_peft_model
_log(f"Device: {DEVICE.upper()}")
if torch.cuda.is_available():
_log(f"GPU: {torch.cuda.get_device_name(0)}")
with tempfile.TemporaryDirectory() as tmp_dir:
# ── Dataset ──────────────────────────────────────────────────
train_file, val_file = _prepare_dataset(jsonl_content, tmp_dir)
# ── Tokenizer + Modelo ───────────────────────────────────────
_log(f"Cargando tokenizer: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
_log(f"Cargando modelo base...")
dtype = torch.bfloat16 if DEVICE == "cuda" else torch.float32
model = AutoModelForSeq2SeqLM.from_pretrained(
model_name,
torch_dtype=dtype,
device_map="auto" if DEVICE == "cuda" else None,
trust_remote_code=True,
)
# ── LoRA ─────────────────────────────────────────────────────
lora_cfg = LoraConfig(
r=lora_r, lora_alpha=lora_alpha, lora_dropout=0.05,
bias="none",
target_modules=["q", "v", "k", "o"],
task_type=TaskType.SEQ_2_SEQ_LM,
)
model = get_peft_model(model, lora_cfg)
n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
_log(f"Parámetros entrenables: {n_params:,} ({n_params/1e6:.2f}M)")
_train_state["progress"] = 0.1
# ── Dataset PyTorch ──────────────────────────────────────────
class SimpleDS(Dataset):
def __init__(self, path, tok, max_len=512):
self.tok = tok
self.max_len = max_len
self.samples = []
with open(path) as f:
for line in f:
if line.strip():
self.samples.append(json.loads(line))
def __len__(self): return len(self.samples)
def __getitem__(self, i):
s = self.samples[i]
inp = self.tok(
"Translate Python to C++: " + s["python"],
max_length=self.max_len, padding="max_length",
truncation=True, return_tensors="pt"
)
lbl = self.tok(
s["cpp"], max_length=self.max_len, padding="max_length",
truncation=True, return_tensors="pt"
)
ids = lbl["input_ids"].squeeze()
ids[ids == self.tok.pad_token_id] = -100
return {
"input_ids": inp["input_ids"].squeeze(),
"attention_mask": inp["attention_mask"].squeeze(),
"labels": ids,
}
train_ds = SimpleDS(train_file, tokenizer)
val_ds = SimpleDS(val_file, tokenizer)
_log(f"Train: {len(train_ds)} | Val: {len(val_ds)}")
_train_state["progress"] = 0.15
# ── Callback de progreso ─────────────────────────────────────
total_steps = [0]
class ProgressCallback(TrainerCallback):
def on_train_begin(self, args, state, control, **kw):
total_steps[0] = state.max_steps
_log(f"Iniciando: {state.max_steps} steps totales")
def on_log(self, args, state, control, logs=None, **kw):
if logs and total_steps[0] > 0:
frac = state.global_step / total_steps[0]
_train_state["progress"] = 0.15 + frac * 0.70
loss = logs.get("loss", logs.get("train_loss", "?"))
_log(f"Step {state.global_step}/{total_steps[0]} — loss: {loss}")
def on_epoch_end(self, args, state, control, **kw):
ep = int(state.epoch)
_log(f"✓ Época {ep}/{num_epochs} completada")
# ── Training args ────────────────────────────────────────────
ck_dir = str(Path(tmp_dir) / "checkpoints")
bf16_ok = DEVICE == "cuda" and torch.cuda.is_bf16_supported()
args = Seq2SeqTrainingArguments(
output_dir=ck_dir,
num_train_epochs=num_epochs,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
gradient_accumulation_steps=max(1, 8 // batch_size),
learning_rate=learning_rate,
lr_scheduler_type="cosine",
warmup_ratio=0.05,
bf16=bf16_ok,
fp16=(not bf16_ok and DEVICE == "cuda"),
gradient_checkpointing=(DEVICE == "cuda"),
save_strategy="epoch",
evaluation_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
logging_steps=max(1, len(train_ds) // (batch_size * 5)),
predict_with_generate=True,
generation_max_length=256,
report_to="none",
push_to_hub=bool(hub_repo_id and HF_TOKEN),
hub_model_id=hub_repo_id or None,
hub_token=HF_TOKEN or None,
hub_strategy="end",
)
trainer = Seq2SeqTrainer(
model=model,
args=args,
train_dataset=train_ds,
eval_dataset=val_ds,
tokenizer=tokenizer,
data_collator=DataCollatorForSeq2Seq(
tokenizer, model=model, label_pad_token_id=-100
),
callbacks=[ProgressCallback()],
)
_log("🚀 Iniciando fine-tuning...")
trainer.train()
_train_state["progress"] = 0.85
# ── Subir al Hub ─────────────────────────────────────────────
if hub_repo_id and HF_TOKEN:
_log(f"Subiendo modelo a hub: {hub_repo_id}...")
trainer.push_to_hub(commit_message="py2cpp fine-tuned via Training Space")
_train_state["model_url"] = f"https://huggingface.co/{hub_repo_id}"
_log(f"✓ Modelo disponible: {_train_state['model_url']}")
else:
_log("⚠ Sin HF_TOKEN: modelo no subido al Hub")
_train_state["progress"] = 1.0
_log("✅ Entrenamiento completado")
except Exception as e:
_train_state["error"] = str(e)
_log(f"❌ Error: {e}")
logger.exception("Error en entrenamiento")
finally:
_train_state["running"] = False
_train_state["done"] = True
# ---------------------------------------------------------------------------
# Interfaz Gradio
# ---------------------------------------------------------------------------
@spaces.GPU
def start_training(
jsonl_file,
model_name: str,
lora_r: int,
lora_alpha: int,
num_epochs: int,
batch_size: int,
learning_rate: float,
hub_repo_id: str,
):
if _train_state["running"]:
return "⚠ Ya hay un entrenamiento en curso.", 0.0
jsonl_content = None
if jsonl_file is not None:
with open(jsonl_file.name) as f:
jsonl_content = f.read()
t = threading.Thread(
target=_run_training,
args=(model_name, lora_r, lora_alpha, num_epochs,
batch_size, learning_rate, hub_repo_id, jsonl_content),
daemon=True,
)
t.start()
return "🚀 Entrenamiento iniciado...", 0.0
def get_status():
log_text = "\n".join(_train_state["log"][-60:])
progress = _train_state["progress"]
url = _train_state.get("model_url") or ""
error = _train_state.get("error") or ""
status = "⏳ Entrenando..." if _train_state["running"] else \
("✅ Completado" if _train_state["done"] and not error else
(f"❌ Error: {error}" if error else "🔵 Listo para entrenar"))
return log_text, progress, status, url
CSS = """
#title { text-align:center; font-size:1.6rem; margin-bottom:0.2rem; }
#sub { text-align:center; color:#888; margin-bottom:1.5rem; }
.mono { font-family: 'JetBrains Mono', monospace; font-size:0.8rem; }
"""
with gr.Blocks(
title="py2cpp Training Space",
theme=gr.themes.Soft(primary_hue="violet"),
css=CSS,
) as demo:
gr.Markdown("# ⚡ py2cpp — Training Space", elem_id="title")
gr.Markdown(
"Fine-tuning de **CodeT5+** con LoRA para traducción Python → C++ \n"
f"Device: `{DEVICE.upper()}` {'🟢 GPU disponible' if DEVICE=='cuda' else '🟡 CPU (lento)'}",
elem_id="sub",
)
with gr.Row():
# ── Panel izquierdo: configuración ──────────────────────────────
with gr.Column(scale=1):
gr.Markdown("### 📂 Dataset")
jsonl_file = gr.File(
label="Sube tu dataset (.jsonl) — opcional",
file_types=[".jsonl"],
)
gr.Markdown(
"_Formato: una línea por par `{\"python\": \"...\", \"cpp\": \"...\"}` \n"
"Si no subes nada, se usan los ejemplos bootstrap incluidos._"
)
gr.Markdown("### ⚙️ Configuración del modelo")
model_name = gr.Dropdown(
choices=[
"Salesforce/codet5p-220m",
"Salesforce/codet5p-770m",
"Salesforce/codet5-small",
"Salesforce/codet5-base",
],
value="Salesforce/codet5p-220m",
label="Modelo base",
)
with gr.Row():
lora_r = gr.Slider(4, 64, value=16, step=4, label="LoRA r")
lora_alpha = gr.Slider(8, 128, value=32, step=8, label="LoRA alpha")
with gr.Row():
num_epochs = gr.Slider(1, 20, value=5, step=1, label="Épocas")
batch_size = gr.Slider(1, 16, value=4, step=1, label="Batch size")
learning_rate = gr.Slider(1e-5, 1e-3, value=3e-4, step=1e-5, label="Learning rate")
gr.Markdown("### 🤗 Publicar en HuggingFace Hub")
hub_repo_id = gr.Textbox(
label="Repo ID (ej: tu-usuario/py2cpp-model)",
placeholder="tu-usuario/py2cpp-codet5p",
info="Requiere HF_TOKEN configurado en los Secrets del Space",
)
train_btn = gr.Button("🚀 Iniciar entrenamiento", variant="primary", size="lg")
status_label = gr.Textbox(label="Estado", interactive=False, max_lines=1)
# ── Panel derecho: progreso ──────────────────────────────────────
with gr.Column(scale=1):
gr.Markdown("### 📊 Progreso")
progress_bar = gr.Slider(
0, 1, value=0, label="Progreso total",
interactive=False,
)
log_box = gr.Textbox(
label="Log de entrenamiento",
lines=24,
max_lines=24,
interactive=False,
elem_classes=["mono"],
)
model_url_box = gr.Textbox(
label="URL del modelo en Hub",
interactive=False,
placeholder="Aparecerá cuando termine el entrenamiento...",
)
train_btn.click(
fn=start_training,
inputs=[jsonl_file, model_name, lora_r, lora_alpha,
num_epochs, batch_size, learning_rate, hub_repo_id],
outputs=[status_label, progress_bar],
)
# Polling cada 3 segundos mientras entrena
timer = gr.Timer(value=3) # Crea un temporizador de 3 segundos
timer.tick(
fn=get_status,
outputs=[log_box, progress_bar, status_label, model_url_box]
)
gr.Markdown(
"---\n"
"**Notas:** \n"
"• ZeroGPU (PRO) = A100 gratis por sesión \n"
"• T4 Small = ~$0.60/hr, A10G Small = ~$3/hr \n"
"• Configura `HF_TOKEN` en _Settings → Variables and secrets_ del Space"
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)