Spaces:
Running on Zero
Running on Zero
| """ | |
| cloud/huggingface/training_space/app.py | |
| ---------------------------------------- | |
| HuggingFace Space para fine-tuning de py2cpp con ZeroGPU. | |
| ZeroGPU = GPU gratuita (A100 40GB) para Spaces con plan PRO (~$9/mes). | |
| Si no tienes PRO, funciona en CPU (más lento) o puedes pagar T4 (~$0.60/hr). | |
| Este Space hace TODO el pipeline: | |
| 1. Sube tu dataset de pares Python→C++ (o usa los ejemplos bootstrap) | |
| 2. Configura el fine-tuning (modelo, LoRA, épocas) | |
| 3. Entrena con progreso en tiempo real | |
| 4. Guarda el modelo directamente en tu HF Hub | |
| Despliegue: | |
| - Crea un Space en huggingface.co/new-space | |
| - SDK: Gradio | Hardware: ZeroGPU (gratis con PRO) o T4 Small | |
| - Sube todos los archivos de esta carpeta | |
| - Configura los Secrets: HF_TOKEN (para escribir en el Hub) | |
| """ | |
| import spaces | |
| import os | |
| import sys | |
| import json | |
| import time | |
| import threading | |
| import tempfile | |
| import logging | |
| from pathlib import Path | |
| from typing import Optional | |
| import gradio as gr | |
| import torch | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Estado global del entrenamiento (compartido entre hilos) | |
| _train_state = { | |
| "running": False, | |
| "log": [], | |
| "progress": 0.0, | |
| "done": False, | |
| "error": None, | |
| "model_url": None, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Datos bootstrap embebidos (mínimo para arrancar sin dataset propio) | |
| # --------------------------------------------------------------------------- | |
| # Dataset builtin: 100 pares verificados, todos compilados con g++ -std=c++17 | |
| # Se cargan desde builtin_dataset.json (incluido junto a este app.py) | |
| _BUILTIN_PAIRS: list[dict] = [] | |
| def _get_builtin_pairs() -> list[dict]: | |
| global _BUILTIN_PAIRS | |
| if _BUILTIN_PAIRS: | |
| return _BUILTIN_PAIRS | |
| builtin_path = Path(__file__).parent / "builtin_dataset.json" | |
| if builtin_path.exists(): | |
| with open(builtin_path) as f: | |
| _BUILTIN_PAIRS = json.load(f) | |
| else: | |
| # Fallback si el archivo no está presente | |
| _BUILTIN_PAIRS = [ | |
| {"python": "print('Hello, World!')", | |
| "cpp": '#include <iostream>\nusing namespace std;\nint main(){cout<<"Hello, World!"<<endl;return 0;}'}, | |
| {"python": "def factorial(n):\n return 1 if n<=1 else n*factorial(n-1)\nprint(factorial(10))", | |
| "cpp": '#include <iostream>\nusing namespace std;\nlong long f(int n){return n<=1?1:n*f(n-1);}\nint main(){cout<<f(10)<<endl;return 0;}'}, | |
| ] | |
| return _BUILTIN_PAIRS | |
| def _log(msg: str): | |
| ts = time.strftime("%H:%M:%S") | |
| line = f"[{ts}] {msg}" | |
| _train_state["log"].append(line) | |
| logger.info(msg) | |
| def _prepare_dataset(jsonl_content: Optional[str], tmp_dir: str): | |
| """Prepara el dataset: usa el content subido o el bootstrap.""" | |
| pairs = [] | |
| if jsonl_content and jsonl_content.strip(): | |
| for line in jsonl_content.strip().splitlines(): | |
| try: | |
| obj = json.loads(line) | |
| if "python" in obj and "cpp" in obj: | |
| pairs.append(obj) | |
| except json.JSONDecodeError: | |
| pass | |
| _log(f"Dataset cargado: {len(pairs)} pares del archivo subido") | |
| builtin = _get_builtin_pairs() | |
| _log(f"Usando {len(builtin)} pares verificados del dataset builtin") | |
| pairs.extend(builtin) | |
| import random | |
| random.shuffle(pairs) | |
| n_train = max(1, int(len(pairs) * 0.85)) | |
| for split, data in [("train", pairs[:n_train]), ("val", pairs[n_train:])]: | |
| p = Path(tmp_dir) / f"{split}.jsonl" | |
| with open(p, "w") as f: | |
| for item in data: | |
| f.write(json.dumps(item, ensure_ascii=False) + "\n") | |
| _log(f"Split: {n_train} train / {len(pairs)-n_train} val") | |
| return str(Path(tmp_dir) / "train.jsonl"), str(Path(tmp_dir) / "val.jsonl") | |
| def _run_training( | |
| model_name: str, | |
| lora_r: int, | |
| lora_alpha: int, | |
| num_epochs: int, | |
| batch_size: int, | |
| learning_rate: float, | |
| hub_repo_id: str, | |
| jsonl_content: Optional[str], | |
| ): | |
| """Hilo de entrenamiento.""" | |
| _train_state.update({"running": True, "log": [], "progress": 0.0, | |
| "done": False, "error": None, "model_url": None}) | |
| try: | |
| import torch | |
| from torch.utils.data import Dataset | |
| from transformers import ( | |
| AutoTokenizer, AutoModelForSeq2SeqLM, | |
| Seq2SeqTrainer, Seq2SeqTrainingArguments, | |
| DataCollatorForSeq2Seq, TrainerCallback, | |
| ) | |
| from peft import LoraConfig, TaskType, get_peft_model | |
| _log(f"Device: {DEVICE.upper()}") | |
| if torch.cuda.is_available(): | |
| _log(f"GPU: {torch.cuda.get_device_name(0)}") | |
| with tempfile.TemporaryDirectory() as tmp_dir: | |
| # ── Dataset ────────────────────────────────────────────────── | |
| train_file, val_file = _prepare_dataset(jsonl_content, tmp_dir) | |
| # ── Tokenizer + Modelo ─────────────────────────────────────── | |
| _log(f"Cargando tokenizer: {model_name}") | |
| tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| _log(f"Cargando modelo base...") | |
| dtype = torch.bfloat16 if DEVICE == "cuda" else torch.float32 | |
| model = AutoModelForSeq2SeqLM.from_pretrained( | |
| model_name, | |
| torch_dtype=dtype, | |
| device_map="auto" if DEVICE == "cuda" else None, | |
| trust_remote_code=True, | |
| ) | |
| # ── LoRA ───────────────────────────────────────────────────── | |
| lora_cfg = LoraConfig( | |
| r=lora_r, lora_alpha=lora_alpha, lora_dropout=0.05, | |
| bias="none", | |
| target_modules=["q", "v", "k", "o"], | |
| task_type=TaskType.SEQ_2_SEQ_LM, | |
| ) | |
| model = get_peft_model(model, lora_cfg) | |
| n_params = sum(p.numel() for p in model.parameters() if p.requires_grad) | |
| _log(f"Parámetros entrenables: {n_params:,} ({n_params/1e6:.2f}M)") | |
| _train_state["progress"] = 0.1 | |
| # ── Dataset PyTorch ────────────────────────────────────────── | |
| class SimpleDS(Dataset): | |
| def __init__(self, path, tok, max_len=512): | |
| self.tok = tok | |
| self.max_len = max_len | |
| self.samples = [] | |
| with open(path) as f: | |
| for line in f: | |
| if line.strip(): | |
| self.samples.append(json.loads(line)) | |
| def __len__(self): return len(self.samples) | |
| def __getitem__(self, i): | |
| s = self.samples[i] | |
| inp = self.tok( | |
| "Translate Python to C++: " + s["python"], | |
| max_length=self.max_len, padding="max_length", | |
| truncation=True, return_tensors="pt" | |
| ) | |
| lbl = self.tok( | |
| s["cpp"], max_length=self.max_len, padding="max_length", | |
| truncation=True, return_tensors="pt" | |
| ) | |
| ids = lbl["input_ids"].squeeze() | |
| ids[ids == self.tok.pad_token_id] = -100 | |
| return { | |
| "input_ids": inp["input_ids"].squeeze(), | |
| "attention_mask": inp["attention_mask"].squeeze(), | |
| "labels": ids, | |
| } | |
| train_ds = SimpleDS(train_file, tokenizer) | |
| val_ds = SimpleDS(val_file, tokenizer) | |
| _log(f"Train: {len(train_ds)} | Val: {len(val_ds)}") | |
| _train_state["progress"] = 0.15 | |
| # ── Callback de progreso ───────────────────────────────────── | |
| total_steps = [0] | |
| class ProgressCallback(TrainerCallback): | |
| def on_train_begin(self, args, state, control, **kw): | |
| total_steps[0] = state.max_steps | |
| _log(f"Iniciando: {state.max_steps} steps totales") | |
| def on_log(self, args, state, control, logs=None, **kw): | |
| if logs and total_steps[0] > 0: | |
| frac = state.global_step / total_steps[0] | |
| _train_state["progress"] = 0.15 + frac * 0.70 | |
| loss = logs.get("loss", logs.get("train_loss", "?")) | |
| _log(f"Step {state.global_step}/{total_steps[0]} — loss: {loss}") | |
| def on_epoch_end(self, args, state, control, **kw): | |
| ep = int(state.epoch) | |
| _log(f"✓ Época {ep}/{num_epochs} completada") | |
| # ── Training args ──────────────────────────────────────────── | |
| ck_dir = str(Path(tmp_dir) / "checkpoints") | |
| bf16_ok = DEVICE == "cuda" and torch.cuda.is_bf16_supported() | |
| args = Seq2SeqTrainingArguments( | |
| output_dir=ck_dir, | |
| num_train_epochs=num_epochs, | |
| per_device_train_batch_size=batch_size, | |
| per_device_eval_batch_size=batch_size, | |
| gradient_accumulation_steps=max(1, 8 // batch_size), | |
| learning_rate=learning_rate, | |
| lr_scheduler_type="cosine", | |
| warmup_ratio=0.05, | |
| bf16=bf16_ok, | |
| fp16=(not bf16_ok and DEVICE == "cuda"), | |
| gradient_checkpointing=(DEVICE == "cuda"), | |
| save_strategy="epoch", | |
| evaluation_strategy="epoch", | |
| load_best_model_at_end=True, | |
| metric_for_best_model="eval_loss", | |
| greater_is_better=False, | |
| logging_steps=max(1, len(train_ds) // (batch_size * 5)), | |
| predict_with_generate=True, | |
| generation_max_length=256, | |
| report_to="none", | |
| push_to_hub=bool(hub_repo_id and HF_TOKEN), | |
| hub_model_id=hub_repo_id or None, | |
| hub_token=HF_TOKEN or None, | |
| hub_strategy="end", | |
| ) | |
| trainer = Seq2SeqTrainer( | |
| model=model, | |
| args=args, | |
| train_dataset=train_ds, | |
| eval_dataset=val_ds, | |
| tokenizer=tokenizer, | |
| data_collator=DataCollatorForSeq2Seq( | |
| tokenizer, model=model, label_pad_token_id=-100 | |
| ), | |
| callbacks=[ProgressCallback()], | |
| ) | |
| _log("🚀 Iniciando fine-tuning...") | |
| trainer.train() | |
| _train_state["progress"] = 0.85 | |
| # ── Subir al Hub ───────────────────────────────────────────── | |
| if hub_repo_id and HF_TOKEN: | |
| _log(f"Subiendo modelo a hub: {hub_repo_id}...") | |
| trainer.push_to_hub(commit_message="py2cpp fine-tuned via Training Space") | |
| _train_state["model_url"] = f"https://huggingface.co/{hub_repo_id}" | |
| _log(f"✓ Modelo disponible: {_train_state['model_url']}") | |
| else: | |
| _log("⚠ Sin HF_TOKEN: modelo no subido al Hub") | |
| _train_state["progress"] = 1.0 | |
| _log("✅ Entrenamiento completado") | |
| except Exception as e: | |
| _train_state["error"] = str(e) | |
| _log(f"❌ Error: {e}") | |
| logger.exception("Error en entrenamiento") | |
| finally: | |
| _train_state["running"] = False | |
| _train_state["done"] = True | |
| # --------------------------------------------------------------------------- | |
| # Interfaz Gradio | |
| # --------------------------------------------------------------------------- | |
| def start_training( | |
| jsonl_file, | |
| model_name: str, | |
| lora_r: int, | |
| lora_alpha: int, | |
| num_epochs: int, | |
| batch_size: int, | |
| learning_rate: float, | |
| hub_repo_id: str, | |
| ): | |
| if _train_state["running"]: | |
| return "⚠ Ya hay un entrenamiento en curso.", 0.0 | |
| jsonl_content = None | |
| if jsonl_file is not None: | |
| with open(jsonl_file.name) as f: | |
| jsonl_content = f.read() | |
| t = threading.Thread( | |
| target=_run_training, | |
| args=(model_name, lora_r, lora_alpha, num_epochs, | |
| batch_size, learning_rate, hub_repo_id, jsonl_content), | |
| daemon=True, | |
| ) | |
| t.start() | |
| return "🚀 Entrenamiento iniciado...", 0.0 | |
| def get_status(): | |
| log_text = "\n".join(_train_state["log"][-60:]) | |
| progress = _train_state["progress"] | |
| url = _train_state.get("model_url") or "" | |
| error = _train_state.get("error") or "" | |
| status = "⏳ Entrenando..." if _train_state["running"] else \ | |
| ("✅ Completado" if _train_state["done"] and not error else | |
| (f"❌ Error: {error}" if error else "🔵 Listo para entrenar")) | |
| return log_text, progress, status, url | |
| CSS = """ | |
| #title { text-align:center; font-size:1.6rem; margin-bottom:0.2rem; } | |
| #sub { text-align:center; color:#888; margin-bottom:1.5rem; } | |
| .mono { font-family: 'JetBrains Mono', monospace; font-size:0.8rem; } | |
| """ | |
| with gr.Blocks( | |
| title="py2cpp Training Space", | |
| theme=gr.themes.Soft(primary_hue="violet"), | |
| css=CSS, | |
| ) as demo: | |
| gr.Markdown("# ⚡ py2cpp — Training Space", elem_id="title") | |
| gr.Markdown( | |
| "Fine-tuning de **CodeT5+** con LoRA para traducción Python → C++ \n" | |
| f"Device: `{DEVICE.upper()}` {'🟢 GPU disponible' if DEVICE=='cuda' else '🟡 CPU (lento)'}", | |
| elem_id="sub", | |
| ) | |
| with gr.Row(): | |
| # ── Panel izquierdo: configuración ────────────────────────────── | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 📂 Dataset") | |
| jsonl_file = gr.File( | |
| label="Sube tu dataset (.jsonl) — opcional", | |
| file_types=[".jsonl"], | |
| ) | |
| gr.Markdown( | |
| "_Formato: una línea por par `{\"python\": \"...\", \"cpp\": \"...\"}` \n" | |
| "Si no subes nada, se usan los ejemplos bootstrap incluidos._" | |
| ) | |
| gr.Markdown("### ⚙️ Configuración del modelo") | |
| model_name = gr.Dropdown( | |
| choices=[ | |
| "Salesforce/codet5p-220m", | |
| "Salesforce/codet5p-770m", | |
| "Salesforce/codet5-small", | |
| "Salesforce/codet5-base", | |
| ], | |
| value="Salesforce/codet5p-220m", | |
| label="Modelo base", | |
| ) | |
| with gr.Row(): | |
| lora_r = gr.Slider(4, 64, value=16, step=4, label="LoRA r") | |
| lora_alpha = gr.Slider(8, 128, value=32, step=8, label="LoRA alpha") | |
| with gr.Row(): | |
| num_epochs = gr.Slider(1, 20, value=5, step=1, label="Épocas") | |
| batch_size = gr.Slider(1, 16, value=4, step=1, label="Batch size") | |
| learning_rate = gr.Slider(1e-5, 1e-3, value=3e-4, step=1e-5, label="Learning rate") | |
| gr.Markdown("### 🤗 Publicar en HuggingFace Hub") | |
| hub_repo_id = gr.Textbox( | |
| label="Repo ID (ej: tu-usuario/py2cpp-model)", | |
| placeholder="tu-usuario/py2cpp-codet5p", | |
| info="Requiere HF_TOKEN configurado en los Secrets del Space", | |
| ) | |
| train_btn = gr.Button("🚀 Iniciar entrenamiento", variant="primary", size="lg") | |
| status_label = gr.Textbox(label="Estado", interactive=False, max_lines=1) | |
| # ── Panel derecho: progreso ────────────────────────────────────── | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 📊 Progreso") | |
| progress_bar = gr.Slider( | |
| 0, 1, value=0, label="Progreso total", | |
| interactive=False, | |
| ) | |
| log_box = gr.Textbox( | |
| label="Log de entrenamiento", | |
| lines=24, | |
| max_lines=24, | |
| interactive=False, | |
| elem_classes=["mono"], | |
| ) | |
| model_url_box = gr.Textbox( | |
| label="URL del modelo en Hub", | |
| interactive=False, | |
| placeholder="Aparecerá cuando termine el entrenamiento...", | |
| ) | |
| train_btn.click( | |
| fn=start_training, | |
| inputs=[jsonl_file, model_name, lora_r, lora_alpha, | |
| num_epochs, batch_size, learning_rate, hub_repo_id], | |
| outputs=[status_label, progress_bar], | |
| ) | |
| # Polling cada 3 segundos mientras entrena | |
| timer = gr.Timer(value=3) # Crea un temporizador de 3 segundos | |
| timer.tick( | |
| fn=get_status, | |
| outputs=[log_box, progress_bar, status_label, model_url_box] | |
| ) | |
| gr.Markdown( | |
| "---\n" | |
| "**Notas:** \n" | |
| "• ZeroGPU (PRO) = A100 gratis por sesión \n" | |
| "• T4 Small = ~$0.60/hr, A10G Small = ~$3/hr \n" | |
| "• Configura `HF_TOKEN` en _Settings → Variables and secrets_ del Space" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |