File size: 17,976 Bytes
f3e9bca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9caf5bf
f3e9bca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9caf5bf
f3e9bca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d6b7c0
 
 
f3e9bca
8d6b7c0
f3e9bca
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
"""
cloud/huggingface/training_space/app.py
----------------------------------------
HuggingFace Space para fine-tuning de py2cpp con ZeroGPU.

ZeroGPU = GPU gratuita (A100 40GB) para Spaces con plan PRO (~$9/mes).
Si no tienes PRO, funciona en CPU (mΓ‘s lento) o puedes pagar T4 (~$0.60/hr).

Este Space hace TODO el pipeline:
  1. Sube tu dataset de pares Python→C++ (o usa los ejemplos bootstrap)
  2. Configura el fine-tuning (modelo, LoRA, Γ©pocas)
  3. Entrena con progreso en tiempo real
  4. Guarda el modelo directamente en tu HF Hub

Despliegue:
  - Crea un Space en huggingface.co/new-space
  - SDK: Gradio | Hardware: ZeroGPU (gratis con PRO) o T4 Small
  - Sube todos los archivos de esta carpeta
  - Configura los Secrets: HF_TOKEN (para escribir en el Hub)
"""
import spaces
import os
import sys
import json
import time
import threading
import tempfile
import logging
from pathlib import Path
from typing import Optional

import gradio as gr
import torch

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

HF_TOKEN = os.environ.get("HF_TOKEN", "")
DEVICE   = "cuda" if torch.cuda.is_available() else "cpu"

# Estado global del entrenamiento (compartido entre hilos)
_train_state = {
    "running":   False,
    "log":       [],
    "progress":  0.0,
    "done":      False,
    "error":     None,
    "model_url": None,
}


# ---------------------------------------------------------------------------
# Datos bootstrap embebidos (mΓ­nimo para arrancar sin dataset propio)
# ---------------------------------------------------------------------------
# Dataset builtin: 100 pares verificados, todos compilados con g++ -std=c++17
# Se cargan desde builtin_dataset.json (incluido junto a este app.py)
_BUILTIN_PAIRS: list[dict] = []

def _get_builtin_pairs() -> list[dict]:
    global _BUILTIN_PAIRS
    if _BUILTIN_PAIRS:
        return _BUILTIN_PAIRS
    builtin_path = Path(__file__).parent / "builtin_dataset.json"
    if builtin_path.exists():
        with open(builtin_path) as f:
            _BUILTIN_PAIRS = json.load(f)
    else:
        # Fallback si el archivo no estΓ‘ presente
        _BUILTIN_PAIRS = [
            {"python": "print('Hello, World!')",
             "cpp": '#include <iostream>\nusing namespace std;\nint main(){cout<<"Hello, World!"<<endl;return 0;}'},
            {"python": "def factorial(n):\n    return 1 if n<=1 else n*factorial(n-1)\nprint(factorial(10))",
             "cpp": '#include <iostream>\nusing namespace std;\nlong long f(int n){return n<=1?1:n*f(n-1);}\nint main(){cout<<f(10)<<endl;return 0;}'},
        ]
    return _BUILTIN_PAIRS


def _log(msg: str):
    ts = time.strftime("%H:%M:%S")
    line = f"[{ts}] {msg}"
    _train_state["log"].append(line)
    logger.info(msg)


def _prepare_dataset(jsonl_content: Optional[str], tmp_dir: str):
    """Prepara el dataset: usa el content subido o el bootstrap."""
    pairs = []

    if jsonl_content and jsonl_content.strip():
        for line in jsonl_content.strip().splitlines():
            try:
                obj = json.loads(line)
                if "python" in obj and "cpp" in obj:
                    pairs.append(obj)
            except json.JSONDecodeError:
                pass
        _log(f"Dataset cargado: {len(pairs)} pares del archivo subido")

    builtin = _get_builtin_pairs()
    _log(f"Usando {len(builtin)} pares verificados del dataset builtin")
    pairs.extend(builtin)

    import random
    random.shuffle(pairs)
    n_train = max(1, int(len(pairs) * 0.85))

    for split, data in [("train", pairs[:n_train]), ("val", pairs[n_train:])]:
        p = Path(tmp_dir) / f"{split}.jsonl"
        with open(p, "w") as f:
            for item in data:
                f.write(json.dumps(item, ensure_ascii=False) + "\n")

    _log(f"Split: {n_train} train / {len(pairs)-n_train} val")
    return str(Path(tmp_dir) / "train.jsonl"), str(Path(tmp_dir) / "val.jsonl")


def _run_training(
    model_name: str,
    lora_r: int,
    lora_alpha: int,
    num_epochs: int,
    batch_size: int,
    learning_rate: float,
    hub_repo_id: str,
    jsonl_content: Optional[str],
):
    """Hilo de entrenamiento."""
    _train_state.update({"running": True, "log": [], "progress": 0.0,
                          "done": False, "error": None, "model_url": None})
    try:
        import torch
        from torch.utils.data import Dataset
        from transformers import (
            AutoTokenizer, AutoModelForSeq2SeqLM,
            Seq2SeqTrainer, Seq2SeqTrainingArguments,
            DataCollatorForSeq2Seq, TrainerCallback,
        )
        from peft import LoraConfig, TaskType, get_peft_model

        _log(f"Device: {DEVICE.upper()}")
        if torch.cuda.is_available():
            _log(f"GPU: {torch.cuda.get_device_name(0)}")

        with tempfile.TemporaryDirectory() as tmp_dir:
            # ── Dataset ──────────────────────────────────────────────────
            train_file, val_file = _prepare_dataset(jsonl_content, tmp_dir)

            # ── Tokenizer + Modelo ───────────────────────────────────────
            _log(f"Cargando tokenizer: {model_name}")
            tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token

            _log(f"Cargando modelo base...")
            dtype = torch.bfloat16 if DEVICE == "cuda" else torch.float32
            model = AutoModelForSeq2SeqLM.from_pretrained(
                model_name,
                torch_dtype=dtype,
                device_map="auto" if DEVICE == "cuda" else None,
                trust_remote_code=True,
            )

            # ── LoRA ─────────────────────────────────────────────────────
            lora_cfg = LoraConfig(
                r=lora_r, lora_alpha=lora_alpha, lora_dropout=0.05,
                bias="none",
                target_modules=["q", "v", "k", "o"],
                task_type=TaskType.SEQ_2_SEQ_LM,
            )
            model = get_peft_model(model, lora_cfg)
            n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
            _log(f"ParΓ‘metros entrenables: {n_params:,} ({n_params/1e6:.2f}M)")
            _train_state["progress"] = 0.1

            # ── Dataset PyTorch ──────────────────────────────────────────
            class SimpleDS(Dataset):
                def __init__(self, path, tok, max_len=512):
                    self.tok = tok
                    self.max_len = max_len
                    self.samples = []
                    with open(path) as f:
                        for line in f:
                            if line.strip():
                                self.samples.append(json.loads(line))

                def __len__(self): return len(self.samples)

                def __getitem__(self, i):
                    s = self.samples[i]
                    inp = self.tok(
                        "Translate Python to C++: " + s["python"],
                        max_length=self.max_len, padding="max_length",
                        truncation=True, return_tensors="pt"
                    )
                    lbl = self.tok(
                        s["cpp"], max_length=self.max_len, padding="max_length",
                        truncation=True, return_tensors="pt"
                    )
                    ids = lbl["input_ids"].squeeze()
                    ids[ids == self.tok.pad_token_id] = -100
                    return {
                        "input_ids": inp["input_ids"].squeeze(),
                        "attention_mask": inp["attention_mask"].squeeze(),
                        "labels": ids,
                    }

            train_ds = SimpleDS(train_file, tokenizer)
            val_ds   = SimpleDS(val_file,   tokenizer)
            _log(f"Train: {len(train_ds)} | Val: {len(val_ds)}")
            _train_state["progress"] = 0.15

            # ── Callback de progreso ─────────────────────────────────────
            total_steps = [0]

            class ProgressCallback(TrainerCallback):
                def on_train_begin(self, args, state, control, **kw):
                    total_steps[0] = state.max_steps
                    _log(f"Iniciando: {state.max_steps} steps totales")

                def on_log(self, args, state, control, logs=None, **kw):
                    if logs and total_steps[0] > 0:
                        frac = state.global_step / total_steps[0]
                        _train_state["progress"] = 0.15 + frac * 0.70
                        loss = logs.get("loss", logs.get("train_loss", "?"))
                        _log(f"Step {state.global_step}/{total_steps[0]} β€” loss: {loss}")

                def on_epoch_end(self, args, state, control, **kw):
                    ep = int(state.epoch)
                    _log(f"βœ“ Γ‰poca {ep}/{num_epochs} completada")

            # ── Training args ────────────────────────────────────────────
            ck_dir = str(Path(tmp_dir) / "checkpoints")
            bf16_ok = DEVICE == "cuda" and torch.cuda.is_bf16_supported()
            args = Seq2SeqTrainingArguments(
                output_dir=ck_dir,
                num_train_epochs=num_epochs,
                per_device_train_batch_size=batch_size,
                per_device_eval_batch_size=batch_size,
                gradient_accumulation_steps=max(1, 8 // batch_size),
                learning_rate=learning_rate,
                lr_scheduler_type="cosine",
                warmup_ratio=0.05,
                bf16=bf16_ok,
                fp16=(not bf16_ok and DEVICE == "cuda"),
                gradient_checkpointing=(DEVICE == "cuda"),
                save_strategy="epoch",
                evaluation_strategy="epoch",
                load_best_model_at_end=True,
                metric_for_best_model="eval_loss",
                greater_is_better=False,
                logging_steps=max(1, len(train_ds) // (batch_size * 5)),
                predict_with_generate=True,
                generation_max_length=256,
                report_to="none",
                push_to_hub=bool(hub_repo_id and HF_TOKEN),
                hub_model_id=hub_repo_id or None,
                hub_token=HF_TOKEN or None,
                hub_strategy="end",
            )

            trainer = Seq2SeqTrainer(
                model=model,
                args=args,
                train_dataset=train_ds,
                eval_dataset=val_ds,
                tokenizer=tokenizer,
                data_collator=DataCollatorForSeq2Seq(
                    tokenizer, model=model, label_pad_token_id=-100
                ),
                callbacks=[ProgressCallback()],
            )

            _log("πŸš€ Iniciando fine-tuning...")
            trainer.train()
            _train_state["progress"] = 0.85

            # ── Subir al Hub ─────────────────────────────────────────────
            if hub_repo_id and HF_TOKEN:
                _log(f"Subiendo modelo a hub: {hub_repo_id}...")
                trainer.push_to_hub(commit_message="py2cpp fine-tuned via Training Space")
                _train_state["model_url"] = f"https://huggingface.co/{hub_repo_id}"
                _log(f"βœ“ Modelo disponible: {_train_state['model_url']}")
            else:
                _log("⚠ Sin HF_TOKEN: modelo no subido al Hub")

            _train_state["progress"] = 1.0
            _log("βœ… Entrenamiento completado")

    except Exception as e:
        _train_state["error"] = str(e)
        _log(f"❌ Error: {e}")
        logger.exception("Error en entrenamiento")
    finally:
        _train_state["running"] = False
        _train_state["done"] = True


# ---------------------------------------------------------------------------
# Interfaz Gradio
# ---------------------------------------------------------------------------
@spaces.GPU
def start_training(
    jsonl_file,
    model_name: str,
    lora_r: int,
    lora_alpha: int,
    num_epochs: int,
    batch_size: int,
    learning_rate: float,
    hub_repo_id: str,
):
    if _train_state["running"]:
        return "⚠ Ya hay un entrenamiento en curso.", 0.0

    jsonl_content = None
    if jsonl_file is not None:
        with open(jsonl_file.name) as f:
            jsonl_content = f.read()

    t = threading.Thread(
        target=_run_training,
        args=(model_name, lora_r, lora_alpha, num_epochs,
              batch_size, learning_rate, hub_repo_id, jsonl_content),
        daemon=True,
    )
    t.start()
    return "πŸš€ Entrenamiento iniciado...", 0.0


def get_status():
    log_text = "\n".join(_train_state["log"][-60:])
    progress  = _train_state["progress"]
    url       = _train_state.get("model_url") or ""
    error     = _train_state.get("error") or ""

    status = "⏳ Entrenando..." if _train_state["running"] else \
             ("βœ… Completado" if _train_state["done"] and not error else
              (f"❌ Error: {error}" if error else "πŸ”΅ Listo para entrenar"))
    return log_text, progress, status, url


CSS = """
#title  { text-align:center; font-size:1.6rem; margin-bottom:0.2rem; }
#sub    { text-align:center; color:#888; margin-bottom:1.5rem; }
.mono   { font-family: 'JetBrains Mono', monospace; font-size:0.8rem; }
"""

with gr.Blocks(
    title="py2cpp Training Space",
    theme=gr.themes.Soft(primary_hue="violet"),
    css=CSS,
) as demo:

    gr.Markdown("# ⚑ py2cpp β€” Training Space", elem_id="title")
    gr.Markdown(
        "Fine-tuning de **CodeT5+** con LoRA para traducciΓ³n Python β†’ C++  \n"
        f"Device: `{DEVICE.upper()}` {'🟒 GPU disponible' if DEVICE=='cuda' else '🟑 CPU (lento)'}",
        elem_id="sub",
    )

    with gr.Row():
        # ── Panel izquierdo: configuraciΓ³n ──────────────────────────────
        with gr.Column(scale=1):
            gr.Markdown("### πŸ“‚ Dataset")
            jsonl_file = gr.File(
                label="Sube tu dataset (.jsonl) β€” opcional",
                file_types=[".jsonl"],
            )
            gr.Markdown(
                "_Formato: una lΓ­nea por par `{\"python\": \"...\", \"cpp\": \"...\"}`  \n"
                "Si no subes nada, se usan los ejemplos bootstrap incluidos._"
            )

            gr.Markdown("### βš™οΈ ConfiguraciΓ³n del modelo")
            model_name = gr.Dropdown(
                choices=[
                    "Salesforce/codet5p-220m",
                    "Salesforce/codet5p-770m",
                    "Salesforce/codet5-small",
                    "Salesforce/codet5-base",
                ],
                value="Salesforce/codet5p-220m",
                label="Modelo base",
            )
            with gr.Row():
                lora_r     = gr.Slider(4, 64, value=16, step=4,  label="LoRA r")
                lora_alpha = gr.Slider(8, 128, value=32, step=8, label="LoRA alpha")
            with gr.Row():
                num_epochs = gr.Slider(1, 20, value=5,   step=1,   label="Γ‰pocas")
                batch_size = gr.Slider(1, 16,  value=4,  step=1,   label="Batch size")
            learning_rate = gr.Slider(1e-5, 1e-3, value=3e-4, step=1e-5, label="Learning rate")

            gr.Markdown("### πŸ€— Publicar en HuggingFace Hub")
            hub_repo_id = gr.Textbox(
                label="Repo ID (ej: tu-usuario/py2cpp-model)",
                placeholder="tu-usuario/py2cpp-codet5p",
                info="Requiere HF_TOKEN configurado en los Secrets del Space",
            )

            train_btn = gr.Button("πŸš€ Iniciar entrenamiento", variant="primary", size="lg")
            status_label = gr.Textbox(label="Estado", interactive=False, max_lines=1)

        # ── Panel derecho: progreso ──────────────────────────────────────
        with gr.Column(scale=1):
            gr.Markdown("### πŸ“Š Progreso")
            progress_bar = gr.Slider(
                0, 1, value=0, label="Progreso total",
                interactive=False,
            )
            log_box = gr.Textbox(
                label="Log de entrenamiento",
                lines=24,
                max_lines=24,
                interactive=False,
                elem_classes=["mono"],
            )
            model_url_box = gr.Textbox(
                label="URL del modelo en Hub",
                interactive=False,
                placeholder="AparecerΓ‘ cuando termine el entrenamiento...",
            )

    train_btn.click(
        fn=start_training,
        inputs=[jsonl_file, model_name, lora_r, lora_alpha,
                num_epochs, batch_size, learning_rate, hub_repo_id],
        outputs=[status_label, progress_bar],
    )

    # Polling cada 3 segundos mientras entrena
    timer = gr.Timer(value=3) # Crea un temporizador de 3 segundos
    
    timer.tick(
        fn=get_status,
        outputs=[log_box, progress_bar, status_label, model_url_box]
    )

    gr.Markdown(
        "---\n"
        "**Notas:**  \n"
        "β€’ ZeroGPU (PRO) = A100 gratis por sesiΓ³n  \n"
        "β€’ T4 Small = ~$0.60/hr, A10G Small = ~$3/hr  \n"
        "β€’ Configura `HF_TOKEN` en _Settings β†’ Variables and secrets_ del Space"
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)