#!/bin/bash set -euo pipefail set -x # MODIFICATION: Add this line to print every command # ============================================================================= # BENGALI-CODE LLM - DEV PIPELINE SCRIPT # ============================================================================= # This script is designed to run in the resource-constrained Hugging Face Space. echo "🚀 Initializing Dev Pipeline..." # ... the rest of the file is exactly the same ... # (No other changes needed in this file) # --- Configuration --- VOCAB_SIZE=16000 # Smaller vocab for faster dev run PROJECT_DIR="$(pwd)" # --- Create Directory Structure --- mkdir -p {data/{raw,processed},tokenizer,models,checkpoints,results,logs,scripts,configs} # --- 1. Data Collection (Sample Data) --- echo "📚 Step 1: Creating a small sample dataset..." cat > data/raw/sample_data.txt <<'EOF' আমার সোনার বাংলা, আমি তোমায় ভালোবাসি। The quick brown fox jumps over the lazy dog. def factorial(n): # This function calculates the factorial of a number if n == 0: return 1 else: return n * factorial(n-1) import math print(math.pi) EOF echo "✅ Sample dataset created." # --- 2. Preprocessing & Tokenizer Training --- echo "🧹 Step 2: Preprocessing data..." cat data/raw/*.txt > data/processed/combined.txt head -n 3 data/processed/combined.txt > data/processed/train.txt tail -n +4 data/processed/combined.txt > data/processed/validation.txt echo "✅ Data preprocessed." echo "🔤 Step 3: Training tokenizer..." python3 << EOF import sentencepiece as spm import os os.makedirs('tokenizer', exist_ok=True) spm.SentencePieceTrainer.train( input='data/processed/train.txt', model_prefix='tokenizer/bengali_code_dev', vocab_size=${VOCAB_SIZE}, model_type='bpe', pad_id=0, unk_id=1, bos_id=2, eos_id=3 ) EOF echo "✅ Tokenizer trained." # --- 3. Model Training (Tiny Dev Model) --- echo "🧠 Step 4: Configuring and Training Tiny Model..." cat > scripts/train_dev.py << 'EOF' import torch, argparse, sentencepiece as spm from transformers import AutoConfig, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling from datasets import load_dataset class Tokenizer: def __init__(self, path): self.sp = spm.SentencePieceProcessor(model_file=path) def __call__(self, t, **k): return {'input_ids': self.sp.encode(t, out_type=int)} def decode(self, ids, **k): return self.sp.decode(ids) @property def vocab_size(self): return self.sp.vocab_size() @property def pad_token_id(self): return self.sp.pad_id() tokenizer = Tokenizer(path="tokenizer/bengali_code_dev.model") dataset = load_dataset("text", data_files={"train": "data/processed/train.txt", "validation": "data/processed/validation.txt"}) tokenized_ds = dataset.map(lambda e: tokenizer(e["text"]), remove_columns=["text"]) config = AutoConfig.from_pretrained("gpt2", vocab_size=tokenizer.vocab_size, n_layer=2, n_head=2, n_embd=128) model = AutoModelForCausalLM.from_config(config) print(f"✅ Tiny model created with ~{sum(p.numel() for p in model.parameters())/1e6:.1f}M parameters.") trainer = Trainer( model=model, args=TrainingArguments(output_dir='./results', num_train_epochs=1, logging_steps=1, report_to="none"), train_dataset=tokenized_ds["train"], eval_dataset=tokenized_ds["validation"], tokenizer=tokenizer, data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) ) print("🚀 Starting training...") trainer.train() print("✅ Training complete.") EOF python3 scripts/train_dev.py echo "🎉 PIPELINE COMPLETED SUCCESSFULLY!"