File size: 5,399 Bytes
463fc7e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | import argparse
import os
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
from scripts.core.training.model import CodeEmbedder
from scripts.core.training.trainer import CodeTrainer
import json
# Real Dataset class for Triplet Training
class RealCodeDataset(Dataset):
def __init__(self, jsonl_path, tokenizer, max_length=512):
self.tokenizer = tokenizer
self.max_length = max_length
self.data = []
print(f"Loading data from {jsonl_path}...")
with open(jsonl_path, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
self.data.append(json.loads(line))
print(f"Loaded {len(self.data)} triplets.")
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
item = self.data[idx]
# Helper to tokenize
def tokenize_text(text):
return self.tokenizer(
text,
return_tensors='pt',
padding='max_length',
truncation=True,
max_length=self.max_length
)
# Tokenize all three parts
anchor = tokenize_text(item['anchor'])
positive = tokenize_text(item['positive'])
negative = tokenize_text(item['negative'])
# Return a flat dict with prefixed keys
return {
'anchor_input_ids': anchor['input_ids'].squeeze(0),
'anchor_attention_mask': anchor['attention_mask'].squeeze(0),
'positive_input_ids': positive['input_ids'].squeeze(0),
'positive_attention_mask': positive['attention_mask'].squeeze(0),
'negative_input_ids': negative['input_ids'].squeeze(0),
'negative_attention_mask': negative['attention_mask'].squeeze(0)
}
# Dummy Dataset class for MVP testing without the robust data pipeline availability
class DummyCodeDataset(Dataset):
def __init__(self, tokenizer, size=100):
self.tokenizer = tokenizer
self.size = size
# Generate dummy triplet structure
self.data = [{"anchor": "def hello(): return 'world'", "positive": "def hi(): return 'earth'", "negative": "class Foo: pass"}] * size
def __len__(self):
return self.size
def __getitem__(self, idx):
item = self.data[idx]
# Helper to tokenize
def tokenize_text(text):
return self.tokenizer(
text,
return_tensors='pt',
padding='max_length',
truncation=True,
max_length=128
)
anchor = tokenize_text(item['anchor'])
positive = tokenize_text(item['positive'])
negative = tokenize_text(item['negative'])
return {
'anchor_input_ids': anchor['input_ids'].squeeze(0),
'anchor_attention_mask': anchor['attention_mask'].squeeze(0),
'positive_input_ids': positive['input_ids'].squeeze(0),
'positive_attention_mask': positive['attention_mask'].squeeze(0),
'negative_input_ids': negative['input_ids'].squeeze(0),
'negative_attention_mask': negative['attention_mask'].squeeze(0)
}
def main():
parser = argparse.ArgumentParser(description="Train CodeMode Embeddings")
parser.add_argument("--model_name", type=str, default="microsoft/codebert-base", help="Hub model name")
parser.add_argument("--data_path", type=str, required=False, help="Path to parsed chunks.jsonl")
parser.add_argument("--output_dir", type=str, default="./output", help="Where to save checkpoints")
parser.add_argument("--epochs", type=int, default=3)
parser.add_argument("--batch_size", type=int, default=8)
parser.add_argument("--accumulation_steps", type=int, default=4, help="Gradient Accumulation Steps")
parser.add_argument("--lr", type=float, default=2e-5)
parser.add_argument("--dry_run", action="store_true", help="Run with dummy data for 1 epoch")
args = parser.parse_args()
print(f"Initializing Training Pipeline...")
print(f" Model: {args.model_name}")
print(f" Output: {args.output_dir}")
print(f" Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")
# 1. Initialize Tokenizer
tokenizer = AutoTokenizer.from_pretrained(args.model_name)
# 2. Load Dataset (Real or Dummy)
if args.data_path and os.path.exists(args.data_path):
train_dataset = RealCodeDataset(args.data_path, tokenizer)
else:
print("No data path provided or file missing. Using DUMMY data for verification.")
train_dataset = DummyCodeDataset(tokenizer, size=100)
train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
# 3. Initialize Model
model = CodeEmbedder(model_name_or_path=args.model_name)
# 4. Initialize Trainer
trainer = CodeTrainer(
model=model,
train_loader=train_loader,
epochs=args.epochs,
learning_rate=args.lr,
accumulation_steps=args.accumulation_steps,
mixed_precision=True, # Hardcoded True for the "Zero-Cost" philosophy
output_dir=args.output_dir
)
# 5. Connect and Train
trainer.train()
print("Training Complete.")
if __name__ == "__main__":
main()
|