Spaces:
Sleeping
Sleeping
File size: 2,072 Bytes
dbb535a 85f3c75 dbb535a 47c9da0 2b7f32a 5740a72 47c9da0 5740a72 47c9da0 5740a72 47c9da0 5740a72 47c9da0 5740a72 47c9da0 5740a72 47c9da0 5740a72 47c9da0 5740a72 47c9da0 2b7f32a 5740a72 47c9da0 2b7f32a 47c9da0 2b7f32a 47c9da0 8e0eaae 47c9da0 8e0eaae 2b7f32a 47c9da0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | import tensorflow as tf
from model import VedaProgrammingLLM
from tokenizer import VedaTokenizer
from config import MODEL_DIR, VOCAB_SIZE, MAX_LENGTH
import os
import json
TRAINING_DATA = """
<USER> Hi
<ASSISTANT> Hello! How can I help you today?
<USER> Who are you?
<ASSISTANT> I am Veda, an AI programming assistant.
<USER> What is Python?
<ASSISTANT> Python is a high-level programming language known for its simplicity and readability.
<USER> Write hello world in python
<ASSISTANT> <CODE>print("Hello, World!")<ENDCODE>
<USER> Write a function to add numbers
<ASSISTANT> <CODE>
def add(a, b):
return a + b
</CODE>
<USER> Explain recursion
<ASSISTANT> Recursion is when a function calls itself to solve a problem. It needs a base case to stop.
"""
class VedaTrainer:
def __init__(self):
self.tokenizer = VedaTokenizer(VOCAB_SIZE)
self.model = None
def train(self, epochs=10, extra_data=""):
data = TRAINING_DATA + "\n" + extra_data
self.tokenizer.fit([data])
tokens = self.tokenizer.encode(data)
# Create dataset
seqs = []
for i in range(0, len(tokens)-MAX_LENGTH, 50):
seqs.append(tokens[i:i+MAX_LENGTH+1])
import numpy as np
if not seqs: seqs = [tokens[:MAX_LENGTH+1]]
arr = np.array(seqs)
ds = tf.data.Dataset.from_tensor_slices((arr[:, :-1], arr[:, 1:])).batch(4)
self.model = VedaProgrammingLLM(self.tokenizer.vocabulary_size)
self.model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))
# Build model
self.model(tf.zeros((1, MAX_LENGTH)))
self.model.fit(ds, epochs=epochs)
# Save
self.model.save_weights(os.path.join(MODEL_DIR, "weights.h5"))
self.tokenizer.save(os.path.join(MODEL_DIR, "tokenizer.json"))
with open(os.path.join(MODEL_DIR, "config.json"), 'w') as f:
json.dump(self.model.get_config(), f)
if __name__ == "__main__":
VedaTrainer().train(epochs=20) |