File size: 2,072 Bytes
dbb535a
85f3c75
dbb535a
47c9da0
 
 
2b7f32a
5740a72
47c9da0
 
5740a72
47c9da0
 
5740a72
47c9da0
 
5740a72
47c9da0
 
5740a72
47c9da0
 
5740a72
 
47c9da0
5740a72
47c9da0
 
5740a72
 
 
47c9da0
 
2b7f32a
5740a72
47c9da0
 
2b7f32a
47c9da0
2b7f32a
47c9da0
 
 
 
 
 
 
 
 
8e0eaae
47c9da0
 
 
 
 
 
 
 
 
 
 
 
8e0eaae
2b7f32a
47c9da0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import tensorflow as tf
from model import VedaProgrammingLLM
from tokenizer import VedaTokenizer
from config import MODEL_DIR, VOCAB_SIZE, MAX_LENGTH
import os
import json

TRAINING_DATA = """
<USER> Hi
<ASSISTANT> Hello! How can I help you today?

<USER> Who are you?
<ASSISTANT> I am Veda, an AI programming assistant.

<USER> What is Python?
<ASSISTANT> Python is a high-level programming language known for its simplicity and readability.

<USER> Write hello world in python
<ASSISTANT> <CODE>print("Hello, World!")<ENDCODE>

<USER> Write a function to add numbers
<ASSISTANT> <CODE>
def add(a, b):
    return a + b
</CODE>

<USER> Explain recursion
<ASSISTANT> Recursion is when a function calls itself to solve a problem. It needs a base case to stop.
"""

class VedaTrainer:
    def __init__(self):
        self.tokenizer = VedaTokenizer(VOCAB_SIZE)
        self.model = None

    def train(self, epochs=10, extra_data=""):
        data = TRAINING_DATA + "\n" + extra_data
        self.tokenizer.fit([data])
        tokens = self.tokenizer.encode(data)
        
        # Create dataset
        seqs = []
        for i in range(0, len(tokens)-MAX_LENGTH, 50):
            seqs.append(tokens[i:i+MAX_LENGTH+1])
        
        import numpy as np
        if not seqs: seqs = [tokens[:MAX_LENGTH+1]]
        arr = np.array(seqs)
        ds = tf.data.Dataset.from_tensor_slices((arr[:, :-1], arr[:, 1:])).batch(4)

        self.model = VedaProgrammingLLM(self.tokenizer.vocabulary_size)
        self.model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))
        
        # Build model
        self.model(tf.zeros((1, MAX_LENGTH)))
        self.model.fit(ds, epochs=epochs)
        
        # Save
        self.model.save_weights(os.path.join(MODEL_DIR, "weights.h5"))
        self.tokenizer.save(os.path.join(MODEL_DIR, "tokenizer.json"))
        with open(os.path.join(MODEL_DIR, "config.json"), 'w') as f:
            json.dump(self.model.get_config(), f)

if __name__ == "__main__":
    VedaTrainer().train(epochs=20)