Add Tamil character support to tokenizer - 62 Tamil characters including vowels, consonants, Grantha characters, vowel markers, and digits

#47
Files changed (1) hide show
  1. grapheme_mtl_with_tamil_v2.json +158 -0
grapheme_mtl_with_tamil_v2.json ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "model": {
4
+ "type": "WordLevel",
5
+ "vocab": {
6
+ "[UNK]": 0,
7
+ "[START]": 1,
8
+ "[STOP]": 2,
9
+ "[SPACE]": 3,
10
+ "[PAD]": 4,
11
+ "[CLS]": 5,
12
+ "[SEP]": 6,
13
+ "a": 7,
14
+ "b": 8,
15
+ "c": 9,
16
+ "d": 10,
17
+ "e": 11,
18
+ "f": 12,
19
+ "g": 13,
20
+ "h": 14,
21
+ "i": 15,
22
+ "j": 16,
23
+ "k": 17,
24
+ "l": 18,
25
+ "m": 19,
26
+ "n": 20,
27
+ "o": 21,
28
+ "p": 22,
29
+ "q": 23,
30
+ "r": 24,
31
+ "s": 25,
32
+ "t": 26,
33
+ "u": 27,
34
+ "v": 28,
35
+ "w": 29,
36
+ "x": 30,
37
+ "y": 31,
38
+ "z": 32,
39
+ "A": 33,
40
+ "B": 34,
41
+ "C": 35,
42
+ "D": 36,
43
+ "E": 37,
44
+ "F": 38,
45
+ "G": 39,
46
+ "H": 40,
47
+ "I": 41,
48
+ "J": 42,
49
+ "K": 43,
50
+ "L": 44,
51
+ "M": 45,
52
+ "N": 46,
53
+ "O": 47,
54
+ "P": 48,
55
+ "Q": 49,
56
+ "R": 50,
57
+ "S": 51,
58
+ "T": 52,
59
+ "U": 53,
60
+ "V": 54,
61
+ "W": 55,
62
+ "X": 56,
63
+ "Y": 57,
64
+ "Z": 58,
65
+ "0": 59,
66
+ "1": 60,
67
+ "2": 61,
68
+ "3": 62,
69
+ "4": 63,
70
+ "5": 64,
71
+ "6": 65,
72
+ "7": 66,
73
+ "8": 67,
74
+ "9": 68,
75
+ ".": 69,
76
+ ",": 70,
77
+ "!": 71,
78
+ "?": 72,
79
+ ";": 73,
80
+ ":": 74,
81
+ "'": 75,
82
+ "\"": 76,
83
+ "(": 77,
84
+ ")": 78,
85
+ "-": 79,
86
+ " ": 80,
87
+ "அ": 81,
88
+ "ஆ": 82,
89
+ "இ": 83,
90
+ "ஈ": 84,
91
+ "உ": 85,
92
+ "ஊ": 86,
93
+ "எ": 87,
94
+ "ஏ": 88,
95
+ "ஐ": 89,
96
+ "ஒ": 90,
97
+ "ஓ": 91,
98
+ "ஔ": 92,
99
+ "க": 93,
100
+ "ங": 94,
101
+ "ச": 95,
102
+ "ஞ": 96,
103
+ "ட": 97,
104
+ "ண": 98,
105
+ "த": 99,
106
+ "ந": 100,
107
+ "ப": 101,
108
+ "ம": 102,
109
+ "ய": 103,
110
+ "ர": 104,
111
+ "ல": 105,
112
+ "வ": 106,
113
+ "ழ": 107,
114
+ "ள": 108,
115
+ "ற": 109,
116
+ "ன": 110,
117
+ "ஜ": 111,
118
+ "ஷ": 112,
119
+ "ஸ": 113,
120
+ "ஹ": 114,
121
+ "க்ஷ": 115,
122
+ "ஸ்ரீ": 116,
123
+ "ா": 117,
124
+ "ி": 118,
125
+ "ீ": 119,
126
+ "ு": 120,
127
+ "ூ": 121,
128
+ "ெ": 122,
129
+ "ே": 123,
130
+ "ை": 124,
131
+ "ொ": 125,
132
+ "ோ": 126,
133
+ "ௌ": 127,
134
+ "்": 128,
135
+ "௦": 129,
136
+ "௧": 130,
137
+ "௨": 131,
138
+ "௩": 132,
139
+ "௪": 133,
140
+ "௫": 134,
141
+ "௬": 135,
142
+ "௭": 136,
143
+ "௮": 137,
144
+ "௯": 138,
145
+ "ௗ": 139,
146
+ "௰": 140,
147
+ "௱": 141,
148
+ "௲": 142
149
+ },
150
+ "unk_token": "[UNK]"
151
+ },
152
+ "pre_tokenizer": {
153
+ "type": "Whitespace"
154
+ },
155
+ "normalizer": {
156
+ "type": "NFKC"
157
+ }
158
+ }