Add COMPLETE Tamil character support to tokenizer - 286 Tamil characters including all vowels, consonants, uyirmei combinations, Grantha characters, digits, and symbols

#48
grapheme_mtl_complete_tamil_final.json ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "model": {
4
+ "type": "WordLevel",
5
+ "vocab": {
6
+ "[UNK]": 0,
7
+ "[START]": 1,
8
+ "[STOP]": 2,
9
+ "[SPACE]": 3,
10
+ "[PAD]": 4,
11
+ "[CLS]": 5,
12
+ "[SEP]": 6,
13
+ "a": 7,
14
+ "b": 8,
15
+ "c": 9,
16
+ "d": 10,
17
+ "e": 11,
18
+ "f": 12,
19
+ "g": 13,
20
+ "h": 14,
21
+ "i": 15,
22
+ "j": 16,
23
+ "k": 17,
24
+ "l": 18,
25
+ "m": 19,
26
+ "n": 20,
27
+ "o": 21,
28
+ "p": 22,
29
+ "q": 23,
30
+ "r": 24,
31
+ "s": 25,
32
+ "t": 26,
33
+ "u": 27,
34
+ "v": 28,
35
+ "w": 29,
36
+ "x": 30,
37
+ "y": 31,
38
+ "z": 32,
39
+ "A": 33,
40
+ "B": 34,
41
+ "C": 35,
42
+ "D": 36,
43
+ "E": 37,
44
+ "F": 38,
45
+ "G": 39,
46
+ "H": 40,
47
+ "I": 41,
48
+ "J": 42,
49
+ "K": 43,
50
+ "L": 44,
51
+ "M": 45,
52
+ "N": 46,
53
+ "O": 47,
54
+ "P": 48,
55
+ "Q": 49,
56
+ "R": 50,
57
+ "S": 51,
58
+ "T": 52,
59
+ "U": 53,
60
+ "V": 54,
61
+ "W": 55,
62
+ "X": 56,
63
+ "Y": 57,
64
+ "Z": 58,
65
+ "0": 59,
66
+ "1": 60,
67
+ "2": 61,
68
+ "3": 62,
69
+ "4": 63,
70
+ "5": 64,
71
+ "6": 65,
72
+ "7": 66,
73
+ "8": 67,
74
+ "9": 68,
75
+ ".": 69,
76
+ ",": 70,
77
+ "!": 71,
78
+ "?": 72,
79
+ ";": 73,
80
+ ":": 74,
81
+ "'": 75,
82
+ "\"": 76,
83
+ "(": 77,
84
+ ")": 78,
85
+ "-": 79,
86
+ " ": 80,
87
+ "அ": 81,
88
+ "ஆ": 82,
89
+ "இ": 83,
90
+ "ஈ": 84,
91
+ "உ": 85,
92
+ "ஊ": 86,
93
+ "எ": 87,
94
+ "ஏ": 88,
95
+ "ஐ": 89,
96
+ "ஒ": 90,
97
+ "ஓ": 91,
98
+ "ஔ": 92,
99
+ "க": 93,
100
+ "கா": 94,
101
+ "கி": 95,
102
+ "கீ": 96,
103
+ "கு": 97,
104
+ "கூ": 98,
105
+ "கெ": 99,
106
+ "கே": 100,
107
+ "கை": 101,
108
+ "கொ": 102,
109
+ "கோ": 103,
110
+ "கௌ": 104,
111
+ "க்": 105,
112
+ "க்ஷ": 106,
113
+ "ங": 107,
114
+ "ஙா": 108,
115
+ "ஙி": 109,
116
+ "ஙீ": 110,
117
+ "ஙு": 111,
118
+ "ஙூ": 112,
119
+ "ஙெ": 113,
120
+ "ஙே": 114,
121
+ "ஙை": 115,
122
+ "ஙொ": 116,
123
+ "ஙோ": 117,
124
+ "ஙௌ": 118,
125
+ "ங்": 119,
126
+ "ச": 120,
127
+ "சா": 121,
128
+ "சி": 122,
129
+ "சீ": 123,
130
+ "சு": 124,
131
+ "சூ": 125,
132
+ "செ": 126,
133
+ "சே": 127,
134
+ "சை": 128,
135
+ "சொ": 129,
136
+ "சோ": 130,
137
+ "சௌ": 131,
138
+ "ச்": 132,
139
+ "ஜ": 133,
140
+ "ஞ": 134,
141
+ "ஞா": 135,
142
+ "ஞி": 136,
143
+ "ஞீ": 137,
144
+ "ஞு": 138,
145
+ "ஞூ": 139,
146
+ "ஞெ": 140,
147
+ "ஞே": 141,
148
+ "ஞை": 142,
149
+ "ஞொ": 143,
150
+ "ஞோ": 144,
151
+ "ஞௌ": 145,
152
+ "ஞ்": 146,
153
+ "ட": 147,
154
+ "டா": 148,
155
+ "டி": 149,
156
+ "டீ": 150,
157
+ "டு": 151,
158
+ "டூ": 152,
159
+ "டெ": 153,
160
+ "டே": 154,
161
+ "டை": 155,
162
+ "டொ": 156,
163
+ "டோ": 157,
164
+ "டௌ": 158,
165
+ "ட்": 159,
166
+ "ண": 160,
167
+ "ணா": 161,
168
+ "ணி": 162,
169
+ "ணீ": 163,
170
+ "ணு": 164,
171
+ "ணூ": 165,
172
+ "ணெ": 166,
173
+ "ணே": 167,
174
+ "ணை": 168,
175
+ "ணொ": 169,
176
+ "ணோ": 170,
177
+ "ணௌ": 171,
178
+ "ண்": 172,
179
+ "த": 173,
180
+ "தா": 174,
181
+ "தி": 175,
182
+ "தீ": 176,
183
+ "து": 177,
184
+ "தூ": 178,
185
+ "தெ": 179,
186
+ "தே": 180,
187
+ "தை": 181,
188
+ "தொ": 182,
189
+ "தோ": 183,
190
+ "தௌ": 184,
191
+ "த்": 185,
192
+ "ந": 186,
193
+ "நா": 187,
194
+ "நி": 188,
195
+ "நீ": 189,
196
+ "நு": 190,
197
+ "நூ": 191,
198
+ "நெ": 192,
199
+ "நே": 193,
200
+ "நை": 194,
201
+ "நொ": 195,
202
+ "நோ": 196,
203
+ "நௌ": 197,
204
+ "ந்": 198,
205
+ "ன": 199,
206
+ "னா": 200,
207
+ "னி": 201,
208
+ "னீ": 202,
209
+ "னு": 203,
210
+ "னூ": 204,
211
+ "னெ": 205,
212
+ "னே": 206,
213
+ "னை": 207,
214
+ "னொ": 208,
215
+ "னோ": 209,
216
+ "னௌ": 210,
217
+ "ன்": 211,
218
+ "ப": 212,
219
+ "பா": 213,
220
+ "பி": 214,
221
+ "பீ": 215,
222
+ "பு": 216,
223
+ "பூ": 217,
224
+ "பெ": 218,
225
+ "பே": 219,
226
+ "பை": 220,
227
+ "பொ": 221,
228
+ "போ": 222,
229
+ "பௌ": 223,
230
+ "ப்": 224,
231
+ "ம": 225,
232
+ "மா": 226,
233
+ "மி": 227,
234
+ "மீ": 228,
235
+ "மு": 229,
236
+ "மூ": 230,
237
+ "மெ": 231,
238
+ "மே": 232,
239
+ "மை": 233,
240
+ "மொ": 234,
241
+ "மோ": 235,
242
+ "மௌ": 236,
243
+ "ம்": 237,
244
+ "ய": 238,
245
+ "யா": 239,
246
+ "யி": 240,
247
+ "யீ": 241,
248
+ "யு": 242,
249
+ "யூ": 243,
250
+ "யெ": 244,
251
+ "யே": 245,
252
+ "யை": 246,
253
+ "யொ": 247,
254
+ "யோ": 248,
255
+ "யௌ": 249,
256
+ "ய்": 250,
257
+ "ர": 251,
258
+ "ரா": 252,
259
+ "ர���": 253,
260
+ "ரீ": 254,
261
+ "ரு": 255,
262
+ "ரூ": 256,
263
+ "ரெ": 257,
264
+ "ரே": 258,
265
+ "ரை": 259,
266
+ "ரொ": 260,
267
+ "ரோ": 261,
268
+ "ரௌ": 262,
269
+ "ர்": 263,
270
+ "ற": 264,
271
+ "றா": 265,
272
+ "றி": 266,
273
+ "றீ": 267,
274
+ "று": 268,
275
+ "றூ": 269,
276
+ "றெ": 270,
277
+ "றே": 271,
278
+ "றை": 272,
279
+ "றொ": 273,
280
+ "றோ": 274,
281
+ "றௌ": 275,
282
+ "ற்": 276,
283
+ "ல": 277,
284
+ "லா": 278,
285
+ "லி": 279,
286
+ "லீ": 280,
287
+ "லு": 281,
288
+ "லூ": 282,
289
+ "லெ": 283,
290
+ "லே": 284,
291
+ "லை": 285,
292
+ "லொ": 286,
293
+ "லோ": 287,
294
+ "லௌ": 288,
295
+ "ல்": 289,
296
+ "ள": 290,
297
+ "ளா": 291,
298
+ "ளி": 292,
299
+ "ளீ": 293,
300
+ "ளு": 294,
301
+ "ளூ": 295,
302
+ "ளெ": 296,
303
+ "ளே": 297,
304
+ "ளை": 298,
305
+ "ளொ": 299,
306
+ "ளோ": 300,
307
+ "ளௌ": 301,
308
+ "ள்": 302,
309
+ "ழ": 303,
310
+ "ழா": 304,
311
+ "ழி": 305,
312
+ "ழீ": 306,
313
+ "ழு": 307,
314
+ "ழூ": 308,
315
+ "ழெ": 309,
316
+ "ழே": 310,
317
+ "ழை": 311,
318
+ "ழொ": 312,
319
+ "ழோ": 313,
320
+ "ழௌ": 314,
321
+ "ழ்": 315,
322
+ "வ": 316,
323
+ "வா": 317,
324
+ "வி": 318,
325
+ "வீ": 319,
326
+ "வு": 320,
327
+ "வூ": 321,
328
+ "வெ": 322,
329
+ "வே": 323,
330
+ "வை": 324,
331
+ "வொ": 325,
332
+ "வோ": 326,
333
+ "வௌ": 327,
334
+ "வ்": 328,
335
+ "ஷ": 329,
336
+ "ஸ": 330,
337
+ "ஸ்ரீ": 331,
338
+ "ஹ": 332,
339
+ "ா": 333,
340
+ "ி": 334,
341
+ "ீ": 335,
342
+ "ு": 336,
343
+ "ூ": 337,
344
+ "ெ": 338,
345
+ "ே": 339,
346
+ "ை": 340,
347
+ "ொ": 341,
348
+ "ோ": 342,
349
+ "ௌ": 343,
350
+ "்": 344,
351
+ "ௗ": 345,
352
+ "௦": 346,
353
+ "௧": 347,
354
+ "௨": 348,
355
+ "௩": 349,
356
+ "௪": 350,
357
+ "௫": 351,
358
+ "௬": 352,
359
+ "௭": 353,
360
+ "௮": 354,
361
+ "௯": 355,
362
+ "௰": 356,
363
+ "௱": 357,
364
+ "௲": 358,
365
+ "௳": 359,
366
+ "௴": 360,
367
+ "௵": 361,
368
+ "௶": 362,
369
+ "௷": 363,
370
+ "௸": 364,
371
+ "௹": 365,
372
+ "௺": 366
373
+ },
374
+ "unk_token": "[UNK]"
375
+ },
376
+ "pre_tokenizer": {
377
+ "type": "Whitespace"
378
+ },
379
+ "normalizer": {
380
+ "type": "NFKC"
381
+ }
382
+ }