| { |
| "num_threads": 224, |
| "split_by_whitespace": true, |
| "model_type": "unigram", |
| "vocab_size": 250680, |
| "character_coverage": 0.9999, |
| "byte_fallback": true, |
| "split_by_number": true, |
| "split_digits": true, |
| "normalization_rule_name": "nfkc", |
| "max_sentence_length": 4096, |
| "shuffle_input_sentence": true, |
| "input_sentence_size": 0, |
| "train_extremely_large_corpus": true, |
| "allow_whitespace_only_pieces": true, |
| "required_chars": "", |
| "remove_extra_whitespaces": false, |
| "user_defined_symbols": [ |
| "<s>", |
| "</s>", |
| "<pad>", |
| "<eod>", |
| "<placeholder_tok_0>", |
| "<placeholder_tok_1>", |
| "<placeholder_tok_2>", |
| "<placeholder_tok_3>", |
| "<placeholder_tok_4>", |
| "<placeholder_tok_5>", |
| "<placeholder_tok_6>", |
| "<placeholder_tok_7>", |
| "<placeholder_tok_8>", |
| "<placeholder_tok_9>", |
| "<placeholder_tok_10>", |
| "<placeholder_tok_11>", |
| "<placeholder_tok_12>", |
| "<placeholder_tok_13>", |
| "<placeholder_tok_14>", |
| "<placeholder_tok_15>", |
| "<placeholder_tok_16>", |
| "<placeholder_tok_17>", |
| "<placeholder_tok_18>", |
| "<placeholder_tok_19>", |
| "<placeholder_tok_20>", |
| "<placeholder_tok_21>", |
| "<placeholder_tok_22>", |
| "<placeholder_tok_23>", |
| "<placeholder_tok_24>", |
| "<placeholder_tok_25>", |
| "<placeholder_tok_26>", |
| "<placeholder_tok_27>", |
| "<placeholder_tok_28>", |
| "<placeholder_tok_29>", |
| "<placeholder_tok_30>", |
| "<placeholder_tok_31>", |
| "<placeholder_tok_32>", |
| "<placeholder_tok_33>", |
| "<placeholder_tok_34>", |
| "<placeholder_tok_35>", |
| "<placeholder_tok_36>", |
| "<placeholder_tok_37>", |
| "<placeholder_tok_38>", |
| "<placeholder_tok_39>", |
| "<placeholder_tok_40>", |
| "<placeholder_tok_41>", |
| "<placeholder_tok_42>", |
| "<placeholder_tok_43>", |
| "<placeholder_tok_44>", |
| "<placeholder_tok_45>", |
| "<placeholder_tok_46>", |
| "<placeholder_tok_47>", |
| "<placeholder_tok_48>", |
| "<placeholder_tok_49>", |
| "<placeholder_tok_50>", |
| "<placeholder_tok_51>", |
| "<placeholder_tok_52>", |
| "<placeholder_tok_53>", |
| "<placeholder_tok_54>", |
| "<placeholder_tok_55>", |
| "<placeholder_tok_56>", |
| "<placeholder_tok_57>", |
| "<placeholder_tok_58>", |
| "<placeholder_tok_59>", |
| "<placeholder_tok_60>", |
| "<placeholder_tok_61>", |
| "<placeholder_tok_62>", |
| "<placeholder_tok_63>", |
| "<placeholder_tok_64>", |
| "<placeholder_tok_65>", |
| "<placeholder_tok_66>", |
| "<placeholder_tok_67>", |
| "<placeholder_tok_68>", |
| "<placeholder_tok_69>", |
| "<placeholder_tok_70>", |
| "<placeholder_tok_71>", |
| "<placeholder_tok_72>", |
| "<placeholder_tok_73>", |
| "<placeholder_tok_74>", |
| "<placeholder_tok_75>", |
| "<placeholder_tok_76>", |
| "<placeholder_tok_77>", |
| "<placeholder_tok_78>", |
| "<placeholder_tok_79>", |
| "<placeholder_tok_80>", |
| "<placeholder_tok_81>", |
| "<placeholder_tok_82>", |
| "<placeholder_tok_83>", |
| "<placeholder_tok_84>", |
| "<placeholder_tok_85>", |
| "<placeholder_tok_86>", |
| "<placeholder_tok_87>", |
| "<placeholder_tok_88>", |
| "<placeholder_tok_89>", |
| "<placeholder_tok_90>", |
| "<placeholder_tok_91>", |
| "<placeholder_tok_92>", |
| "<placeholder_tok_93>", |
| "<placeholder_tok_94>", |
| "<placeholder_tok_95>", |
| "<placeholder_tok_96>", |
| "<placeholder_tok_97>", |
| "<placeholder_tok_98>", |
| "<placeholder_tok_99>", |
| "<placeholder_tok_100>", |
| "<placeholder_tok_101>", |
| "<placeholder_tok_102>", |
| "<placeholder_tok_103>", |
| "<placeholder_tok_104>", |
| "<placeholder_tok_105>", |
| "<placeholder_tok_106>", |
| "<placeholder_tok_107>", |
| "<placeholder_tok_108>", |
| "<placeholder_tok_109>", |
| "<placeholder_tok_110>", |
| "<placeholder_tok_111>", |
| "<placeholder_tok_112>", |
| "<placeholder_tok_113>", |
| "<placeholder_tok_114>", |
| "<placeholder_tok_115>", |
| "<placeholder_tok_116>", |
| "<placeholder_tok_117>", |
| "<placeholder_tok_118>", |
| "<placeholder_tok_119>", |
| "<placeholder_tok_120>", |
| "<placeholder_tok_121>", |
| "<placeholder_tok_122>", |
| "<placeholder_tok_123>", |
| "<placeholder_tok_124>", |
| "<placeholder_tok_125>", |
| "<placeholder_tok_126>", |
| "<placeholder_tok_127>", |
| "<placeholder_tok_128>", |
| "<placeholder_tok_129>", |
| "<placeholder_tok_130>", |
| "<placeholder_tok_131>", |
| "<placeholder_tok_132>", |
| "<placeholder_tok_133>", |
| "<placeholder_tok_134>", |
| "<placeholder_tok_135>", |
| "<placeholder_tok_136>", |
| "<placeholder_tok_137>", |
| "<placeholder_tok_138>", |
| "<placeholder_tok_139>", |
| "<placeholder_tok_140>", |
| "<placeholder_tok_141>", |
| "<placeholder_tok_142>", |
| "<placeholder_tok_143>", |
| "<placeholder_tok_144>", |
| "<placeholder_tok_145>", |
| "<placeholder_tok_146>", |
| "<placeholder_tok_147>", |
| "<placeholder_tok_148>", |
| "<placeholder_tok_149>", |
| "<placeholder_tok_150>", |
| "<placeholder_tok_151>", |
| "<placeholder_tok_152>", |
| "<placeholder_tok_153>", |
| "<placeholder_tok_154>", |
| "<placeholder_tok_155>", |
| "<placeholder_tok_156>", |
| "<placeholder_tok_157>", |
| "<placeholder_tok_158>", |
| "<placeholder_tok_159>", |
| "<placeholder_tok_160>", |
| "<placeholder_tok_161>", |
| "<placeholder_tok_162>", |
| "<placeholder_tok_163>", |
| "<placeholder_tok_164>", |
| "<placeholder_tok_165>", |
| "<placeholder_tok_166>", |
| "<placeholder_tok_167>", |
| "<placeholder_tok_168>", |
| "<placeholder_tok_169>", |
| "<placeholder_tok_170>", |
| "<placeholder_tok_171>", |
| "<placeholder_tok_172>", |
| "<placeholder_tok_173>", |
| "<placeholder_tok_174>", |
| "<placeholder_tok_175>", |
| "<placeholder_tok_176>", |
| "<placeholder_tok_177>", |
| "<placeholder_tok_178>", |
| "<placeholder_tok_179>", |
| "<placeholder_tok_180>", |
| "<placeholder_tok_181>", |
| "<placeholder_tok_182>", |
| "<placeholder_tok_183>", |
| "<placeholder_tok_184>", |
| "<placeholder_tok_185>", |
| "<placeholder_tok_186>", |
| "<placeholder_tok_187>", |
| "<placeholder_tok_188>", |
| "<placeholder_tok_189>", |
| "<placeholder_tok_190>", |
| "<placeholder_tok_191>", |
| "<placeholder_tok_192>", |
| "<placeholder_tok_193>", |
| "<placeholder_tok_194>", |
| "<placeholder_tok_195>", |
| "<placeholder_tok_196>", |
| "<placeholder_tok_197>", |
| "<placeholder_tok_198>", |
| "<placeholder_tok_199>", |
| "<placeholder_tok_200>", |
| "<placeholder_tok_201>", |
| "<placeholder_tok_202>", |
| "<placeholder_tok_203>", |
| "<placeholder_tok_204>", |
| "<placeholder_tok_205>", |
| "<placeholder_tok_206>", |
| "<placeholder_tok_207>", |
| "<placeholder_tok_208>", |
| "<placeholder_tok_209>", |
| "<placeholder_tok_210>", |
| "<placeholder_tok_211>", |
| "<placeholder_tok_212>", |
| "<placeholder_tok_213>", |
| "<placeholder_tok_214>", |
| "<placeholder_tok_215>", |
| "<placeholder_tok_216>", |
| "<placeholder_tok_217>", |
| "<placeholder_tok_218>", |
| "<placeholder_tok_219>", |
| "<placeholder_tok_220>", |
| "<placeholder_tok_221>", |
| "<placeholder_tok_222>", |
| "<placeholder_tok_223>", |
| "<placeholder_tok_224>", |
| "<placeholder_tok_225>", |
| "<placeholder_tok_226>", |
| "<placeholder_tok_227>", |
| "<placeholder_tok_228>", |
| "<placeholder_tok_229>", |
| "<placeholder_tok_230>", |
| "<placeholder_tok_231>", |
| "<placeholder_tok_232>", |
| "<placeholder_tok_233>", |
| "<placeholder_tok_234>", |
| "<placeholder_tok_235>", |
| "<placeholder_tok_236>", |
| "<placeholder_tok_237>", |
| "<placeholder_tok_238>", |
| "<placeholder_tok_239>", |
| "<placeholder_tok_240>", |
| "<placeholder_tok_241>", |
| "<placeholder_tok_242>", |
| "<placeholder_tok_243>", |
| "<placeholder_tok_244>", |
| "<placeholder_tok_245>", |
| "<placeholder_tok_246>", |
| "<placeholder_tok_247>", |
| "<placeholder_tok_248>", |
| "<placeholder_tok_249>", |
| "<placeholder_tok_250>", |
| "<placeholder_tok_251>", |
| "<placeholder_tok_252>", |
| "<placeholder_tok_253>", |
| "<placeholder_tok_254>", |
| "<placeholder_tok_255>" |
| ], |
| "datasets_dir": "/home/fhgiais/gptx_ablations/bias_analysis/data/tokenizer/temp/", |
| "save_dir": "/home/fhgiais/gptx_ablations/bias_analysis/tokenizer/24", |
| "text_key": "text", |
| "cache_dir": "/home/fhgiais/gptx_ablations/bias_analysis/tokenizer/24/cache", |
| "library": "sentencepiece", |
| "auto_map": { |
| "AutoTokenizer": [ |
| "gptx_tokenizer.SPTokenizer", |
| null |
| ] |
| }, |
| "tokenizer_class": "SPTokenizer" |
| } |