| | import os |
| | import json |
| | from huggingface_hub import HfApi |
| | from transformers import PreTrainedTokenizer |
| |
|
| | class CharacterTokenizer(PreTrainedTokenizer): |
| | """ |
| | Simple character-level tokenizer |
| | """ |
| |
|
| | model_input_names = ["input_ids", "attention_mask"] |
| |
|
| | def __init__( |
| | self, |
| | vocab=None, |
| | unk_token="[UNK]", |
| | pad_token="[PAD]", |
| | bos_token="[BOS]", |
| | eos_token="[EOS]", |
| | sep_token="[SEP]", |
| | **kwargs |
| | ): |
| | if vocab is None: |
| | vocab = {} |
| | |
| | special_tokens = [ |
| | unk_token, |
| | pad_token, |
| | bos_token, |
| | eos_token, |
| | sep_token, |
| | ] |
| | for token in special_tokens: |
| | if token not in vocab: |
| | vocab[token] = len(vocab) |
| | self.vocab = vocab |
| | self.inv_vocab = {v: k for k, v in self.vocab.items()} |
| |
|
| | |
| | super().__init__( |
| | unk_token=unk_token, |
| | pad_token=pad_token, |
| | bos_token=bos_token, |
| | eos_token=eos_token, |
| | sep_token=sep_token, |
| | **kwargs |
| | ) |
| |
|
| | @property |
| | def vocab_size(self): |
| | return len(self.vocab) |
| |
|
| | def get_vocab(self): |
| | return dict(self.vocab) |
| |
|
| | def _tokenize(self, text): |
| | return list(text) |
| |
|
| | def _convert_token_to_id(self, token): |
| | return self.vocab.get(token, self.vocab.get(self.unk_token)) |
| |
|
| | def _convert_id_to_token(self, index): |
| | return self.inv_vocab.get(index, self.unk_token) |
| |
|
| | def save_vocabulary(self, save_directory, filename_prefix=None): |
| | if not os.path.isdir(save_directory): |
| | os.makedirs(save_directory) |
| |
|
| | vocab_file = os.path.join( |
| | save_directory, |
| | (filename_prefix + "-" if filename_prefix else "") + "vocab.json" |
| | ) |
| |
|
| | with open(vocab_file, "w", encoding="utf-8") as f: |
| | json.dump(self.vocab, f, ensure_ascii=False) |
| |
|
| | return (vocab_file,) |
| |
|
| | def batch_encode(self, texts, add_special_tokens=False, padding=False, truncation=True, max_length=None): |
| | encoded_texts = [self.encode(text) for text in texts] |
| | |
| | if max_length is not None: |
| | encoded_texts = [ids[:max_length] for ids in encoded_texts] |
| | if add_special_tokens: |
| | bos_token_id = self.convert_tokens_to_ids(self.bos_token) |
| | eos_token_id = self.convert_tokens_to_ids(self.eos_token) |
| | encoded_texts = [[bos_token_id] + ids + [eos_token_id] for ids in encoded_texts] |
| | |
| | if padding: |
| | |
| | pad_id = self.vocab.get(self.pad_token, 0) |
| | max_len = max(len(ids) for ids in encoded_texts) if max_length is None else max_length |
| | if self.padding_side == "right": |
| | encoded_texts = [ids + [pad_id] * (max_len - len(ids)) for ids in encoded_texts] |
| | else: |
| | encoded_texts = [[pad_id] * (max_len - len(ids)) + ids for ids in encoded_texts] |
| | return encoded_texts |
| | |
| | def train(self, texts): |
| | |
| | vocab = {} |
| | special_tokens = [ |
| | self.unk_token, |
| | self.pad_token, |
| | self.bos_token, |
| | self.eos_token, |
| | self.sep_token, |
| | ] |
| | for token in special_tokens: |
| | if token not in vocab: |
| | vocab[token] = len(vocab) |
| |
|
| | |
| | for text in texts: |
| | |
| | processed_text = text |
| | for token in special_tokens: |
| | processed_text = processed_text.replace(token, " ") |
| |
|
| | |
| | for char in processed_text: |
| | if char not in vocab: |
| | vocab[char] = len(vocab) |
| |
|
| | self.vocab = vocab |
| | self.inv_vocab = {v: k for k, v in self.vocab.items()} |
| |
|
| | print(f"Vocabulary built with {len(self.vocab)} tokens") |
| | return self |
| |
|
| | def convert_tokens_to_string(self, tokens): |
| | return "".join(tokens) |
| | |
| | @classmethod |
| | def from_json(cls, vocab_file, **kwargs): |
| | with open(vocab_file, 'r', encoding='utf-8') as f: |
| | vocab = json.load(f) |
| | return cls(vocab=vocab, **kwargs) |
| | |
| | @classmethod |
| | def from_vocab(cls, vocab, **kwargs): |
| | return cls(vocab=vocab, **kwargs) |
| |
|
| | @classmethod |
| | def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): |
| | from transformers import PreTrainedTokenizerFast |
| | from transformers.utils import cached_file |
| |
|
| | |
| | try: |
| | |
| | vocab_file = cached_file( |
| | pretrained_model_name_or_path, |
| | "vocab.json", |
| | _raise_exceptions_for_missing_entries=False |
| | ) |
| |
|
| | |
| | if vocab_file: |
| | return cls.from_json(vocab_file, *inputs, **kwargs) |
| |
|
| | except Exception as e: |
| | |
| | local_vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json") |
| | if os.path.exists(local_vocab_file): |
| | return cls.from_json(local_vocab_file, *inputs, **kwargs) |
| |
|
| | |
| | tokenizer_files = [ |
| | os.path.join(pretrained_model_name_or_path, "tokenizer.json"), |
| | os.path.join(pretrained_model_name_or_path, "tokenizer_config.json") |
| | ] |
| |
|
| | for tokenizer_file in tokenizer_files: |
| | if os.path.exists(tokenizer_file): |
| | print(f"Loading using PreTrainedTokenizerFast from {tokenizer_file}") |
| | fast_tokenizer = PreTrainedTokenizerFast.from_pretrained(pretrained_model_name_or_path) |
| | vocab = {token: i for token, i in fast_tokenizer.get_vocab().items()} |
| | return cls.from_vocab(vocab, *inputs, **kwargs) |
| |
|
| | |
| | raise ValueError( |
| | f"Could not find vocab.json in {pretrained_model_name_or_path}. " |
| | f"Error: {str(e)}" |
| | ) |
| |
|