Update tokenizer_script.py

06459c6 verified 11 months ago

6.56 kB

	import os
	import json
	from huggingface_hub import HfApi
	from transformers import PreTrainedTokenizer

	class CharacterTokenizer(PreTrainedTokenizer):
	"""
	Simple character-level tokenizer
	"""

	model_input_names = ["input_ids", "attention_mask"]

	def __init__(
	self,
	vocab=None,
	unk_token="[UNK]",
	pad_token="[PAD]",
	bos_token="[BOS]",
	eos_token="[EOS]",
	sep_token="[SEP]",
	**kwargs
	):
	if vocab is None:
	vocab = {}
	# Add special tokens
	special_tokens = [
	unk_token,
	pad_token,
	bos_token,
	eos_token,
	sep_token,
	]
	for token in special_tokens:
	if token not in vocab:
	vocab[token] = len(vocab)
	self.vocab = vocab
	self.inv_vocab = {v: k for k, v in self.vocab.items()}

	# Call parent constructor
	super().__init__(
	unk_token=unk_token,
	pad_token=pad_token,
	bos_token=bos_token,
	eos_token=eos_token,
	sep_token=sep_token,
	**kwargs
	)

	@property
	def vocab_size(self):
	return len(self.vocab)

	def get_vocab(self):
	return dict(self.vocab)

	def _tokenize(self, text):
	return list(text)

	def _convert_token_to_id(self, token):
	return self.vocab.get(token, self.vocab.get(self.unk_token))

	def _convert_id_to_token(self, index):
	return self.inv_vocab.get(index, self.unk_token)

	def save_vocabulary(self, save_directory, filename_prefix=None):
	if not os.path.isdir(save_directory):
	os.makedirs(save_directory)

	vocab_file = os.path.join(
	save_directory,
	(filename_prefix + "-" if filename_prefix else "") + "vocab.json"
	)

	with open(vocab_file, "w", encoding="utf-8") as f:
	json.dump(self.vocab, f, ensure_ascii=False)

	return (vocab_file,)

	def batch_encode(self, texts, add_special_tokens=False, padding=False, truncation=True, max_length=None):
	encoded_texts = [self.encode(text) for text in texts]
	# Handle max_length (truncation)
	if max_length is not None:
	encoded_texts = [ids[:max_length] for ids in encoded_texts]
	if add_special_tokens:
	bos_token_id = self.convert_tokens_to_ids(self.bos_token)
	eos_token_id = self.convert_tokens_to_ids(self.eos_token)
	encoded_texts = [[bos_token_id] + ids + [eos_token_id] for ids in encoded_texts]
	# Handle padding
	if padding:
	# properly handle padding side
	pad_id = self.vocab.get(self.pad_token, 0)
	max_len = max(len(ids) for ids in encoded_texts) if max_length is None else max_length
	if self.padding_side == "right":
	encoded_texts = [ids + [pad_id] * (max_len - len(ids)) for ids in encoded_texts]
	else:
	encoded_texts = [[pad_id] * (max_len - len(ids)) + ids for ids in encoded_texts]
	return encoded_texts

	def train(self, texts):
	# Start with special tokens
	vocab = {}
	special_tokens = [
	self.unk_token,
	self.pad_token,
	self.bos_token,
	self.eos_token,
	self.sep_token,
	]
	for token in special_tokens:
	if token not in vocab:
	vocab[token] = len(vocab)

	# Add all unique characters from the training data
	for text in texts:
	# Replace all special tokens with placeholders
	processed_text = text
	for token in special_tokens:
	processed_text = processed_text.replace(token, " ")

	# Add remaining characters
	for char in processed_text:
	if char not in vocab:
	vocab[char] = len(vocab)

	self.vocab = vocab
	self.inv_vocab = {v: k for k, v in self.vocab.items()}

	print(f"Vocabulary built with {len(self.vocab)} tokens")
	return self

	def convert_tokens_to_string(self, tokens):
	return "".join(tokens)

	@classmethod
	def from_json(cls, vocab_file, **kwargs):
	with open(vocab_file, 'r', encoding='utf-8') as f:
	vocab = json.load(f)
	return cls(vocab=vocab, **kwargs)

	@classmethod
	def from_vocab(cls, vocab, **kwargs):
	return cls(vocab=vocab, **kwargs)

	@classmethod
	def from_pretrained(cls, pretrained_model_name_or_path, inputs, *kwargs):
	from transformers import PreTrainedTokenizerFast
	from transformers.utils import cached_file

	# Try to get the vocab file from the Hub or local directory
	try:
	# Try to get vocab.json using the HF Hub utilities
	vocab_file = cached_file(
	pretrained_model_name_or_path,
	"vocab.json",
	_raise_exceptions_for_missing_entries=False
	)

	# If vocab file is found, load the tokenizer from it
	if vocab_file:
	return cls.from_json(vocab_file, inputs, *kwargs)

	except Exception as e:
	# If the file is not found on the Hub, try to load it from a local directory
	local_vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
	if os.path.exists(local_vocab_file):
	return cls.from_json(local_vocab_file, inputs, *kwargs)

	# If both approaches fail, try to load using the PreTrainedTokenizerFast fallback
	tokenizer_files = [
	os.path.join(pretrained_model_name_or_path, "tokenizer.json"),
	os.path.join(pretrained_model_name_or_path, "tokenizer_config.json")
	]

	for tokenizer_file in tokenizer_files:
	if os.path.exists(tokenizer_file):
	print(f"Loading using PreTrainedTokenizerFast from {tokenizer_file}")
	fast_tokenizer = PreTrainedTokenizerFast.from_pretrained(pretrained_model_name_or_path)
	vocab = {token: i for token, i in fast_tokenizer.get_vocab().items()}
	return cls.from_vocab(vocab, inputs, *kwargs)

	# If all else fails, raise an error
	raise ValueError(
	f"Could not find vocab.json in {pretrained_model_name_or_path}. "
	f"Error: {str(e)}"
	)