| class CustomTokenizer: |
| def __init__(self, pretrained_tokenizer_path, cache_dir): |
| self.tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_path, |
| cache_dir= cache_dir, |
| trust_remote_code = True |
| ) |
|
|
| new_tokens = ['<num>', '<url>', '<mail>'] |
| self.tokenizer.add_tokens(new_tokens) |
|
|
| |
| self.number_pattern = re.compile(r'\b\d+\.?\d*\b') |
| self.url_pattern = re.compile(r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})') |
| self.mail_pattern = re.compile(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}') |
|
|
| self.replacement_symbols = { |
| "url": '<url>', |
| "num": '<num>', |
| "mail": '<mail>', |
| } |
|
|
| def preprocess(self, text): |
| text = self.number_pattern.sub(self.replacement_symbols["num"], text) |
| text = self.url_pattern.sub(self.replacement_symbols["url"], text) |
| text = self.mail_pattern.sub(self.replacement_symbols["mail"], text) |
|
|
| return text |
|
|
| def __getattr__(self, attr): |
| |
| return getattr(self.tokenizer, attr) |
|
|
| def __call__(self, text, **kwargs): |
| preprocessed_text = self.preprocess(text) |
| return self.tokenizer(preprocessed_text, **kwargs) |