| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import unittest |
|
|
| from transformers.models.cpm.tokenization_cpm import CpmTokenizer |
| from transformers.testing_utils import custom_tokenizers |
|
|
|
|
| @custom_tokenizers |
| class CpmTokenizationTest(unittest.TestCase): |
| |
| def is_pipeline_test_to_skip( |
| self, |
| pipeline_test_case_name, |
| config_class, |
| model_architecture, |
| tokenizer_name, |
| image_processor_name, |
| feature_extractor_name, |
| processor_name, |
| ): |
| return True |
|
|
| def test_pre_tokenization(self): |
| tokenizer = CpmTokenizer.from_pretrained("TsinghuaAI/CPM-Generate") |
| text = "Hugging Face大法好,谁用谁知道。" |
| normalized_text = "Hugging Face大法好,谁用谁知道。<unk>" |
| bpe_tokens = "▁Hu gg ing ▁ ▂ ▁F ace ▁大法 ▁好 ▁ , ▁谁 ▁用 ▁谁 ▁知 道 ▁ 。".split() |
|
|
| tokens = tokenizer.tokenize(text) |
| self.assertListEqual(tokens, bpe_tokens) |
|
|
| input_tokens = tokens + [tokenizer.unk_token] |
|
|
| input_bpe_tokens = [13789, 13283, 1421, 8, 10, 1164, 13608, 16528, 63, 8, 9, 440, 108, 440, 121, 90, 8, 12, 0] |
| self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) |
|
|
| reconstructed_text = tokenizer.decode(input_bpe_tokens) |
| self.assertEqual(reconstructed_text, normalized_text) |
|
|