| from sentence_transformers import SentenceTransformer |
| from tokenizers import Encoding, Tokenizer |
| from torch.nn import EmbeddingBag |
| import torch |
|
|
|
|
| def test_tokenizer(): |
| examples = [ |
| "This is an example of encoding", |
| "The quick brown fox jumps over the lazy dog.", |
| "Curaçao, naïve fiancé, jalapeño, déjà vu.", |
| "Привет, как дела?", |
| "Бързата кафява лисица прескача мързеливото куче.", |
| "Γρήγορη καφέ αλεπού πηδάει πάνω από τον τεμπέλη σκύλο.", |
| "اللغة العربية جميلة وغنية بالتاريخ.", |
| "مرحبا بالعالم!", |
| "Simplified: 快速的棕色狐狸跳过懒狗。", |
| "Traditional: 快速的棕色狐狸跳過懶狗。", |
| "素早い茶色の狐が怠け者の犬を飛び越える。", |
| "コンピュータープログラミング", |
| "빠른 갈색 여우가 게으른 개를 뛰어넘습니다.", |
| "तेज़ भूरी लोमड़ी आलसी कुत्ते के ऊपर कूदती है।", |
| "দ্রুত বাদামী শিয়াল অলস কুকুরের উপর দিয়ে লাফ দেয়।", |
| "வேகமான பழுப்பு நரி சோம்பேறி நாயின் மேல் குதிக்கிறது.", |
| "สุนัขจิ้งจอกสีน้ำตาลกระโดดข้ามสุนัขขี้เกียจ.", |
| "ብሩክ ቡናማ ቀበሮ ሰነፍ ውሻን ተዘልሏል።", |
| "Hello 世界 مرحبا 🌍", |
| "123, αβγ, абв, العربية, 中文, हिन्दी.", |
| ] |
|
|
| tokenizer: Tokenizer = Tokenizer.from_file("js/tokenizer.json") |
|
|
| for example in examples: |
| encoding: Encoding = tokenizer.encode(example) |
| print(example) |
| print(encoding.tokens) |
| print() |
|
|
|
|
| |
| model = SentenceTransformer( |
| "sentence-transformers/static-similarity-mrl-multilingual-v1", device="cpu" |
| ) |
|
|
| embedding_bag: EmbeddingBag = model[0].embedding |
| embeddings = torch.Tensor(embedding_bag.weight) |
|
|
| print(embeddings.shape) |
| assert embeddings.shape == torch.Size([105879, 1024]) |
|
|
| print("float32") |
| print(f" 1024 dim - {embeddings.shape[0] * 1024 * 4 / 1024 / 1024:,.1f} MiB") |
| print(f" 512 dim - {embeddings.shape[0] * 512 * 4 / 1024 / 1024:,.1f} MiB") |
| print(f" 256 dim - {embeddings.shape[0] * 256 * 4 / 1024 / 1024:,.1f} MiB") |
|
|
| print("float16") |
| print(f" 1024 dim - {embeddings.shape[0] * 1024 * 2 / 1024 / 1024:,.1f} MiB") |
| print(f" 512 dim - {embeddings.shape[0] * 512 * 2 / 1024 / 1024:,.1f} MiB") |
| print(f" 256 dim - {embeddings.shape[0] * 256 * 2 / 1024 / 1024:,.1f} MiB") |
|
|