| """ |
| Extend the given vocabulary using dataset-specific words. |
| |
| 1. First create a vocabulary for the specific dataset. |
| 2. Find all words not in our vocabulary, but in the dataset vocabulary. |
| 3. Take top X (default=1000) of these words and add them to the vocabulary. |
| 4. Save this combined vocabulary and embedding matrix, which can now be used. |
| """ |
|
|
| from __future__ import print_function, unicode_literals |
| import example_helper |
| import json |
| from torchmoji.create_vocab import extend_vocab, VocabBuilder |
| from torchmoji.word_generator import WordGenerator |
|
|
| new_words = ['#zzzzaaazzz', 'newword', 'newword'] |
| word_gen = WordGenerator(new_words) |
| vb = VocabBuilder(word_gen) |
| vb.count_all_words() |
|
|
| with open('../model/vocabulary.json') as f: |
| vocab = json.load(f) |
|
|
| print(len(vocab)) |
| print(vb.word_counts) |
| extend_vocab(vocab, vb, max_tokens=1) |
|
|
| |
| print(vocab['newword']) |
| print(len(vocab)) |
|
|