| ''' |
| Split a given dataset into three different datasets: training, validation and |
| testing. |
| |
| This is achieved by splitting the given list of sentences into three separate |
| lists according to either a given ratio (e.g. [0.7, 0.1, 0.2]) or by an |
| explicit enumeration. The sentences are also tokenised using the given |
| vocabulary. |
| |
| Also splits a given list of dictionaries containing information about |
| each sentence. |
| |
| An additional parameter can be set 'extend_with', which will extend the given |
| vocabulary with up to 'extend_with' tokens, taken from the training dataset. |
| ''' |
| from __future__ import print_function, unicode_literals |
| import example_helper |
| import json |
|
|
| from torchmoji.sentence_tokenizer import SentenceTokenizer |
|
|
| DATASET = [ |
| 'I am sentence 0', |
| 'I am sentence 1', |
| 'I am sentence 2', |
| 'I am sentence 3', |
| 'I am sentence 4', |
| 'I am sentence 5', |
| 'I am sentence 6', |
| 'I am sentence 7', |
| 'I am sentence 8', |
| 'I am sentence 9 newword', |
| ] |
|
|
| INFO_DICTS = [ |
| {'label': 'sentence 0'}, |
| {'label': 'sentence 1'}, |
| {'label': 'sentence 2'}, |
| {'label': 'sentence 3'}, |
| {'label': 'sentence 4'}, |
| {'label': 'sentence 5'}, |
| {'label': 'sentence 6'}, |
| {'label': 'sentence 7'}, |
| {'label': 'sentence 8'}, |
| {'label': 'sentence 9'}, |
| ] |
|
|
| with open('../model/vocabulary.json', 'r') as f: |
| vocab = json.load(f) |
| st = SentenceTokenizer(vocab, 30) |
|
|
| |
| print(st.split_train_val_test(DATASET, INFO_DICTS)) |
|
|
| |
| print(st.split_train_val_test(DATASET, |
| INFO_DICTS, |
| [[0, 1, 2, 4, 9], [5, 6], [7, 8, 3]], |
| extend_with=1)) |
|
|