| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | import argparse |
| |
|
| | import numpy as np |
| |
|
| | if __name__ == "__main__": |
| | parser = argparse.ArgumentParser() |
| | parser.add_argument("source", type=str, help="CoNLL-U file to split") |
| | parser.add_argument("train", type=str, help="CoNLL-U file to write training data to") |
| | parser.add_argument("dev", type=str, help="CoNLL-U file to write development data to") |
| | parser.add_argument("--dev_size", type=float, default=0.1, help="Size of the development data") |
| | args = parser.parse_args() |
| |
|
| | sentences = [] |
| | with open(args.source, "r", encoding="utf-8") as source: |
| | sentence = [] |
| | for line in source: |
| | sentence.append(line) |
| | if not line.rstrip("\r\n"): |
| | sentences.append("".join(sentence)) |
| | sentence = [] |
| | assert not sentence, "Missing empty line after the last sentence" |
| |
|
| | dev_indices = set(np.random.RandomState(42).choice(len(sentences), int(len(sentences) * args.dev_size), replace=False)) |
| |
|
| | with open(args.train, "w", encoding="utf-8") as train: |
| | with open(args.dev, "w", encoding="utf-8") as dev: |
| | for i, sentence in enumerate(sentences): |
| | (dev if i in dev_indices else train).write(sentence) |
| |
|