| | #include <stdio.h> |
| | #include <stdlib.h> |
| | #include <string.h> |
| | #include <fcntl.h> |
| | #include <sys/stat.h> |
| | #include <unistd.h> |
| | #include <errno.h> |
| |
|
| |
|
| | #define MAX_VOCAB_SIZE 4096 |
| | #define MAX_WORD_LEN 12 |
| |
|
| |
|
| | #include "tokenizer.h" |
| | #include "utils.h" |
| |
|
| |
|
| |
|
| |
|
| | static int strequ(char *a, char *b) { |
| | for (int i = 0; a[i] && b[i]; ++i) |
| | if (a[i] != b[i]) |
| | return 0; |
| |
|
| | return 1; |
| | } |
| |
|
| | void help(int argc, char *argv[]) { |
| | printf("Usage: %s [arg1] [arg2] ... [argN]\n", argv[0]); |
| | printf("Arguments:\n"); |
| | printf(" --help show this help\n"); |
| | printf(" --dataset_path <str> path to dataset\n"); |
| | printf(" --dataset_output_path <str> path to dataset output\n"); |
| | printf(" --vocab_path <str> path to vocabulary\n"); |
| | printf(" --vocab_output_path <str> path to vocabulary output\n"); |
| | printf(" --tokenizer_path <str> path to tokenizer\n"); |
| | printf(" --tokenizer_output_path <str> path to tokenizer output\n"); |
| | printf("\n"); |
| | exit(0); |
| | } |
| |
|
| |
|
| |
|
| |
|
| | void tokenizer_print_vocab(tokenizer_t *t) { |
| | printf("vocab size: %d\n", t->vocab_size); |
| |
|
| | for (int i = 0; i < t->vocab_size; ++i) { |
| | char *str = t->vocab[i]; |
| | int len = strlen(str); |
| | printf("id: %d, len: %d, str: %s\n", i, len, str); |
| | } |
| | } |
| |
|
| | void tokenize_whole_file(tokenizer_t *t, char *fname[2]) { |
| | int infd = open(fname[0], O_RDONLY), |
| | outfd = open(fname[1], O_WRONLY | O_CREAT | O_TRUNC, 0644); |
| |
|
| | if (infd < 0) ERROR("failed to open %s\n", fname[0]); |
| | if (outfd < 0) ERROR("failed to open %s\n", fname[1]); |
| |
|
| | for (int i = 0; ; ++i) { |
| | if (rdchar(infd) == EOF) break; |
| | else seekback(infd, 1); |
| |
|
| | int id = t->encode_file(t, infd); |
| | if (id == EOF) { (void)rdchar(infd); continue; } |
| | |
| | write(outfd, &id, sizeof(id)); |
| | |
| | if (i % 100000 == 0) { |
| | int ncurrent = lseek(infd, 0, SEEK_CUR), |
| | ntotal = 0; |
| |
|
| | ntotal = lseek(infd, 0, SEEK_END); |
| | (void) lseek(infd, ncurrent, SEEK_SET); |
| |
|
| | printf("iteration %d, encoded %d (%s), proccessed %d of %d (%.2f%%)\n", i, id, t->decode(t, id), ncurrent, ntotal, ((float)ncurrent / ntotal) * 100); |
| | |
| | if (ntotal - ncurrent < 100) break; |
| | } |
| | } |
| | } |
| | #define tokenize_whole_file(T, ...) tokenize_whole_file(T, (char*[2]){__VA_ARGS__}) |
| |
|
| |
|
| |
|
| |
|
| | int main(int argc, char *argv[]) { |
| | tokenizer_t *tokenizer = Tokenizer(NULL); |
| |
|
| |
|
| | char *dataset_path = "data/dataset.txt", |
| | *dataset_output_path = "bin/dataset.bin", |
| | *vocab_path = "data/vocab.txt", |
| | *vocab_output_path = NULL, |
| | *tokenizer_path = NULL, |
| | *tokenizer_output_path = "bin/tokenizer.bin"; |
| |
|
| |
|
| | for (int i = 1; i < argc; i += 2) { |
| | if (strequ(argv[i], "--dataset_path")) dataset_path = argv[i + 1], printf("setting dataset path to \"%s\"\n", dataset_path); |
| | else if (strequ(argv[i], "--dataset_output_path")) dataset_output_path = argv[i + 1], printf("setting dataset output path to \"%s\"\n", dataset_output_path); |
| | else if (strequ(argv[i], "--vocab_path")) vocab_path = argv[i + 1], printf("setting vocabulary path to \"%s\"\n", vocab_path); |
| | else if (strequ(argv[i], "--vocab_output_path")) vocab_output_path = argv[i + 1], printf("setting vocabulary output path to \"%s\"\n", vocab_output_path); |
| | else if (strequ(argv[i], "--tokenizer_path")) tokenizer_path = argv[i + 1], printf("setting tokenizer output path to \"%s\"\n", tokenizer_path); |
| | else if (strequ(argv[i], "--tokenizer_output_path")) tokenizer_output_path = argv[i + 1], printf("setting tokenizer output path to \"%s\"\n", tokenizer_output_path); |
| | else if (strequ(argv[i], "--help")) help(argc, argv); |
| | else ERROR("unknown option \"%s\"\n", argv[i]); |
| | } |
| |
|
| |
|
| | CALLBACK_ON_FILE(vocab_path, |
| | INFO("loading vocabulary from \"%s\"\n", vocab_path); |
| | tokenizer->load_vocab(tokenizer, vocab_path); |
| | tokenizer_print_vocab(tokenizer); |
| | ); |
| |
|
| | CALLBACK_ON_FILE(tokenizer_path, |
| | INFO("loading tokenizer from \"%s\"\n", tokenizer_path); |
| | tokenizer->load_tokenizer(tokenizer, tokenizer_path); |
| | ); |
| |
|
| | CALLBACK_ON_FILE(vocab_output_path, |
| | INFO("exporting vocabulary to \"%s\"\n", vocab_output_path); |
| | tokenizer->save_vocab(tokenizer, vocab_output_path) |
| | ); |
| |
|
| | CALLBACK_ON_FILE(tokenizer_output_path, |
| | INFO("exporting tokenizer to \"%s\"\n", tokenizer_output_path); |
| | tokenizer->save_tokenizer(tokenizer, tokenizer_output_path) |
| | ); |
| |
|
| | CALLBACK_ON_FILE(dataset_path, |
| | CALLBACK_ON_FILE(dataset_output_path, |
| | INFO("loading dataset from \"%s\" and exporting it to \"%s\"\n", dataset_path, dataset_output_path); |
| | tokenize_whole_file(tokenizer, dataset_path, dataset_output_path); |
| | ); |
| | ); |
| |
|
| | return EXIT_SUCCESS; |
| | } |
| |
|