adding py_src directory

915f29f over 1 year ago

5.12 kB

	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <fcntl.h>
	#include <sys/stat.h>
	#include <unistd.h>
	#include <errno.h>


	#define MAX_VOCAB_SIZE 4096
	#define MAX_WORD_LEN 12


	#include "tokenizer.h"
	#include "utils.h"




	static int strequ(char a, char b) {
	for (int i = 0; a[i] && b[i]; ++i)
	if (a[i] != b[i])
	return 0;

	return 1;
	}

	void help(int argc, char *argv[]) {
	printf("Usage: %s [arg1] [arg2] ... [argN]\n", argv[0]);
	printf("Arguments:\n");
	printf(" --help show this help\n");
	printf(" --dataset_path <str> path to dataset\n");
	printf(" --dataset_output_path <str> path to dataset output\n");
	printf(" --vocab_path <str> path to vocabulary\n");
	printf(" --vocab_output_path <str> path to vocabulary output\n");
	printf(" --tokenizer_path <str> path to tokenizer\n");
	printf(" --tokenizer_output_path <str> path to tokenizer output\n");
	printf("\n");
	exit(0);
	}




	void tokenizer_print_vocab(tokenizer_t *t) {
	printf("vocab size: %d\n", t->vocab_size);

	for (int i = 0; i < t->vocab_size; ++i) {
	char *str = t->vocab[i];
	int len = strlen(str);
	printf("id: %d, len: %d, str: %s\n", i, len, str);
	}
	}

	void tokenize_whole_file(tokenizer_t t, char fname[2]) {
	int infd = open(fname[0], O_RDONLY),
	outfd = open(fname[1], O_WRONLY \| O_CREAT \| O_TRUNC, 0644);

	if (infd < 0) ERROR("failed to open %s\n", fname[0]);
	if (outfd < 0) ERROR("failed to open %s\n", fname[1]);

	for (int i = 0; ; ++i) {
	if (rdchar(infd) == EOF) break;
	else seekback(infd, 1);

	int id = t->encode_file(t, infd);
	if (id == EOF) { (void)rdchar(infd); continue; }

	write(outfd, &id, sizeof(id));

	if (i % 100000 == 0) {
	int ncurrent = lseek(infd, 0, SEEK_CUR),
	ntotal = 0;

	ntotal = lseek(infd, 0, SEEK_END);
	(void) lseek(infd, ncurrent, SEEK_SET);

	printf("iteration %d, encoded %d (%s), proccessed %d of %d (%.2f%%)\n", i, id, t->decode(t, id), ncurrent, ntotal, ((float)ncurrent / ntotal) * 100);

	if (ntotal - ncurrent < 100) break;
	}
	}
	}
	#define tokenize_whole_file(T, ...) tokenize_whole_file(T, (char*[2]){__VA_ARGS__})




	int main(int argc, char *argv[]) {
	tokenizer_t *tokenizer = Tokenizer(NULL);


	char *dataset_path = "data/dataset.txt",
	*dataset_output_path = "bin/dataset.bin",
	*vocab_path = "data/vocab.txt",
	*vocab_output_path = NULL,
	*tokenizer_path = NULL,
	*tokenizer_output_path = "bin/tokenizer.bin";


	for (int i = 1; i < argc; i += 2) {
	if (strequ(argv[i], "--dataset_path")) dataset_path = argv[i + 1], printf("setting dataset path to \"%s\"\n", dataset_path);
	else if (strequ(argv[i], "--dataset_output_path")) dataset_output_path = argv[i + 1], printf("setting dataset output path to \"%s\"\n", dataset_output_path);
	else if (strequ(argv[i], "--vocab_path")) vocab_path = argv[i + 1], printf("setting vocabulary path to \"%s\"\n", vocab_path);
	else if (strequ(argv[i], "--vocab_output_path")) vocab_output_path = argv[i + 1], printf("setting vocabulary output path to \"%s\"\n", vocab_output_path);
	else if (strequ(argv[i], "--tokenizer_path")) tokenizer_path = argv[i + 1], printf("setting tokenizer output path to \"%s\"\n", tokenizer_path);
	else if (strequ(argv[i], "--tokenizer_output_path")) tokenizer_output_path = argv[i + 1], printf("setting tokenizer output path to \"%s\"\n", tokenizer_output_path);
	else if (strequ(argv[i], "--help")) help(argc, argv);
	else ERROR("unknown option \"%s\"\n", argv[i]);
	}


	CALLBACK_ON_FILE(vocab_path,
	INFO("loading vocabulary from \"%s\"\n", vocab_path);
	tokenizer->load_vocab(tokenizer, vocab_path);
	tokenizer_print_vocab(tokenizer);
	);

	CALLBACK_ON_FILE(tokenizer_path,
	INFO("loading tokenizer from \"%s\"\n", tokenizer_path);
	tokenizer->load_tokenizer(tokenizer, tokenizer_path);
	);

	CALLBACK_ON_FILE(vocab_output_path,
	INFO("exporting vocabulary to \"%s\"\n", vocab_output_path);
	tokenizer->save_vocab(tokenizer, vocab_output_path)
	);

	CALLBACK_ON_FILE(tokenizer_output_path,
	INFO("exporting tokenizer to \"%s\"\n", tokenizer_output_path);
	tokenizer->save_tokenizer(tokenizer, tokenizer_output_path)
	);

	CALLBACK_ON_FILE(dataset_path,
	CALLBACK_ON_FILE(dataset_output_path,
	INFO("loading dataset from \"%s\" and exporting it to \"%s\"\n", dataset_path, dataset_output_path);
	tokenize_whole_file(tokenizer, dataset_path, dataset_output_path);
	);
	);

	return EXIT_SUCCESS;
	}