iAmKavin
/

Bigram-Language-Model

Model card Files Files and versions

Bigram-Language-Model / ml-dev /train.py

Kavin Arasu

First Model Version

68694b3 about 3 years ago

history blame contribute delete

1.05 kB

	import torch
	import pickle

	# Character which represents the start and end of a word
	TOKEN = '.'

	# Reading the names into a list
	words = open('data/names.txt','r').read().splitlines()

	# Building a vocabulary out of all of the characters we have
	vocab = sorted(list(set(''.join(words)) \| {TOKEN}))

	# Building a Bigram table which will hold the counts for each of the bigram
	n = len(vocab)
	N = torch.zeros((n,n), dtype = torch.int32)

	# Defining a mapping for characters to and from integers
	char_to_int = {char:i for i,char in enumerate(vocab)}
	int_to_char = {value:key for key,value in char_to_int.items()}

	# Populating the Bigram table (N) with counts
	for word in words:
	chars = [TOKEN] + list(word) + [TOKEN]
	for ch1,ch2 in zip(chars,chars[1:]):
	ix1 = char_to_int[ch1]
	ix2 = char_to_int[ch2]
	N[ix1,ix2] += 1

	# Normalise the counts to represent probabilities
	P = N.float()
	P /= P.sum(1, keepdim = True)

	# Open a file and use dump()
	with open('model/bigrams.pkl', 'wb') as file:
	pickle.dump([P,char_to_int,int_to_char], file)