Upload 97 files

2c8dc05 verified about 2 years ago

8.46 kB

	# import sys, os
	# SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
	# sys.path.append(SCRIPT_DIR)

	# combined lexical analyzer and parser

	from Unified_parser.ply.lex import Lexer
	from Unified_parser.ply.yacc import yacc
	from Unified_parser.globals import *
	from Unified_parser.helpers import *
	import sys
	from sys import exit
	from Unified_parser.get_phone_mapped_python import *

	# tokens used
	tokens = ('kaki_c', 'conjsyll2_c', 'fullvowel_b', 'kaki_a', 'kaki_b', 'conjsyll2_b', 'conjsyll2_a',
	'conjsyll1', 'nukchan_b','nukchan_a', 'yarule', 'fullvowel_a', 'vowel')

	# parser part

	def p_sentence(p):
	'''
	sentence : words
	'''
	if p.parser.g.flags.parseLevel == 0:
	p.parser.g.words.syllabifiedWordOut = p[1]

	if p.parser.g.words.syllabifiedWordOut.find('&&') != -1:
	p.parser.g.words.syllabifiedWordOut = rec_replace(p.parser.g.words.syllabifiedWordOut,'&&','&')

	p.parser.g.flags.parseLevel += 1
	else:
	p.parser.g.words.phonifiedWord = p[1]

	def p_words_syltoken(p):
	'''
	words : syltoken
	'''
	if(p.parser.g.flags.DEBUG):
	print(f"Syll:\t{p[1]}")
	p[0] = p[1]

	def p_words_wordsandsyltoken(p):
	'''
	words : words syltoken
	'''
	if(p.parser.g.flags.DEBUG):
	print(f"Syll:\t{p[2]}")
	p[0] = p[1] + p[2]

	def p_syltoken(p):
	'''
	syltoken : fullvowel_b
	\| fullvowel_a
	\| conjsyll2_c
	\| conjsyll2_b
	\| conjsyll2_a
	\| conjsyll1
	\| nukchan_b
	\| nukchan_a
	\| yarule
	\| vowel
	'''
	p[0] = p[1]

	def p_syltoken1(p):
	'''
	syltoken :
	\| kaki_c
	\| kaki_a
	\| kaki_b
	'''
	if (p.parser.g.flags.DEBUG):
	print(f'kaki : {p[1]}')
	p[0] = p[1]

	def p_error(p):
	print('parse error')
	exit(1)

	# print the help of syntax
	def printHelp():

	print("UnifiedParser - Usage Instructions")
	print("Run python3 parser.py wd lsflag wfflag clearflag")
	print("wd - word to parse in unicode.")
	print("lsflag - always 0. we are not using this.")
	print("wfflag - 0 for Monophone parsing, 1 for syllable parsing, 2 for Akshara Parsing")
	print("clearflag -0. for normal syntx, 1 for removing the lisp like format of output and to just produce space separated output. 2 for cls labled output")
	print("language - name of language in quationes")

	def wordparse(wd : str, lsflag : int, wfflag : int, clearflag : int,language_main :str):
	g = GLOBALS()
	lexer = Lexer()
	parser = yacc()
	parser.g = g
	g.flags.DEBUG = False
	wd = wd.strip(' ') # hidden characters

	if lsflag not in [0,1] or wfflag not in [0,1,2]:
	print("Invalid input")
	exit(1)

	g.flags.LangSpecificCorrectionFlag = lsflag

	g.flags.writeFormat = wfflag
	if wfflag == 4:
	g.flags.writeFormat = 1
	g.flags.syllTagFlag = 1

	word = wd
	if g.flags.DEBUG:
	print(f'Word : {word}')

	word = RemoveUnwanted(word)
	if g.flags.DEBUG:
	print(f'Cleared Word : {word}')

	if SetlanguageFeat(g, word) == 0:
	return 0

	if CheckDictionary(g, word) != 0:
	return 0

	if g.flags.DEBUG:
	print(f'langId : {g.langId}')

	word = ConvertToSymbols(g, word)

	if g.flags.DEBUG:
	print(f"Symbols code : {g.words.unicodeWord}")
	print(f"Symbols syllables : {g.words.syllabifiedWord}")

	language_main=language_main.lower()
	# if language_main == "telugu":
	# g.currLang = g.TELUGU
	# g.langId = 3
	# if(g.langId < 5):
	# g.isSouth = 1
	# print("language:",language_main)

	# if language_main == "marathi":
	# print("inside main")
	# g.currLang = g.KANNADA
	# g.langId = 4
	# if(g.langId < 5):
	# g.isSouth = 1
	# print("language:",language_main)

	parser.parse(g.words.syllabifiedWord, lexer=lexer)
	if(g.flags.DEBUG):
	print(f"Syllabified Word : {g.words.syllabifiedWordOut}")
	g.words.syllabifiedWordOut = rec_replace(g.words.syllabifiedWordOut, '&#&','&') + '&'
	if(g.flags.DEBUG):
	print(f"Syllabified Word out : {g.words.syllabifiedWordOut}")
	g.words.syllabifiedWordOut = LangSpecificCorrection(g, g.words.syllabifiedWordOut, g.flags.LangSpecificCorrectionFlag)
	if(g.flags.DEBUG):
	print(f"Syllabified Word langCorr : {g.words.syllabifiedWordOut}")
	# if(g.flags.DEBUG):
	# print(f"Syllabified Word gemCorr : {g.words.syllabifiedWordOut}")
	g.words.syllabifiedWordOut = CleanseWord(g.words.syllabifiedWordOut)
	if(g.flags.DEBUG):
	print(f"Syllabified Word memCorr : {g.words.syllabifiedWordOut}")

	if not g.isSouth:
	if g.flags.DEBUG:
	print('NOT SOUTH')
	count = 0
	for i in range(len(g.words.syllabifiedWordOut)):
	if g.words.syllabifiedWordOut[i] == '&':
	count += 1
	splitPosition = 2
	if GetPhoneType(g, g.words.syllabifiedWordOut, 1) == 1:
	if count > 2:
	tpe = GetPhoneType(g, g.words.syllabifiedWordOut, 2)
	if tpe == 2:
	splitPosition = 1
	elif tpe == 3:
	splitPosition = 3
	else:
	splitPosition = 1
	count = 0
	for i in range(len(g.words.syllabifiedWordOut)):
	if g.words.syllabifiedWordOut[i] == '&':
	count += 1
	if count > splitPosition:
	count = i
	break
	start, end = g.words.syllabifiedWordOut, g.words.syllabifiedWordOut
	end = end[count:]
	start = start[:count]
	if(g.flags.DEBUG):
	print(f"posi {count} {start} {end}")
	end = SchwaSpecificCorrection(g, end)
	if(g.flags.DEBUG):
	print(f"prefinal : {g.words.syllabifiedWordOut}")
	g.words.syllabifiedWordOut = start + end
	if(g.flags.DEBUG):
	print(f"prefinal1 : {g.words.syllabifiedWordOut}")
	g.words.syllabifiedWordOut = CleanseWord(g.words.syllabifiedWordOut)
	if(g.flags.DEBUG):
	print(f"final : {g.words.syllabifiedWordOut}")
	g.words.syllabifiedWordOut = SchwaDoubleConsonent(g.words.syllabifiedWordOut)
	if(g.flags.DEBUG):
	print(f"final0 : {g.words.syllabifiedWordOut}")

	g.words.syllabifiedWordOut = GeminateCorrection(g.words.syllabifiedWordOut, 0)
	g.words.syllabifiedWordOut = MiddleVowel(g, g.words.syllabifiedWordOut)
	g.words.syllabifiedWordOut = Syllabilfy(g.words.syllabifiedWordOut)

	SplitSyllables(g,g.words.syllabifiedWordOut)
	temp_list=g.syllableList
	if g.flags.DEBUG:
	print("temp_list_1",temp_list)
	# print("temp_list_1",temp_list)
	# for i in range(len(temp_list)):
	# print("output_a", temp_list[i] )
	# temp_list[i]=convert_to_main_lang (g,temp_list[i], language_main)
	# print("output_b", temp_list[i] )


	WritetoFiles(g)
	if clearflag == 0:
	return g.answer
	if clearflag == 1:
	t = g.words.outputText
	t = t.split('"')
	ln = len(t)
	i = 1
	g.answer = ''
	while i < ln:
	g.answer += t[i] + ' '
	i += 2
	g.answer.strip()
	return g.answer
	if clearflag == 2:
	t = g.words.outputText
	t = t.split('"')
	# print(t)
	ln = len(t)
	i = 1
	g.answer = ''
	text_replacer=TextReplacer()
	while i < ln:
	if len(t[i])>1 and ord(t[i][0])<128 and ord(t[i][0])>=65:
	temp = text_replacer.apply_replacements_by_phonems(t[i])
	g.answer += temp
	elif len(t[i])==1 and ord(t[i][0])<128 and ord(t[i][0])>=65:
	g.answer += t[i]
	i += 1
	g.answer.strip()
	return g.answer


	def safe_word_parse(wd : str, lsflag : int, wfflag : int, clearflag : int,language : str):
	try:
	ans = wordparse(wd, lsflag, wfflag, clearflag,language)
	# print(ans)
	except:
	print(f'failed on {wd}')
	ans = 'FAILED'
	return wd, ans,


	if __name__ == '__main__':

	if (len(sys.argv) != 6):
	printHelp()
	exit(-1)

	ans = wordparse(sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]),str(sys.argv[5]))
	print(ans)