| |
| from romtoslp import rom_slp |
| from json import * |
| import pprint |
| from utilities import * |
| class word_new(): |
| def __init__(self,names): |
| self.lemmas=[] |
| self.names=names |
| self.urls=[] |
| self.forms=[] |
|
|
| class chunks: |
| def __init__(self,chunk_name): |
| self.chunk_name=chunk_name |
| self.chunk_words={} |
|
|
| class sentences: |
| def __init__(self,sent_id,sentence): |
| self.sent_id=sent_id |
| self.sentence=sentence |
| self.chunk=[] |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| class SentenceError(Exception): |
| def __init__(self, message): |
|
|
| |
| super(SentenceError, self).__init__(message) |
|
|
| def SeeSentence(sentenceObj): |
| print('SKT ANALYZE') |
| print('-'*15) |
| print(sentenceObj.sentence) |
| zz = 0 |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| for chunk in sentenceObj.chunk: |
| print("Analyzing ", rom_slp(chunk.chunk_name)) |
| for pos in chunk.chunk_words.keys(): |
| for word_sense in chunk.chunk_words[pos]: |
| word_sense = fix_w_new(word_sense) |
| print(pos, ": ", rom_slp(word_sense.names), word_sense.lemmas, word_sense.forms) |
| |
| |
| print() |
|
|
| def getWord(sentenceObj, cid, pos,kii): |
| ch = sentenceObj.chunk[cid] |
| word = ch.chunk_words[pos][kii] |
| return {'lemmas': word.lemmas, 'forms':word.forms, 'names':word.names} |
|
|
| |
| |
|
|
| |
| |
| from wordTypeCheckFunction import * |
| import pickle |
|
|
| """ |
| SentencePreprocess: |
| ------------------- |
| Read a sentence obj and create + return the following objects |
| |
| -> chunkDict: chunk_id -> position -> index in lemmaList (nested dictionary) |
| -> lemmaList: list of possible words as a result of word segmentation |
| -> revMap2Chunk: Map word in wordList to (cid, position) in chunkDict |
| -> qu: Possible query nodes |
| """ |
| v2t = pickle.load(open('verbs_vs_cngs_matrix_countonly.p', 'rb'), encoding=u'utf8') |
| def wtc_recursive(form, c): |
| if type(c) ==list: |
| for cc in c: |
| return wtc_recursive(form, cc) |
| else: |
| return wordTypeCheck(form, c) |
|
|
| def CanBeQuery(chunk): |
| allLemmas = [] |
| for pos, words in chunk.chunk_words.items(): |
| for word in words: |
| for lemma in word.lemmas: |
| if lemma != '': |
| allLemmas.append(lemma) |
| if(len(allLemmas) == 1): |
| return True |
|
|
| def Get_QCs(tuplesMain, chunkDict): |
| |
| qc_pairs = {} |
| nodeList = [t for ts in tuplesMain for t in ts] |
| |
| for ni in range(len(nodeList)): |
| qc_pairs[ni] = set(range(len(nodeList))) - set([ni]) |
|
|
| for cid in chunkDict.keys(): |
| |
| for pos1 in chunkDict[cid].keys(): |
| for pos2 in chunkDict[cid].keys(): |
| if pos1 <= pos2: |
| nList1 = [] |
| for ti1 in chunkDict[cid][pos1]: |
| for tup1 in tuplesMain[ti1]: |
| nList1.append(tup1[0]) |
| nList2 = [] |
| for ti2 in chunkDict[cid][pos2]: |
| for tup2 in tuplesMain[ti2]: |
| nList2.append(tup2[0]) |
| nList1 = set(nList1) |
| nList2 = set(nList2) |
| for n1 in nList1: |
| qc_pairs[n1] = qc_pairs[n1] - nList1 |
|
|
| for n2 in nList2: |
| qc_pairs[n2] = qc_pairs[n2] - nList2 |
|
|
| if pos1 < pos2: |
| for n1 in nList1: |
| for n2 in nList2: |
| if not CanCoExist_sandhi(pos1, pos2, nodeList[n1][1], nodeList[n2][1]): |
| qc_pairs[n1] = qc_pairs[n1] - set([n2]) |
| qc_pairs[n2] = qc_pairs[n2] - set([n1]) |
| |
| return qc_pairs |
|
|
| ''' |
| =================== |
| SentencePreprocess |
| =================== |
| forceQuery: Setting it true will make the longest word available a query if no |
| other query is available |
| ''' |
| def SentencePreprocess(sentenceObj, forceQuery = False): |
| """ |
| Considering word names only |
| ***{Word forms or cngs can also be used} |
| """ |
| def getCNGs(formsDict): |
| if type(formsDict) == int or type(formsDict) == str: |
| return [int(formsDict)] |
| else: |
| l = [] |
| for form, configs in formsDict.items(): |
| for c in configs: |
| if(form == 'verbform'): |
| continue |
| else: |
| l.append(wtc_recursive(form, c)) |
| return list(set(l)) |
|
|
| chunkDict = {} |
| lemmaList = [] |
| wordList = [] |
| cngList = [] |
| revMap2Chunk = [] |
| qu = [] |
| tuplesMain = [] |
|
|
| cid = -1 |
| tidExclusive = 0 |
|
|
| |
| for chunk in sentenceObj.chunk: |
| |
| cid = cid+1 |
| chunkDict[cid] = {} |
| for pos in chunk.chunk_words.keys(): |
| tupleSet = {} |
| chunkDict[cid][pos] = [] |
| for word_sense in chunk.chunk_words[pos]: |
| |
| nama = rom_slp(word_sense.names) |
| if nama == '': |
| raise SentenceError('Empty Name Detected') |
| if(len(word_sense.lemmas) > 0 and len(word_sense.forms) > 0): |
| tuples = [] |
| for lemmaI in range(len(word_sense.lemmas)): |
| |
| lemma = word_sense.lemmas[lemmaI] |
| if lemma == '': |
| continue |
| tempCNGs = getCNGs(word_sense.forms[lemmaI]) |
| for cng in tempCNGs: |
| |
| newT_Key = (lemma, cng) |
| newT = (tidExclusive, nama, lemma, cng) |
| if(newT_Key not in tupleSet): |
| tupleSet[newT_Key] = 1 |
| tuples.append(newT) |
| lemmaList.append(lemma) |
| wordList.append(nama) |
| cngList.append(cng) |
| revMap2Chunk.append((cid, pos, len(tuplesMain))) |
| tidExclusive += 1 |
|
|
| if(len(tuples) > 0): |
| |
| k = len(tuplesMain) |
| chunkDict[cid][pos].append(k) |
| tuplesMain.append(tuples) |
|
|
| |
| for cid in chunkDict.keys(): |
| tuples = [] |
| for pos in chunkDict[cid].keys(): |
| tupIds = chunkDict[cid][pos] |
| for tupId in tupIds: |
| [tuples.append((pos, tup[0], tup[1])) for tup in tuplesMain[tupId]] |
| for u in range(len(tuples)): |
| tup1 = tuples[u] |
| quFlag = True |
| for v in range(len(tuples)): |
| if(u == v): |
| continue |
| tup2 = tuples[v] |
| |
| |
| |
| |
| |
| if(tup1[0] < tup2[0]): |
| if not CanCoExist_sandhi(tup1[0], tup2[0], tup1[2], tup2[2]): |
| |
| quFlag = False |
| break |
| elif(tup1[0] > tup2[0]): |
| if not CanCoExist_sandhi(tup2[0], tup1[0], tup2[2], tup1[2]): |
| |
| quFlag = False |
| break |
| else: |
| quFlag = False |
| break |
|
|
| |
| |
| |
|
|
| if quFlag: |
| qu.append(tup1[1]) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
|
|
| verbs = [] |
| i = -1 |
| for w in lemmaList: |
| i += 1 |
| if w in list(v2t.keys()): |
| verbs.append(i) |
|
|
|
|
| |
| |
| |
| |
| qc_pairs = Get_QCs(tuplesMain, chunkDict) |
| |
| ''' |
| qu = [] # Have to remove it later |
| ''' |
| |
| if len(qu) == 0 and len(lemmaList) > 0: |
| lens = np.array([len(t[1]) for ts in tuplesMain for t in ts]) |
| cw = [(t[0], t[1]) for ts in tuplesMain for t in ts] |
| round1 = np.where(lens == np.max(lens))[0] |
| hits = [len(qc_pairs[r]) for r in round1] |
| finalist = round1[np.where(hits == np.min(hits))][0] |
| qu.append(finalist) |
| |
| return (chunkDict, lemmaList, wordList, revMap2Chunk, qu, cngList, verbs, tuplesMain, qc_pairs) |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|