| INDIC_NLP_LIB_HOME = "indic_nlp_library" |
| INDIC_NLP_RESOURCES = "indic_nlp_resources" |
| import sys |
|
|
| from indicnlp import transliterate |
|
|
| sys.path.append(r"{}".format(INDIC_NLP_LIB_HOME)) |
| from indicnlp import common |
|
|
| common.set_resources_path(INDIC_NLP_RESOURCES) |
| from indicnlp import loader |
|
|
| loader.load() |
| from sacremoses import MosesPunctNormalizer |
| from sacremoses import MosesTokenizer |
| from sacremoses import MosesDetokenizer |
| from collections import defaultdict |
|
|
| import indicnlp |
| from indicnlp.tokenize import indic_tokenize |
| from indicnlp.tokenize import indic_detokenize |
| from indicnlp.normalize import indic_normalize |
| from indicnlp.transliterate import unicode_transliterate |
|
|
| from flores_codes_map_indic import flores_codes |
| import sentencepiece as spm |
|
|
| import re |
|
|
| en_detok = MosesDetokenizer(lang="en") |
|
|
|
|
| def postprocess( |
| infname: str, |
| outfname: str, |
| input_size: int, |
| lang: str, |
| transliterate: bool = False, |
| spm_model_path: str = None, |
| ): |
| """ |
| Postprocess the output of a machine translation model in the following order: |
| - parse fairseq interactive output |
| - convert script back to native Indic script (in case of Indic languages) |
| - detokenize |
| |
| Args: |
| infname (str): path to the input file containing the machine translation output. |
| outfname (str): path to the output file where the postprocessed output will be written. |
| input_size (int): number of sentences in the input file. |
| lang (str): language code of the output language. |
| transliterate (bool, optional): whether to transliterate the output text to devanagari (default: False). |
| spm_model_path (str): path of the sentence piece model. |
| """ |
| if spm_model_path is None: |
| raise Exception("Please provide sentence piece model path for decoding") |
| |
| sp = spm.SentencePieceProcessor(model_file=spm_model_path) |
| |
| iso_lang = flores_codes[lang] |
|
|
| consolidated_testoutput = [] |
| consolidated_testoutput = [(x, 0.0, "") for x in range(input_size)] |
|
|
| temp_testoutput = [] |
| with open(infname, "r", encoding="utf-8") as infile: |
| temp_testoutput = list( |
| map( |
| lambda x: x.strip().split("\t"), |
| filter(lambda x: x.startswith("H-"), infile), |
| ) |
| ) |
| temp_testoutput = list( |
| map(lambda x: (int(x[0].split("-")[1]), float(x[1]), x[2]), temp_testoutput) |
| ) |
| for sid, score, hyp in temp_testoutput: |
| consolidated_testoutput[sid] = (sid, score, hyp) |
| consolidated_testoutput = [x[2] for x in consolidated_testoutput] |
| consolidated_testoutput = [sp.decode(x.split(" ")) for x in consolidated_testoutput] |
|
|
| if iso_lang == "en": |
| with open(outfname, "w", encoding="utf-8") as outfile: |
| for sent in consolidated_testoutput: |
| outfile.write(en_detok.detokenize(sent.split(" ")) + "\n") |
| else: |
| xliterator = unicode_transliterate.UnicodeIndicTransliterator() |
| with open(outfname, "w", encoding="utf-8") as outfile: |
| for sent in consolidated_testoutput: |
| if transliterate: |
| outstr = indic_detokenize.trivial_detokenize( |
| xliterator.transliterate(sent, "hi", iso_lang), iso_lang |
| ) |
| else: |
| outstr = indic_detokenize.trivial_detokenize(sent, iso_lang) |
| outfile.write(outstr + "\n") |
|
|
|
|
| if __name__ == "__main__": |
| infname = sys.argv[1] |
| outfname = sys.argv[2] |
| input_size = int(sys.argv[3]) |
| lang = sys.argv[4] |
| transliterate = sys.argv[5] |
| spm_model_path = sys.argv[6] |
|
|
| postprocess(infname, outfname, input_size, lang, transliterate, spm_model_path) |
|
|