| ''' |
| TTS Preprocessing |
| Developed by Arun Kumar A(CS20S013) - November 2022 |
| Code Changes by Utkarsh - 2023 |
| ''' |
| import os |
| import re |
| import json |
| import pandas as pd |
| import string |
| from collections import defaultdict |
| import time |
| import subprocess |
| import shutil |
| from multiprocessing import Process |
| import traceback |
|
|
| |
| from num_to_words import num_to_word |
| from g2p_en import G2p |
|
|
| def add_to_dictionary(dict_to_add, dict_file): |
| append_string = "" |
| for key, value in dict_to_add.items(): |
| append_string += (str(key) + " " + str(value) + "\n") |
| |
| if os.path.isfile(dict_file): |
| |
| source_dir = os.path.dirname(dict_file) |
| dict_file_name = os.path.basename(dict_file) |
| temp_file_name = "." + dict_file_name + ".temp" |
| temp_dict_file = os.path.join(source_dir, temp_file_name) |
| shutil.copy(dict_file, temp_dict_file) |
| |
| with open(temp_dict_file, "a") as f: |
| f.write(append_string) |
| |
| try: |
| df_orig = pd.read_csv(dict_file, delimiter=" ", header=None, dtype=str) |
| df_temp = pd.read_csv(temp_dict_file, delimiter=" ", header=None, dtype=str) |
| if len(df_temp) > len(df_orig): |
| os.rename(temp_dict_file, dict_file) |
| print(f"{len(dict_to_add)} new words appended to Dictionary: {dict_file}") |
| except: |
| print(traceback.format_exc()) |
| else: |
| |
| with open(dict_file, "a") as f: |
| f.write(append_string) |
| print(f"New Dictionary: {dict_file} created with {len(dict_to_add)} words") |
|
|
|
|
| class TextCleaner: |
| def __init__(self): |
| |
| self.cleaning_rules = { |
| " +" : " ", |
| "^ +" : "", |
| " +$" : "", |
| "#" : "", |
| "[.,;।!](\r\n)*" : "# ", |
| "[.,;।!](\n)*" : "# ", |
| "(\r\n)+" : "# ", |
| "(\n)+" : "# ", |
| "(\r)+" : "# ", |
| """[?;:)(!|&’‘,।\."]""": "", |
| "[/']" : "", |
| "[-–]" : " ", |
| } |
|
|
| def clean(self, text): |
| for key, replacement in self.cleaning_rules.items(): |
| text = re.sub(key, replacement, text) |
| return text |
|
|
| def clean_list(self, text): |
| |
| output_text = [] |
| for line in text: |
| line = line.strip() |
| for key, replacement in self.cleaning_rules.items(): |
| line = re.sub(key, replacement, line) |
| output_text.append(line) |
| return output_text |
|
|
|
|
| class Phonifier: |
| def __init__(self, dict_location=None): |
| if dict_location is None: |
| dict_location = "phone_dict" |
| self.dict_location = dict_location |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
|
|
| self.g2p = G2p() |
| print('Loading G2P model... Done!') |
| |
| self.cmu_2_cls_map = { |
| "AA" : "aa", |
| "AA0" : "aa", |
| "AA1" : "aa", |
| "AA2" : "aa", |
| "AE" : "axx", |
| "AE0" : "axx", |
| "AE1" : "axx", |
| "AE2" : "axx", |
| "AH" : "a", |
| "AH0" : "a", |
| "AH1" : "a", |
| "AH2" : "a", |
| "AO" : "ax", |
| "AO0" : "ax", |
| "AO1" : "ax", |
| "AO2" : "ax", |
| "AW" : "ou", |
| "AW0" : "ou", |
| "AW1" : "ou", |
| "AW2" : "ou", |
| "AX" : "a", |
| "AY" : "ei", |
| "AY0" : "ei", |
| "AY1" : "ei", |
| "AY2" : "ei", |
| "B" : "b", |
| "CH" : "c", |
| "D" : "dx", |
| "DH" : "d", |
| "EH" : "ee", |
| "EH0" : "ee", |
| "EH1" : "ee", |
| "EH2" : "ee", |
| "ER" : "a r", |
| "ER0" : "a r", |
| "ER1" : "a r", |
| "ER2" : "a r", |
| "EY" : "ee", |
| "EY0" : "ee", |
| "EY1" : "ee", |
| "EY2" : "ee", |
| "F" : "f", |
| "G" : "g", |
| "HH" : "h", |
| "IH" : "i", |
| "IH0" : "i", |
| "IH1" : "i", |
| "IH2" : "i", |
| "IY" : "ii", |
| "IY0" : "ii", |
| "IY1" : "ii", |
| "IY2" : "ii", |
| "JH" : "j", |
| "K" : "k", |
| "L" : "l", |
| "M" : "m", |
| "N" : "n", |
| "NG" : "ng", |
| "OW" : "o", |
| "OW0" : "o", |
| "OW1" : "o", |
| "OW2" : "o", |
| "OY" : "ei", |
| "OY0" : "ei", |
| "OY1" : "ei", |
| "OY2" : "ei", |
| "P" : "p", |
| "R" : "r", |
| "S" : "s", |
| "SH" : "sh", |
| "T" : "tx", |
| "TH" : "t", |
| "UH" : "u", |
| "UH0" : "u", |
| "UH1" : "u", |
| "UH2" : "u", |
| "UW" : "uu", |
| "UW0" : "uu", |
| "UW1" : "uu", |
| "UW2" : "uu", |
| "V" : "w", |
| "W" : "w", |
| "Y" : "y", |
| "Z" : "z", |
| "ZH" : "sh", |
| } |
|
|
| |
| self.cls_2_chr_map = { |
| "aa" : "A", |
| "ii" : "I", |
| "uu" : "U", |
| "ee" : "E", |
| "oo" : "O", |
| "nn" : "N", |
| "ae" : "ऍ", |
| "ag" : "ऽ", |
| "au" : "औ", |
| "axx" : "अ", |
| "ax" : "ऑ", |
| "bh" : "B", |
| "ch" : "C", |
| "dh" : "ध", |
| "dx" : "ड", |
| "dxh" : "ढ", |
| "dxhq" : "T", |
| "dxq" : "D", |
| "ei" : "ऐ", |
| "ai" : "ऐ", |
| "eu" : "உ", |
| "gh" : "घ", |
| "gq" : "G", |
| "hq" : "H", |
| "jh" : "J", |
| "kh" : "ख", |
| "khq" : "K", |
| "kq" : "क", |
| "ln" : "ൾ", |
| "lw" : "ൽ", |
| "lx" : "ള", |
| "mq" : "M", |
| "nd" : "न", |
| "ng" : "ङ", |
| "nj" : "ञ", |
| "nk" : "Y", |
| "nw" : "ൺ", |
| "nx" : "ण", |
| "ou" : "औ", |
| "ph" : "P", |
| "rq" : "R", |
| "rqw" : "ॠ", |
| "rw" : "ർ", |
| "rx" : "र", |
| "sh" : "श", |
| "sx" : "ष", |
| "th" : "थ", |
| "tx" : "ट", |
| "txh" : "ठ", |
| "wv" : "W", |
| "zh" : "Z", |
| } |
|
|
| |
| oov_map_json_file = 'multilingualcharmap.json' |
| with open(oov_map_json_file, 'r') as oov_file: |
| self.oov_map = json.load(oov_file) |
|
|
|
|
|
|
| def load_lang_dict(self, language, phone_dictionary): |
| |
| try: |
|
|
| dict_file = language |
| print("language", language) |
| dict_file_path = os.path.join(self.dict_location, dict_file) |
| print("dict_file_path", dict_file_path) |
| df = pd.read_csv(dict_file_path, delimiter=" ", header=None, dtype=str) |
| phone_dictionary[language] = df.set_index(0).to_dict('dict')[1] |
|
|
| dict_file = 'english' |
| dict_file_path = os.path.join(self.dict_location, dict_file) |
| df = pd.read_csv(dict_file_path, delimiter=" ", header=None, dtype=str) |
| phone_dictionary['english'] = df.set_index(0).to_dict('dict')[1] |
| |
| except Exception as e: |
| print(traceback.format_exc()) |
|
|
| return phone_dictionary |
|
|
| def __is_float(self, word): |
| parts = word.split('.') |
| if len(parts) != 2: |
| return False |
| return parts[0].isdecimal() and parts[1].isdecimal() |
|
|
| def en_g2p(self, word): |
| phn_out = self.g2p(word) |
| |
| |
| for i, phn in enumerate(phn_out): |
| if phn in self.cmu_2_cls_map.keys(): |
| phn_out[i] = self.cmu_2_cls_map[phn] |
| |
| if phn_out[i] in self.cls_2_chr_map.keys(): |
| phn_out[i] = self.cls_2_chr_map[phn_out[i]] |
| else: |
| pass |
| else: |
| pass |
| |
| return ("".join(phn_out)).strip().replace(" ", "") |
|
|
| def __post_phonify(self, text, language, gender): |
| language_gender_id = language+'_'+gender |
| if language_gender_id in self.oov_map.keys(): |
| output_string = '' |
| for char in text: |
| if char in self.oov_map[language_gender_id].keys(): |
| output_string += self.oov_map[language_gender_id][char] |
| else: |
| output_string += char |
| |
| return output_string |
| else: |
| return text |
|
|
| def __is_english_word(self, word): |
| maxchar = max(word) |
| if u'\u0000' <= maxchar <= u'\u007f': |
| return True |
| return False |
|
|
| def __phonify(self, text, language, gender, phone_dictionary): |
| |
| words = set((" ".join(text)).split(" ")) |
| |
| non_dict_words = [] |
| |
| |
| if language in phone_dictionary: |
| for word in words: |
| |
| if word not in phone_dictionary[language] and (language == "english" or (not self.__is_english_word(word))): |
| non_dict_words.append(word) |
| |
| else: |
| non_dict_words = words |
| print(f"word not in dict: {non_dict_words}") |
|
|
| if len(non_dict_words) > 0: |
| |
| os.makedirs("tmp", exist_ok=True) |
| timestamp = str(time.time()) |
| non_dict_words_file = os.path.abspath("tmp/non_dict_words_" + timestamp) |
| out_dict_file = os.path.abspath("tmp/out_dict_" + timestamp) |
| with open(non_dict_words_file, "w") as f: |
| f.write("\n".join(non_dict_words)) |
|
|
| if(language == 'tamil'): |
| current_directory = os.getcwd() |
| |
| tamil_parser_cmd = f"{current_directory}/ssn_parser_new/tamil_parser.py" |
| |
| subprocess.run(["python", tamil_parser_cmd, non_dict_words_file, out_dict_file, timestamp, f"{current_directory}/ssn_parser_new"]) |
| elif(language == 'english'): |
| phn_out_dict = {} |
| for i in range(0,len(non_dict_words)): |
| phn_out_dict[non_dict_words[i]] = self.en_g2p(non_dict_words[i]) |
| |
| data_str = "\n".join([f"{key}\t{value}" for key, value in phn_out_dict.items()]) |
| print(f"data_str: {data_str}") |
| with open(out_dict_file, "w") as f: |
| f.write(data_str) |
| else: |
| |
| out_dict_file = os.path.abspath("tmp/out_dict_" + timestamp) |
| from get_phone_mapped_python import TextReplacer |
| |
| from indic_unified_parser.uparser import wordparse |
| |
| text_replacer=TextReplacer() |
| |
| |
| |
| parsed_output_list = [] |
| for word in non_dict_words: |
| parsed_word = wordparse(word, 0, 0, 1) |
| parsed_output_list.append(parsed_word) |
| replaced_output_list = [text_replacer.apply_replacements(parsed_word) for parsed_word in parsed_output_list] |
| with open(out_dict_file, 'w', encoding='utf-8') as file: |
| for original_word, formatted_word in zip(non_dict_words, replaced_output_list): |
| line = f"{original_word}\t{formatted_word}\n" |
| file.write(line) |
| print(line, end='') |
| |
|
|
| try: |
| |
| df = pd.read_csv(out_dict_file, delimiter="\t", header=None, dtype=str) |
| |
| new_dict = df.dropna().set_index(0).to_dict('dict')[1] |
| |
| if language not in phone_dictionary: |
| phone_dictionary[language] = new_dict |
| else: |
| phone_dictionary[language].update(new_dict) |
| |
| |
| p = Process(target=add_to_dictionary, args=(new_dict, os.path.join(self.dict_location, language))) |
| p.start() |
| except Exception as err: |
| print(f"Error: While loading {out_dict_file}") |
| traceback.print_exc() |
|
|
| |
| text_phonified = [] |
| for phrase in text: |
| phrase_phonified = [] |
| for word in phrase.split(" "): |
| if self.__is_english_word(word): |
| if word in phone_dictionary["english"]: |
| phrase_phonified.append(str(phone_dictionary["english"][word])) |
| else: |
| phrase_phonified.append(str(self.en_g2p(word))) |
| elif word in phone_dictionary[language]: |
| |
| phrase_phonified.append(str(phone_dictionary[language][word])) |
| |
| text_phonified.append(" ".join(phrase_phonified)) |
| return text_phonified |
|
|
| def __merge_lists(self, lists): |
| merged_string = "" |
| for list in lists: |
| for word in list: |
| merged_string += word + " " |
| return merged_string.strip() |
|
|
| def __phonify_list(self, text, language, gender, phone_dictionary): |
| |
| words = set(self.__merge_lists(text).split(" ")) |
| non_dict_words = [] |
| if language in phone_dictionary: |
| for word in words: |
| if word not in phone_dictionary[language] and (language == "english" or (not self.__is_english_word(word))): |
| non_dict_words.append(word) |
| else: |
| non_dict_words = words |
|
|
| if len(non_dict_words) > 0: |
| print(len(non_dict_words)) |
| print(non_dict_words) |
| |
| os.makedirs("tmp", exist_ok=True) |
| timestamp = str(time.time()) |
| non_dict_words_file = os.path.abspath("tmp/non_dict_words_" + timestamp) |
| out_dict_file = os.path.abspath("tmp/out_dict_" + timestamp) |
| with open(non_dict_words_file, "w") as f: |
| f.write("\n".join(non_dict_words)) |
|
|
| if(language == 'tamil'): |
| current_directory = os.getcwd() |
| |
| tamil_parser_cmd = f"{current_directory}/ssn_parser_new/tamil_parser.py" |
| |
| subprocess.run(["python", tamil_parser_cmd, non_dict_words_file, out_dict_file, timestamp, f"{current_directory}/ssn_parser_new"]) |
| |
| elif(language == 'english'): |
| phn_out_dict = {} |
| for i in range(0,len(non_dict_words)): |
| phn_out_dict[non_dict_words[i]] = self.en_g2p(non_dict_words[i]) |
| |
| data_str = "\n".join([f"{key}\t{value}" for key, value in phn_out_dict.items()]) |
| print(f"data_str: {data_str}") |
| with open(out_dict_file, "w") as f: |
| f.write(data_str) |
| else: |
| out_dict_file = os.path.abspath("tmp/out_dict_" + timestamp) |
| from get_phone_mapped_python import TextReplacer |
| |
| from indic_unified_parser.uparser import wordparse |
| |
| text_replacer=TextReplacer() |
| |
| parsed_output_list = [] |
| for word in non_dict_words: |
| parsed_word = wordparse(word, 0, 0, 1) |
| parsed_output_list.append(parsed_word) |
| replaced_output_list = [text_replacer.apply_replacements(parsed_word) for parsed_word in parsed_output_list] |
| with open(out_dict_file, 'w', encoding='utf-8') as file: |
| for original_word, formatted_word in zip(non_dict_words, replaced_output_list): |
| line = f"{original_word}\t{formatted_word}\n" |
| file.write(line) |
| print(line, end='') |
| |
| try: |
| df = pd.read_csv(out_dict_file, delimiter="\t", header=None, dtype=str) |
| new_dict = df.dropna().set_index(0).to_dict('dict')[1] |
| print(new_dict) |
| if language not in phone_dictionary: |
| phone_dictionary[language] = new_dict |
| else: |
| phone_dictionary[language].update(new_dict) |
| |
| p = Process(target=add_to_dictionary, args=(new_dict, os.path.join(self.dict_location, language))) |
| p.start() |
| except Exception as err: |
| traceback.print_exc() |
|
|
| |
| text_phonified = [] |
| for line in text: |
| line_phonified = [] |
| for phrase in line: |
| phrase_phonified = [] |
| for word in phrase.split(" "): |
| if self.__is_english_word(word): |
| if word in phone_dictionary["english"]: |
| phrase_phonified.append(str(phone_dictionary["english"][word])) |
| else: |
| phrase_phonified.append(str(self.en_g2p(word))) |
| elif word in phone_dictionary[language]: |
| |
| phrase_phonified.append(str(phone_dictionary[language][word])) |
| |
| line_phonified.append(" ".join(phrase_phonified)) |
| text_phonified.append(line_phonified) |
| return text_phonified |
|
|
| def phonify(self, text, language, gender, phone_dictionary): |
| if not isinstance(text, list): |
| out = self.__phonify([text], language, gender) |
| return out[0] |
| return self.__phonify(text, language, gender, phone_dictionary) |
| |
| def phonify_list(self, text, language, gender, phone_dictionary): |
| if isinstance(text, list): |
| return self.__phonify_list(text, language, gender, phone_dictionary) |
| else: |
| print("Error!! Expected to have a list as input.") |
|
|
|
|
| class TextNormalizer: |
| def __init__(self, char_map_location=None, phonifier = Phonifier()): |
| self.phonifier = phonifier |
| if char_map_location is None: |
| char_map_location = "charmap" |
| |
| |
| self.cleaning_rules = { |
| " +" : " ", |
| "^ +" : "", |
| " +$" : "", |
| "#$" : "", |
| "# +$" : "", |
| } |
|
|
| |
| self.keydict = {"english" : "en", |
| "hindi" : "hi", |
| "gujarati" : "gu", |
| "marathi" : "mr", |
| "bengali" : "bn", |
| "telugu" : "te", |
| "tamil" : "ta", |
| "kannada" : "kn", |
| "odia" : "or", |
| "punjabi" : "pa" |
| } |
| |
| self.g2p = G2p() |
| print('Loading G2P model... Done!') |
|
|
| def __post_cleaning(self, text): |
| for key, replacement in self.cleaning_rules.items(): |
| text = re.sub(key, replacement, text) |
| return text |
|
|
| def __post_cleaning_list(self, text): |
| |
| output_text = [] |
| for line in text: |
| for key, replacement in self.cleaning_rules.items(): |
| line = re.sub(key, replacement, line) |
| output_text.append(line) |
| return output_text |
|
|
| def __check_char_type(self, str_c): |
| |
| if str_c.isnumeric(): |
| char_type = "number" |
| elif str_c in string.punctuation: |
| char_type = "punctuation" |
| elif str_c in string.whitespace: |
| char_type = "whitespace" |
| elif str_c.isalpha() and str_c.isascii(): |
| char_type = "ascii" |
| else: |
| char_type = "non-ascii" |
| return char_type |
| |
| def insert_space(self, text): |
| ''' |
| Check if the text contains numbers and English words and if they are without space inserts space between them. |
| ''' |
| |
| prev_char_type = None |
| next_char_type = None |
| insert_space = False |
|
|
| |
| output_string = "" |
|
|
| |
| for i, c in enumerate(text): |
| |
| char_type = self.__check_char_type(c) |
| if i == (len(text) - 1): |
| next_char_type = None |
| else: |
| next_char_type = self.__check_char_type(text[i+1]) |
| |
|
|
| |
| if (char_type != prev_char_type and prev_char_type != None and char_type != "punctuation" and char_type != "whitespace"): |
| if next_char_type != "punctuation" or next_char_type != "whitespace": |
| insert_space = True |
|
|
| |
| if insert_space: |
| output_string += " "+c |
| insert_space = False |
| else: |
| output_string += c |
|
|
| |
| prev_char_type = char_type |
|
|
| |
| output_string = re.sub(r' +', ' ', output_string) |
| return output_string |
|
|
| def insert_space_list(self, text): |
| ''' |
| Expect the input to be in form of list of string. |
| Check if the text contains numbers and English words and if they are without space inserts space between them. |
| ''' |
| |
| output_list = [] |
|
|
| for line in text: |
| |
| prev_char_type = None |
| next_char_type = None |
| insert_space = False |
| |
| output_string = "" |
| |
| for i, c in enumerate(line): |
| |
| char_type = self.__check_char_type(c) |
| if i == (len(line) - 1): |
| next_char_type = None |
| else: |
| next_char_type = self.__check_char_type(line[i+1]) |
| |
|
|
| |
| if (char_type != prev_char_type and prev_char_type != None and char_type != "punctuation" and char_type != "whitespace"): |
| if next_char_type != "punctuation" or next_char_type != "whitespace": |
| insert_space = True |
|
|
| |
| if insert_space: |
| output_string += " "+c |
| insert_space = False |
| else: |
| output_string += c |
|
|
| |
| prev_char_type = char_type |
|
|
| |
| output_string = re.sub(r' +', ' ', output_string) |
| output_list.append(output_string) |
| return output_list |
|
|
| def num2text(self, text, language): |
| if language in self.keydict.keys(): |
| digits = sorted(list(map(int, re.findall(r'\d+', text))),reverse=True) |
| if digits: |
| for digit in digits: |
| text = re.sub(str(digit), ' '+num_to_word(digit, self.keydict[language])+' ', text) |
| return self.__post_cleaning(text) |
| else: |
| print(f"No num-to-char for the given language {language}.") |
| return self.__post_cleaning(text) |
|
|
| def num2text_list(self, text, language): |
| |
| if language in self.keydict.keys(): |
| output_text = [] |
| for line in text: |
| digits = sorted(list(map(int, re.findall(r'\d+', line))),reverse=True) |
| if digits: |
| for digit in digits: |
| line = re.sub(str(digit), ' '+num_to_word(digit, self.keydict[language])+' ', line) |
| output_text.append(line) |
| return self.__post_cleaning_list(output_text) |
| else: |
| print(f"No num-to-char for the given language {language}.") |
| return self.__post_cleaning_list(text) |
| |
| def numberToTextConverter(self, text, language): |
| if language in self.keydict.keys(): |
| matches = re.findall(r'\d+\.\d+|\d+', text) |
| digits = sorted([int(match) if match.isdigit() else match if re.match(r'^\d+(\.\d+)?$', match) else str(match) for match in matches], key=lambda x: float(x) if isinstance(x, str) and '.' in x else x, reverse=True) |
| if digits: |
| for digit in digits: |
| |
| if isinstance(digit, int): |
| text = re.sub(str(digit), ' '+num_to_word(digit, self.keydict[language]).replace(",", "")+' ', text) |
| else: |
| parts = str(digit).split('.') |
| integer_part = int(parts[0]) |
| data1 = num_to_word(integer_part, self.keydict[language]).replace(",", "") |
| decimal_part = str(parts[1]) |
| data2 = '' |
| for i in decimal_part: |
| data2 = data2+' '+num_to_word(i, self.keydict[language]) |
| if language == 'hindi': |
| final_data = f'{data1} दशमलव {data2}' |
| elif language == 'tamil': |
| final_data = f'{data1} புள்ளி {data2}' |
| else: |
| final_data = f'{data1} point {data2}' |
|
|
| |
| text = re.sub(str(digit), ' '+final_data+' ', text) |
|
|
| return self.__post_cleaning(text) |
| else: |
|
|
|
|
| words = { |
| '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four', |
| '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine' |
| } |
|
|
|
|
| |
| text = re.sub(r'(?<=\d)\.(?=\d)', ' point ', text) |
|
|
| |
| matches = re.findall(r'point (\d+)', text) |
|
|
| for match in matches: |
| replacement = ' '.join(words[digit] for digit in match) |
| text = text.replace(f'point {match}', f'point {replacement}', 1) |
|
|
|
|
| return self.__post_cleaning(text) |
|
|
|
|
| def normalize(self, text, language): |
| return self.__post_cleaning(text) |
|
|
| def normalize_list(self, text, language): |
| |
| return self.__post_cleaning_list(text) |
|
|
|
|
| class TextPhrasifier: |
| @classmethod |
| def phrasify(cls, text): |
| phrase_list = [] |
| for phrase in text.split("#"): |
| phrase = phrase.strip() |
| if phrase != "": |
| phrase_list.append(phrase) |
| return phrase_list |
|
|
| class TextPhrasifier_List: |
| @classmethod |
| def phrasify(cls, text): |
| |
| |
| output_list = [] |
| for line in text: |
| phrase_list = [] |
| for phrase in line.split("#"): |
| phrase = phrase.strip() |
| if phrase != "": |
| phrase_list.append(phrase) |
| output_list.append(phrase_list) |
| return output_list |
|
|
| class DurAlignTextProcessor: |
| def __init__(self): |
| |
| self.cleaning_rules = { |
| " +" : "", |
| "^" : "$", |
| "$" : ".", |
| } |
| self.cleaning_rules_English = { |
| " +" : "", |
| "$" : ".", |
| } |
| def textProcesor(self, text): |
| for key, replacement in self.cleaning_rules.items(): |
| for idx in range(0,len(text)): |
| text[idx] = re.sub(key, replacement, text[idx]) |
|
|
| return text |
| |
| def textProcesorForEnglish(self, text): |
| for key, replacement in self.cleaning_rules_English.items(): |
| for idx in range(0,len(text)): |
| text[idx] = re.sub(key, replacement, text[idx]) |
|
|
| return text |
| |
| def textProcesor_list(self, text): |
| |
| output_text = [] |
| for line in text: |
| for key, replacement in self.cleaning_rules.items(): |
| for idx in range(0,len(line)): |
| line[idx] = re.sub(key, replacement, line[idx]) |
| output_text.append(line) |
|
|
| return output_text |
|
|
|
|
| class TTSDurAlignPreprocessor: |
| def __init__(self, |
| text_cleaner = TextCleaner(), |
| text_normalizer=TextNormalizer(), |
| phonifier = Phonifier(), |
| post_processor = DurAlignTextProcessor()): |
| self.text_cleaner = text_cleaner |
| self.text_normalizer = text_normalizer |
| self.phonifier = phonifier |
| self.post_processor = post_processor |
|
|
| def preprocess(self, text, language, gender, phone_dictionary): |
| |
| print(text) |
| text = self.text_normalizer.numberToTextConverter(text, language) |
| text = self.text_cleaner.clean(text) |
| print("cleaned text", text) |
| |
| |
| |
| text = self.text_normalizer.normalize(text, language) |
| |
| phrasified_text = TextPhrasifier.phrasify(text) |
| |
|
|
| if language not in list(phone_dictionary.keys()): |
| phone_dictionary = self.phonifier.load_lang_dict(language, phone_dictionary) |
|
|
| print(phone_dictionary.keys()) |
|
|
| phonified_text = self.phonifier.phonify(phrasified_text, language, gender, phone_dictionary) |
| print("phonetext",phonified_text) |
| phonified_text = self.post_processor.textProcesor(phonified_text) |
| print(phonified_text) |
| return phonified_text, phrasified_text |
|
|
| class TTSDurAlignPreprocessor_VTT: |
| def __init__(self, |
| text_cleaner = TextCleaner(), |
| text_normalizer=TextNormalizer(), |
| phonifier = Phonifier(), |
| post_processor = DurAlignTextProcessor()): |
| self.text_cleaner = text_cleaner |
| self.text_normalizer = text_normalizer |
| self.phonifier = phonifier |
| self.post_processor = post_processor |
|
|
| def preprocess(self, text, language, gender): |
| |
| text = self.text_cleaner.clean_list(text) |
| |
| text = self.text_normalizer.num2text_list(text, language) |
| text = self.text_normalizer.normalize_list(text, language) |
| phrasified_text = TextPhrasifier_List.phrasify(text) |
| phonified_text = self.phonifier.phonify_list(phrasified_text, language, gender) |
| phonified_text = self.post_processor.textProcesor_list(phonified_text) |
| return phonified_text, phrasified_text |
|
|
|
|
| class CharTextPreprocessor: |
| def __init__(self, |
| text_cleaner = TextCleaner(), |
| text_normalizer=TextNormalizer()): |
| self.text_cleaner = text_cleaner |
| self.text_normalizer = text_normalizer |
|
|
| def preprocess(self, text, language, gender=None, phone_dictionary=None): |
| text = text.strip() |
| text = self.text_normalizer.numberToTextConverter(text, language) |
| text = self.text_cleaner.clean(text) |
| |
| |
| text = self.text_normalizer.normalize(text, language) |
| phrasified_text = TextPhrasifier.phrasify(text) |
| phonified_text = phrasified_text |
| return phonified_text, phrasified_text |
|
|
| class CharTextPreprocessor_VTT: |
| def __init__(self, |
| text_cleaner = TextCleaner(), |
| text_normalizer=TextNormalizer() |
| ): |
| self.text_cleaner = text_cleaner |
| self.text_normalizer = text_normalizer |
|
|
| def preprocess(self, text, language, gender=None): |
| |
| text = self.text_cleaner.clean_list(text) |
| |
| text = self.text_normalizer.num2text_list(text, language) |
| text = self.text_normalizer.normalize_list(text, language) |
| phrasified_text = TextPhrasifier_List.phrasify(text) |
| phonified_text = phrasified_text |
| return phonified_text, phrasified_text |
|
|
|
|
| class TTSPreprocessor: |
| def __init__(self, |
| text_cleaner = TextCleaner(), |
| text_normalizer=TextNormalizer(), |
| phonifier = Phonifier(), |
| text_phrasefier = TextPhrasifier(), |
| post_processor = DurAlignTextProcessor()): |
| self.text_cleaner = text_cleaner |
| self.text_normalizer = text_normalizer |
| self.phonifier = phonifier |
| self.text_phrasefier = text_phrasefier |
| self.post_processor = post_processor |
| |
| def preprocess(self, text, language, gender, phone_dictionary): |
| text = text.strip() |
| text = self.text_normalizer.numberToTextConverter(text, language) |
| text = self.text_cleaner.clean(text) |
| |
| |
| text = self.text_normalizer.normalize(text, language) |
| phrasified_text = TextPhrasifier.phrasify(text) |
| if language not in list(phone_dictionary.keys()): |
| phone_dictionary = self.phonifier.load_lang_dict(language, phone_dictionary) |
| phonified_text = self.phonifier.phonify(phrasified_text, language, gender, phone_dictionary) |
| print(phonified_text) |
| phonified_text = self.post_processor.textProcesorForEnglish(phonified_text) |
| print(phonified_text) |
| return phonified_text, phrasified_text |
|
|
| class TTSPreprocessor_VTT: |
| def __init__(self, |
| text_cleaner = TextCleaner(), |
| text_normalizer=TextNormalizer(), |
| phonifier = Phonifier(), |
| text_phrasefier = TextPhrasifier_List()): |
| self.text_cleaner = text_cleaner |
| self.text_normalizer = text_normalizer |
| self.phonifier = phonifier |
| self.text_phrasefier = text_phrasefier |
|
|
| def preprocess(self, text, language, gender): |
| |
| text = self.text_cleaner.clean_list(text) |
| |
| |
| |
| text = self.text_normalizer.num2text_list(text, language) |
| |
| text = self.text_normalizer.normalize_list(text, language) |
| |
| phrasified_text = TextPhrasifier_List.phrasify(text) |
| |
| phonified_text = self.phonifier.phonify_list(phrasified_text, language, gender) |
| |
| return phonified_text, phrasified_text |
|
|