| import re |
| from typing import Iterable, List, Tuple |
| import cn2an |
| from english_utils.abbreviations import expand_abbreviations |
| from english_utils.time_norm import expand_time_english |
| from english_utils.number_norm import normalize_numbers as replace_numbers_en |
|
|
|
|
| def merge_short_sentences_zh(sens): |
| |
| """Avoid short sentences by merging them with the following sentence. |
| |
| Args: |
| List[str]: list of input sentences. |
| |
| Returns: |
| List[str]: list of output sentences. |
| """ |
| sens_out = [] |
| for s in sens: |
| |
| |
| if len(sens_out) > 0 and len(sens_out[-1]) <= 2: |
| sens_out[-1] = sens_out[-1] + " " + s |
| else: |
| sens_out.append(s) |
| try: |
| if len(sens_out[-1]) <= 2: |
| sens_out[-2] = sens_out[-2] + " " + sens_out[-1] |
| sens_out.pop(-1) |
| except: |
| pass |
| return sens_out |
|
|
|
|
| def split_sentences_zh(text, min_len=10): |
| text = re.sub('[。!?;]', '.', text) |
| text = re.sub('[,]', ',', text) |
| |
| text = re.sub('[\n\t ]+', ' ', text) |
| |
| text = re.sub('([,.!?;])', r'\1 $#!', text) |
| |
| |
| sentences = [s.strip() for s in text.split('$#!')] |
| if len(sentences[-1]) == 0: del sentences[-1] |
|
|
| new_sentences = [] |
| new_sent = [] |
| count_len = 0 |
| for ind, sent in enumerate(sentences): |
| new_sent.append(sent) |
| count_len += len(sent) |
| if count_len > min_len or ind == len(sentences) - 1: |
| count_len = 0 |
| new_sentences.append(' '.join(new_sent)) |
| new_sent = [] |
| return merge_short_sentences_zh(new_sentences) |
|
|
|
|
| def intersperse(lst, item): |
| result = [item] * (len(lst) * 2 + 1) |
| result[1::2] = lst |
| return result |
|
|
|
|
| def replace_numbers_zh(text): |
| numbers = re.findall(r"\d+(?:\.?\d+)?", text) |
| for number in numbers: |
| text = text.replace(number, cn2an.an2cn(number), 1) |
| return text |
|
|
|
|
| def replace_punctuation(text): |
| rep_map = { |
| ":": ",", |
| ";": ",", |
| ",": ",", |
| "。": ".", |
| "!": "!", |
| "?": "?", |
| "\n": ".", |
| "·": ",", |
| "、": ",", |
| "...": "…", |
| "$": ".", |
| "“": "'", |
| "”": "'", |
| "‘": "'", |
| "’": "'", |
| "(": "'", |
| ")": "'", |
| "(": "'", |
| ")": "'", |
| "《": "'", |
| "》": "'", |
| "【": "'", |
| "】": "'", |
| "[": "'", |
| "]": "'", |
| "—": "-", |
| "~": "-", |
| "~": "-", |
| "「": "'", |
| "」": "'", |
| } |
|
|
| for k, v in rep_map.items(): |
| text = text.replace(k, v) |
| return text |
|
|
|
|
| class Lexicon: |
| def __init__(self, lexion_filename: str, tokens_filename: str): |
| tokens = dict() |
| with open(tokens_filename, encoding="utf-8") as f: |
| for line in f: |
| s, i = line.split() |
| tokens[s] = int(i) |
|
|
| lexicon = dict() |
| with open(lexion_filename, encoding="utf-8") as f: |
| for line in f: |
| splits = line.split() |
| word_or_phrase = splits[0] |
| phone_tone_list = splits[1:] |
| assert len(phone_tone_list) & 1 == 0, len(phone_tone_list) |
| phone_str = phone_tone_list[: len(phone_tone_list) // 2] |
| phones = [tokens[p] for p in phone_str] |
|
|
| tones = phone_tone_list[len(phone_tone_list) // 2 :] |
| tones = [int(t) for t in tones] |
|
|
| lexicon[word_or_phrase] = (phone_str, phones, tones) |
| lexicon["呣"] = lexicon["母"] |
| lexicon["嗯"] = lexicon["恩"] |
| self.lexicon = lexicon |
|
|
| punctuation = ["!", "?", "…", ",", ".", "'", "-"] |
| for p in punctuation: |
| i = tokens[p] |
| tone = 0 |
| self.lexicon[p] = ([p], [i], [tone]) |
| self.lexicon[" "] = ([" "], [tokens["_"]], [0]) |
|
|
| def g2p_zh_mix_en(self, text: str) -> Tuple[List[int], List[int]]: |
| phone_str = [] |
| phones = [] |
| tones = [] |
|
|
| if text not in self.lexicon: |
| |
| if len(text) > 1: |
| for w in text: |
| |
| s, _, p, t = self.convert(w) |
| if p: |
| phone_str += s |
| phones += p |
| tones += t |
| return phone_str, phones, tones |
|
|
| phone_str, phones, tones = self.lexicon[text] |
| return phone_str, phones, tones |
| |
| |
| def split_zh_en(self, text): |
| if re.search(r'[a-zA-Z]+', text): |
| spliter = '#$&^!@' |
| |
| text = re.sub(r'[a-zA-Z]+', lambda x: f'{spliter}{x.group()}{spliter}', text) |
| texts = text.split(spliter) |
| texts = [t for t in texts if len(t) > 0] |
| return texts |
| else: |
| return [text] |
| |
| |
| def normalize_english(self, text): |
| text = text.lower() |
| text = expand_time_english(text) |
| text = replace_numbers_en(text) |
| text = expand_abbreviations(text) |
| return text |
|
|
| def normalize_chinese(self, text): |
| text = replace_numbers_zh(text) |
| return text |
| |
|
|
| def is_english(self, text): |
| return 1 if re.match(r'[a-zA-Z\s]+', text) else 0 |
|
|
| def convert(self, text: Iterable[str]) -> Tuple[List[int], List[int]]: |
| phone_str = [] |
| yinjie_num = [] |
| phones = [] |
| tones = [] |
|
|
| text = replace_punctuation(text) |
| texts_zh_en = self.split_zh_en(text) |
| en_num = sum([self.is_english(i) for i in texts_zh_en]) |
| if en_num * 2 >= len(texts_zh_en): |
| texts_zh_en = self.split_zh_en(self.normalize_english(text)) |
| else: |
| texts_zh_en = self.split_zh_en(self.normalize_chinese(text)) |
| for text_one_lang in texts_zh_en: |
| if self.is_english(text_one_lang): |
| |
| s, p, t = self.g2p_zh_mix_en(text_one_lang) |
|
|
| phone_str += s |
| yinjie_num.append(len(s)) |
| phones += p |
| tones += t |
| else: |
| |
| for tl in text_one_lang: |
| s, p, t = self.g2p_zh_mix_en(tl) |
|
|
| phone_str += s |
| yinjie_num.append(len(s)) |
| phones += p |
| tones += t |
| |
| return phone_str, yinjie_num, phones, tones |