| | import html |
| | import re |
| |
|
| | import emojiswitch |
| | import ftfy |
| |
|
| | from modules import models |
| | from modules.utils.detect_lang import guess_lang |
| | from modules.utils.HomophonesReplacer import HomophonesReplacer |
| | from modules.utils.html import remove_html_tags as _remove_html_tags |
| | from modules.utils.markdown import markdown_to_text |
| | from modules.utils.zh_normalization.text_normlization import TextNormalizer |
| |
|
| | |
| | |
| | DISABLE_UNK_TOKEN_CHECK = False |
| |
|
| |
|
| | post_normalize_pipeline = [] |
| | pre_normalize_pipeline = [] |
| |
|
| |
|
| | def post_normalize(): |
| | def decorator(func): |
| | post_normalize_pipeline.append(func) |
| | return func |
| |
|
| | return decorator |
| |
|
| |
|
| | def pre_normalize(): |
| | def decorator(func): |
| | pre_normalize_pipeline.append(func) |
| | return func |
| |
|
| | return decorator |
| |
|
| |
|
| | def apply_pre_normalize(text): |
| | for func in pre_normalize_pipeline: |
| | text = func(text) |
| | return text |
| |
|
| |
|
| | def apply_post_normalize(text): |
| | for func in post_normalize_pipeline: |
| | text = func(text) |
| | return text |
| |
|
| |
|
| | def is_markdown(text): |
| | markdown_patterns = [ |
| | r"(^|\s)#[^#]", |
| | r"\*\*.*?\*\*", |
| | r"\*.*?\*", |
| | r"!\[.*?\]\(.*?\)", |
| | r"\[.*?\]\(.*?\)", |
| | r"`[^`]+`", |
| | r"```[\s\S]*?```", |
| | r"(^|\s)\* ", |
| | r"(^|\s)\d+\. ", |
| | r"(^|\s)> ", |
| | r"(^|\s)---", |
| | ] |
| |
|
| | for pattern in markdown_patterns: |
| | if re.search(pattern, text, re.MULTILINE): |
| | return True |
| |
|
| | return False |
| |
|
| |
|
| | character_map = { |
| | ":": ",", |
| | ";": ",", |
| | "!": "。", |
| | "(": ",", |
| | ")": ",", |
| | "【": ",", |
| | "】": ",", |
| | "『": ",", |
| | "』": ",", |
| | "「": ",", |
| | "」": ",", |
| | "《": ",", |
| | "》": ",", |
| | "-": ",", |
| | "‘": " ", |
| | "“": " ", |
| | "’": " ", |
| | "”": " ", |
| | '"': " ", |
| | "'": " ", |
| | ":": ",", |
| | ";": ",", |
| | "!": ".", |
| | "(": ",", |
| | ")": ",", |
| | "[": ",", |
| | "]": ",", |
| | ">": ",", |
| | "<": ",", |
| | "-": ",", |
| | "~": " ", |
| | "~": " ", |
| | "/": " ", |
| | "·": " ", |
| | } |
| |
|
| | character_to_word = { |
| | " & ": " and ", |
| | } |
| |
|
| | |
| |
|
| |
|
| | @post_normalize() |
| | def apply_character_to_word(text): |
| | for k, v in character_to_word.items(): |
| | text = text.replace(k, v) |
| | return text |
| |
|
| |
|
| | @post_normalize() |
| | def apply_character_map(text): |
| | translation_table = str.maketrans(character_map) |
| | return text.translate(translation_table) |
| |
|
| |
|
| | @post_normalize() |
| | def apply_emoji_map(text): |
| | lang = guess_lang(text) |
| | return emojiswitch.demojize(text, delimiters=("", ""), lang=lang) |
| |
|
| |
|
| | @post_normalize() |
| | def insert_spaces_between_uppercase(s): |
| | |
| | return re.sub( |
| | r"(?<=[A-Z])(?=[A-Z])|(?<=[a-z])(?=[A-Z])|(?<=[\u4e00-\u9fa5])(?=[A-Z])|(?<=[A-Z])(?=[\u4e00-\u9fa5])", |
| | " ", |
| | s, |
| | ) |
| |
|
| |
|
| | @post_normalize() |
| | def replace_unk_tokens(text): |
| | """ |
| | 把不在字典里的字符替换为 " , " |
| | """ |
| | if DISABLE_UNK_TOKEN_CHECK: |
| | return text |
| | chat_tts = models.load_chat_tts() |
| | if "tokenizer" not in chat_tts.pretrain_models: |
| | |
| | |
| | return text |
| | tokenizer = chat_tts.pretrain_models["tokenizer"] |
| | vocab = tokenizer.get_vocab() |
| | vocab_set = set(vocab.keys()) |
| | |
| | vocab_set.update(set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")) |
| | vocab_set.update(set(" \n\r\t")) |
| | replaced_chars = [char if char in vocab_set else " , " for char in text] |
| | output_text = "".join(replaced_chars) |
| | return output_text |
| |
|
| |
|
| | homo_replacer = HomophonesReplacer(map_file_path="./data/homophones_map.json") |
| |
|
| |
|
| | @post_normalize() |
| | def replace_homophones(text): |
| | lang = guess_lang(text) |
| | if lang == "zh": |
| | text = homo_replacer.replace(text) |
| | return text |
| |
|
| |
|
| | |
| |
|
| |
|
| | @pre_normalize() |
| | def html_unescape(text): |
| | text = html.unescape(text) |
| | text = html.unescape(text) |
| | return text |
| |
|
| |
|
| | @pre_normalize() |
| | def fix_text(text): |
| | return ftfy.fix_text(text=text) |
| |
|
| |
|
| | @pre_normalize() |
| | def apply_markdown_to_text(text): |
| | if is_markdown(text): |
| | text = markdown_to_text(text) |
| | return text |
| |
|
| |
|
| | @pre_normalize() |
| | def remove_html_tags(text): |
| | return _remove_html_tags(text) |
| |
|
| |
|
| | |
| | |
| | @pre_normalize() |
| | def replace_quotes(text): |
| | repl = r"\n\1\n" |
| | patterns = [ |
| | ['"', '"'], |
| | ["'", "'"], |
| | ["“", "”"], |
| | ["‘", "’"], |
| | ] |
| | for p in patterns: |
| | text = re.sub(rf"({p[0]}[^{p[0]}{p[1]}]+?{p[1]})", repl, text) |
| | return text |
| |
|
| |
|
| | def ensure_suffix(a: str, b: str, c: str): |
| | a = a.strip() |
| | if not a.endswith(b): |
| | a += c |
| | return a |
| |
|
| |
|
| | email_domain_map = { |
| | "outlook.com": "Out look", |
| | "hotmail.com": "Hot mail", |
| | "yahoo.com": "雅虎", |
| | } |
| |
|
| |
|
| | |
| | |
| | |
| | |
| | def email_detect(text): |
| | email_pattern = re.compile(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})") |
| |
|
| | def replace(match): |
| | email = match.group(1) |
| | name, domain = email.split("@") |
| | name = " ".join(name) |
| | if domain in email_domain_map: |
| | domain = email_domain_map[domain] |
| | domain = domain.replace(".", " dot ") |
| | return f"{name} at {domain}" |
| |
|
| | return email_pattern.sub(replace, text) |
| |
|
| |
|
| | def sentence_normalize(sentence_text: str): |
| | |
| | tx = TextNormalizer() |
| |
|
| | |
| | pattern = re.compile(r"(\[.+?\])|([^[]+)") |
| |
|
| | def normalize_part(part): |
| | sentences = tx.normalize(part) if guess_lang(part) == "zh" else [part] |
| | dest_text = "" |
| | for sentence in sentences: |
| | sentence = apply_post_normalize(sentence) |
| | dest_text += sentence |
| | return dest_text |
| |
|
| | def replace(match): |
| | if match.group(1): |
| | return f" {match.group(1)} " |
| | else: |
| | return normalize_part(match.group(2)) |
| |
|
| | result = pattern.sub(replace, sentence_text) |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | return result |
| |
|
| |
|
| | def text_normalize(text, is_end=False): |
| | text = apply_pre_normalize(text) |
| | lines = text.split("\n") |
| | lines = [line.strip() for line in lines] |
| | lines = [line for line in lines if line] |
| | lines = [sentence_normalize(line) for line in lines] |
| | content = "\n".join(lines) |
| | return content |
| |
|
| |
|
| | if __name__ == "__main__": |
| | from modules.devices import devices |
| |
|
| | devices.reset_device() |
| | test_cases = [ |
| | "ChatTTS是专门为对话场景设计的文本转语音模型,例如LLM助手对话任务。它支持英文和中文两种语言。最大的模型使用了10万小时以上的中英文数据进行训练。在HuggingFace中开源的版本为4万小时训练且未SFT的版本.", |
| | " [oral_9] [laugh_0] [break_0] 电 [speed_0] 影 [speed_0] 中 梁朝伟 [speed_9] 扮演的陈永仁的编号27149", |
| | " 明天有62%的概率降雨", |
| | "大🍌,一条大🍌,嘿,你的感觉真的很奇妙 [lbreak]", |
| | "I like eating 🍏", |
| | """ |
| | # 你好,世界 |
| | ```js |
| | console.log('1') |
| | ``` |
| | **加粗** |
| | |
| | *一条文本* |
| | """, |
| | """ |
| | 在沙漠、岩石、雪地上行走了很长的时间以后,小王子终于发现了一条大路。所有的大路都是通往人住的地方的。 |
| | “你们好。”小王子说。 |
| | 这是一个玫瑰盛开的花园。 |
| | “你好。”玫瑰花说道。 |
| | 小王子瞅着这些花,它们全都和他的那朵花一样。 |
| | “你们是什么花?”小王子惊奇地问。 |
| | “我们是玫瑰花。”花儿们说道。 |
| | “啊!”小王子说……。 |
| | """, |
| | """ |
| | State-of-the-art Machine Learning for PyTorch, TensorFlow, and JAX. |
| | |
| | 🤗 Transformers provides APIs and tools to easily download and train state-of-the-art pretrained models. Using pretrained models can reduce your compute costs, carbon footprint, and save you the time and resources required to train a model from scratch. These models support common tasks in different modalities, such as: |
| | |
| | 📝 Natural Language Processing: text classification, named entity recognition, question answering, language modeling, summarization, translation, multiple choice, and text generation. |
| | 🖼️ Computer Vision: image classification, object detection, and segmentation. |
| | 🗣️ Audio: automatic speech recognition and audio classification. |
| | 🐙 Multimodal: table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering. |
| | """, |
| | """ |
| | 120米 |
| | 有12%的概率会下雨 |
| | 埃隆·马斯克 |
| | """, |
| | ] |
| |
|
| | for i, test_case in enumerate(test_cases): |
| | print(f"case {i}:\n", {"x": text_normalize(test_case, is_end=True)}) |
| |
|