| import re |
|
|
| def read_srt_text(file_path: str) -> str: |
| """ |
| Read an SRT file and extract only the text content, ignoring timestamps. |
| |
| Args: |
| file_path (str): Path to the SRT file |
| |
| Returns: |
| str: Concatenated text content from the SRT file |
| """ |
| with open(file_path, "r", encoding="utf-8") as f: |
| content = f.read() |
|
|
| |
| blocks = content.strip().split("\n\n") |
|
|
| |
| text_lines = [] |
| for block in blocks: |
| lines = block.split("\n") |
| |
| text = " ".join(lines[2:]) |
| text_lines.append(text) |
|
|
| return " ".join(text_lines) |
|
|
| def preprocess_chinese_text(text: str, include_punctuation: bool = False) -> str: |
| """ |
| Preprocess Chinese text for CER calculation. |
| |
| Args: |
| text (str): Input Chinese text |
| include_punctuation (bool): Whether to include punctuation in the calculation |
| |
| Returns: |
| str: Preprocessed text with characters separated by spaces |
| """ |
| |
| text = re.sub(r"[a-zA-Z0-9\s]+", "", text) |
|
|
| if not include_punctuation: |
| |
| text = re.sub( |
| r'[,。!?:;""' "()【】《》、,\.!?:;\"'\\(\\)\\[\\]\\{\\}]", "", text |
| ) |
|
|
| |
| return " ".join(list(text)) |
|
|