| | import re |
| | import tiktoken |
| | from typing import List, Tuple, Dict |
| |
|
| | class TextUtils: |
| | """文本处理工具类""" |
| | |
| | @staticmethod |
| | def count_tokens(text: str, model: str = "gpt-4") -> int: |
| | """计算文本的token数量""" |
| | try: |
| | encoding = tiktoken.encoding_for_model(model) |
| | return len(encoding.encode(text)) |
| | except: |
| | |
| | |
| | chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text)) |
| | english_chars = len(text) - chinese_chars |
| | return int(chinese_chars / 1.5 + english_chars / 4) |
| | |
| | @staticmethod |
| | def clean_text(text: str) -> str: |
| | """清理文本""" |
| | |
| | text = re.sub(r'\s+', ' ', text) |
| | |
| | text = re.sub(r'[^\w\s,.!?;:\'\"()\-—《》「」『』【】\u4e00-\u9fff]', '', text) |
| | return text.strip() |
| | |
| | @staticmethod |
| | def split_into_sentences(text: str) -> List[str]: |
| | """分割句子""" |
| | |
| | |
| | |
| | sentences = re.split(r'[.!?。!?]+', text) |
| | return [s.strip() for s in sentences if s.strip()] |
| | |
| | @staticmethod |
| | def detect_language(text: str) -> str: |
| | """检测文本语言""" |
| | |
| | chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text)) |
| | |
| | english_chars = len(re.findall(r'[a-zA-Z]', text)) |
| | |
| | total_chars = chinese_chars + english_chars |
| | |
| | if total_chars == 0: |
| | return "unknown" |
| | |
| | chinese_ratio = chinese_chars / total_chars |
| | |
| | if chinese_ratio > 0.3: |
| | return "zh" |
| | elif chinese_ratio < 0.1: |
| | return "en" |
| | else: |
| | return "mixed" |
| | |
| | @staticmethod |
| | def extract_dialogues(text: str, language: str = "en") -> List[Dict]: |
| | """提取对话""" |
| | dialogues = [] |
| | |
| | if language == "zh": |
| | |
| | patterns = [ |
| | r'"([^"]+)"[,,]?\s*([^说道讲告诉问答叫喊]*(?:说|道|讲|告诉|问|答|叫|喊))', |
| | r'"([^"]+)"', |
| | r'「([^」]+)」', |
| | r'『([^』]+)』' |
| | ] |
| | else: |
| | |
| | patterns = [ |
| | r'"([^"]+)",?\s*([^said]*(said|asked|replied|shouted|whispered|muttered))', |
| | r'"([^"]+)"', |
| | r"'([^']+)',?\s*([^said]*(said|asked|replied))", |
| | r"'([^']+)'" |
| | ] |
| | |
| | for pattern in patterns: |
| | matches = re.finditer(pattern, text, re.IGNORECASE) |
| | for match in matches: |
| | dialogue = { |
| | 'content': match.group(1), |
| | 'attribution': match.group(2) if len(match.groups()) > 1 else '', |
| | 'position': match.start() |
| | } |
| | dialogues.append(dialogue) |
| | |
| | return dialogues |
| | |
| | @staticmethod |
| | def truncate_text(text: str, max_length: int, |
| | ellipsis: str = "...") -> str: |
| | """截断文本到指定长度""" |
| | if len(text) <= max_length: |
| | return text |
| | |
| | return text[:max_length - len(ellipsis)] + ellipsis |
| | |
| | @staticmethod |
| | def extract_keywords(text: str, top_n: int = 10) -> List[str]: |
| | """提取关键词(简单实现)""" |
| | |
| | words = re.findall(r'\b\w+\b', text.lower()) |
| | |
| | |
| | stop_words = { |
| | 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', |
| | 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be', |
| | 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', |
| | 'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that', |
| | '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', |
| | '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有' |
| | } |
| | |
| | |
| | filtered_words = [w for w in words if w not in stop_words and len(w) > 2] |
| | |
| | |
| | from collections import Counter |
| | word_freq = Counter(filtered_words) |
| | |
| | |
| | return [word for word, freq in word_freq.most_common(top_n)] |
| | |
| | @staticmethod |
| | def estimate_reading_time(text: str, wpm: int = 200) -> int: |
| | """估计阅读时间(分钟)""" |
| | words = len(re.findall(r'\b\w+\b', text)) |
| | chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text)) |
| | |
| | |
| | reading_time = chinese_chars / 500 + words / wpm |
| | |
| | return max(1, int(reading_time)) |