| """ |
| AIFinder Feature Extraction |
| Optimized TF-IDF and stylometric features for AI model detection. |
| """ |
|
|
| import re |
| import numpy as np |
| from scipy.sparse import csr_matrix, hstack |
| from sklearn.feature_extraction.text import TfidfVectorizer |
| from sklearn.base import BaseEstimator, TransformerMixin |
| from sklearn.preprocessing import MaxAbsScaler |
|
|
| from config import TFIDF_WORD_PARAMS, TFIDF_CHAR_PARAMS |
|
|
| _RE_COMPILED = { |
| "cot": re.compile(r"<think(?:ing)?>.*?</think(?:ing)?>", re.DOTALL), |
| "code_block": re.compile(r"```[\s\S]*?```"), |
| "inline_code": re.compile(r"`[^`]+`"), |
| "bold": re.compile(r"\*\*([^*]+)\*\*"), |
| "italic_ast": re.compile(r"\*([^*]+)\*"), |
| "italic_under": re.compile(r"__([^_]+)__"), |
| "under": re.compile(r"_([^_]+)_"), |
| "header": re.compile(r"^#{1,6}\s+", re.MULTILINE), |
| "bullet": re.compile(r"^[\s]*[-*+]\s+", re.MULTILINE), |
| "numbered": re.compile(r"^\s*\d+[.)]\s+", re.MULTILINE), |
| "link": re.compile(r"\[([^\]]+)\]\([^)]+\)"), |
| "quote": re.compile(r"^>.*$", re.MULTILINE), |
| "hr": re.compile(r"^---+$", re.MULTILINE), |
| "think_tag": re.compile(r"<think>"), |
| "xml_tag": re.compile(r"<[^>]+>"), |
| "url": re.compile(r"https?://"), |
| "contraction": re.compile(r"\b\w+'\w+\b"), |
| "markdown_header": re.compile(r"^#{1,6}\s", re.MULTILINE), |
| "markdown_bold": re.compile(r"\*\*.*?\*\*"), |
| "markdown_code_block": re.compile(r"```"), |
| "markdown_inline_code": re.compile(r"`[^`]+`"), |
| "markdown_bullet": re.compile(r"^[\s]*[-*+]\s", re.MULTILINE), |
| "markdown_numbered": re.compile(r"^\s*\d+[.)]\s", re.MULTILINE), |
| "markdown_table": re.compile(r"\|.*\|"), |
| "question_start": re.compile( |
| r"^(who|what|when|where|why|how)\b", re.IGNORECASE | re.MULTILINE |
| ), |
| "emoji": re.compile(r"[\U00010000-\U0010ffff]"), |
| "chinese": re.compile(r"[\u4e00-\u9fff]"), |
| "all_caps": re.compile(r"\b[A-Z][a-z]+\b"), |
| "four_word": re.compile(r"\b\w+\s+\w+\s+\w+\s+\w+\b"), |
| "sent_boundary": re.compile(r"[.!?]\s+[A-Z]"), |
| "paren": re.compile(r"\([^)]+\)"), |
| "colon_def": re.compile(r"\b\w+:\s+\w+"), |
| "double_quote": re.compile(r'"[^"]*"'), |
| "single_quote": re.compile(r"'[^']*'"), |
| "greeting": re.compile( |
| r"\b(hi|hello|hey|hiya|greetings|howdy|yo)\b", re.IGNORECASE |
| ), |
| "conv_phrase": re.compile( |
| r"\b(great|perfect|sure|definitely|certainly|absolutely|of course|no problem|sounds good|got it|understood|okay|alright)\b", |
| re.IGNORECASE, |
| ), |
| "helpful": re.compile( |
| r"\b(let me know|feel free|happy to|glad to|happy to help|don't hesitate|let me know if|please let me|reach out)\b", |
| re.IGNORECASE, |
| ), |
| "closing_offer": re.compile( |
| r"(let me know|feel free|happy to help|don't hesitate|hope this helps)", |
| re.IGNORECASE, |
| ), |
| "self_id_ai": re.compile( |
| r"\b(I'm|I am)\s+(an?\s+)?(AI|language model|assistant|chatbot)\b", |
| re.IGNORECASE, |
| ), |
| "provider_mention": re.compile( |
| r"\b(Claude|Anthropic|GPT|OpenAI|ChatGPT|Gemini|Google|Bard|Grok|xAI|DeepSeek|Kimi|Moonshot|Mistral|MiniMax|Zhipu|GLM|深度求索)\b", |
| re.IGNORECASE, |
| ), |
| "special_unicode": re.compile(r"[^\x00-\x7F]"), |
| } |
|
|
| _PRONOUN_SETS = { |
| "first": frozenset( |
| {"i", "me", "my", "mine", "myself", "we", "us", "our", "ours", "ourselves"} |
| ), |
| "second": frozenset({"you", "your", "yours", "yourself", "yourselves"}), |
| "third": frozenset( |
| {"he", "she", "it", "they", "them", "his", "her", "its", "their"} |
| ), |
| } |
|
|
| _DISCOURSE_SETS = { |
| "conjunctions": frozenset( |
| { |
| "and", |
| "but", |
| "or", |
| "nor", |
| "for", |
| "yet", |
| "so", |
| "because", |
| "although", |
| "while", |
| "if", |
| "when", |
| "where", |
| } |
| ), |
| "discourse": frozenset( |
| { |
| "however", |
| "therefore", |
| "moreover", |
| "furthermore", |
| "nevertheless", |
| "consequently", |
| "thus", |
| "hence", |
| } |
| ), |
| "hedging": frozenset( |
| { |
| "perhaps", |
| "maybe", |
| "might", |
| "could", |
| "possibly", |
| "seemingly", |
| "apparently", |
| "arguably", |
| "potentially", |
| } |
| ), |
| "certainty": frozenset( |
| { |
| "definitely", |
| "certainly", |
| "absolutely", |
| "clearly", |
| "obviously", |
| "undoubtedly", |
| "indeed", |
| "surely", |
| } |
| ), |
| "transition": frozenset( |
| { |
| "additionally", |
| "meanwhile", |
| "subsequently", |
| "alternatively", |
| "specifically", |
| "notably", |
| "importantly", |
| "essentially", |
| } |
| ), |
| "casual": frozenset( |
| { |
| "okay", |
| "ok", |
| "hey", |
| "hi", |
| "cool", |
| "awesome", |
| "wow", |
| "basically", |
| "actually", |
| "literally", |
| "right", |
| "yeah", |
| } |
| ), |
| "formal": frozenset( |
| { |
| "regarding", |
| "concerning", |
| "pertaining", |
| "aforementioned", |
| "respectively", |
| "accordingly", |
| "henceforth", |
| "whereby", |
| "notwithstanding", |
| "pursuant", |
| } |
| ), |
| } |
|
|
| _PUNC_STRIP = frozenset(".,!?;:'\"()[]{}") |
|
|
|
|
| def strip_cot(text): |
| return _RE_COMPILED["cot"].sub("", text).strip() |
|
|
|
|
| def strip_markdown(text): |
| text = _RE_COMPILED["code_block"].sub("", text) |
| text = _RE_COMPILED["inline_code"].sub("", text) |
| text = _RE_COMPILED["bold"].sub(r"\1", text) |
| text = _RE_COMPILED["italic_ast"].sub(r"\1", text) |
| text = _RE_COMPILED["italic_under"].sub(r"\1", text) |
| text = _RE_COMPILED["under"].sub(r"\1", text) |
| text = _RE_COMPILED["header"].sub("", text) |
| text = _RE_COMPILED["bullet"].sub("", text) |
| text = _RE_COMPILED["numbered"].sub("", text) |
| text = _RE_COMPILED["link"].sub(r"\1", text) |
| text = _RE_COMPILED["quote"].sub("", text) |
| text = _RE_COMPILED["hr"].sub("", text) |
| return text.strip() |
|
|
|
|
| class StylometricFeatures(BaseEstimator, TransformerMixin): |
| def fit(self, X, y=None): |
| return self |
|
|
| def transform(self, X): |
| return csr_matrix(np.array([self._extract(t) for t in X], dtype=np.float32)) |
|
|
| def _extract(self, text): |
| n_chars = max(len(text), 1) |
| words = text.split() |
| n_words = max(len(words), 1) |
|
|
| sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()] |
| n_sentences = max(len(sentences), 1) |
|
|
| paragraphs = text.split("\n\n") |
| non_empty_paras = [p for p in paragraphs if p.strip()] |
| n_paragraphs = len(non_empty_paras) |
|
|
| lines = text.split("\n") |
| non_empty_lines = [ln for ln in lines if ln.strip()] |
| n_lines = max(len(non_empty_lines), 1) |
|
|
| word_lens = [len(w) for w in words] |
| sent_lens = [len(s.split()) for s in sentences] |
|
|
| _rc = _RE_COMPILED |
| _ps = _PRONOUN_SETS |
| _ds = _DISCOURSE_SETS |
|
|
| avg_word_len = np.mean(word_lens) if words else 0.0 |
| word_len_std = np.std(word_lens) if len(words) > 1 else 0.0 |
| median_word_len = np.median(word_lens) if words else 0.0 |
| avg_sent_len = n_words / n_sentences |
|
|
| n_commas = text.count(",") / n_chars |
| n_semicolons = text.count(";") / n_chars |
| n_colons = text.count(":") / n_chars |
| n_dash = (text.count("—") + text.count("–") + text.count("--")) / n_chars |
| n_parens = (text.count("(") + text.count(")")) / n_chars |
| n_quotes = (text.count('"') + text.count("'")) / n_chars |
| n_exclaim = text.count("!") / n_chars |
| n_question = text.count("?") / n_chars |
| n_period = text.count(".") / n_chars |
| n_ellipsis = (text.count("...") + text.count("…")) / n_chars |
|
|
| comma_colon_ratio = n_commas / (n_colons + 0.001) |
| comma_period_ratio = n_commas / (n_period + 0.001) |
| excl_question_ratio = n_exclaim / (n_question + 0.001) |
|
|
| n_headers = len(_rc["markdown_header"].findall(text)) / n_sentences |
| n_bold = len(_rc["markdown_bold"].findall(text)) / n_sentences |
| n_code_blocks = len(_rc["markdown_code_block"].findall(text)) / n_sentences |
| n_inline_code = len(_rc["markdown_inline_code"].findall(text)) / n_sentences |
| n_bullet = len(_rc["markdown_bullet"].findall(text)) / n_sentences |
| n_numbered = len(_rc["markdown_numbered"].findall(text)) / n_sentences |
| n_tables = len(_rc["markdown_table"].findall(text)) / n_sentences |
|
|
| newline_density = text.count("\n") / n_chars |
| double_newline_ratio = text.count("\n\n") / (text.count("\n") + 1) |
| uppercase_ratio = sum(1 for c in text if c.isupper()) / n_chars |
| digit_ratio = sum(1 for c in text if c.isdigit()) / n_chars |
| space_ratio = sum(1 for c in text if c.isspace()) / n_chars |
|
|
| unique_chars = len(set(text)) / n_chars |
| unique_chars_ratio = len(set(text.lower())) / n_chars |
|
|
| sent_len_std = np.std(sent_lens) if len(sent_lens) > 1 else 0.0 |
| sent_len_max = max(sent_lens) if sent_lens else 0 |
| sent_len_min = min(sent_lens) if sent_lens else 0 |
| sent_len_median = np.median(sent_lens) if sent_lens else 0.0 |
| sent_len_range = sent_len_max - sent_len_min |
|
|
| has_think = 1.0 if _rc["think_tag"].search(text) else 0.0 |
| has_xml = 1.0 if _rc["xml_tag"].search(text) else 0.0 |
| has_hr = 1.0 if _rc["hr"].search(text) else 0.0 |
| has_url = 1.0 if _rc["url"].search(text) else 0.0 |
|
|
| words_lower = [w.lower().strip(".,!?;:'\"()[]{}") for w in words] |
| first_person_ratio = sum(1 for w in words_lower if w in _ps["first"]) / n_words |
| second_person_ratio = ( |
| sum(1 for w in words_lower if w in _ps["second"]) / n_words |
| ) |
| third_person_ratio = sum(1 for w in words_lower if w in _ps["third"]) / n_words |
|
|
| unique_words = len(set(words_lower)) |
| ttr = unique_words / n_words if n_words > 0 else 0.0 |
| word_counts = {} |
| for w in words_lower: |
| word_counts[w] = word_counts.get(w, 0) + 1 |
| hapax = sum(1 for c in word_counts.values() if c == 1) |
| hapax_ratio = hapax / n_words if n_words > 0 else 0.0 |
|
|
| contraction_count = len(_rc["contraction"].findall(text)) |
| contraction_ratio = contraction_count / n_words if n_words > 0 else 0.0 |
|
|
| sentences_starters = [ |
| s.split()[0].lower() if s.split() else "" for s in sentences |
| ] |
| starter_vocab = ( |
| len(set(sentences_starters)) / n_sentences if n_sentences > 0 else 0.0 |
| ) |
|
|
| and_starts = sum(1 for s in sentences_starters if s == "and") / n_sentences |
| but_starts = sum(1 for s in sentences_starters if s == "but") / n_sentences |
| so_starts = sum(1 for s in sentences_starters if s == "so") / n_sentences |
| the_starts = sum(1 for s in sentences_starters if s == "the") / n_sentences |
| it_starts = ( |
| sum(1 for s in sentences_starters if s in ("it", "it's")) / n_sentences |
| ) |
| i_starts = ( |
| sum(1 for s in sentences_starters if s in ("i", "i'm", "i've")) |
| / n_sentences |
| ) |
|
|
| short_word_ratio = sum(1 for w in words_lower if len(w) <= 2) / n_words |
| medium_word_ratio = sum(1 for w in words_lower if 3 <= len(w) <= 6) / n_words |
| long_word_ratio = sum(1 for w in words_lower if len(w) >= 7) / n_words |
| very_long_word_ratio = sum(1 for w in words_lower if len(w) >= 10) / n_words |
|
|
| para_lens = ( |
| [len(p.split()) for p in non_empty_paras] if non_empty_paras else [0] |
| ) |
| avg_para_len = np.mean(para_lens) |
| para_len_std = np.std(para_lens) if len(para_lens) > 1 else 0.0 |
|
|
| conjunction_ratio = ( |
| sum(1 for w in words_lower if w in _ds["conjunctions"]) / n_words |
| ) |
| discourse_ratio = sum(1 for w in words_lower if w in _ds["discourse"]) / n_words |
| hedging_ratio = sum(1 for w in words_lower if w in _ds["hedging"]) / n_words |
| certainty_ratio = sum(1 for w in words_lower if w in _ds["certainty"]) / n_words |
| transition_ratio = ( |
| sum(1 for w in words_lower if w in _ds["transition"]) / n_words |
| ) |
|
|
| question_starts = sum( |
| 1 for s in sentences if s and _rc["question_start"].search(s.lower()) |
| ) |
|
|
| has_list = 1.0 if n_bullet > 0 or n_numbered > 0 else 0.0 |
| list_items = n_bullet + n_numbered |
|
|
| emoji_count = len(_rc["emoji"].findall(text)) |
| has_emoji = 1.0 if emoji_count > 0 else 0.0 |
|
|
| all_caps_words = sum( |
| 1 for w in words if len(w) > 1 and w.isupper() and w.isalpha() |
| ) |
| all_caps_ratio = all_caps_words / n_words |
|
|
| paren_count = len(_rc["paren"].findall(text)) |
| paren_ratio = paren_count / n_sentences |
|
|
| rhetorical_q = sum(1 for s in text.split("\n") if s.strip().endswith("?")) |
| rhetorical_ratio = rhetorical_q / n_sentences |
|
|
| casual_ratio = sum(1 for w in words_lower if w in _ds["casual"]) / n_words |
| formal_ratio = sum(1 for w in words_lower if w in _ds["formal"]) / n_words |
|
|
| chinese_chars = len(_rc["chinese"].findall(text)) |
| has_chinese = 1.0 if chinese_chars > 0 else 0.0 |
| chinese_ratio = chinese_chars / n_chars |
|
|
| has_self_id_ai = 1.0 if _rc["self_id_ai"].search(text) else 0.0 |
| has_provider_mention = 1.0 if _rc["provider_mention"].search(text) else 0.0 |
|
|
| ends_with_question = 1.0 if text.rstrip().endswith("?") else 0.0 |
| has_closing_offer = 1.0 if _rc["closing_offer"].search(text) else 0.0 |
|
|
| commas_per_sentence = text.count(",") / n_sentences |
|
|
| avg_line_len = ( |
| np.mean([len(ln) for ln in non_empty_lines]) if non_empty_lines else 0.0 |
| ) |
| short_lines_ratio = ( |
| sum(1 for ln in non_empty_lines if len(ln.split()) <= 5) / n_lines |
| ) |
|
|
| cap_words = len(_rc["all_caps"].findall(text)) |
| cap_word_ratio = cap_words / n_words |
|
|
| four_word_phrases = len(_rc["four_word"].findall(text)) |
| phrase_ratio = four_word_phrases / n_sentences |
|
|
| sent_boundaries = len(_rc["sent_boundary"].findall(text)) |
| sent_boundary_ratio = sent_boundaries / n_sentences |
|
|
| has_checkmark = 1.0 if any(c in text for c in "✓✗✔✘") else 0.0 |
| has_arrow = 1.0 if any(c in text for c in "→←➡") else 0.0 |
| has_star = 1.0 if any(c in text for c in "⭐★☆") else 0.0 |
| special_unicode = len(_rc["special_unicode"].findall(text)) / n_chars |
|
|
| colon_definitions = len(_rc["colon_def"].findall(text)) / n_sentences |
|
|
| double_quote_pairs = len(_rc["double_quote"].findall(text)) / n_sentences |
| single_quote_pairs = len(_rc["single_quote"].findall(text)) / n_sentences |
|
|
| greeting_patterns = len(_rc["greeting"].findall(text)) |
| greeting_ratio = greeting_patterns / n_sentences |
|
|
| is_short = 1.0 if n_words < 100 else 0.0 |
| is_medium = 1.0 if 100 <= n_words < 500 else 0.0 |
| is_long = 1.0 if n_words >= 500 else 0.0 |
|
|
| excl_sentences = sum(1 for s in sentences if s.strip().endswith("!")) |
| excl_sentence_ratio = excl_sentences / n_sentences |
|
|
| question_lines = [ln for ln in non_empty_lines if ln.strip().endswith("?")] |
| question_line_ratio = len(question_lines) / n_lines if n_lines > 0 else 0.0 |
|
|
| conversational_phrases = len(_rc["conv_phrase"].findall(text)) |
| conv_phrase_ratio = conversational_phrases / n_words |
|
|
| helpful_phrases = len(_rc["helpful"].findall(text)) |
| helpful_ratio = helpful_phrases / n_sentences |
|
|
| return [ |
| avg_word_len, |
| word_len_std, |
| median_word_len, |
| avg_sent_len, |
| sent_len_std, |
| sent_len_max, |
| sent_len_min, |
| sent_len_median, |
| sent_len_range, |
| commas_per_sentence, |
| n_commas, |
| n_semicolons, |
| n_colons, |
| n_dash, |
| n_parens, |
| n_quotes, |
| n_exclaim, |
| n_question, |
| n_period, |
| n_ellipsis, |
| comma_colon_ratio, |
| comma_period_ratio, |
| excl_question_ratio, |
| n_headers, |
| n_bold, |
| n_code_blocks, |
| n_inline_code, |
| n_bullet, |
| n_numbered, |
| n_tables, |
| has_list, |
| newline_density, |
| double_newline_ratio, |
| uppercase_ratio, |
| digit_ratio, |
| space_ratio, |
| unique_chars, |
| unique_chars_ratio, |
| list_items, |
| n_paragraphs, |
| n_lines / n_sentences, |
| has_think, |
| has_xml, |
| has_hr, |
| has_url, |
| first_person_ratio, |
| second_person_ratio, |
| third_person_ratio, |
| ttr, |
| hapax_ratio, |
| contraction_ratio, |
| short_word_ratio, |
| medium_word_ratio, |
| long_word_ratio, |
| very_long_word_ratio, |
| starter_vocab, |
| and_starts, |
| but_starts, |
| so_starts, |
| the_starts, |
| it_starts, |
| avg_para_len, |
| para_len_std, |
| conjunction_ratio, |
| discourse_ratio, |
| hedging_ratio, |
| certainty_ratio, |
| transition_ratio, |
| question_starts / n_sentences if n_sentences > 0 else 0, |
| emoji_count, |
| has_emoji, |
| special_unicode, |
| all_caps_ratio, |
| paren_ratio, |
| rhetorical_ratio, |
| casual_ratio, |
| formal_ratio, |
| has_chinese, |
| chinese_ratio, |
| has_self_id_ai, |
| has_provider_mention, |
| ends_with_question, |
| has_closing_offer, |
| has_checkmark, |
| has_arrow, |
| has_star, |
| avg_line_len, |
| short_lines_ratio, |
| cap_word_ratio, |
| phrase_ratio, |
| sent_boundary_ratio, |
| colon_definitions, |
| double_quote_pairs, |
| single_quote_pairs, |
| i_starts, |
| greeting_ratio, |
| is_short, |
| is_medium, |
| is_long, |
| excl_sentence_ratio, |
| question_line_ratio, |
| conv_phrase_ratio, |
| helpful_ratio, |
| ] |
|
|
|
|
| class StyleOnlyPipeline: |
| """Feature pipeline using ONLY stylometric features — no TF-IDF.""" |
|
|
| def __init__(self): |
| self.stylo = StylometricFeatures() |
| self.scaler = MaxAbsScaler() |
|
|
| def fit_transform(self, texts): |
| import time |
|
|
| texts_clean = [strip_markdown(strip_cot(t)) for t in texts] |
| t0 = time.time() |
| stylo_features = self.stylo.transform(texts_clean) |
| print( |
| f" Stylometric: {stylo_features.shape[1]} features ({time.time() - t0:.1f}s)" |
| ) |
| result = self.scaler.fit_transform(stylo_features) |
| print(f" Final feature matrix: {result.shape}") |
| return result |
|
|
| def transform(self, texts): |
| texts_clean = [strip_markdown(strip_cot(t)) for t in texts] |
| stylo_features = self.stylo.transform(texts_clean) |
| return self.scaler.transform(stylo_features) |
|
|
|
|
| class FeaturePipeline: |
| def __init__(self, use_tfidf=True): |
| word_params = dict(TFIDF_WORD_PARAMS) |
| char_params = dict(TFIDF_CHAR_PARAMS) |
|
|
| if word_params.get("max_features", 1) == 0: |
| word_params["max_features"] = None |
| if char_params.get("max_features", 1) == 0: |
| char_params["max_features"] = None |
|
|
| self.word_tfidf = TfidfVectorizer(**word_params) |
| self.char_tfidf = TfidfVectorizer(**char_params) |
| self.stylo = StylometricFeatures() |
| self.scaler = MaxAbsScaler() |
| self.use_tfidf = use_tfidf and ( |
| TFIDF_WORD_PARAMS.get("max_features", 1) > 0 |
| or TFIDF_CHAR_PARAMS.get("max_features", 1) > 0 |
| ) |
|
|
| def _clean_for_tfidf(self, text): |
| return strip_markdown(strip_cot(text)) |
|
|
| def fit_transform(self, texts): |
| import time |
|
|
| print(f" Input: {len(texts)} texts", flush=True) |
|
|
| texts_clean = [strip_markdown(strip_cot(t)) for t in texts] |
| texts_tfidf = texts_clean |
|
|
| use_word_tfidf = ( |
| self.word_tfidf.max_features is not None |
| and self.word_tfidf.max_features > 0 |
| ) |
| if use_word_tfidf: |
| t0 = time.time() |
| word_features = self.word_tfidf.fit_transform(texts_tfidf) |
| print( |
| f" word tfidf: {word_features.shape[1]} features ({time.time() - t0:.1f}s)", |
| flush=True, |
| ) |
| else: |
| word_features = csr_matrix((len(texts), 0), dtype=np.float32) |
|
|
| if self.use_tfidf: |
| t0 = time.time() |
| char_features = self.char_tfidf.fit_transform(texts_tfidf) |
| print( |
| f" char tfidf: {char_features.shape[1]} features ({time.time() - t0:.1f}s)", |
| flush=True, |
| ) |
| else: |
| char_features = csr_matrix((len(texts), 0), dtype=np.float32) |
|
|
| t0 = time.time() |
| stylo_features = self.stylo.transform(texts_clean) |
| print( |
| f" stylometric: {stylo_features.shape[1]} features ({time.time() - t0:.1f}s)", |
| flush=True, |
| ) |
|
|
| combined = hstack([word_features, char_features, stylo_features]) |
| combined = self.scaler.fit_transform(combined) |
| print(f" Combined feature matrix: {combined.shape}", flush=True) |
| return combined |
|
|
| def transform(self, texts): |
| texts_clean = [strip_markdown(strip_cot(t)) for t in texts] |
| texts_tfidf = texts_clean |
|
|
| use_word_tfidf = ( |
| self.word_tfidf.max_features is not None |
| and self.word_tfidf.max_features > 0 |
| ) |
| if use_word_tfidf: |
| word_features = self.word_tfidf.transform(texts_tfidf) |
| else: |
| word_features = csr_matrix((len(texts), 0), dtype=np.float32) |
|
|
| if self.use_tfidf: |
| char_features = self.char_tfidf.transform(texts_tfidf) |
| else: |
| char_features = csr_matrix((len(texts), 0), dtype=np.float32) |
|
|
| stylo_features = self.stylo.transform(texts_clean) |
| combined = hstack([word_features, char_features, stylo_features]) |
| return self.scaler.transform(combined) |
|
|