Spaces:

CompactAI
/

AIFinder

Running

App Files Files Community

AIFinder / features.py

CompactAI

Upload 18 files

bb0efe6 verified 1 day ago

raw

history blame contribute delete

22.8 kB

	"""
	AIFinder Feature Extraction
	Optimized TF-IDF and stylometric features for AI model detection.
	"""

	import re
	import numpy as np
	from scipy.sparse import csr_matrix, hstack
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.base import BaseEstimator, TransformerMixin
	from sklearn.preprocessing import MaxAbsScaler

	from config import TFIDF_WORD_PARAMS, TFIDF_CHAR_PARAMS

	_RE_COMPILED = {
	"cot": re.compile(r"<think(?:ing)?>.*?</think(?:ing)?>", re.DOTALL),
	"code_block": re.compile(r"```[\s\S]*?```"),
	"inline_code": re.compile(r"`[^`]+`"),
	"bold": re.compile(r"\\([^]+)\\*"),
	"italic_ast": re.compile(r"\([^]+)\*"),
	"italic_under": re.compile(r"__([^_]+)__"),
	"under": re.compile(r"_([^_]+)_"),
	"header": re.compile(r"^#{1,6}\s+", re.MULTILINE),
	"bullet": re.compile(r"^[\s][-+]\s+", re.MULTILINE),
	"numbered": re.compile(r"^\s*\d+[.)]\s+", re.MULTILINE),
	"link": re.compile(r"\[([^\]]+)\]$[^)]+$"),
	"quote": re.compile(r"^>.*$", re.MULTILINE),
	"hr": re.compile(r"^---+$", re.MULTILINE),
	"think_tag": re.compile(r"<think>"),
	"xml_tag": re.compile(r"<[^>]+>"),
	"url": re.compile(r"https?://"),
	"contraction": re.compile(r"\b\w+'\w+\b"),
	"markdown_header": re.compile(r"^#{1,6}\s", re.MULTILINE),
	"markdown_bold": re.compile(r"\\.?\\*"),
	"markdown_code_block": re.compile(r"```"),
	"markdown_inline_code": re.compile(r"`[^`]+`"),
	"markdown_bullet": re.compile(r"^[\s][-+]\s", re.MULTILINE),
	"markdown_numbered": re.compile(r"^\s*\d+[.)]\s", re.MULTILINE),
	"markdown_table": re.compile(r"\\|.*\\|"),
	"question_start": re.compile(
	r"^(who\|what\|when\|where\|why\|how)\b", re.IGNORECASE \| re.MULTILINE
	),
	"emoji": re.compile(r"[\U00010000-\U0010ffff]"),
	"chinese": re.compile(r"[\u4e00-\u9fff]"),
	"all_caps": re.compile(r"\b[A-Z][a-z]+\b"),
	"four_word": re.compile(r"\b\w+\s+\w+\s+\w+\s+\w+\b"),
	"sent_boundary": re.compile(r"[.!?]\s+[A-Z]"),
	"paren": re.compile(r"$[^)]+$"),
	"colon_def": re.compile(r"\b\w+:\s+\w+"),
	"double_quote": re.compile(r'"[^"]*"'),
	"single_quote": re.compile(r"'[^']*'"),
	"greeting": re.compile(
	r"\b(hi\|hello\|hey\|hiya\|greetings\|howdy\|yo)\b", re.IGNORECASE
	),
	"conv_phrase": re.compile(
	r"\b(great\|perfect\|sure\|definitely\|certainly\|absolutely\|of course\|no problem\|sounds good\|got it\|understood\|okay\|alright)\b",
	re.IGNORECASE,
	),
	"helpful": re.compile(
	r"\b(let me know\|feel free\|happy to\|glad to\|happy to help\|don't hesitate\|let me know if\|please let me\|reach out)\b",
	re.IGNORECASE,
	),
	"closing_offer": re.compile(
	r"(let me know\|feel free\|happy to help\|don't hesitate\|hope this helps)",
	re.IGNORECASE,
	),
	"self_id_ai": re.compile(
	r"\b(I'm\|I am)\s+(an?\s+)?(AI\|language model\|assistant\|chatbot)\b",
	re.IGNORECASE,
	),
	"provider_mention": re.compile(
	r"\b(Claude\|Anthropic\|GPT\|OpenAI\|ChatGPT\|Gemini\|Google\|Bard\|Grok\|xAI\|DeepSeek\|Kimi\|Moonshot\|Mistral\|MiniMax\|Zhipu\|GLM\|深度求索)\b",
	re.IGNORECASE,
	),
	"special_unicode": re.compile(r"[^\x00-\x7F]"),
	}

	_PRONOUN_SETS = {
	"first": frozenset(
	{"i", "me", "my", "mine", "myself", "we", "us", "our", "ours", "ourselves"}
	),
	"second": frozenset({"you", "your", "yours", "yourself", "yourselves"}),
	"third": frozenset(
	{"he", "she", "it", "they", "them", "his", "her", "its", "their"}
	),
	}

	_DISCOURSE_SETS = {
	"conjunctions": frozenset(
	{
	"and",
	"but",
	"or",
	"nor",
	"for",
	"yet",
	"so",
	"because",
	"although",
	"while",
	"if",
	"when",
	"where",
	}
	),
	"discourse": frozenset(
	{
	"however",
	"therefore",
	"moreover",
	"furthermore",
	"nevertheless",
	"consequently",
	"thus",
	"hence",
	}
	),
	"hedging": frozenset(
	{
	"perhaps",
	"maybe",
	"might",
	"could",
	"possibly",
	"seemingly",
	"apparently",
	"arguably",
	"potentially",
	}
	),
	"certainty": frozenset(
	{
	"definitely",
	"certainly",
	"absolutely",
	"clearly",
	"obviously",
	"undoubtedly",
	"indeed",
	"surely",
	}
	),
	"transition": frozenset(
	{
	"additionally",
	"meanwhile",
	"subsequently",
	"alternatively",
	"specifically",
	"notably",
	"importantly",
	"essentially",
	}
	),
	"casual": frozenset(
	{
	"okay",
	"ok",
	"hey",
	"hi",
	"cool",
	"awesome",
	"wow",
	"basically",
	"actually",
	"literally",
	"right",
	"yeah",
	}
	),
	"formal": frozenset(
	{
	"regarding",
	"concerning",
	"pertaining",
	"aforementioned",
	"respectively",
	"accordingly",
	"henceforth",
	"whereby",
	"notwithstanding",
	"pursuant",
	}
	),
	}

	_PUNC_STRIP = frozenset(".,!?;:'\"()[]{}")


	def strip_cot(text):
	return _RE_COMPILED["cot"].sub("", text).strip()


	def strip_markdown(text):
	text = _RE_COMPILED["code_block"].sub("", text)
	text = _RE_COMPILED["inline_code"].sub("", text)
	text = _RE_COMPILED["bold"].sub(r"\1", text)
	text = _RE_COMPILED["italic_ast"].sub(r"\1", text)
	text = _RE_COMPILED["italic_under"].sub(r"\1", text)
	text = _RE_COMPILED["under"].sub(r"\1", text)
	text = _RE_COMPILED["header"].sub("", text)
	text = _RE_COMPILED["bullet"].sub("", text)
	text = _RE_COMPILED["numbered"].sub("", text)
	text = _RE_COMPILED["link"].sub(r"\1", text)
	text = _RE_COMPILED["quote"].sub("", text)
	text = _RE_COMPILED["hr"].sub("", text)
	return text.strip()


	class StylometricFeatures(BaseEstimator, TransformerMixin):
	def fit(self, X, y=None):
	return self

	def transform(self, X):
	return csr_matrix(np.array([self._extract(t) for t in X], dtype=np.float32))

	def _extract(self, text):
	n_chars = max(len(text), 1)
	words = text.split()
	n_words = max(len(words), 1)

	sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]
	n_sentences = max(len(sentences), 1)

	paragraphs = text.split("\n\n")
	non_empty_paras = [p for p in paragraphs if p.strip()]
	n_paragraphs = len(non_empty_paras)

	lines = text.split("\n")
	non_empty_lines = [ln for ln in lines if ln.strip()]
	n_lines = max(len(non_empty_lines), 1)

	word_lens = [len(w) for w in words]
	sent_lens = [len(s.split()) for s in sentences]

	_rc = _RE_COMPILED
	_ps = _PRONOUN_SETS
	_ds = _DISCOURSE_SETS

	avg_word_len = np.mean(word_lens) if words else 0.0
	word_len_std = np.std(word_lens) if len(words) > 1 else 0.0
	median_word_len = np.median(word_lens) if words else 0.0
	avg_sent_len = n_words / n_sentences

	n_commas = text.count(",") / n_chars
	n_semicolons = text.count(";") / n_chars
	n_colons = text.count(":") / n_chars
	n_dash = (text.count("—") + text.count("–") + text.count("--")) / n_chars
	n_parens = (text.count("(") + text.count(")")) / n_chars
	n_quotes = (text.count('"') + text.count("'")) / n_chars
	n_exclaim = text.count("!") / n_chars
	n_question = text.count("?") / n_chars
	n_period = text.count(".") / n_chars
	n_ellipsis = (text.count("...") + text.count("…")) / n_chars

	comma_colon_ratio = n_commas / (n_colons + 0.001)
	comma_period_ratio = n_commas / (n_period + 0.001)
	excl_question_ratio = n_exclaim / (n_question + 0.001)

	n_headers = len(_rc["markdown_header"].findall(text)) / n_sentences
	n_bold = len(_rc["markdown_bold"].findall(text)) / n_sentences
	n_code_blocks = len(_rc["markdown_code_block"].findall(text)) / n_sentences
	n_inline_code = len(_rc["markdown_inline_code"].findall(text)) / n_sentences
	n_bullet = len(_rc["markdown_bullet"].findall(text)) / n_sentences
	n_numbered = len(_rc["markdown_numbered"].findall(text)) / n_sentences
	n_tables = len(_rc["markdown_table"].findall(text)) / n_sentences

	newline_density = text.count("\n") / n_chars
	double_newline_ratio = text.count("\n\n") / (text.count("\n") + 1)
	uppercase_ratio = sum(1 for c in text if c.isupper()) / n_chars
	digit_ratio = sum(1 for c in text if c.isdigit()) / n_chars
	space_ratio = sum(1 for c in text if c.isspace()) / n_chars

	unique_chars = len(set(text)) / n_chars
	unique_chars_ratio = len(set(text.lower())) / n_chars

	sent_len_std = np.std(sent_lens) if len(sent_lens) > 1 else 0.0
	sent_len_max = max(sent_lens) if sent_lens else 0
	sent_len_min = min(sent_lens) if sent_lens else 0
	sent_len_median = np.median(sent_lens) if sent_lens else 0.0
	sent_len_range = sent_len_max - sent_len_min

	has_think = 1.0 if _rc["think_tag"].search(text) else 0.0
	has_xml = 1.0 if _rc["xml_tag"].search(text) else 0.0
	has_hr = 1.0 if _rc["hr"].search(text) else 0.0
	has_url = 1.0 if _rc["url"].search(text) else 0.0

	words_lower = [w.lower().strip(".,!?;:'\"()[]{}") for w in words]
	first_person_ratio = sum(1 for w in words_lower if w in _ps["first"]) / n_words
	second_person_ratio = (
	sum(1 for w in words_lower if w in _ps["second"]) / n_words
	)
	third_person_ratio = sum(1 for w in words_lower if w in _ps["third"]) / n_words

	unique_words = len(set(words_lower))
	ttr = unique_words / n_words if n_words > 0 else 0.0
	word_counts = {}
	for w in words_lower:
	word_counts[w] = word_counts.get(w, 0) + 1
	hapax = sum(1 for c in word_counts.values() if c == 1)
	hapax_ratio = hapax / n_words if n_words > 0 else 0.0

	contraction_count = len(_rc["contraction"].findall(text))
	contraction_ratio = contraction_count / n_words if n_words > 0 else 0.0

	sentences_starters = [
	s.split()[0].lower() if s.split() else "" for s in sentences
	]
	starter_vocab = (
	len(set(sentences_starters)) / n_sentences if n_sentences > 0 else 0.0
	)

	and_starts = sum(1 for s in sentences_starters if s == "and") / n_sentences
	but_starts = sum(1 for s in sentences_starters if s == "but") / n_sentences
	so_starts = sum(1 for s in sentences_starters if s == "so") / n_sentences
	the_starts = sum(1 for s in sentences_starters if s == "the") / n_sentences
	it_starts = (
	sum(1 for s in sentences_starters if s in ("it", "it's")) / n_sentences
	)
	i_starts = (
	sum(1 for s in sentences_starters if s in ("i", "i'm", "i've"))
	/ n_sentences
	)

	short_word_ratio = sum(1 for w in words_lower if len(w) <= 2) / n_words
	medium_word_ratio = sum(1 for w in words_lower if 3 <= len(w) <= 6) / n_words
	long_word_ratio = sum(1 for w in words_lower if len(w) >= 7) / n_words
	very_long_word_ratio = sum(1 for w in words_lower if len(w) >= 10) / n_words

	para_lens = (
	[len(p.split()) for p in non_empty_paras] if non_empty_paras else [0]
	)
	avg_para_len = np.mean(para_lens)
	para_len_std = np.std(para_lens) if len(para_lens) > 1 else 0.0

	conjunction_ratio = (
	sum(1 for w in words_lower if w in _ds["conjunctions"]) / n_words
	)
	discourse_ratio = sum(1 for w in words_lower if w in _ds["discourse"]) / n_words
	hedging_ratio = sum(1 for w in words_lower if w in _ds["hedging"]) / n_words
	certainty_ratio = sum(1 for w in words_lower if w in _ds["certainty"]) / n_words
	transition_ratio = (
	sum(1 for w in words_lower if w in _ds["transition"]) / n_words
	)

	question_starts = sum(
	1 for s in sentences if s and _rc["question_start"].search(s.lower())
	)

	has_list = 1.0 if n_bullet > 0 or n_numbered > 0 else 0.0
	list_items = n_bullet + n_numbered

	emoji_count = len(_rc["emoji"].findall(text))
	has_emoji = 1.0 if emoji_count > 0 else 0.0

	all_caps_words = sum(
	1 for w in words if len(w) > 1 and w.isupper() and w.isalpha()
	)
	all_caps_ratio = all_caps_words / n_words

	paren_count = len(_rc["paren"].findall(text))
	paren_ratio = paren_count / n_sentences

	rhetorical_q = sum(1 for s in text.split("\n") if s.strip().endswith("?"))
	rhetorical_ratio = rhetorical_q / n_sentences

	casual_ratio = sum(1 for w in words_lower if w in _ds["casual"]) / n_words
	formal_ratio = sum(1 for w in words_lower if w in _ds["formal"]) / n_words

	chinese_chars = len(_rc["chinese"].findall(text))
	has_chinese = 1.0 if chinese_chars > 0 else 0.0
	chinese_ratio = chinese_chars / n_chars

	has_self_id_ai = 1.0 if _rc["self_id_ai"].search(text) else 0.0
	has_provider_mention = 1.0 if _rc["provider_mention"].search(text) else 0.0

	ends_with_question = 1.0 if text.rstrip().endswith("?") else 0.0
	has_closing_offer = 1.0 if _rc["closing_offer"].search(text) else 0.0

	commas_per_sentence = text.count(",") / n_sentences

	avg_line_len = (
	np.mean([len(ln) for ln in non_empty_lines]) if non_empty_lines else 0.0
	)
	short_lines_ratio = (
	sum(1 for ln in non_empty_lines if len(ln.split()) <= 5) / n_lines
	)

	cap_words = len(_rc["all_caps"].findall(text))
	cap_word_ratio = cap_words / n_words

	four_word_phrases = len(_rc["four_word"].findall(text))
	phrase_ratio = four_word_phrases / n_sentences

	sent_boundaries = len(_rc["sent_boundary"].findall(text))
	sent_boundary_ratio = sent_boundaries / n_sentences

	has_checkmark = 1.0 if any(c in text for c in "✓✗✔✘") else 0.0
	has_arrow = 1.0 if any(c in text for c in "→←➡") else 0.0
	has_star = 1.0 if any(c in text for c in "⭐★☆") else 0.0
	special_unicode = len(_rc["special_unicode"].findall(text)) / n_chars

	colon_definitions = len(_rc["colon_def"].findall(text)) / n_sentences

	double_quote_pairs = len(_rc["double_quote"].findall(text)) / n_sentences
	single_quote_pairs = len(_rc["single_quote"].findall(text)) / n_sentences

	greeting_patterns = len(_rc["greeting"].findall(text))
	greeting_ratio = greeting_patterns / n_sentences

	is_short = 1.0 if n_words < 100 else 0.0
	is_medium = 1.0 if 100 <= n_words < 500 else 0.0
	is_long = 1.0 if n_words >= 500 else 0.0

	excl_sentences = sum(1 for s in sentences if s.strip().endswith("!"))
	excl_sentence_ratio = excl_sentences / n_sentences

	question_lines = [ln for ln in non_empty_lines if ln.strip().endswith("?")]
	question_line_ratio = len(question_lines) / n_lines if n_lines > 0 else 0.0

	conversational_phrases = len(_rc["conv_phrase"].findall(text))
	conv_phrase_ratio = conversational_phrases / n_words

	helpful_phrases = len(_rc["helpful"].findall(text))
	helpful_ratio = helpful_phrases / n_sentences

	return [
	avg_word_len,
	word_len_std,
	median_word_len,
	avg_sent_len,
	sent_len_std,
	sent_len_max,
	sent_len_min,
	sent_len_median,
	sent_len_range,
	commas_per_sentence,
	n_commas,
	n_semicolons,
	n_colons,
	n_dash,
	n_parens,
	n_quotes,
	n_exclaim,
	n_question,
	n_period,
	n_ellipsis,
	comma_colon_ratio,
	comma_period_ratio,
	excl_question_ratio,
	n_headers,
	n_bold,
	n_code_blocks,
	n_inline_code,
	n_bullet,
	n_numbered,
	n_tables,
	has_list,
	newline_density,
	double_newline_ratio,
	uppercase_ratio,
	digit_ratio,
	space_ratio,
	unique_chars,
	unique_chars_ratio,
	list_items,
	n_paragraphs,
	n_lines / n_sentences,
	has_think,
	has_xml,
	has_hr,
	has_url,
	first_person_ratio,
	second_person_ratio,
	third_person_ratio,
	ttr,
	hapax_ratio,
	contraction_ratio,
	short_word_ratio,
	medium_word_ratio,
	long_word_ratio,
	very_long_word_ratio,
	starter_vocab,
	and_starts,
	but_starts,
	so_starts,
	the_starts,
	it_starts,
	avg_para_len,
	para_len_std,
	conjunction_ratio,
	discourse_ratio,
	hedging_ratio,
	certainty_ratio,
	transition_ratio,
	question_starts / n_sentences if n_sentences > 0 else 0,
	emoji_count,
	has_emoji,
	special_unicode,
	all_caps_ratio,
	paren_ratio,
	rhetorical_ratio,
	casual_ratio,
	formal_ratio,
	has_chinese,
	chinese_ratio,
	has_self_id_ai,
	has_provider_mention,
	ends_with_question,
	has_closing_offer,
	has_checkmark,
	has_arrow,
	has_star,
	avg_line_len,
	short_lines_ratio,
	cap_word_ratio,
	phrase_ratio,
	sent_boundary_ratio,
	colon_definitions,
	double_quote_pairs,
	single_quote_pairs,
	i_starts,
	greeting_ratio,
	is_short,
	is_medium,
	is_long,
	excl_sentence_ratio,
	question_line_ratio,
	conv_phrase_ratio,
	helpful_ratio,
	]


	class StyleOnlyPipeline:
	"""Feature pipeline using ONLY stylometric features — no TF-IDF."""

	def __init__(self):
	self.stylo = StylometricFeatures()
	self.scaler = MaxAbsScaler()

	def fit_transform(self, texts):
	import time

	texts_clean = [strip_markdown(strip_cot(t)) for t in texts]
	t0 = time.time()
	stylo_features = self.stylo.transform(texts_clean)
	print(
	f" Stylometric: {stylo_features.shape[1]} features ({time.time() - t0:.1f}s)"
	)
	result = self.scaler.fit_transform(stylo_features)
	print(f" Final feature matrix: {result.shape}")
	return result

	def transform(self, texts):
	texts_clean = [strip_markdown(strip_cot(t)) for t in texts]
	stylo_features = self.stylo.transform(texts_clean)
	return self.scaler.transform(stylo_features)


	class FeaturePipeline:
	def __init__(self, use_tfidf=True):
	word_params = dict(TFIDF_WORD_PARAMS)
	char_params = dict(TFIDF_CHAR_PARAMS)

	if word_params.get("max_features", 1) == 0:
	word_params["max_features"] = None
	if char_params.get("max_features", 1) == 0:
	char_params["max_features"] = None

	self.word_tfidf = TfidfVectorizer(**word_params)
	self.char_tfidf = TfidfVectorizer(**char_params)
	self.stylo = StylometricFeatures()
	self.scaler = MaxAbsScaler()
	self.use_tfidf = use_tfidf and (
	TFIDF_WORD_PARAMS.get("max_features", 1) > 0
	or TFIDF_CHAR_PARAMS.get("max_features", 1) > 0
	)

	def _clean_for_tfidf(self, text):
	return strip_markdown(strip_cot(text))

	def fit_transform(self, texts):
	import time

	print(f" Input: {len(texts)} texts", flush=True)

	texts_clean = [strip_markdown(strip_cot(t)) for t in texts]
	texts_tfidf = texts_clean

	use_word_tfidf = (
	self.word_tfidf.max_features is not None
	and self.word_tfidf.max_features > 0
	)
	if use_word_tfidf:
	t0 = time.time()
	word_features = self.word_tfidf.fit_transform(texts_tfidf)
	print(
	f" word tfidf: {word_features.shape[1]} features ({time.time() - t0:.1f}s)",
	flush=True,
	)
	else:
	word_features = csr_matrix((len(texts), 0), dtype=np.float32)

	if self.use_tfidf:
	t0 = time.time()
	char_features = self.char_tfidf.fit_transform(texts_tfidf)
	print(
	f" char tfidf: {char_features.shape[1]} features ({time.time() - t0:.1f}s)",
	flush=True,
	)
	else:
	char_features = csr_matrix((len(texts), 0), dtype=np.float32)

	t0 = time.time()
	stylo_features = self.stylo.transform(texts_clean)
	print(
	f" stylometric: {stylo_features.shape[1]} features ({time.time() - t0:.1f}s)",
	flush=True,
	)

	combined = hstack([word_features, char_features, stylo_features])
	combined = self.scaler.fit_transform(combined)
	print(f" Combined feature matrix: {combined.shape}", flush=True)
	return combined

	def transform(self, texts):
	texts_clean = [strip_markdown(strip_cot(t)) for t in texts]
	texts_tfidf = texts_clean

	use_word_tfidf = (
	self.word_tfidf.max_features is not None
	and self.word_tfidf.max_features > 0
	)
	if use_word_tfidf:
	word_features = self.word_tfidf.transform(texts_tfidf)
	else:
	word_features = csr_matrix((len(texts), 0), dtype=np.float32)

	if self.use_tfidf:
	char_features = self.char_tfidf.transform(texts_tfidf)
	else:
	char_features = csr_matrix((len(texts), 0), dtype=np.float32)

	stylo_features = self.stylo.transform(texts_clean)
	combined = hstack([word_features, char_features, stylo_features])
	return self.scaler.transform(combined)