init

5c8f9d2 verified 10 months ago

9.49 kB

	"""
	Moduł do ekstrakcji zaawansowanych cech lingwistycznych przy użyciu spaCy.
	"""
	import re
	import math
	from collections import Counter
	from statistics import mean, variance
	from typing import Dict, List

	import textstat
	import spacy

	from ..utils import safe_divide
	from ..constants import CAMEL_CASE_PATTERN

	# --- Funkcje analizujące podstawowe statystyki z Doc ---

	def analyze_pos_stats(doc: spacy.tokens.Doc) -> Dict[str, float]:
	"""Oblicza statystyki części mowy (POS), interpunkcji i stopwords."""
	words = [t for t in doc if not t.is_punct and not t.is_space and t.pos_ != 'SYM']
	words_count = len(words)

	if not words_count:
	return {'words': 0, 'nouns': 0, 'verbs': 0, 'adjectives': 0, 'adverbs': 0,
	'punctuations': 0, 'symbols': 0, 'stopwords': 0, 'oovs': 0,
	'pos_x': 0, 'pos_num': 0, 'noun_ratio': 0.0, 'verb_ratio': 0.0,
	'adj_ratio': 0.0}

	stats = {
	'words': words_count, 'nouns': sum(1 for t in doc if t.pos_ == "NOUN"),
	'verbs': sum(1 for t in doc if t.pos_ == "VERB"),
	'adjectives': sum(1 for t in doc if t.pos_ == "ADJ"),
	'adverbs': sum(1 for t in doc if t.pos_ == "ADV"),
	'punctuations': sum(1 for t in doc if t.is_punct),
	'symbols': sum(1 for t in doc if t.pos_ == "SYM"),
	'stopwords': sum(1 for t in doc if t.is_stop),
	'oovs': sum(1 for t in doc if t.is_oov),
	'pos_x': sum(1 for t in doc if t.pos_ == "X"),
	'pos_num': sum(1 for t in doc if t.pos_ == "NUM"),
	}
	stats['noun_ratio'] = safe_divide(stats['nouns'], words_count)
	stats['verb_ratio'] = safe_divide(stats['verbs'], words_count)
	stats['adj_ratio'] = safe_divide(stats['adjectives'], words_count)
	return stats

	def analyze_doc_level_stats(doc: spacy.tokens.Doc, text: str) -> Dict[str, float]:
	"""Analizuje cechy na poziomie całego dokumentu."""
	words = [t for t in doc if not t.is_punct and not t.is_space and t.pos_ != 'SYM']
	words_count = len(words)
	sentences_count = len(list(doc.sents))

	return {
	'sentences': sentences_count,
	'avg_word_length': safe_divide(sum(len(t.text) for t in words), words_count),
	'avg_sentence_length': safe_divide(words_count, sentences_count),
	'lexical_density': safe_divide(len({t.lemma_ for t in words}), words_count),
	'gunning_fog': textstat.gunning_fog(text) if text.strip() else 0.0,
	'camel_case': sum(1 for t in words if CAMEL_CASE_PATTERN.match(t.text)),
	'capitalized_words': sum(1 for t in words if t.text.isupper()),
	}

	# --- Funkcje analizujące zaawansowane cechy lingwistyczne ---

	def analyze_named_entities(doc: spacy.tokens.Doc) -> Dict[str, float]:
	"""Analizuje rozpoznane jednostki nazwane (NER)."""
	alpha_words = [t for t in doc if t.is_alpha]
	if not alpha_words:
	return {"ner_count": 0, "ner_person_ratio": 0.0, "ner_org_ratio": 0.0,
	"ner_loc_ratio": 0.0, "ner_misc_ratio": 0.0}

	ents = doc.ents
	return {
	"ner_count": len(ents),
	"ner_person_ratio": safe_divide(sum(1 for e in ents if e.label_ == "persName"), len(alpha_words)),
	"ner_org_ratio": safe_divide(sum(1 for e in ents if e.label_ == "orgName"), len(alpha_words)),
	"ner_loc_ratio": safe_divide(sum(1 for e in ents if e.label_ in ["placeName", "locName"]), len(alpha_words)),
	"ner_misc_ratio": safe_divide(sum(1 for e in ents if e.label_ not in ["persName", "orgName", "placeName", "locName"]), len(alpha_words)),
	}

	def analyze_morphology(doc: spacy.tokens.Doc) -> Dict[str, float]:
	"""Analizuje różnorodność morfologiczną."""
	alpha_tokens = [t for t in doc if t.is_alpha]
	if not alpha_tokens:
	return {"case_diversity": 0.0, "tense_diversity": 0.0, "mood_diversity": 0.0}

	cases, tenses, moods = [], [], []
	for token in alpha_tokens:
	if token.morph:
	cases.extend(token.morph.get("Case", []))
	tenses.extend(token.morph.get("Tense", []))
	moods.extend(token.morph.get("Mood", []))

	return {"case_diversity": safe_divide(len(set(cases)), len(alpha_tokens)),
	"tense_diversity": safe_divide(len(set(tenses)), len(alpha_tokens)),
	"mood_diversity": safe_divide(len(set(moods)), len(alpha_tokens))}

	def analyze_dependency_complexity(doc: spacy.tokens.Doc) -> Dict[str, float]:
	"""Oblicza średnią głębokość drzewa zależności."""
	depths = []
	for sent in doc.sents:
	if not list(sent): continue
	max_depth = 0
	for token in sent:
	dist = 0
	curr = token
	while curr.head != curr and dist < 100:
	curr = curr.head
	dist += 1
	max_depth = max(max_depth, dist)
	depths.append(max_depth)
	return {"avg_dependency_tree_depth": mean(depths) if depths else 0.0}

	def analyze_pos_frequencies(doc: spacy.tokens.Doc, top_k=10) -> Dict[str, float]:
	"""Analizuje częstotliwość POS dla najczęstszych słów."""
	tokens = [t for t in doc if t.is_alpha]
	if not tokens:
	return {"top_words_total_count": 0, "top_words_noun_ratio": 0.0, "top_words_verb_ratio": 0.0,
	"top_words_adj_ratio": 0.0, "top_words_other_ratio": 0.0, "top_words_noun_prop_of_all_nouns": 0.0,
	"top_words_verb_prop_of_all_verbs": 0.0, "top_words_adj_prop_of_all_adjs": 0.0,
	"top_words_other_prop_of_all_others": 0.0}

	word_counts = Counter(t.text.lower() for t in tokens)
	top_words_list = [w for w, _ in word_counts.most_common(top_k)]

	top_tokens = [t for t in tokens if t.text.lower() in top_words_list]
	total_top_count = len(top_tokens)

	top_noun = sum(1 for t in top_tokens if t.pos_ == 'NOUN')
	top_verb = sum(1 for t in top_tokens if t.pos_ == 'VERB')
	top_adj = sum(1 for t in top_tokens if t.pos_ == 'ADJ')
	top_other = total_top_count - (top_noun + top_verb + top_adj)

	total_nouns = sum(1 for t in tokens if t.pos_ == "NOUN")
	total_verbs = sum(1 for t in tokens if t.pos_ == "VERB")
	total_adjs = sum(1 for t in tokens if t.pos_ == "ADJ")
	total_others = len(tokens) - (total_nouns + total_verbs + total_adjs)

	return {
	"top_words_total_count": total_top_count,
	"top_words_noun_ratio": safe_divide(top_noun, total_top_count),
	"top_words_verb_ratio": safe_divide(top_verb, total_top_count),
	"top_words_adj_ratio": safe_divide(top_adj, total_top_count),
	"top_words_other_ratio": safe_divide(top_other, total_top_count),
	"top_words_noun_prop_of_all_nouns": safe_divide(top_noun, total_nouns),
	"top_words_verb_prop_of_all_verbs": safe_divide(top_verb, total_verbs),
	"top_words_adj_prop_of_all_adjs": safe_divide(top_adj, total_adjs),
	"top_words_other_prop_of_all_others": safe_divide(top_other, total_others),
	}

	# --- Funkcje wymagające tylko tekstu ---

	def compute_readability_indices(text: str, sentences: List[str]) -> Dict[str, float]:
	"""Oblicza wskaźniki czytelności LIX i RIX."""
	if not text.strip(): return {"lix": 0.0, "rix": 0.0}
	words = re.findall(r'\w+', text)
	num_words = len(words)
	num_sentences = len(sentences)
	long_words = sum(1 for w in words if len(w) > 6)
	lix = safe_divide(num_words, num_sentences) + safe_divide(long_words * 100, num_words)
	rix = safe_divide(long_words, num_words) * 100
	return {"lix": lix, "rix": rix}

	def analyze_polish_diacritics_distribution(text: str) -> Dict[str, float]:
	"""Analizuje rozkład polskich znaków diakrytycznych."""
	polish_diacritics = 'ąćęłńóśźżĄĆĘŁŃÓŚŹŻ'
	total = len(text)
	if total == 0: return {"diacritics_std_dev": 0.0}
	counts = Counter(text)
	diac_counts = [counts[ch] for ch in polish_diacritics if ch in counts]
	if not diac_counts: return {"diacritics_std_dev": 0.0}
	diac_freqs = [c / total for c in diac_counts]
	mean_freq = mean(diac_freqs)
	variance_val = sum((x - mean_freq) ** 2 for x in diac_freqs) / len(diac_freqs)
	return {"diacritics_std_dev": math.sqrt(variance_val)}

	def analyze_question_sentences(sentences: List[str]) -> Dict[str, float]:
	"""Oblicza stosunek zdań pytających do wszystkich."""
	if not sentences: return {"question_sentence_ratio": 0.0}
	questions = sum(1 for s in sentences if s.strip().endswith('?'))
	return {"question_sentence_ratio": safe_divide(questions, len(sentences))}

	# --- Główna funkcja agregująca ---

	def calculate_all_spacy_features(doc: spacy.tokens.Doc, text: str, sentences: List[str]) -> Dict[str, float]:
	"""Agreguje wszystkie zaawansowane cechy lingwistyczne."""
	features = {}
	features.update(analyze_pos_stats(doc))
	features.update(analyze_doc_level_stats(doc, text))
	features.update(analyze_named_entities(doc))
	features.update(analyze_morphology(doc))
	features.update(analyze_dependency_complexity(doc))
	features.update(analyze_pos_frequencies(doc))
	features.update(compute_readability_indices(text, sentences))
	features.update(analyze_polish_diacritics_distribution(text))
	features.update(analyze_question_sentences(sentences))
	return features