| """
|
| Modu艂 do ekstrakcji zaawansowanych cech lingwistycznych przy u偶yciu spaCy.
|
| """
|
| import re
|
| import math
|
| from collections import Counter
|
| from statistics import mean, variance
|
| from typing import Dict, List
|
|
|
| import textstat
|
| import spacy
|
|
|
| from ..utils import safe_divide
|
| from ..constants import CAMEL_CASE_PATTERN
|
|
|
|
|
|
|
| def analyze_pos_stats(doc: spacy.tokens.Doc) -> Dict[str, float]:
|
| """Oblicza statystyki cz臋艣ci mowy (POS), interpunkcji i stopwords."""
|
| words = [t for t in doc if not t.is_punct and not t.is_space and t.pos_ != 'SYM']
|
| words_count = len(words)
|
|
|
| if not words_count:
|
| return {'words': 0, 'nouns': 0, 'verbs': 0, 'adjectives': 0, 'adverbs': 0,
|
| 'punctuations': 0, 'symbols': 0, 'stopwords': 0, 'oovs': 0,
|
| 'pos_x': 0, 'pos_num': 0, 'noun_ratio': 0.0, 'verb_ratio': 0.0,
|
| 'adj_ratio': 0.0}
|
|
|
| stats = {
|
| 'words': words_count, 'nouns': sum(1 for t in doc if t.pos_ == "NOUN"),
|
| 'verbs': sum(1 for t in doc if t.pos_ == "VERB"),
|
| 'adjectives': sum(1 for t in doc if t.pos_ == "ADJ"),
|
| 'adverbs': sum(1 for t in doc if t.pos_ == "ADV"),
|
| 'punctuations': sum(1 for t in doc if t.is_punct),
|
| 'symbols': sum(1 for t in doc if t.pos_ == "SYM"),
|
| 'stopwords': sum(1 for t in doc if t.is_stop),
|
| 'oovs': sum(1 for t in doc if t.is_oov),
|
| 'pos_x': sum(1 for t in doc if t.pos_ == "X"),
|
| 'pos_num': sum(1 for t in doc if t.pos_ == "NUM"),
|
| }
|
| stats['noun_ratio'] = safe_divide(stats['nouns'], words_count)
|
| stats['verb_ratio'] = safe_divide(stats['verbs'], words_count)
|
| stats['adj_ratio'] = safe_divide(stats['adjectives'], words_count)
|
| return stats
|
|
|
| def analyze_doc_level_stats(doc: spacy.tokens.Doc, text: str) -> Dict[str, float]:
|
| """Analizuje cechy na poziomie ca艂ego dokumentu."""
|
| words = [t for t in doc if not t.is_punct and not t.is_space and t.pos_ != 'SYM']
|
| words_count = len(words)
|
| sentences_count = len(list(doc.sents))
|
|
|
| return {
|
| 'sentences': sentences_count,
|
| 'avg_word_length': safe_divide(sum(len(t.text) for t in words), words_count),
|
| 'avg_sentence_length': safe_divide(words_count, sentences_count),
|
| 'lexical_density': safe_divide(len({t.lemma_ for t in words}), words_count),
|
| 'gunning_fog': textstat.gunning_fog(text) if text.strip() else 0.0,
|
| 'camel_case': sum(1 for t in words if CAMEL_CASE_PATTERN.match(t.text)),
|
| 'capitalized_words': sum(1 for t in words if t.text.isupper()),
|
| }
|
|
|
|
|
|
|
| def analyze_named_entities(doc: spacy.tokens.Doc) -> Dict[str, float]:
|
| """Analizuje rozpoznane jednostki nazwane (NER)."""
|
| alpha_words = [t for t in doc if t.is_alpha]
|
| if not alpha_words:
|
| return {"ner_count": 0, "ner_person_ratio": 0.0, "ner_org_ratio": 0.0,
|
| "ner_loc_ratio": 0.0, "ner_misc_ratio": 0.0}
|
|
|
| ents = doc.ents
|
| return {
|
| "ner_count": len(ents),
|
| "ner_person_ratio": safe_divide(sum(1 for e in ents if e.label_ == "persName"), len(alpha_words)),
|
| "ner_org_ratio": safe_divide(sum(1 for e in ents if e.label_ == "orgName"), len(alpha_words)),
|
| "ner_loc_ratio": safe_divide(sum(1 for e in ents if e.label_ in ["placeName", "locName"]), len(alpha_words)),
|
| "ner_misc_ratio": safe_divide(sum(1 for e in ents if e.label_ not in ["persName", "orgName", "placeName", "locName"]), len(alpha_words)),
|
| }
|
|
|
| def analyze_morphology(doc: spacy.tokens.Doc) -> Dict[str, float]:
|
| """Analizuje r贸偶norodno艣膰 morfologiczn膮."""
|
| alpha_tokens = [t for t in doc if t.is_alpha]
|
| if not alpha_tokens:
|
| return {"case_diversity": 0.0, "tense_diversity": 0.0, "mood_diversity": 0.0}
|
|
|
| cases, tenses, moods = [], [], []
|
| for token in alpha_tokens:
|
| if token.morph:
|
| cases.extend(token.morph.get("Case", []))
|
| tenses.extend(token.morph.get("Tense", []))
|
| moods.extend(token.morph.get("Mood", []))
|
|
|
| return {"case_diversity": safe_divide(len(set(cases)), len(alpha_tokens)),
|
| "tense_diversity": safe_divide(len(set(tenses)), len(alpha_tokens)),
|
| "mood_diversity": safe_divide(len(set(moods)), len(alpha_tokens))}
|
|
|
| def analyze_dependency_complexity(doc: spacy.tokens.Doc) -> Dict[str, float]:
|
| """Oblicza 艣redni膮 g艂臋boko艣膰 drzewa zale偶no艣ci."""
|
| depths = []
|
| for sent in doc.sents:
|
| if not list(sent): continue
|
| max_depth = 0
|
| for token in sent:
|
| dist = 0
|
| curr = token
|
| while curr.head != curr and dist < 100:
|
| curr = curr.head
|
| dist += 1
|
| max_depth = max(max_depth, dist)
|
| depths.append(max_depth)
|
| return {"avg_dependency_tree_depth": mean(depths) if depths else 0.0}
|
|
|
| def analyze_pos_frequencies(doc: spacy.tokens.Doc, top_k=10) -> Dict[str, float]:
|
| """Analizuje cz臋stotliwo艣膰 POS dla najcz臋stszych s艂贸w."""
|
| tokens = [t for t in doc if t.is_alpha]
|
| if not tokens:
|
| return {"top_words_total_count": 0, "top_words_noun_ratio": 0.0, "top_words_verb_ratio": 0.0,
|
| "top_words_adj_ratio": 0.0, "top_words_other_ratio": 0.0, "top_words_noun_prop_of_all_nouns": 0.0,
|
| "top_words_verb_prop_of_all_verbs": 0.0, "top_words_adj_prop_of_all_adjs": 0.0,
|
| "top_words_other_prop_of_all_others": 0.0}
|
|
|
| word_counts = Counter(t.text.lower() for t in tokens)
|
| top_words_list = [w for w, _ in word_counts.most_common(top_k)]
|
|
|
| top_tokens = [t for t in tokens if t.text.lower() in top_words_list]
|
| total_top_count = len(top_tokens)
|
|
|
| top_noun = sum(1 for t in top_tokens if t.pos_ == 'NOUN')
|
| top_verb = sum(1 for t in top_tokens if t.pos_ == 'VERB')
|
| top_adj = sum(1 for t in top_tokens if t.pos_ == 'ADJ')
|
| top_other = total_top_count - (top_noun + top_verb + top_adj)
|
|
|
| total_nouns = sum(1 for t in tokens if t.pos_ == "NOUN")
|
| total_verbs = sum(1 for t in tokens if t.pos_ == "VERB")
|
| total_adjs = sum(1 for t in tokens if t.pos_ == "ADJ")
|
| total_others = len(tokens) - (total_nouns + total_verbs + total_adjs)
|
|
|
| return {
|
| "top_words_total_count": total_top_count,
|
| "top_words_noun_ratio": safe_divide(top_noun, total_top_count),
|
| "top_words_verb_ratio": safe_divide(top_verb, total_top_count),
|
| "top_words_adj_ratio": safe_divide(top_adj, total_top_count),
|
| "top_words_other_ratio": safe_divide(top_other, total_top_count),
|
| "top_words_noun_prop_of_all_nouns": safe_divide(top_noun, total_nouns),
|
| "top_words_verb_prop_of_all_verbs": safe_divide(top_verb, total_verbs),
|
| "top_words_adj_prop_of_all_adjs": safe_divide(top_adj, total_adjs),
|
| "top_words_other_prop_of_all_others": safe_divide(top_other, total_others),
|
| }
|
|
|
|
|
|
|
| def compute_readability_indices(text: str, sentences: List[str]) -> Dict[str, float]:
|
| """Oblicza wska藕niki czytelno艣ci LIX i RIX."""
|
| if not text.strip(): return {"lix": 0.0, "rix": 0.0}
|
| words = re.findall(r'\w+', text)
|
| num_words = len(words)
|
| num_sentences = len(sentences)
|
| long_words = sum(1 for w in words if len(w) > 6)
|
| lix = safe_divide(num_words, num_sentences) + safe_divide(long_words * 100, num_words)
|
| rix = safe_divide(long_words, num_words) * 100
|
| return {"lix": lix, "rix": rix}
|
|
|
| def analyze_polish_diacritics_distribution(text: str) -> Dict[str, float]:
|
| """Analizuje rozk艂ad polskich znak贸w diakrytycznych."""
|
| polish_diacritics = '膮膰臋艂艅贸艣藕偶膭膯臉艁艃脫艢殴呕'
|
| total = len(text)
|
| if total == 0: return {"diacritics_std_dev": 0.0}
|
| counts = Counter(text)
|
| diac_counts = [counts[ch] for ch in polish_diacritics if ch in counts]
|
| if not diac_counts: return {"diacritics_std_dev": 0.0}
|
| diac_freqs = [c / total for c in diac_counts]
|
| mean_freq = mean(diac_freqs)
|
| variance_val = sum((x - mean_freq) ** 2 for x in diac_freqs) / len(diac_freqs)
|
| return {"diacritics_std_dev": math.sqrt(variance_val)}
|
|
|
| def analyze_question_sentences(sentences: List[str]) -> Dict[str, float]:
|
| """Oblicza stosunek zda艅 pytaj膮cych do wszystkich."""
|
| if not sentences: return {"question_sentence_ratio": 0.0}
|
| questions = sum(1 for s in sentences if s.strip().endswith('?'))
|
| return {"question_sentence_ratio": safe_divide(questions, len(sentences))}
|
|
|
|
|
|
|
| def calculate_all_spacy_features(doc: spacy.tokens.Doc, text: str, sentences: List[str]) -> Dict[str, float]:
|
| """Agreguje wszystkie zaawansowane cechy lingwistyczne."""
|
| features = {}
|
| features.update(analyze_pos_stats(doc))
|
| features.update(analyze_doc_level_stats(doc, text))
|
| features.update(analyze_named_entities(doc))
|
| features.update(analyze_morphology(doc))
|
| features.update(analyze_dependency_complexity(doc))
|
| features.update(analyze_pos_frequencies(doc))
|
| features.update(compute_readability_indices(text, sentences))
|
| features.update(analyze_polish_diacritics_distribution(text))
|
| features.update(analyze_question_sentences(sentences))
|
| return features |