|
|
|
|
| import spacy
|
| from pathlib import Path
|
| import logging
|
| import itertools
|
| import re
|
| import string
|
|
|
|
|
| from src.data_management import storage
|
| from src.data_management import loaders
|
|
|
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
|
|
| nlp = None
|
| STOP_WORDS = set()
|
| try:
|
| nlp = spacy.load("en_core_web_lg")
|
| logging.info("spaCy 'en_core_web_lg' modeli başarıyla yüklendi.")
|
| STOP_WORDS = nlp.Defaults.stop_words
|
| except OSError:
|
| logging.error("spaCy 'en_core_web_lg' modeli bulunamadı. Lütfen indirin: python -m spacy download en_core_web_lg")
|
|
|
|
|
| TRUSTED_ENTITY_LABELS = {"PRODUCT", "ORG", "WORK_OF_ART"}
|
| OTHER_ENTITY_LABELS = {"PERSON", "EVENT", "LAW", "NORP", "FAC", "GPE", "LOC"}
|
| NOUN_CHUNK_PATTERNS = re.compile(r".*\b(learning|network|model|algorithm|system|technique|approach|agent|layer|architecture|transformer|attention)\b$", re.IGNORECASE)
|
| MIN_CONCEPT_WORDS = 1
|
| MAX_CONCEPT_WORDS = 6
|
| AI_KEYWORDS = {"artificial intelligence", "machine learning", "deep learning",
|
| "neural network", "reinforcement learning", "transformer", "llm",
|
| "large language model", "computer vision", "natural language processing",
|
| "algorithm", "model", "gpt", "bert", "agent", "attention", "supervised",
|
| "unsupervised", "classification", "regression", "clustering"}
|
|
|
| RELATION_VERBS = {
|
| "use": "USES", "utilize": "USES", "apply": "USES", "employ": "USES",
|
| "improve": "IMPROVES", "enhance": "IMPROVES", "extend": "IMPROVES", "outperform": "IMPROVES",
|
| "base on": "BASED_ON", "rely on": "BASED_ON",
|
| "compare": "COMPARES_TO", "relate": "RELATED_TO", "associate": "RELATED_TO", "link": "RELATED_TO",
|
| "propose": "PROPOSES", "introduce": "PROPOSES", "develop": "PROPOSES",
|
| }
|
|
|
| def normalize_and_validate_concept(text: str, is_entity: bool = False, entity_label: str = "") -> str | None:
|
| """ Verilen metni temizler, doğrular... """
|
| cleaned_text = text.strip()
|
| word_count = len(cleaned_text.split())
|
| if not (MIN_CONCEPT_WORDS <= word_count <= MAX_CONCEPT_WORDS): return None
|
| if cleaned_text and all(word.lower() in STOP_WORDS for word in re.findall(r'\b\w+\b', cleaned_text)): return None
|
| if cleaned_text.isdigit() or all(c in string.punctuation for c in cleaned_text): return None
|
| generic_phrases = {"this approach", "these models", "this technique", "this system",
|
| "the model", "the algorithm", "the method", "the approach",
|
| "the system", "the technique", "our model", "our approach"}
|
| if cleaned_text.lower() in generic_phrases: return None
|
| return cleaned_text
|
|
|
| def find_verb_relation(token1: spacy.tokens.Token, token2: spacy.tokens.Token) -> tuple[str, str] | None:
|
| """ İki token arasındaki dependency path'e bakarak fiil ilişkisi bulur. """
|
| common_ancestor = None
|
| ancestors1 = list(token1.ancestors)
|
| ancestors2 = list(token2.ancestors)
|
| for t in reversed(ancestors1):
|
| if t in ancestors2:
|
| common_ancestor = t
|
| break
|
| if not common_ancestor: return None
|
|
|
| verb1 = None; head = token1
|
| while head != common_ancestor:
|
| if head.pos_ == "VERB": verb1 = head; break
|
| head = head.head
|
| verb2 = None; head = token2
|
| while head != common_ancestor:
|
| if head.pos_ == "VERB": verb2 = head; break
|
| head = head.head
|
|
|
| verb_token = None
|
| if common_ancestor.pos_ == "VERB": verb_token = common_ancestor
|
| elif verb1 and verb1 == verb2: verb_token = verb1
|
|
|
|
|
| elif common_ancestor.head.pos_ == "VERB": verb_token = common_ancestor.head
|
|
|
| if verb_token:
|
| verb_lemma = verb_token.lemma_
|
|
|
|
|
|
|
|
|
| for verb, rel_type in RELATION_VERBS.items():
|
| if verb_lemma == verb or verb_lemma in verb.split():
|
| logging.debug(f"Fiil ilişkisi bulundu: {token1.text}... {verb_lemma} ({rel_type}) ...{token2.text}")
|
| return rel_type, verb_lemma
|
| return None
|
|
|
| def extract_entities_and_relations(text: str, doc_id: str):
|
| """ Metinden konseptleri, mention'ları ve İYİLEŞTİRİLMİŞ ilişkileri çıkarır. """
|
| if not nlp: raise RuntimeError("spaCy modeli yüklenemedi.")
|
| spacy_doc = nlp(text)
|
| potential_concepts = {}; mentions_in_doc = []; valid_mentions = {}
|
| processed_spans = set(); added_relations = set()
|
|
|
|
|
| candidates = []
|
| for ent in spacy_doc.ents:
|
| if ent.label_ in TRUSTED_ENTITY_LABELS or ent.label_ in OTHER_ENTITY_LABELS:
|
| candidates.append({"span": ent, "is_entity": True, "label": ent.label_})
|
| for chunk in spacy_doc.noun_chunks:
|
| is_covered = any(ent_data["span"].start_char <= chunk.start_char and ent_data["span"].end_char >= chunk.end_char
|
| for ent_data in candidates if ent_data["is_entity"])
|
| if not is_covered:
|
| candidates.append({"span": chunk, "is_entity": False, "label": ""})
|
|
|
|
|
| for data in candidates:
|
| span = data["span"];
|
| if span in processed_spans: continue
|
| validated_text = normalize_and_validate_concept(span.text, data["is_entity"], data["label"])
|
| if not validated_text: processed_spans.add(span); continue
|
| concept_lemma = span.lemma_.lower().strip() if span.lemma_ else validated_text.lower()
|
| is_concept = False
|
| if data["is_entity"] and data["label"] in TRUSTED_ENTITY_LABELS: is_concept = True
|
| elif NOUN_CHUNK_PATTERNS.match(validated_text): is_concept = True
|
| elif any(keyword in concept_lemma.split() or keyword in validated_text.lower().split() for keyword in AI_KEYWORDS): is_concept = True
|
| elif validated_text.isupper() and len(validated_text) > 1 and len(validated_text) < 6: is_concept = True
|
|
|
| if is_concept:
|
| concept_id = storage.add_concept(validated_text)
|
| if concept_id:
|
| mention_id = storage.add_mention(
|
| doc_id=doc_id, concept_id=concept_id,
|
| context=span.sent.text, start=span.start_char, end=span.end_char
|
| )
|
| if mention_id:
|
| mention_data = {
|
| "mention_id": mention_id, "concept_id": concept_id,
|
| "start_char": span.start_char, "end_char": span.end_char,
|
| "sentence": span.sent, "root_token": span.root
|
| }
|
| mentions_in_doc.append(mention_data); valid_mentions[mention_id] = mention_data
|
| processed_spans.add(span)
|
|
|
|
|
| for sentence in spacy_doc.sents:
|
| mentions_in_sentence = [m for m in mentions_in_doc if m["sentence"] == sentence]
|
| if len(mentions_in_sentence) >= 2:
|
| for m1_data, m2_data in itertools.combinations(mentions_in_sentence, 2):
|
| c1_id = m1_data["concept_id"]; c2_id = m2_data["concept_id"]
|
| if c1_id == c2_id: continue
|
| rel_pair = tuple(sorted((c1_id, c2_id)))
|
| if rel_pair in added_relations: continue
|
| relation_found = False
|
| relation_info = find_verb_relation(m1_data["root_token"], m2_data["root_token"])
|
| if relation_info:
|
| rel_type, verb = relation_info
|
| storage.add_relationship(
|
| source_concept_id=c1_id, target_concept_id=c2_id, rel_type=rel_type,
|
| mention_id=m1_data["mention_id"], doc_id=doc_id, sentence=sentence.text
|
| )
|
| relation_found = True; added_relations.add(rel_pair)
|
| if not relation_found:
|
| storage.add_relationship(
|
| source_concept_id=c1_id, target_concept_id=c2_id, rel_type="RELATED_TO",
|
| mention_id=m1_data["mention_id"], doc_id=doc_id, sentence=sentence.text
|
| )
|
| added_relations.add(rel_pair)
|
|
|
| def process_documents_for_extraction():
|
| """ Dokümanları işler ve durumu günceller... (Öncekiyle aynı) """
|
| if not nlp: raise RuntimeError("spaCy modeli yüklenemedi.")
|
| logging.info("Gelişmiş bilgi çıkarımı için dokümanlar işleniyor...")
|
| documents_df = storage.load_dataframe('documents', storage.DOC_COLUMNS)
|
| docs_to_process = documents_df[documents_df['status'] == 'added']
|
| if docs_to_process.empty:
|
| logging.info("Durumu 'added' olan ve işlenecek doküman bulunamadı.")
|
| return
|
| processed_count = 0; failed_count = 0
|
| for index, doc_row in docs_to_process.iterrows():
|
| doc_id = doc_row['doc_id']; filepath = Path(doc_row['filepath'])
|
| logging.info(f"İşleniyor: {filepath.name} (ID: {doc_id})")
|
| text = loaders.extract_text_from_pdf(filepath)
|
| if text:
|
| try:
|
| extract_entities_and_relations(text, doc_id)
|
| storage.update_document_status(doc_id, 'processed_v3')
|
| processed_count += 1
|
| except Exception as e:
|
| logging.exception(f"'{filepath.name}' işlenirken BEKLENMEYEN HATA oluştu: {e}")
|
| storage.update_document_status(doc_id, 'extraction_failed_v3')
|
| failed_count += 1
|
| else:
|
| logging.warning(f"Metin çıkarılamadı: {filepath.name}")
|
| storage.update_document_status(doc_id, 'text_extraction_failed')
|
| failed_count += 1
|
| logging.info(f"Gelişmiş bilgi çıkarımı tamamlandı. Başarılı: {processed_count}, Başarısız: {failed_count}") |