| from collections import defaultdict |
| import re |
| import heapq |
| import joblib |
| import os |
|
|
| def preprocess_text(text): |
| """ |
| Preprocess the text for tokenization. |
| Removes special characters, lowercases, and splits into words. |
| """ |
| return re.findall(r'\w+', text.lower()) |
|
|
| def create_inverted_index(wikipedia_dict): |
| """ |
| Create an inverted index from the document dictionary. |
| Args: |
| wikipedia_dict (dict): A dictionary with document IDs as keys and text as values. |
| |
| Returns: |
| dict: An inverted index where each term maps to a list of document IDs containing it. |
| """ |
| inverted_index = defaultdict(set) |
| for doc_id, text in wikipedia_dict.items(): |
| tokens = set(preprocess_text(text)) |
| for token in tokens: |
| inverted_index[token].add(doc_id) |
| return inverted_index |
|
|
| def save_inverted_index(inverted_index, filepath="Baseline/inverted_index.pkl"): |
| """ |
| Save the inverted index to a file using joblib. |
| """ |
| joblib.dump(inverted_index, filepath) |
|
|
| def load_inverted_index(filepath="Baseline/inverted_index.pkl"): |
| """ |
| Load the inverted index from a file using joblib. |
| """ |
| if os.path.exists(filepath): |
| return joblib.load(filepath) |
| return None |
|
|
| def boolean_retrieval(queries_dict, inverted_index, wikipedia_dict, top_n=100): |
| """ |
| Perform boolean retrieval for each query. |
| Args: |
| queries_dict (dict): A dictionary with query IDs as keys and query text as values. |
| inverted_index (dict): The inverted index created from the document collection. |
| wikipedia_dict (dict): The original document dictionary (for scoring if needed). |
| top_n (int): The number of top documents to retrieve for each query. |
| |
| Returns: |
| dict: A dictionary with query IDs as keys and a list of top document IDs as values. |
| """ |
| query_results = {} |
| |
| for query_id, query_text in queries_dict.items(): |
| query_tokens = preprocess_text(query_text) |
| |
| |
| relevant_docs = set() |
| for token in query_tokens: |
| if token in inverted_index: |
| relevant_docs.update(inverted_index[token]) |
| |
| |
| doc_scores = [] |
| for doc_id in relevant_docs: |
| doc_text = preprocess_text(wikipedia_dict[doc_id]) |
| score = sum(doc_text.count(token) for token in query_tokens) |
| doc_scores.append((score, doc_id)) |
| |
| |
| top_docs = heapq.nlargest(top_n, doc_scores) |
| query_results[query_id] = [doc_id for _, doc_id in top_docs] |
|
|
| return query_results |
|
|
| |
| def main_boolean_retrieval(wikipedia_dict, queries_dict): |
| |
| inverted_index = create_inverted_index(wikipedia_dict) |
| |
| |
| top_docs = boolean_retrieval(queries_dict, inverted_index, wikipedia_dict) |
| |
| return top_docs |
|
|
| def retrieve_single_query(query, wikipedia_dict, top_n=100, inverted_index_path="Baseline/inverted_index.pkl"): |
| """ |
| Retrieve documents for a single query using the inverted index. |
| If the inverted index is not found, it will be created and saved. |
| |
| Args: |
| query (str): The query text. |
| wikipedia_dict (dict): The original document dictionary. |
| top_n (int): The number of top documents to retrieve. |
| inverted_index_path (str): Path to the saved inverted index file. |
| |
| Returns: |
| list: A list of top document IDs matching the query. |
| """ |
| |
| inverted_index = load_inverted_index(inverted_index_path) |
| if inverted_index is None: |
| print("Inverted index not found. Creating one...") |
| inverted_index = create_inverted_index(wikipedia_dict) |
| save_inverted_index(inverted_index, inverted_index_path) |
|
|
| |
| query_tokens = preprocess_text(query) |
| |
| |
| relevant_docs = set() |
| for token in query_tokens: |
| if token in inverted_index: |
| relevant_docs.update(inverted_index[token]) |
| |
| |
| doc_scores = [] |
| for doc_id in relevant_docs: |
| doc_text = preprocess_text(wikipedia_dict[doc_id]) |
| score = sum(doc_text.count(token) for token in query_tokens) |
| doc_scores.append((score, doc_id)) |
| |
| |
| top_docs = heapq.nlargest(top_n, doc_scores) |
| return [doc_id for _, doc_id in top_docs] |
|
|
| |
| |
| |
| |
|
|