Spaces:

IR-IIITH
/

MultiAgent-OpenDomain-QnA-System

Sleeping

App Files Files Community

MultiAgent-OpenDomain-QnA-System / Baseline /boolean_retrieval.py

raghuv-aditya

Upload 24 files

9f21f05 verified over 1 year ago

raw

history blame contribute delete

4.95 kB

	from collections import defaultdict
	import re
	import heapq
	import joblib
	import os

	def preprocess_text(text):
	"""
	Preprocess the text for tokenization.
	Removes special characters, lowercases, and splits into words.
	"""
	return re.findall(r'\w+', text.lower())

	def create_inverted_index(wikipedia_dict):
	"""
	Create an inverted index from the document dictionary.
	Args:
	wikipedia_dict (dict): A dictionary with document IDs as keys and text as values.

	Returns:
	dict: An inverted index where each term maps to a list of document IDs containing it.
	"""
	inverted_index = defaultdict(set)
	for doc_id, text in wikipedia_dict.items():
	tokens = set(preprocess_text(text)) # Unique tokens for each document
	for token in tokens:
	inverted_index[token].add(doc_id)
	return inverted_index

	def save_inverted_index(inverted_index, filepath="Baseline/inverted_index.pkl"):
	"""
	Save the inverted index to a file using joblib.
	"""
	joblib.dump(inverted_index, filepath)

	def load_inverted_index(filepath="Baseline/inverted_index.pkl"):
	"""
	Load the inverted index from a file using joblib.
	"""
	if os.path.exists(filepath):
	return joblib.load(filepath)
	return None

	def boolean_retrieval(queries_dict, inverted_index, wikipedia_dict, top_n=100):
	"""
	Perform boolean retrieval for each query.
	Args:
	queries_dict (dict): A dictionary with query IDs as keys and query text as values.
	inverted_index (dict): The inverted index created from the document collection.
	wikipedia_dict (dict): The original document dictionary (for scoring if needed).
	top_n (int): The number of top documents to retrieve for each query.

	Returns:
	dict: A dictionary with query IDs as keys and a list of top document IDs as values.
	"""
	query_results = {}

	for query_id, query_text in queries_dict.items():
	query_tokens = preprocess_text(query_text)

	# Collect all document IDs that contain any of the query terms
	relevant_docs = set()
	for token in query_tokens:
	if token in inverted_index:
	relevant_docs.update(inverted_index[token])

	# If more than `top_n` documents, sort by some criteria (e.g., frequency of terms in the doc)
	doc_scores = []
	for doc_id in relevant_docs:
	doc_text = preprocess_text(wikipedia_dict[doc_id])
	score = sum(doc_text.count(token) for token in query_tokens) # Term frequency score
	doc_scores.append((score, doc_id))

	# Get the top `top_n` documents based on the score
	top_docs = heapq.nlargest(top_n, doc_scores)
	query_results[query_id] = [doc_id for _, doc_id in top_docs]

	return query_results

	# Main flow
	def main_boolean_retrieval(wikipedia_dict, queries_dict):
	# Step 1: Create inverted index
	inverted_index = create_inverted_index(wikipedia_dict)

	# Step 2: Perform boolean retrieval
	top_docs = boolean_retrieval(queries_dict, inverted_index, wikipedia_dict)

	return top_docs

	def retrieve_single_query(query, wikipedia_dict, top_n=100, inverted_index_path="Baseline/inverted_index.pkl"):
	"""
	Retrieve documents for a single query using the inverted index.
	If the inverted index is not found, it will be created and saved.

	Args:
	query (str): The query text.
	wikipedia_dict (dict): The original document dictionary.
	top_n (int): The number of top documents to retrieve.
	inverted_index_path (str): Path to the saved inverted index file.

	Returns:
	list: A list of top document IDs matching the query.
	"""
	# Load or create the inverted index
	inverted_index = load_inverted_index(inverted_index_path)
	if inverted_index is None:
	print("Inverted index not found. Creating one...")
	inverted_index = create_inverted_index(wikipedia_dict)
	save_inverted_index(inverted_index, inverted_index_path)

	# Preprocess the query
	query_tokens = preprocess_text(query)

	# Collect relevant documents
	relevant_docs = set()
	for token in query_tokens:
	if token in inverted_index:
	relevant_docs.update(inverted_index[token])

	# Rank documents by frequency of terms
	doc_scores = []
	for doc_id in relevant_docs:
	doc_text = preprocess_text(wikipedia_dict[doc_id])
	score = sum(doc_text.count(token) for token in query_tokens)
	doc_scores.append((score, doc_id))

	# Get the top `top_n` documents based on the score
	top_docs = heapq.nlargest(top_n, doc_scores)
	return [doc_id for _, doc_id in top_docs]

	# Example usage:
	# Assuming `wikipedia_dict` and `queries_dict` are already prepared
	# top_results = main_boolean_retrieval(wikipedia_dict, queries_dict)
	# print(top_results)