Spaces:

Legal-Assistant
/

LawBot

Running

App Files Files Community

LawBot / src /apps /utils /main.py

Vishwanath77

Upload 2 files

535ee95 verified 20 days ago

raw

history blame contribute delete

3.79 kB

	from .llm import nemotron_llama
	from .embeddings import get_embeddings
	from .retriever import vector_db_retriever
	import pickle

	import os

	BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
	def get_path(folder, filename):
	# Try the standard 'volumes/' path (local structure)
	path1 = os.path.join(BASE_DIR, "volumes", folder, filename)
	if os.path.exists(path1):
	return path1
	# Try the root-level path (Hugging Face structure)
	path2 = os.path.join(BASE_DIR, folder, filename)
	if os.path.exists(path2):
	return path2
	# Default to path1 if neither exists, so it raises the error on the first expected path
	return path1

	pkl_path = get_path("metadata", "new_pdfs_corpus_data.pkl")

	if not os.path.exists(pkl_path):
	print(f"⚠️ ERROR: Metadata file not found at {pkl_path}")
	else:
	with open(pkl_path, "rb") as p:
	metadata = pickle.load(p)

	# ids = list(metadata.keys())


	# def RAG(query, chat_history):
	# query_embeddings = get_embeddings([query])
	# result = vector_db_retriever(query_embeddings, 10)
	# indexes = result[0][0]
	# context = ""
	# for idx in indexes:
	# hash_id = ids[idx]
	# retrieved_results = metadata[hash_id]
	# context+="Title:"+retrieved_results['title']+"\n"+"Date:"+retrieved_results['date']+"\n"+"Page Number:"+str(retrieved_results['page_no'])+"\n"+"Corpus:"+retrieved_results['text']+"\n\n"
	# completion = nemotron_llama(query, context, chat_history)
	# # for chunk in completion:
	# # if chunk.choices[0].delta.content is not None:
	# # print(chunk.choices[0].delta.content, end = '')
	# return completion
	# RAG("explain the seventh amentment act", chat_history=[])


	import re as _re

	def _clean_corpus(text: str) -> str:
	"""Collapse PDF extraction artifacts: newlines between words become spaces,
	but preserve intentional paragraph breaks (two+ newlines)."""
	# Preserve double newlines (paragraph breaks) as a placeholder
	text = text.replace('\r\n', '\n').replace('\r', '\n')
	# Replace single newlines (mid-sentence line-wraps from PDF) with a space
	text = _re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
	# Collapse multiple spaces into one
	text = _re.sub(r' {2,}', ' ', text)
	return text.strip()

	def RAG(query, chat_history, role="General"):
	query_embeddings = get_embeddings([query])
	# Fetch 30 chunks from FAISS to allow filtering for diversity
	result = vector_db_retriever(query_embeddings, 30)
	indexes = result[0][0]

	context = ""
	title_counts = {}
	chunks_added = 0

	for idx in indexes:
	# FAISS returns -1 for empty slots if there are fewer than top_k chunks
	if idx == -1: continue

	retrieved_results = metadata[idx]
	title = retrieved_results['title']

	# Enforce diversity: Max 2 chunks per PDF
	if title_counts.get(title, 0) >= 2:
	continue

	clean_paragraph = _clean_corpus(retrieved_results['paragraphs'])
	context += f"Title: {title}\nPage Number: {retrieved_results['page']}\nCorpus: {clean_paragraph}\n\n"

	title_counts[title] = title_counts.get(title, 0) + 1
	chunks_added += 1

	# Stop once we have 10 highly diverse chunks
	if chunks_added >= 10:
	break

	completion = nemotron_llama(query, context, chat_history, role=role)
	# for chunk in completion:
	# if chunk.choices[0].delta.content is not None:
	# print(chunk.choices[0].delta.content, end = '')
	return completion

	print("imported sucessfully")