Spaces:
Running
Running
| from .llm import nemotron_llama | |
| from .embeddings import get_embeddings | |
| from .retriever import vector_db_retriever | |
| import pickle | |
| import os | |
| BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) | |
| def get_path(folder, filename): | |
| # Try the standard 'volumes/' path (local structure) | |
| path1 = os.path.join(BASE_DIR, "volumes", folder, filename) | |
| if os.path.exists(path1): | |
| return path1 | |
| # Try the root-level path (Hugging Face structure) | |
| path2 = os.path.join(BASE_DIR, folder, filename) | |
| if os.path.exists(path2): | |
| return path2 | |
| # Default to path1 if neither exists, so it raises the error on the first expected path | |
| return path1 | |
| pkl_path = get_path("metadata", "new_pdfs_corpus_data.pkl") | |
| if not os.path.exists(pkl_path): | |
| print(f"⚠️ ERROR: Metadata file not found at {pkl_path}") | |
| else: | |
| with open(pkl_path, "rb") as p: | |
| metadata = pickle.load(p) | |
| # ids = list(metadata.keys()) | |
| # def RAG(query, chat_history): | |
| # query_embeddings = get_embeddings([query]) | |
| # result = vector_db_retriever(query_embeddings, 10) | |
| # indexes = result[0][0] | |
| # context = "" | |
| # for idx in indexes: | |
| # hash_id = ids[idx] | |
| # retrieved_results = metadata[hash_id] | |
| # context+="Title:"+retrieved_results['title']+"\n"+"Date:"+retrieved_results['date']+"\n"+"Page Number:"+str(retrieved_results['page_no'])+"\n"+"Corpus:"+retrieved_results['text']+"\n\n" | |
| # completion = nemotron_llama(query, context, chat_history) | |
| # # for chunk in completion: | |
| # # if chunk.choices[0].delta.content is not None: | |
| # # print(chunk.choices[0].delta.content, end = '') | |
| # return completion | |
| # RAG("explain the seventh amentment act", chat_history=[]) | |
| import re as _re | |
| def _clean_corpus(text: str) -> str: | |
| """Collapse PDF extraction artifacts: newlines between words become spaces, | |
| but preserve intentional paragraph breaks (two+ newlines).""" | |
| # Preserve double newlines (paragraph breaks) as a placeholder | |
| text = text.replace('\r\n', '\n').replace('\r', '\n') | |
| # Replace single newlines (mid-sentence line-wraps from PDF) with a space | |
| text = _re.sub(r'(?<!\n)\n(?!\n)', ' ', text) | |
| # Collapse multiple spaces into one | |
| text = _re.sub(r' {2,}', ' ', text) | |
| return text.strip() | |
| def RAG(query, chat_history, role="General"): | |
| query_embeddings = get_embeddings([query]) | |
| # Fetch 30 chunks from FAISS to allow filtering for diversity | |
| result = vector_db_retriever(query_embeddings, 30) | |
| indexes = result[0][0] | |
| context = "" | |
| title_counts = {} | |
| chunks_added = 0 | |
| for idx in indexes: | |
| # FAISS returns -1 for empty slots if there are fewer than top_k chunks | |
| if idx == -1: continue | |
| retrieved_results = metadata[idx] | |
| title = retrieved_results['title'] | |
| # Enforce diversity: Max 2 chunks per PDF | |
| if title_counts.get(title, 0) >= 2: | |
| continue | |
| clean_paragraph = _clean_corpus(retrieved_results['paragraphs']) | |
| context += f"Title: {title}\nPage Number: {retrieved_results['page']}\nCorpus: {clean_paragraph}\n\n" | |
| title_counts[title] = title_counts.get(title, 0) + 1 | |
| chunks_added += 1 | |
| # Stop once we have 10 highly diverse chunks | |
| if chunks_added >= 10: | |
| break | |
| completion = nemotron_llama(query, context, chat_history, role=role) | |
| # for chunk in completion: | |
| # if chunk.choices[0].delta.content is not None: | |
| # print(chunk.choices[0].delta.content, end = '') | |
| return completion | |
| print("imported sucessfully") |