Spaces:
Sleeping
Sleeping
| import os | |
| import logging | |
| from langchain_core.documents import Document | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_chroma import Chroma | |
| from langchain_openai import ChatOpenAI | |
| import json | |
| # Set up logging configuration | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s' | |
| ) | |
| # Get a logger for this module | |
| logger = logging.getLogger(__name__) | |
| working_dir = os.path.dirname(os.path.abspath(__file__)) | |
| parent_dir = os.path.dirname(working_dir) | |
| data_dir = f"{parent_dir}/" | |
| vector_db_dir = f"{parent_dir}/vector_db" | |
| logger.info("Reading Files Process Started...") | |
| all_records = [] | |
| # loop through all files | |
| for file_name in os.listdir(data_dir): | |
| if file_name.endswith(".json"): | |
| file_path = os.path.join(data_dir, file_name) | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| # if JSON contains list of records | |
| if isinstance(data, list): | |
| all_records.extend(data) | |
| else: | |
| all_records.append(data) | |
| print("Total drug records:", len(all_records)) | |
| documents = [] | |
| for record in data: | |
| drug = record.get("generic_name", ["UNKNOWN"])[0].upper() | |
| # choose sections you want in RAG | |
| sections = [ | |
| "indications_and_usage", | |
| "warnings_and_cautions", | |
| "adverse_reactions", | |
| "drug_interactions" | |
| ] | |
| for section in sections: | |
| if section in record: | |
| for text in record[section]: | |
| documents.append( | |
| Document( | |
| page_content=text, | |
| metadata={ | |
| "generic_name": drug, | |
| "section": section | |
| } | |
| ) | |
| ) | |
| print("Documents created:", len(documents)) | |
| logger.info("Split chunk Files Process Started...") | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=800, | |
| chunk_overlap=150 | |
| ) | |
| chunked_docs = splitter.split_documents(documents) | |
| print("Chunks created:", len(chunked_docs)) | |
| logger.info("Embeddings Files Process Started...") | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name="sentence-transformers/all-MiniLM-L6-v2" | |
| ) | |
| #%% | |
| print("Chroma ready ✅") | |
| logger.info(" VectorDB Process Started...") | |
| vectordb = Chroma.from_documents( | |
| documents=chunked_docs, | |
| embedding=embeddings, | |
| persist_directory="./chroma_db" | |
| ) | |
| print("Vector DB created successfully ✅") | |
| logger.info("VectorDB Process Completed...") | |