import os import logging from langchain_core.documents import Document from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from langchain_chroma import Chroma from langchain_openai import ChatOpenAI import json # Set up logging configuration logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) # Get a logger for this module logger = logging.getLogger(__name__) working_dir = os.path.dirname(os.path.abspath(__file__)) parent_dir = os.path.dirname(working_dir) data_dir = f"{parent_dir}/" vector_db_dir = f"{parent_dir}/vector_db" logger.info("Reading Files Process Started...") all_records = [] # loop through all files for file_name in os.listdir(data_dir): if file_name.endswith(".json"): file_path = os.path.join(data_dir, file_name) with open(file_path, "r", encoding="utf-8") as f: data = json.load(f) # if JSON contains list of records if isinstance(data, list): all_records.extend(data) else: all_records.append(data) print("Total drug records:", len(all_records)) documents = [] for record in data: drug = record.get("generic_name", ["UNKNOWN"])[0].upper() # choose sections you want in RAG sections = [ "indications_and_usage", "warnings_and_cautions", "adverse_reactions", "drug_interactions" ] for section in sections: if section in record: for text in record[section]: documents.append( Document( page_content=text, metadata={ "generic_name": drug, "section": section } ) ) print("Documents created:", len(documents)) logger.info("Split chunk Files Process Started...") splitter = RecursiveCharacterTextSplitter( chunk_size=800, chunk_overlap=150 ) chunked_docs = splitter.split_documents(documents) print("Chunks created:", len(chunked_docs)) logger.info("Embeddings Files Process Started...") embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2" ) #%% print("Chroma ready ✅") logger.info(" VectorDB Process Started...") vectordb = Chroma.from_documents( documents=chunked_docs, embedding=embeddings, persist_directory="./chroma_db" ) print("Vector DB created successfully ✅") logger.info("VectorDB Process Completed...")