Medical-Labelling / src /ETL_VectorDB.py
kramachan's picture
Upload 2 files
0fc1003 verified
import os
import logging
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_openai import ChatOpenAI
import json
# Set up logging configuration
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
# Get a logger for this module
logger = logging.getLogger(__name__)
working_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(working_dir)
data_dir = f"{parent_dir}/"
vector_db_dir = f"{parent_dir}/vector_db"
logger.info("Reading Files Process Started...")
all_records = []
# loop through all files
for file_name in os.listdir(data_dir):
if file_name.endswith(".json"):
file_path = os.path.join(data_dir, file_name)
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
# if JSON contains list of records
if isinstance(data, list):
all_records.extend(data)
else:
all_records.append(data)
print("Total drug records:", len(all_records))
documents = []
for record in data:
drug = record.get("generic_name", ["UNKNOWN"])[0].upper()
# choose sections you want in RAG
sections = [
"indications_and_usage",
"warnings_and_cautions",
"adverse_reactions",
"drug_interactions"
]
for section in sections:
if section in record:
for text in record[section]:
documents.append(
Document(
page_content=text,
metadata={
"generic_name": drug,
"section": section
}
)
)
print("Documents created:", len(documents))
logger.info("Split chunk Files Process Started...")
splitter = RecursiveCharacterTextSplitter(
chunk_size=800,
chunk_overlap=150
)
chunked_docs = splitter.split_documents(documents)
print("Chunks created:", len(chunked_docs))
logger.info("Embeddings Files Process Started...")
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
#%%
print("Chroma ready ✅")
logger.info(" VectorDB Process Started...")
vectordb = Chroma.from_documents(
documents=chunked_docs,
embedding=embeddings,
persist_directory="./chroma_db"
)
print("Vector DB created successfully ✅")
logger.info("VectorDB Process Completed...")