| | |
| | from langchain_community.document_loaders import DirectoryLoader |
| | from langchain.text_splitter import RecursiveCharacterTextSplitter |
| | from langchain.schema import Document |
| | |
| | from langchain_huggingface import HuggingFaceEmbeddings |
| | from langchain_community.vectorstores import Chroma |
| | from dotenv import load_dotenv |
| | import os |
| | import shutil |
| | import logging |
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| | |
| | load_dotenv() |
| | |
| | |
| | |
| |
|
| | CHROMA_PATH = "chroma" |
| | DATA_PATH = "data/" |
| |
|
| |
|
| | def main(): |
| | generate_data_store() |
| |
|
| |
|
| | def generate_data_store(): |
| | logger.info("Loading documents..") |
| | documents = load_documents() |
| | chunks = split_text(documents) |
| | save_to_chroma(chunks) |
| |
|
| |
|
| | def load_documents(): |
| | loader = DirectoryLoader(DATA_PATH, glob="*.pdf") |
| | documents = loader.load() |
| | logger.info("Found {:d} documents..".format(len(documents))) |
| |
|
| | return documents |
| |
|
| |
|
| | def split_text(documents: list[Document]): |
| | text_splitter = RecursiveCharacterTextSplitter( |
| | chunk_size=1800, |
| | chunk_overlap=100, |
| | length_function=len, |
| | add_start_index=True, |
| | ) |
| | chunks = text_splitter.split_documents(documents) |
| | print(f"Split {len(documents)} documents into {len(chunks)} chunks.") |
| |
|
| | document = chunks[10] |
| | print(document.page_content) |
| | print(document.metadata) |
| |
|
| | return chunks |
| |
|
| |
|
| | def save_to_chroma(chunks: list[Document]): |
| | |
| | if os.path.exists(CHROMA_PATH): |
| | shutil.rmtree(CHROMA_PATH) |
| |
|
| | |
| | db = Chroma.from_documents( |
| | chunks, HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2"), persist_directory=CHROMA_PATH |
| | ) |
| | db.persist() |
| | print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |