| from langchain.document_loaders import PyPDFLoader, DirectoryLoader, PDFMinerLoader |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
| from langchain.embeddings import SentenceTransformerEmbeddings |
| from langchain.vectorstores import Chroma |
| import os |
| from constants import CHROMA_SETTINGS |
|
|
| persist_directory = "db" |
|
|
| def main(): |
| for root, dirs, files in os.walk("docs"): |
| for file in files: |
| if file.endswith(".pdf"): |
| print(file) |
| loader = PDFMinerLoader(os.path.join(root, file)) |
| documents = loader.load() |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=500) |
| texts = text_splitter.split_documents(documents) |
| |
| embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") |
| |
| db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS) |
| db.persist() |
| db=None |
|
|
| if __name__ == "__main__": |
| main() |