""" src/document_loader.py ─────────────────────────────────────────────────────────────────────────────── Responsible for: 1. Loading documents from a folder (PDF, TXT, DOCX) 2. Splitting them into overlapping chunks suitable for embedding 3. Returning a list of LangChain Document objects Why chunking? LLMs have a limited context window. We split documents into small pieces so each chunk can fit alongside the user query into the model's context. Overlap between chunks avoids losing information at chunk boundaries. """ import os from pathlib import Path from typing import List from langchain_core.documents import Document from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.document_loaders import ( PyPDFLoader, TextLoader, Docx2txtLoader, ) # ── Supported file extensions and their loaders ────────────────────────────── LOADER_MAP = { ".pdf": PyPDFLoader, ".txt": TextLoader, ".docx": Docx2txtLoader, } def load_documents(data_folder: str) -> List[Document]: """ Load all supported documents found in `data_folder`. Args: data_folder: Path to the folder containing raw documents. Returns: A flat list of LangChain Document objects (one per page or file). """ folder = Path(data_folder) if not folder.exists(): raise FileNotFoundError(f"Data folder not found: {data_folder}") documents: List[Document] = [] for file_path in folder.iterdir(): suffix = file_path.suffix.lower() if suffix not in LOADER_MAP: print(f"[DocumentLoader] Skipping unsupported file: {file_path.name}") continue print(f"[DocumentLoader] Loading: {file_path.name}") loader_class = LOADER_MAP[suffix] loader = loader_class(str(file_path)) docs = loader.load() # Attach the source filename as metadata for traceability for doc in docs: doc.metadata["source"] = file_path.name documents.extend(docs) print(f"[DocumentLoader] Total pages/sections loaded: {len(documents)}") return documents def split_documents( documents: List[Document], chunk_size: int = 500, chunk_overlap: int = 50, ) -> List[Document]: """ Split documents into smaller overlapping chunks. Args: documents: List of Document objects (raw, full pages). chunk_size: Max number of characters per chunk. chunk_overlap: Number of characters shared between adjacent chunks. This ensures context is not lost at boundaries. Returns: List of smaller Document chunks ready for embedding. """ splitter = RecursiveCharacterTextSplitter( # Try to split on paragraph → sentence → word → character separators=["\n\n", "\n", ".", " ", ""], chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, ) chunks = splitter.split_documents(documents) print(f"[DocumentLoader] Total chunks after splitting: {len(chunks)}") return chunks