Spaces:
Running
Running
| """ | |
| src/document_loader.py | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| Responsible for: | |
| 1. Loading documents from a folder (PDF, TXT, DOCX) | |
| 2. Splitting them into overlapping chunks suitable for embedding | |
| 3. Returning a list of LangChain Document objects | |
| Why chunking? | |
| LLMs have a limited context window. We split documents into small pieces | |
| so each chunk can fit alongside the user query into the model's context. | |
| Overlap between chunks avoids losing information at chunk boundaries. | |
| """ | |
| import os | |
| from pathlib import Path | |
| from typing import List | |
| from langchain_core.documents import Document | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.document_loaders import ( | |
| PyPDFLoader, | |
| TextLoader, | |
| Docx2txtLoader, | |
| ) | |
| # ββ Supported file extensions and their loaders ββββββββββββββββββββββββββββββ | |
| LOADER_MAP = { | |
| ".pdf": PyPDFLoader, | |
| ".txt": TextLoader, | |
| ".docx": Docx2txtLoader, | |
| } | |
| def load_documents(data_folder: str) -> List[Document]: | |
| """ | |
| Load all supported documents found in `data_folder`. | |
| Args: | |
| data_folder: Path to the folder containing raw documents. | |
| Returns: | |
| A flat list of LangChain Document objects (one per page or file). | |
| """ | |
| folder = Path(data_folder) | |
| if not folder.exists(): | |
| raise FileNotFoundError(f"Data folder not found: {data_folder}") | |
| documents: List[Document] = [] | |
| for file_path in folder.iterdir(): | |
| suffix = file_path.suffix.lower() | |
| if suffix not in LOADER_MAP: | |
| print(f"[DocumentLoader] Skipping unsupported file: {file_path.name}") | |
| continue | |
| print(f"[DocumentLoader] Loading: {file_path.name}") | |
| loader_class = LOADER_MAP[suffix] | |
| loader = loader_class(str(file_path)) | |
| docs = loader.load() | |
| # Attach the source filename as metadata for traceability | |
| for doc in docs: | |
| doc.metadata["source"] = file_path.name | |
| documents.extend(docs) | |
| print(f"[DocumentLoader] Total pages/sections loaded: {len(documents)}") | |
| return documents | |
| def split_documents( | |
| documents: List[Document], | |
| chunk_size: int = 500, | |
| chunk_overlap: int = 50, | |
| ) -> List[Document]: | |
| """ | |
| Split documents into smaller overlapping chunks. | |
| Args: | |
| documents: List of Document objects (raw, full pages). | |
| chunk_size: Max number of characters per chunk. | |
| chunk_overlap: Number of characters shared between adjacent chunks. | |
| This ensures context is not lost at boundaries. | |
| Returns: | |
| List of smaller Document chunks ready for embedding. | |
| """ | |
| splitter = RecursiveCharacterTextSplitter( | |
| # Try to split on paragraph β sentence β word β character | |
| separators=["\n\n", "\n", ".", " ", ""], | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| length_function=len, | |
| ) | |
| chunks = splitter.split_documents(documents) | |
| print(f"[DocumentLoader] Total chunks after splitting: {len(chunks)}") | |
| return chunks |