Spaces:
Running
Running
File size: 3,333 Bytes
659d6ec | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 | """
src/document_loader.py
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Responsible for:
1. Loading documents from a folder (PDF, TXT, DOCX)
2. Splitting them into overlapping chunks suitable for embedding
3. Returning a list of LangChain Document objects
Why chunking?
LLMs have a limited context window. We split documents into small pieces
so each chunk can fit alongside the user query into the model's context.
Overlap between chunks avoids losing information at chunk boundaries.
"""
import os
from pathlib import Path
from typing import List
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
PyPDFLoader,
TextLoader,
Docx2txtLoader,
)
# ββ Supported file extensions and their loaders ββββββββββββββββββββββββββββββ
LOADER_MAP = {
".pdf": PyPDFLoader,
".txt": TextLoader,
".docx": Docx2txtLoader,
}
def load_documents(data_folder: str) -> List[Document]:
"""
Load all supported documents found in `data_folder`.
Args:
data_folder: Path to the folder containing raw documents.
Returns:
A flat list of LangChain Document objects (one per page or file).
"""
folder = Path(data_folder)
if not folder.exists():
raise FileNotFoundError(f"Data folder not found: {data_folder}")
documents: List[Document] = []
for file_path in folder.iterdir():
suffix = file_path.suffix.lower()
if suffix not in LOADER_MAP:
print(f"[DocumentLoader] Skipping unsupported file: {file_path.name}")
continue
print(f"[DocumentLoader] Loading: {file_path.name}")
loader_class = LOADER_MAP[suffix]
loader = loader_class(str(file_path))
docs = loader.load()
# Attach the source filename as metadata for traceability
for doc in docs:
doc.metadata["source"] = file_path.name
documents.extend(docs)
print(f"[DocumentLoader] Total pages/sections loaded: {len(documents)}")
return documents
def split_documents(
documents: List[Document],
chunk_size: int = 500,
chunk_overlap: int = 50,
) -> List[Document]:
"""
Split documents into smaller overlapping chunks.
Args:
documents: List of Document objects (raw, full pages).
chunk_size: Max number of characters per chunk.
chunk_overlap: Number of characters shared between adjacent chunks.
This ensures context is not lost at boundaries.
Returns:
List of smaller Document chunks ready for embedding.
"""
splitter = RecursiveCharacterTextSplitter(
# Try to split on paragraph β sentence β word β character
separators=["\n\n", "\n", ".", " ", ""],
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
)
chunks = splitter.split_documents(documents)
print(f"[DocumentLoader] Total chunks after splitting: {len(chunks)}")
return chunks |