File size: 3,333 Bytes
659d6ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""
src/document_loader.py
───────────────────────────────────────────────────────────────────────────────
Responsible for:
  1. Loading documents from a folder (PDF, TXT, DOCX)
  2. Splitting them into overlapping chunks suitable for embedding
  3. Returning a list of LangChain Document objects

Why chunking?
  LLMs have a limited context window. We split documents into small pieces
  so each chunk can fit alongside the user query into the model's context.
  Overlap between chunks avoids losing information at chunk boundaries.
"""

import os
from pathlib import Path
from typing import List

from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
    PyPDFLoader,
    TextLoader,
    Docx2txtLoader,
)


# ── Supported file extensions and their loaders ──────────────────────────────
LOADER_MAP = {
    ".pdf":  PyPDFLoader,
    ".txt":  TextLoader,
    ".docx": Docx2txtLoader,
}


def load_documents(data_folder: str) -> List[Document]:
    """
    Load all supported documents found in `data_folder`.

    Args:
        data_folder: Path to the folder containing raw documents.

    Returns:
        A flat list of LangChain Document objects (one per page or file).
    """
    folder = Path(data_folder)
    if not folder.exists():
        raise FileNotFoundError(f"Data folder not found: {data_folder}")

    documents: List[Document] = []

    for file_path in folder.iterdir():
        suffix = file_path.suffix.lower()
        if suffix not in LOADER_MAP:
            print(f"[DocumentLoader] Skipping unsupported file: {file_path.name}")
            continue

        print(f"[DocumentLoader] Loading: {file_path.name}")
        loader_class = LOADER_MAP[suffix]
        loader = loader_class(str(file_path))
        docs = loader.load()

        # Attach the source filename as metadata for traceability
        for doc in docs:
            doc.metadata["source"] = file_path.name

        documents.extend(docs)

    print(f"[DocumentLoader] Total pages/sections loaded: {len(documents)}")
    return documents


def split_documents(
    documents: List[Document],
    chunk_size: int = 500,
    chunk_overlap: int = 50,
) -> List[Document]:
    """
    Split documents into smaller overlapping chunks.

    Args:
        documents:     List of Document objects (raw, full pages).
        chunk_size:    Max number of characters per chunk.
        chunk_overlap: Number of characters shared between adjacent chunks.
                       This ensures context is not lost at boundaries.

    Returns:
        List of smaller Document chunks ready for embedding.
    """
    splitter = RecursiveCharacterTextSplitter(
        # Try to split on paragraph β†’ sentence β†’ word β†’ character
        separators=["\n\n", "\n", ".", " ", ""],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
    )

    chunks = splitter.split_documents(documents)
    print(f"[DocumentLoader] Total chunks after splitting: {len(chunks)}")
    return chunks