Spaces:

RayanMLK
/

ece-intelligence-lab

Running

App Files Files Community

ece-intelligence-lab / src /document_loader.py

RayanMLK

Initial commit - ECE Intelligence Lab chatbot

659d6ec 19 days ago

raw

history blame contribute delete

3.33 kB

	"""
	src/document_loader.py
	───────────────────────────────────────────────────────────────────────────────
	Responsible for:
	1. Loading documents from a folder (PDF, TXT, DOCX)
	2. Splitting them into overlapping chunks suitable for embedding
	3. Returning a list of LangChain Document objects

	Why chunking?
	LLMs have a limited context window. We split documents into small pieces
	so each chunk can fit alongside the user query into the model's context.
	Overlap between chunks avoids losing information at chunk boundaries.
	"""

	import os
	from pathlib import Path
	from typing import List

	from langchain_core.documents import Document
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders import (
	PyPDFLoader,
	TextLoader,
	Docx2txtLoader,
	)


	# ── Supported file extensions and their loaders ──────────────────────────────
	LOADER_MAP = {
	".pdf": PyPDFLoader,
	".txt": TextLoader,
	".docx": Docx2txtLoader,
	}


	def load_documents(data_folder: str) -> List[Document]:
	"""
	Load all supported documents found in `data_folder`.

	Args:
	data_folder: Path to the folder containing raw documents.

	Returns:
	A flat list of LangChain Document objects (one per page or file).
	"""
	folder = Path(data_folder)
	if not folder.exists():
	raise FileNotFoundError(f"Data folder not found: {data_folder}")

	documents: List[Document] = []

	for file_path in folder.iterdir():
	suffix = file_path.suffix.lower()
	if suffix not in LOADER_MAP:
	print(f"[DocumentLoader] Skipping unsupported file: {file_path.name}")
	continue

	print(f"[DocumentLoader] Loading: {file_path.name}")
	loader_class = LOADER_MAP[suffix]
	loader = loader_class(str(file_path))
	docs = loader.load()

	# Attach the source filename as metadata for traceability
	for doc in docs:
	doc.metadata["source"] = file_path.name

	documents.extend(docs)

	print(f"[DocumentLoader] Total pages/sections loaded: {len(documents)}")
	return documents


	def split_documents(
	documents: List[Document],
	chunk_size: int = 500,
	chunk_overlap: int = 50,
	) -> List[Document]:
	"""
	Split documents into smaller overlapping chunks.

	Args:
	documents: List of Document objects (raw, full pages).
	chunk_size: Max number of characters per chunk.
	chunk_overlap: Number of characters shared between adjacent chunks.
	This ensures context is not lost at boundaries.

	Returns:
	List of smaller Document chunks ready for embedding.
	"""
	splitter = RecursiveCharacterTextSplitter(
	# Try to split on paragraph → sentence → word → character
	separators=["\n\n", "\n", ".", " ", ""],
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	length_function=len,
	)

	chunks = splitter.split_documents(documents)
	print(f"[DocumentLoader] Total chunks after splitting: {len(chunks)}")
	return chunks