| """Document processing module for loading and splitting documents""" |
|
|
| from typing import List, Union |
| from langchain_community.document_loaders import WebBaseLoader |
| from langchain_text_splitters import RecursiveCharacterTextSplitter |
| |
| from langchain_core.documents import Document |
| from pathlib import Path |
| from langchain_community.document_loaders import ( |
| WebBaseLoader, |
| PyPDFLoader, |
| TextLoader, |
| PyPDFDirectoryLoader |
| ) |
|
|
| class DocumentProcessor: |
| """Handles document loading and processing""" |
| |
| def __init__(self, chunk_size: int = 500, chunk_overlap: int = 50): |
| """ |
| Initialize document processor |
| |
| Args: |
| chunk_size: Size of text chunks |
| chunk_overlap: Overlap between chunks |
| """ |
| self.chunk_size = chunk_size |
| self.chunk_overlap = chunk_overlap |
| self.splitter = RecursiveCharacterTextSplitter( |
| chunk_size=chunk_size, |
| chunk_overlap=chunk_overlap |
| ) |
|
|
| def process_pdf(self, file_paths: List[str]): |
| """Load multiple PDFs and return a combined list of chunks""" |
| all_documents = [] |
| |
| for path in file_paths: |
| try: |
| loader = PyPDFLoader(path) |
| |
| chunks = loader.load_and_split(text_splitter=self.text_splitter) |
| |
| all_documents.extend(chunks) |
| except Exception as e: |
| print(f"Error loading PDF {path}: {e}") |
| |
| return all_documents |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| def load_from_url(self, url: str) -> List[Document]: |
| """Load document(s) from a URL""" |
| loader = WebBaseLoader(url) |
| return loader.load() |
|
|
| def load_from_pdf_dir(self, directory: Union[str, Path]) -> List[Document]: |
| """Load documents from all PDFs inside a directory""" |
| loader = PyPDFDirectoryLoader(str(directory)) |
| return loader.load() |
|
|
| def load_from_txt(self, file_path: Union[str, Path]) -> List[Document]: |
| """Load document(s) from a TXT file""" |
| loader = TextLoader(str(file_path), encoding="utf-8") |
| return loader.load() |
|
|
| def load_from_pdf(self, file_path: Union[str, Path]) -> List[Document]: |
| """Load document(s) from a PDF file""" |
| loader = PyPDFDirectoryLoader(str("data")) |
| return loader.load() |
| |
| def load_documents(self, sources: List[str]) -> List[Document]: |
| """ |
| Load documents from URLs, PDF directories, or TXT files |
| |
| Args: |
| sources: List of URLs, PDF folder paths, or TXT file paths |
| |
| Returns: |
| List of loaded documents |
| """ |
| docs: List[Document] = [] |
| for src in sources: |
| if src.startswith("http://") or src.startswith("https://"): |
| docs.extend(self.load_from_url(src)) |
| |
| path = Path("data") |
| if path.is_dir(): |
| docs.extend(self.load_from_pdf_dir(path)) |
| elif path.suffix.lower() == ".txt": |
| docs.extend(self.load_from_txt(path)) |
| else: |
| raise ValueError( |
| f"Unsupported source type: {src}. " |
| "Use URL, .txt file, or PDF directory." |
| ) |
| return docs |
| |
| def split_documents(self, documents: List[Document]) -> List[Document]: |
| """ |
| Split documents into chunks |
| |
| Args: |
| documents: List of documents to split |
| |
| Returns: |
| List of split documents |
| """ |
| return self.splitter.split_documents(documents) |
| |
| def process_urls(self, urls: List[str]) -> List[Document]: |
| """ |
| Complete pipeline to load and split documents |
| |
| Args: |
| urls: List of URLs to process |
| |
| Returns: |
| List of processed document chunks |
| """ |
| docs = self.load_documents(urls) |
| return self.split_documents(docs) |