| | """Simple reader that reads files of different formats from a directory.""" |
| | import logging |
| | from pathlib import Path |
| | from typing import Callable, Dict, List, Optional, Union |
| |
|
| | from application.parser.file.base import BaseReader |
| | from application.parser.file.base_parser import BaseParser |
| | from application.parser.file.docs_parser import DocxParser, PDFParser |
| | from application.parser.file.epub_parser import EpubParser |
| | from application.parser.file.html_parser import HTMLParser |
| | from application.parser.file.markdown_parser import MarkdownParser |
| | from application.parser.file.rst_parser import RstParser |
| | from application.parser.file.tabular_parser import PandasCSVParser |
| | from application.parser.schema.base import Document |
| |
|
| | DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = { |
| | ".pdf": PDFParser(), |
| | ".docx": DocxParser(), |
| | ".csv": PandasCSVParser(), |
| | ".epub": EpubParser(), |
| | ".md": MarkdownParser(), |
| | ".rst": RstParser(), |
| | ".html": HTMLParser(), |
| | ".mdx": MarkdownParser(), |
| | } |
| |
|
| |
|
| | class SimpleDirectoryReader(BaseReader): |
| | """Simple directory reader. |
| | |
| | Can read files into separate documents, or concatenates |
| | files into one document text. |
| | |
| | Args: |
| | input_dir (str): Path to the directory. |
| | input_files (List): List of file paths to read (Optional; overrides input_dir) |
| | exclude_hidden (bool): Whether to exclude hidden files (dotfiles). |
| | errors (str): how encoding and decoding errors are to be handled, |
| | see https://docs.python.org/3/library/functions.html#open |
| | recursive (bool): Whether to recursively search in subdirectories. |
| | False by default. |
| | required_exts (Optional[List[str]]): List of required extensions. |
| | Default is None. |
| | file_extractor (Optional[Dict[str, BaseParser]]): A mapping of file |
| | extension to a BaseParser class that specifies how to convert that file |
| | to text. See DEFAULT_FILE_EXTRACTOR. |
| | num_files_limit (Optional[int]): Maximum number of files to read. |
| | Default is None. |
| | file_metadata (Optional[Callable[str, Dict]]): A function that takes |
| | in a filename and returns a Dict of metadata for the Document. |
| | Default is None. |
| | """ |
| |
|
| | def __init__( |
| | self, |
| | input_dir: Optional[str] = None, |
| | input_files: Optional[List] = None, |
| | exclude_hidden: bool = True, |
| | errors: str = "ignore", |
| | recursive: bool = True, |
| | required_exts: Optional[List[str]] = None, |
| | file_extractor: Optional[Dict[str, BaseParser]] = None, |
| | num_files_limit: Optional[int] = None, |
| | file_metadata: Optional[Callable[[str], Dict]] = None, |
| | chunk_size_max: int = 2048, |
| | ) -> None: |
| | """Initialize with parameters.""" |
| | super().__init__() |
| |
|
| | if not input_dir and not input_files: |
| | raise ValueError("Must provide either `input_dir` or `input_files`.") |
| |
|
| | self.errors = errors |
| |
|
| | self.recursive = recursive |
| | self.exclude_hidden = exclude_hidden |
| | self.required_exts = required_exts |
| | self.num_files_limit = num_files_limit |
| |
|
| | if input_files: |
| | self.input_files = [] |
| | for path in input_files: |
| | print(path) |
| | input_file = Path(path) |
| | self.input_files.append(input_file) |
| | elif input_dir: |
| | self.input_dir = Path(input_dir) |
| | self.input_files = self._add_files(self.input_dir) |
| |
|
| | self.file_extractor = file_extractor or DEFAULT_FILE_EXTRACTOR |
| | self.file_metadata = file_metadata |
| |
|
| | def _add_files(self, input_dir: Path) -> List[Path]: |
| | """Add files.""" |
| | input_files = sorted(input_dir.iterdir()) |
| | new_input_files = [] |
| | dirs_to_explore = [] |
| | for input_file in input_files: |
| | if input_file.is_dir(): |
| | if self.recursive: |
| | dirs_to_explore.append(input_file) |
| | elif self.exclude_hidden and input_file.name.startswith("."): |
| | continue |
| | elif ( |
| | self.required_exts is not None |
| | and input_file.suffix not in self.required_exts |
| | ): |
| | continue |
| | else: |
| | new_input_files.append(input_file) |
| |
|
| | for dir_to_explore in dirs_to_explore: |
| | sub_input_files = self._add_files(dir_to_explore) |
| | new_input_files.extend(sub_input_files) |
| |
|
| | if self.num_files_limit is not None and self.num_files_limit > 0: |
| | new_input_files = new_input_files[0: self.num_files_limit] |
| |
|
| | |
| | logging.debug( |
| | f"> [SimpleDirectoryReader] Total files added: {len(new_input_files)}" |
| | ) |
| |
|
| | return new_input_files |
| |
|
| | def load_data(self, concatenate: bool = False) -> List[Document]: |
| | """Load data from the input directory. |
| | |
| | Args: |
| | concatenate (bool): whether to concatenate all files into one document. |
| | If set to True, file metadata is ignored. |
| | False by default. |
| | |
| | Returns: |
| | List[Document]: A list of documents. |
| | |
| | """ |
| | data: Union[str, List[str]] = "" |
| | data_list: List[str] = [] |
| | metadata_list = [] |
| | for input_file in self.input_files: |
| | if input_file.suffix in self.file_extractor: |
| | parser = self.file_extractor[input_file.suffix] |
| | if not parser.parser_config_set: |
| | parser.init_parser() |
| | data = parser.parse_file(input_file, errors=self.errors) |
| | else: |
| | |
| | with open(input_file, "r", errors=self.errors) as f: |
| | data = f.read() |
| | if isinstance(data, List): |
| | data_list.extend(data) |
| | else: |
| | data_list.append(str(data)) |
| | if self.file_metadata is not None: |
| | metadata_list.append(self.file_metadata(str(input_file))) |
| |
|
| | if concatenate: |
| | return [Document("\n".join(data_list))] |
| | elif self.file_metadata is not None: |
| | return [Document(d, extra_info=m) for d, m in zip(data_list, metadata_list)] |
| | else: |
| | return [Document(d) for d in data_list] |
| |
|