Source code for langchain.document_loaders.directory

"""Loading logic for loading documents from a directory."""
import logging
from pathlib import Path
from typing import List, Type, Union

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.html_bs import BSHTMLLoader
from langchain.document_loaders.text import TextLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader

FILE_LOADER_TYPE = Union[
    Type[UnstructuredFileLoader], Type[TextLoader], Type[BSHTMLLoader]
]
logger = logging.getLogger(__name__)


def _is_visible(p: Path) -> bool:
    parts = p.parts
    for _p in parts:
        if _p.startswith("."):
            return False
    return True


[docs]class DirectoryLoader(BaseLoader): """Loading logic for loading documents from a directory.""" def __init__( self, path: str, glob: str = "**/[!.]*", silent_errors: bool = False, load_hidden: bool = False, loader_cls: FILE_LOADER_TYPE = UnstructuredFileLoader, loader_kwargs: Union[dict, None] = None, recursive: bool = False, show_progress: bool = False, ): """Initialize with path to directory and how to glob over it.""" if loader_kwargs is None: loader_kwargs = {} self.path = path self.glob = glob self.load_hidden = load_hidden self.loader_cls = loader_cls self.loader_kwargs = loader_kwargs self.silent_errors = silent_errors self.recursive = recursive self.show_progress = show_progress
[docs] def load(self) -> List[Document]: """Load documents.""" p = Path(self.path) docs = [] items = list(p.rglob(self.glob) if self.recursive else p.glob(self.glob)) pbar = None if self.show_progress: try: from tqdm import tqdm pbar = tqdm(total=len(items)) except ImportError as e: logger.warning( "To log the progress of DirectoryLoader you need to install tqdm, " "`pip install tqdm`" ) if self.silent_errors: logger.warning(e) else: raise e for i in items: if i.is_file(): if _is_visible(i.relative_to(p)) or self.load_hidden: try: sub_docs = self.loader_cls(str(i), **self.loader_kwargs).load() docs.extend(sub_docs) except Exception as e: if self.silent_errors: logger.warning(e) else: raise e finally: if pbar: pbar.update(1) if pbar: pbar.close() return docs