| from abc import ABC, abstractmethod |
| from collections.abc import Sequence |
| from typing import Any, Optional |
|
|
| from pydantic import BaseModel, Field |
|
|
|
|
| class Document(BaseModel): |
| """Class for storing a piece of text and associated metadata.""" |
|
|
| page_content: str |
|
|
| vector: Optional[list[float]] = None |
|
|
| """Arbitrary metadata about the page content (e.g., source, relationships to other |
| documents, etc.). |
| """ |
| metadata: Optional[dict] = Field(default_factory=dict) |
|
|
| provider: Optional[str] = "dify" |
|
|
|
|
| class BaseDocumentTransformer(ABC): |
| """Abstract base class for document transformation systems. |
| |
| A document transformation system takes a sequence of Documents and returns a |
| sequence of transformed Documents. |
| |
| Example: |
| .. code-block:: python |
| |
| class EmbeddingsRedundantFilter(BaseDocumentTransformer, BaseModel): |
| embeddings: Embeddings |
| similarity_fn: Callable = cosine_similarity |
| similarity_threshold: float = 0.95 |
| |
| class Config: |
| arbitrary_types_allowed = True |
| |
| def transform_documents( |
| self, documents: Sequence[Document], **kwargs: Any |
| ) -> Sequence[Document]: |
| stateful_documents = get_stateful_documents(documents) |
| embedded_documents = _get_embeddings_from_stateful_docs( |
| self.embeddings, stateful_documents |
| ) |
| included_idxs = _filter_similar_embeddings( |
| embedded_documents, self.similarity_fn, self.similarity_threshold |
| ) |
| return [stateful_documents[i] for i in sorted(included_idxs)] |
| |
| async def atransform_documents( |
| self, documents: Sequence[Document], **kwargs: Any |
| ) -> Sequence[Document]: |
| raise NotImplementedError |
| |
| """ |
|
|
| @abstractmethod |
| def transform_documents(self, documents: Sequence[Document], **kwargs: Any) -> Sequence[Document]: |
| """Transform a list of documents. |
| |
| Args: |
| documents: A sequence of Documents to be transformed. |
| |
| Returns: |
| A list of transformed Documents. |
| """ |
|
|
| @abstractmethod |
| async def atransform_documents(self, documents: Sequence[Document], **kwargs: Any) -> Sequence[Document]: |
| """Asynchronously transform a list of documents. |
| |
| Args: |
| documents: A sequence of Documents to be transformed. |
| |
| Returns: |
| A list of transformed Documents. |
| """ |
|
|