| | """ |
| | Core Document Intelligence Schemas |
| | |
| | Pydantic models for OCR regions, layout regions, chunks, and evidence. |
| | These form the foundation of the document processing pipeline. |
| | """ |
| |
|
| | from enum import Enum |
| | from typing import List, Dict, Any, Optional, Tuple |
| | from datetime import datetime |
| | from pydantic import BaseModel, Field, field_validator |
| | import hashlib |
| | import json |
| |
|
| |
|
| | class BoundingBox(BaseModel): |
| | """ |
| | Bounding box in normalized coordinates (0-1) or pixel coordinates. |
| | Uses xyxy format: (x_min, y_min, x_max, y_max). |
| | """ |
| | x_min: float = Field(..., description="Left edge coordinate") |
| | y_min: float = Field(..., description="Top edge coordinate") |
| | x_max: float = Field(..., description="Right edge coordinate") |
| | y_max: float = Field(..., description="Bottom edge coordinate") |
| |
|
| | |
| | normalized: bool = Field(default=False, description="True if coordinates are 0-1 normalized") |
| | page_width: Optional[int] = Field(default=None, description="Original page width in pixels") |
| | page_height: Optional[int] = Field(default=None, description="Original page height in pixels") |
| |
|
| | @field_validator('x_max') |
| | @classmethod |
| | def x_max_greater_than_x_min(cls, v, info): |
| | if 'x_min' in info.data and v < info.data['x_min']: |
| | raise ValueError('x_max must be >= x_min') |
| | return v |
| |
|
| | @field_validator('y_max') |
| | @classmethod |
| | def y_max_greater_than_y_min(cls, v, info): |
| | if 'y_min' in info.data and v < info.data['y_min']: |
| | raise ValueError('y_max must be >= y_min') |
| | return v |
| |
|
| | @property |
| | def width(self) -> float: |
| | return self.x_max - self.x_min |
| |
|
| | @property |
| | def height(self) -> float: |
| | return self.y_max - self.y_min |
| |
|
| | @property |
| | def area(self) -> float: |
| | return self.width * self.height |
| |
|
| | @property |
| | def center(self) -> Tuple[float, float]: |
| | return ((self.x_min + self.x_max) / 2, (self.y_min + self.y_max) / 2) |
| |
|
| | def to_xyxy(self) -> Tuple[float, float, float, float]: |
| | """Return as (x_min, y_min, x_max, y_max) tuple.""" |
| | return (self.x_min, self.y_min, self.x_max, self.y_max) |
| |
|
| | def to_xywh(self) -> Tuple[float, float, float, float]: |
| | """Return as (x, y, width, height) tuple.""" |
| | return (self.x_min, self.y_min, self.width, self.height) |
| |
|
| | def normalize(self, width: int, height: int) -> "BoundingBox": |
| | """Convert pixel coordinates to normalized (0-1) coordinates.""" |
| | if self.normalized: |
| | return self |
| | return BoundingBox( |
| | x_min=self.x_min / width, |
| | y_min=self.y_min / height, |
| | x_max=self.x_max / width, |
| | y_max=self.y_max / height, |
| | normalized=True, |
| | page_width=width, |
| | page_height=height, |
| | ) |
| |
|
| | def denormalize(self, width: int, height: int) -> "BoundingBox": |
| | """Convert normalized coordinates to pixel coordinates.""" |
| | if not self.normalized: |
| | return self |
| | return BoundingBox( |
| | x_min=self.x_min * width, |
| | y_min=self.y_min * height, |
| | x_max=self.x_max * width, |
| | y_max=self.y_max * height, |
| | normalized=False, |
| | page_width=width, |
| | page_height=height, |
| | ) |
| |
|
| | def iou(self, other: "BoundingBox") -> float: |
| | """Calculate Intersection over Union with another bbox.""" |
| | x1 = max(self.x_min, other.x_min) |
| | y1 = max(self.y_min, other.y_min) |
| | x2 = min(self.x_max, other.x_max) |
| | y2 = min(self.y_max, other.y_max) |
| |
|
| | if x2 < x1 or y2 < y1: |
| | return 0.0 |
| |
|
| | intersection = (x2 - x1) * (y2 - y1) |
| | union = self.area + other.area - intersection |
| | return intersection / union if union > 0 else 0.0 |
| |
|
| | def contains(self, other: "BoundingBox") -> bool: |
| | """Check if this bbox fully contains another.""" |
| | return ( |
| | self.x_min <= other.x_min and |
| | self.y_min <= other.y_min and |
| | self.x_max >= other.x_max and |
| | self.y_max >= other.y_max |
| | ) |
| |
|
| |
|
| | class OCRRegion(BaseModel): |
| | """ |
| | Result from OCR processing for a single text region. |
| | Includes text, confidence, and precise location. |
| | """ |
| | text: str = Field(..., description="Recognized text content") |
| | confidence: float = Field(..., ge=0.0, le=1.0, description="OCR confidence score") |
| | bbox: BoundingBox = Field(..., description="Bounding box of the text region") |
| | polygon: Optional[List[Tuple[float, float]]] = Field( |
| | default=None, |
| | description="Polygon points for non-rectangular regions" |
| | ) |
| | page: int = Field(..., ge=0, description="Zero-indexed page number") |
| | line_id: Optional[int] = Field(default=None, description="Line grouping ID") |
| | word_id: Optional[int] = Field(default=None, description="Word index within line") |
| |
|
| | |
| | engine: str = Field(default="unknown", description="OCR engine used (paddle/tesseract)") |
| | language: Optional[str] = Field(default=None, description="Detected language code") |
| |
|
| | def __hash__(self): |
| | return hash((self.text, self.page, self.bbox.x_min, self.bbox.y_min)) |
| |
|
| |
|
| | class LayoutType(str, Enum): |
| | """Document layout region types.""" |
| | TEXT = "text" |
| | TITLE = "title" |
| | HEADING = "heading" |
| | PARAGRAPH = "paragraph" |
| | LIST = "list" |
| | TABLE = "table" |
| | FIGURE = "figure" |
| | CHART = "chart" |
| | FORMULA = "formula" |
| | HEADER = "header" |
| | FOOTER = "footer" |
| | PAGE_NUMBER = "page_number" |
| | CAPTION = "caption" |
| | FOOTNOTE = "footnote" |
| | WATERMARK = "watermark" |
| | LOGO = "logo" |
| | SIGNATURE = "signature" |
| | UNKNOWN = "unknown" |
| |
|
| |
|
| | class LayoutRegion(BaseModel): |
| | """ |
| | Result from layout detection for a document region. |
| | Identifies structural elements like tables, figures, paragraphs. |
| | """ |
| | id: str = Field(..., description="Unique region identifier") |
| | type: LayoutType = Field(..., description="Region type classification") |
| | confidence: float = Field(..., ge=0.0, le=1.0, description="Detection confidence") |
| | bbox: BoundingBox = Field(..., description="Bounding box of the region") |
| | page: int = Field(..., ge=0, description="Zero-indexed page number") |
| |
|
| | |
| | reading_order: Optional[int] = Field( |
| | default=None, |
| | description="Position in reading order (0 = first)" |
| | ) |
| |
|
| | |
| | parent_id: Optional[str] = Field(default=None, description="Parent region ID") |
| | children_ids: List[str] = Field(default_factory=list, description="Child region IDs") |
| |
|
| | |
| | ocr_region_ids: List[int] = Field( |
| | default_factory=list, |
| | description="Indices of OCR regions within this layout region" |
| | ) |
| |
|
| | |
| | extra: Dict[str, Any] = Field(default_factory=dict, description="Type-specific metadata") |
| |
|
| | def __hash__(self): |
| | return hash(self.id) |
| |
|
| |
|
| | class ChunkType(str, Enum): |
| | """Document chunk types for semantic segmentation.""" |
| | TEXT = "text" |
| | TITLE = "title" |
| | HEADING = "heading" |
| | PARAGRAPH = "paragraph" |
| | LIST_ITEM = "list_item" |
| | TABLE = "table" |
| | TABLE_CELL = "table_cell" |
| | FIGURE = "figure" |
| | CHART = "chart" |
| | FORMULA = "formula" |
| | CAPTION = "caption" |
| | FOOTNOTE = "footnote" |
| | HEADER = "header" |
| | FOOTER = "footer" |
| | METADATA = "metadata" |
| |
|
| |
|
| | class DocumentChunk(BaseModel): |
| | """ |
| | Semantic chunk of a document for retrieval and processing. |
| | Contains text, location evidence, and metadata for grounding. |
| | """ |
| | chunk_id: str = Field(..., description="Unique chunk identifier") |
| | chunk_type: ChunkType = Field(..., description="Semantic type of chunk") |
| | text: str = Field(..., description="Text content of the chunk") |
| | bbox: BoundingBox = Field(..., description="Bounding box covering the chunk") |
| | page: int = Field(..., ge=0, description="Zero-indexed page number") |
| |
|
| | |
| | document_id: str = Field(..., description="Parent document identifier") |
| | source_path: Optional[str] = Field(default=None, description="Original file path") |
| |
|
| | |
| | sequence_index: int = Field(..., ge=0, description="Position in document reading order") |
| |
|
| | |
| | confidence: float = Field( |
| | default=1.0, |
| | ge=0.0, |
| | le=1.0, |
| | description="Chunk extraction confidence" |
| | ) |
| |
|
| | |
| | table_cell_ids: Optional[List[str]] = Field( |
| | default=None, |
| | description="Cell IDs if this is a table chunk" |
| | ) |
| | row_index: Optional[int] = Field(default=None, description="Table row index") |
| | col_index: Optional[int] = Field(default=None, description="Table column index") |
| |
|
| | |
| | caption: Optional[str] = Field(default=None, description="Associated caption text") |
| | references: List[str] = Field( |
| | default_factory=list, |
| | description="References to other chunks" |
| | ) |
| |
|
| | |
| | embedding: Optional[List[float]] = Field( |
| | default=None, |
| | description="Vector embedding for retrieval" |
| | ) |
| |
|
| | |
| | extra: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata") |
| |
|
| | @property |
| | def content_hash(self) -> str: |
| | """Generate hash of chunk content for deduplication.""" |
| | content = f"{self.text}:{self.page}:{self.chunk_type}" |
| | return hashlib.sha256(content.encode()).hexdigest()[:16] |
| |
|
| | def to_retrieval_dict(self) -> Dict[str, Any]: |
| | """Convert to dictionary for vector store metadata.""" |
| | return { |
| | "chunk_id": self.chunk_id, |
| | "chunk_type": self.chunk_type.value, |
| | "page": self.page, |
| | "document_id": self.document_id, |
| | "source_path": self.source_path, |
| | "bbox_xyxy": self.bbox.to_xyxy(), |
| | "sequence_index": self.sequence_index, |
| | "confidence": self.confidence, |
| | } |
| |
|
| | def __hash__(self): |
| | return hash(self.chunk_id) |
| |
|
| |
|
| | class EvidenceRef(BaseModel): |
| | """ |
| | Evidence reference for grounding extracted information. |
| | Links extracted data back to source document locations. |
| | """ |
| | chunk_id: str = Field(..., description="Referenced chunk ID") |
| | page: int = Field(..., ge=0, description="Page number") |
| | bbox: BoundingBox = Field(..., description="Bounding box of evidence") |
| | source_type: str = Field(..., description="Type of source (text/table/figure)") |
| | snippet: str = Field(..., max_length=500, description="Text snippet as evidence") |
| | confidence: float = Field(..., ge=0.0, le=1.0, description="Evidence confidence") |
| |
|
| | |
| | image_base64: Optional[str] = Field( |
| | default=None, |
| | description="Base64-encoded crop of the evidence region" |
| | ) |
| |
|
| | |
| | extra: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata") |
| |
|
| | def to_citation(self) -> str: |
| | """Format as a human-readable citation.""" |
| | return f"[Page {self.page + 1}, {self.source_type}]: \"{self.snippet[:100]}...\"" |
| |
|
| |
|
| | class ExtractionResult(BaseModel): |
| | """ |
| | Result of a field extraction or analysis task. |
| | Always includes evidence for grounding. |
| | """ |
| | data: Dict[str, Any] = Field(..., description="Extracted data dictionary") |
| | evidence: List[EvidenceRef] = Field( |
| | default_factory=list, |
| | description="Evidence supporting the extraction" |
| | ) |
| | warnings: List[str] = Field( |
| | default_factory=list, |
| | description="Warnings or issues encountered" |
| | ) |
| | confidence: float = Field( |
| | default=1.0, |
| | ge=0.0, |
| | le=1.0, |
| | description="Overall extraction confidence" |
| | ) |
| |
|
| | |
| | abstained_fields: List[str] = Field( |
| | default_factory=list, |
| | description="Fields where extraction was abstained due to low confidence" |
| | ) |
| |
|
| | |
| | processing_time_ms: Optional[float] = Field( |
| | default=None, |
| | description="Processing time in milliseconds" |
| | ) |
| | model_used: Optional[str] = Field(default=None, description="Model used for extraction") |
| |
|
| | @property |
| | def is_grounded(self) -> bool: |
| | """Check if all extracted data has evidence.""" |
| | return len(self.evidence) > 0 and len(self.abstained_fields) == 0 |
| |
|
| | def add_warning(self, warning: str): |
| | """Add a warning message.""" |
| | self.warnings.append(warning) |
| |
|
| | def abstain(self, field: str, reason: str): |
| | """Mark a field as abstained with reason.""" |
| | self.abstained_fields.append(field) |
| | self.warnings.append(f"Abstained from extracting '{field}': {reason}") |
| |
|
| |
|
| | class DocumentMetadata(BaseModel): |
| | """Metadata about a processed document.""" |
| | document_id: str = Field(..., description="Unique document identifier") |
| | source_path: str = Field(..., description="Original file path") |
| | filename: str = Field(..., description="Original filename") |
| | file_type: str = Field(..., description="File type (pdf/image/etc)") |
| | file_size_bytes: int = Field(..., ge=0, description="File size in bytes") |
| |
|
| | |
| | num_pages: int = Field(..., ge=1, description="Total number of pages") |
| | page_dimensions: List[Tuple[int, int]] = Field( |
| | default_factory=list, |
| | description="(width, height) for each page" |
| | ) |
| |
|
| | |
| | created_at: datetime = Field(default_factory=datetime.utcnow) |
| | processed_at: Optional[datetime] = Field(default=None) |
| |
|
| | |
| | total_chunks: int = Field(default=0, description="Number of chunks extracted") |
| | total_characters: int = Field(default=0, description="Total character count") |
| |
|
| | |
| | detected_language: Optional[str] = Field(default=None, description="Primary language") |
| | language_confidence: Optional[float] = Field(default=None) |
| |
|
| | |
| | ocr_confidence_avg: Optional[float] = Field(default=None) |
| | layout_confidence_avg: Optional[float] = Field(default=None) |
| |
|
| | |
| | extra: Dict[str, Any] = Field(default_factory=dict) |
| |
|
| |
|
| | class ProcessedDocument(BaseModel): |
| | """ |
| | Complete processed document with all extracted information. |
| | This is the main output of the document processing pipeline. |
| | """ |
| | metadata: DocumentMetadata = Field(..., description="Document metadata") |
| |
|
| | |
| | ocr_regions: List[OCRRegion] = Field( |
| | default_factory=list, |
| | description="All OCR regions" |
| | ) |
| |
|
| | |
| | layout_regions: List[LayoutRegion] = Field( |
| | default_factory=list, |
| | description="All layout regions" |
| | ) |
| |
|
| | |
| | chunks: List[DocumentChunk] = Field( |
| | default_factory=list, |
| | description="Document chunks for retrieval" |
| | ) |
| |
|
| | |
| | full_text: str = Field(default="", description="Full text in reading order") |
| |
|
| | |
| | status: str = Field(default="completed", description="Processing status") |
| | errors: List[str] = Field(default_factory=list, description="Processing errors") |
| | warnings: List[str] = Field(default_factory=list, description="Processing warnings") |
| |
|
| | def get_page_chunks(self, page: int) -> List[DocumentChunk]: |
| | """Get all chunks for a specific page.""" |
| | return [c for c in self.chunks if c.page == page] |
| |
|
| | def get_chunks_by_type(self, chunk_type: ChunkType) -> List[DocumentChunk]: |
| | """Get all chunks of a specific type.""" |
| | return [c for c in self.chunks if c.chunk_type == chunk_type] |
| |
|
| | def to_json(self, indent: int = 2) -> str: |
| | """Serialize to JSON string.""" |
| | return self.model_dump_json(indent=indent) |
| |
|
| | @classmethod |
| | def from_json(cls, json_str: str) -> "ProcessedDocument": |
| | """Deserialize from JSON string.""" |
| | return cls.model_validate_json(json_str) |
| |
|
| | def save(self, path: str): |
| | """Save to JSON file.""" |
| | with open(path, "w", encoding="utf-8") as f: |
| | f.write(self.to_json()) |
| |
|
| | @classmethod |
| | def load(cls, path: str) -> "ProcessedDocument": |
| | """Load from JSON file.""" |
| | with open(path, "r", encoding="utf-8") as f: |
| | return cls.from_json(f.read()) |
| |
|