| | """ |
| | Evidence Builder for Document Grounding |
| | |
| | Creates evidence references for extracted information. |
| | Handles image cropping and base64 encoding. |
| | """ |
| |
|
| | import base64 |
| | import io |
| | from typing import List, Optional, Dict, Any, Tuple |
| | from pydantic import BaseModel, Field |
| | import numpy as np |
| | from PIL import Image |
| | from loguru import logger |
| |
|
| | from ..schemas.core import ( |
| | BoundingBox, |
| | DocumentChunk, |
| | EvidenceRef, |
| | OCRRegion, |
| | ) |
| |
|
| |
|
| | class GroundingConfig(BaseModel): |
| | """Configuration for grounding and evidence generation.""" |
| | |
| | include_images: bool = Field( |
| | default=True, |
| | description="Include cropped images in evidence" |
| | ) |
| | crop_padding: int = Field( |
| | default=10, |
| | ge=0, |
| | description="Padding around crop regions in pixels" |
| | ) |
| | max_image_size: int = Field( |
| | default=512, |
| | ge=64, |
| | description="Maximum dimension for cropped images" |
| | ) |
| | image_format: str = Field( |
| | default="PNG", |
| | description="Image format for encoding (PNG/JPEG)" |
| | ) |
| | image_quality: int = Field( |
| | default=85, |
| | ge=1, |
| | le=100, |
| | description="JPEG quality if using JPEG format" |
| | ) |
| |
|
| | |
| | max_snippet_length: int = Field( |
| | default=200, |
| | ge=50, |
| | description="Maximum length of text snippets" |
| | ) |
| | include_context: bool = Field( |
| | default=True, |
| | description="Include surrounding context in snippets" |
| | ) |
| |
|
| |
|
| | def crop_region_image( |
| | image: np.ndarray, |
| | bbox: BoundingBox, |
| | padding: int = 10, |
| | max_size: Optional[int] = None, |
| | ) -> np.ndarray: |
| | """ |
| | Crop a region from an image. |
| | |
| | Args: |
| | image: Source image (RGB, HWC format) |
| | bbox: Bounding box to crop |
| | padding: Padding around the crop |
| | max_size: Maximum dimension (will resize if larger) |
| | |
| | Returns: |
| | Cropped image as numpy array |
| | """ |
| | height, width = image.shape[:2] |
| |
|
| | |
| | x1 = max(0, int(bbox.x_min) - padding) |
| | y1 = max(0, int(bbox.y_min) - padding) |
| | x2 = min(width, int(bbox.x_max) + padding) |
| | y2 = min(height, int(bbox.y_max) + padding) |
| |
|
| | |
| | cropped = image[y1:y2, x1:x2] |
| |
|
| | |
| | if max_size and max(cropped.shape[:2]) > max_size: |
| | pil_img = Image.fromarray(cropped) |
| | pil_img.thumbnail((max_size, max_size), Image.Resampling.LANCZOS) |
| | cropped = np.array(pil_img) |
| |
|
| | return cropped |
| |
|
| |
|
| | def encode_image_base64( |
| | image: np.ndarray, |
| | format: str = "PNG", |
| | quality: int = 85, |
| | ) -> str: |
| | """ |
| | Encode image to base64 string. |
| | |
| | Args: |
| | image: Image as numpy array |
| | format: Image format (PNG/JPEG) |
| | quality: JPEG quality if applicable |
| | |
| | Returns: |
| | Base64-encoded string |
| | """ |
| | pil_img = Image.fromarray(image) |
| |
|
| | |
| | if pil_img.mode != "RGB": |
| | pil_img = pil_img.convert("RGB") |
| |
|
| | |
| | buffer = io.BytesIO() |
| | if format.upper() == "JPEG": |
| | pil_img.save(buffer, format="JPEG", quality=quality) |
| | else: |
| | pil_img.save(buffer, format="PNG") |
| |
|
| | buffer.seek(0) |
| | encoded = base64.b64encode(buffer.read()).decode("utf-8") |
| |
|
| | return encoded |
| |
|
| |
|
| | def create_evidence_ref( |
| | chunk: DocumentChunk, |
| | source_type: str = "text", |
| | snippet: Optional[str] = None, |
| | confidence: float = 1.0, |
| | image: Optional[np.ndarray] = None, |
| | config: Optional[GroundingConfig] = None, |
| | ) -> EvidenceRef: |
| | """ |
| | Create an evidence reference from a document chunk. |
| | |
| | Args: |
| | chunk: Source chunk |
| | source_type: Type of source (text/table/figure) |
| | snippet: Optional specific snippet (defaults to chunk text) |
| | confidence: Confidence score |
| | image: Optional page image for cropping |
| | config: Grounding configuration |
| | |
| | Returns: |
| | EvidenceRef instance |
| | """ |
| | config = config or GroundingConfig() |
| |
|
| | |
| | if snippet is None: |
| | snippet = chunk.text[:config.max_snippet_length] |
| | if len(chunk.text) > config.max_snippet_length: |
| | snippet += "..." |
| |
|
| | |
| | evidence = EvidenceRef( |
| | chunk_id=chunk.chunk_id, |
| | page=chunk.page, |
| | bbox=chunk.bbox, |
| | source_type=source_type, |
| | snippet=snippet, |
| | confidence=confidence, |
| | ) |
| |
|
| | |
| | if image is not None and config.include_images: |
| | try: |
| | cropped = crop_region_image( |
| | image, |
| | chunk.bbox, |
| | padding=config.crop_padding, |
| | max_size=config.max_image_size, |
| | ) |
| | evidence.image_base64 = encode_image_base64( |
| | cropped, |
| | format=config.image_format, |
| | quality=config.image_quality, |
| | ) |
| | except Exception as e: |
| | logger.warning(f"Failed to crop evidence image: {e}") |
| |
|
| | return evidence |
| |
|
| |
|
| | class EvidenceBuilder: |
| | """ |
| | Builder for creating evidence references. |
| | |
| | Handles: |
| | - Evidence from chunks |
| | - Evidence from OCR regions |
| | - Evidence aggregation |
| | - Image cropping and encoding |
| | """ |
| |
|
| | def __init__(self, config: Optional[GroundingConfig] = None): |
| | """Initialize evidence builder.""" |
| | self.config = config or GroundingConfig() |
| |
|
| | def from_chunk( |
| | self, |
| | chunk: DocumentChunk, |
| | image: Optional[np.ndarray] = None, |
| | additional_context: Optional[str] = None, |
| | ) -> EvidenceRef: |
| | """ |
| | Create evidence reference from a chunk. |
| | |
| | Args: |
| | chunk: Source chunk |
| | image: Optional page image for visual evidence |
| | additional_context: Optional additional context |
| | |
| | Returns: |
| | EvidenceRef |
| | """ |
| | |
| | source_type = chunk.chunk_type.value |
| |
|
| | |
| | snippet = chunk.text[:self.config.max_snippet_length] |
| | if additional_context: |
| | snippet = f"{additional_context}\n{snippet}" |
| | if len(chunk.text) > self.config.max_snippet_length: |
| | snippet += "..." |
| |
|
| | return create_evidence_ref( |
| | chunk=chunk, |
| | source_type=source_type, |
| | snippet=snippet, |
| | confidence=chunk.confidence, |
| | image=image, |
| | config=self.config, |
| | ) |
| |
|
| | def from_ocr_region( |
| | self, |
| | region: OCRRegion, |
| | chunk_id: str, |
| | document_id: str, |
| | image: Optional[np.ndarray] = None, |
| | ) -> EvidenceRef: |
| | """ |
| | Create evidence reference from an OCR region. |
| | |
| | Args: |
| | region: OCR region |
| | chunk_id: ID to assign |
| | document_id: Parent document ID |
| | image: Optional page image |
| | |
| | Returns: |
| | EvidenceRef |
| | """ |
| | |
| | from ..schemas.core import DocumentChunk, ChunkType |
| |
|
| | chunk = DocumentChunk( |
| | chunk_id=chunk_id, |
| | chunk_type=ChunkType.TEXT, |
| | text=region.text, |
| | bbox=region.bbox, |
| | page=region.page, |
| | document_id=document_id, |
| | source_path=None, |
| | sequence_index=0, |
| | confidence=region.confidence, |
| | ) |
| |
|
| | return self.from_chunk(chunk, image) |
| |
|
| | def aggregate_evidence( |
| | self, |
| | evidence_list: List[EvidenceRef], |
| | combine_snippets: bool = True, |
| | ) -> List[EvidenceRef]: |
| | """ |
| | Aggregate and deduplicate evidence references. |
| | |
| | Args: |
| | evidence_list: List of evidence references |
| | combine_snippets: Whether to combine snippets from same chunk |
| | |
| | Returns: |
| | Deduplicated evidence list |
| | """ |
| | if not evidence_list: |
| | return [] |
| |
|
| | |
| | by_chunk: Dict[str, List[EvidenceRef]] = {} |
| | for ev in evidence_list: |
| | if ev.chunk_id not in by_chunk: |
| | by_chunk[ev.chunk_id] = [] |
| | by_chunk[ev.chunk_id].append(ev) |
| |
|
| | |
| | result = [] |
| | for chunk_id, evidences in by_chunk.items(): |
| | if len(evidences) == 1: |
| | result.append(evidences[0]) |
| | else: |
| | |
| | best = max(evidences, key=lambda e: e.confidence) |
| | if combine_snippets: |
| | all_snippets = list(set(e.snippet for e in evidences)) |
| | combined = " ... ".join(all_snippets[:3]) |
| | best = EvidenceRef( |
| | chunk_id=best.chunk_id, |
| | page=best.page, |
| | bbox=best.bbox, |
| | source_type=best.source_type, |
| | snippet=combined[:self.config.max_snippet_length], |
| | confidence=best.confidence, |
| | image_base64=best.image_base64, |
| | ) |
| | result.append(best) |
| |
|
| | |
| | result.sort(key=lambda e: (e.page, e.bbox.y_min, e.bbox.x_min)) |
| |
|
| | return result |
| |
|
| | def create_grounding_context( |
| | self, |
| | evidence_list: List[EvidenceRef], |
| | include_images: bool = False, |
| | ) -> str: |
| | """ |
| | Create a text context from evidence for LLM prompting. |
| | |
| | Args: |
| | evidence_list: Evidence references |
| | include_images: Whether to include image markers |
| | |
| | Returns: |
| | Formatted context string |
| | """ |
| | if not evidence_list: |
| | return "" |
| |
|
| | lines = ["Evidence from document:"] |
| | for i, ev in enumerate(evidence_list, 1): |
| | lines.append( |
| | f"\n[{i}] Page {ev.page + 1}, {ev.source_type} " |
| | f"(confidence: {ev.confidence:.2f}):" |
| | ) |
| | lines.append(f' "{ev.snippet}"') |
| |
|
| | if include_images and ev.image_base64: |
| | lines.append(" [Image available]") |
| |
|
| | return "\n".join(lines) |
| |
|