| """ |
| Unit Tests for Table-Aware Chunker (FG-002) |
| |
| Tests the enhanced table extraction and structure preservation functionality. |
| """ |
|
|
| import pytest |
| import sys |
| from pathlib import Path |
| from typing import List |
|
|
| |
| sys.path.insert(0, str(Path(__file__).parent.parent.parent)) |
|
|
| from src.document.schemas.core import ( |
| BoundingBox, |
| OCRRegion, |
| LayoutRegion, |
| LayoutType, |
| ChunkType, |
| ) |
| from src.document.chunking.chunker import ( |
| SemanticChunker, |
| ChunkerConfig, |
| ) |
|
|
|
|
| |
| |
| |
|
|
| @pytest.fixture |
| def chunker(): |
| """Create a SemanticChunker with default config.""" |
| config = ChunkerConfig( |
| preserve_table_structure=True, |
| table_row_threshold=10.0, |
| table_col_threshold=20.0, |
| detect_table_headers=True, |
| ) |
| return SemanticChunker(config) |
|
|
|
|
| @pytest.fixture |
| def simple_table_regions() -> List[OCRRegion]: |
| """Create OCR regions representing a simple 3x3 table.""" |
| |
| |
| |
| |
|
|
| regions = [ |
| |
| OCRRegion( |
| text="Name", |
| confidence=0.95, |
| bbox=BoundingBox(x_min=50, y_min=100, x_max=100, y_max=120), |
| page=0 |
| ), |
| OCRRegion( |
| text="Age", |
| confidence=0.95, |
| bbox=BoundingBox(x_min=150, y_min=100, x_max=200, y_max=120), |
| page=0 |
| ), |
| OCRRegion( |
| text="City", |
| confidence=0.95, |
| bbox=BoundingBox(x_min=250, y_min=100, x_max=300, y_max=120), |
| page=0 |
| ), |
| |
| OCRRegion( |
| text="Alice", |
| confidence=0.92, |
| bbox=BoundingBox(x_min=50, y_min=130, x_max=100, y_max=150), |
| page=0 |
| ), |
| OCRRegion( |
| text="25", |
| confidence=0.98, |
| bbox=BoundingBox(x_min=150, y_min=130, x_max=200, y_max=150), |
| page=0 |
| ), |
| OCRRegion( |
| text="New York", |
| confidence=0.90, |
| bbox=BoundingBox(x_min=250, y_min=130, x_max=320, y_max=150), |
| page=0 |
| ), |
| |
| OCRRegion( |
| text="Bob", |
| confidence=0.94, |
| bbox=BoundingBox(x_min=50, y_min=160, x_max=100, y_max=180), |
| page=0 |
| ), |
| OCRRegion( |
| text="30", |
| confidence=0.97, |
| bbox=BoundingBox(x_min=150, y_min=160, x_max=200, y_max=180), |
| page=0 |
| ), |
| OCRRegion( |
| text="London", |
| confidence=0.93, |
| bbox=BoundingBox(x_min=250, y_min=160, x_max=310, y_max=180), |
| page=0 |
| ), |
| ] |
| return regions |
|
|
|
|
| @pytest.fixture |
| def numeric_table_regions() -> List[OCRRegion]: |
| """Create OCR regions for a numeric data table.""" |
| |
| |
| |
| |
| |
|
|
| regions = [ |
| |
| OCRRegion(text="Year", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=100, x_max=100, y_max=120), page=0), |
| OCRRegion(text="Revenue", confidence=0.95, bbox=BoundingBox(x_min=150, y_min=100, x_max=220, y_max=120), page=0), |
| OCRRegion(text="Growth", confidence=0.95, bbox=BoundingBox(x_min=270, y_min=100, x_max=330, y_max=120), page=0), |
| |
| OCRRegion(text="2021", confidence=0.98, bbox=BoundingBox(x_min=50, y_min=130, x_max=100, y_max=150), page=0), |
| OCRRegion(text="$1.5M", confidence=0.92, bbox=BoundingBox(x_min=150, y_min=130, x_max=220, y_max=150), page=0), |
| OCRRegion(text="15%", confidence=0.94, bbox=BoundingBox(x_min=270, y_min=130, x_max=330, y_max=150), page=0), |
| OCRRegion(text="2022", confidence=0.98, bbox=BoundingBox(x_min=50, y_min=160, x_max=100, y_max=180), page=0), |
| OCRRegion(text="$2.0M", confidence=0.93, bbox=BoundingBox(x_min=150, y_min=160, x_max=220, y_max=180), page=0), |
| OCRRegion(text="33%", confidence=0.95, bbox=BoundingBox(x_min=270, y_min=160, x_max=330, y_max=180), page=0), |
| OCRRegion(text="2023", confidence=0.98, bbox=BoundingBox(x_min=50, y_min=190, x_max=100, y_max=210), page=0), |
| OCRRegion(text="$2.8M", confidence=0.91, bbox=BoundingBox(x_min=150, y_min=190, x_max=220, y_max=210), page=0), |
| OCRRegion(text="40%", confidence=0.96, bbox=BoundingBox(x_min=270, y_min=190, x_max=330, y_max=210), page=0), |
| ] |
| return regions |
|
|
|
|
| @pytest.fixture |
| def table_layout_region() -> LayoutRegion: |
| """Create a layout region for a table.""" |
| return LayoutRegion( |
| id="table_001", |
| type=LayoutType.TABLE, |
| confidence=0.95, |
| bbox=BoundingBox(x_min=40, y_min=90, x_max=350, y_max=220), |
| page=0, |
| ) |
|
|
|
|
| |
| |
| |
|
|
| class TestTableStructureReconstruction: |
| """Test table structure reconstruction from OCR regions.""" |
|
|
| def test_reconstruct_simple_table(self, chunker, simple_table_regions): |
| """Test reconstructing a simple table structure.""" |
| result = chunker._reconstruct_table_structure(simple_table_regions) |
|
|
| assert result["row_count"] == 3 |
| assert result["col_count"] == 3 |
| assert result["has_header"] == True |
| assert result["headers"] == ["Name", "Age", "City"] |
|
|
| def test_detect_rows_correctly(self, chunker, simple_table_regions): |
| """Test that rows are detected based on y-coordinate proximity.""" |
| result = chunker._reconstruct_table_structure(simple_table_regions) |
|
|
| cells = result["cells"] |
| assert len(cells) == 3 |
|
|
| |
| assert cells[0] == ["Name", "Age", "City"] |
|
|
| |
| assert cells[1] == ["Alice", "25", "New York"] |
| assert cells[2] == ["Bob", "30", "London"] |
|
|
| def test_detect_columns_correctly(self, chunker, simple_table_regions): |
| """Test that columns are detected based on x-coordinate clustering.""" |
| result = chunker._reconstruct_table_structure(simple_table_regions) |
|
|
| |
| for row in result["cells"]: |
| assert len(row) == 3 |
|
|
| def test_header_detection_numeric_data(self, chunker, numeric_table_regions): |
| """Test header detection when data rows are numeric.""" |
| result = chunker._reconstruct_table_structure(numeric_table_regions) |
|
|
| assert result["has_header"] == True |
| assert result["headers"] == ["Year", "Revenue", "Growth"] |
|
|
| def test_empty_table(self, chunker): |
| """Test handling of empty table (no OCR regions).""" |
| result = chunker._reconstruct_table_structure([]) |
|
|
| assert result["row_count"] == 0 |
| assert result["col_count"] == 0 |
| assert result["cells"] == [] |
| assert result["has_header"] == False |
|
|
|
|
| |
| |
| |
|
|
| class TestMarkdownGeneration: |
| """Test markdown table generation.""" |
|
|
| def test_generate_markdown_with_headers(self, chunker, simple_table_regions): |
| """Test markdown generation with detected headers.""" |
| table_data = chunker._reconstruct_table_structure(simple_table_regions) |
|
|
| markdown = chunker._table_to_markdown( |
| table_data["rows"], |
| table_data["headers"], |
| table_data["has_header"] |
| ) |
|
|
| assert "| Name | Age | City |" in markdown |
| assert "| --- | --- | --- |" in markdown |
| assert "| Alice | 25 | New York |" in markdown |
| assert "| Bob | 30 | London |" in markdown |
|
|
| def test_generate_markdown_without_headers(self, chunker): |
| """Test markdown generation without headers (generic Col1, Col2...).""" |
| rows = [ |
| ["A", "B", "C"], |
| ["1", "2", "3"], |
| ] |
|
|
| markdown = chunker._table_to_markdown(rows, [], False) |
|
|
| assert "| Col1 | Col2 | Col3 |" in markdown |
| assert "| A | B | C |" in markdown |
| assert "| 1 | 2 | 3 |" in markdown |
|
|
| def test_escape_pipe_characters(self, chunker): |
| """Test that pipe characters in cell content are escaped.""" |
| rows = [ |
| ["Header1", "Header2"], |
| ["Value|With|Pipes", "Normal"], |
| ] |
|
|
| markdown = chunker._table_to_markdown(rows, ["Header1", "Header2"], True) |
|
|
| assert "Value\\|With\\|Pipes" in markdown |
|
|
| def test_empty_table_returns_placeholder(self, chunker): |
| """Test that empty table returns placeholder text.""" |
| markdown = chunker._table_to_markdown([], [], False) |
| assert markdown == "[Empty Table]" |
|
|
|
|
| |
| |
| |
|
|
| class TestTableChunkCreation: |
| """Test complete table chunk creation.""" |
|
|
| def test_create_table_chunk_with_structure( |
| self, chunker, simple_table_regions, table_layout_region |
| ): |
| """Test creating a table chunk with preserved structure.""" |
| chunk = chunker._create_table_chunk( |
| simple_table_regions, |
| table_layout_region, |
| document_id="test_doc", |
| source_path="/path/to/doc.pdf" |
| ) |
|
|
| |
| assert chunk.chunk_type == ChunkType.TABLE |
| assert chunk.document_id == "test_doc" |
| assert chunk.page == 0 |
|
|
| |
| assert "| Name | Age | City |" in chunk.text |
| assert "| --- |" in chunk.text |
|
|
| |
| assert "table_structure" in chunk.extra |
| table_struct = chunk.extra["table_structure"] |
|
|
| assert table_struct["row_count"] == 3 |
| assert table_struct["col_count"] == 3 |
| assert table_struct["has_header"] == True |
| assert table_struct["headers"] == ["Name", "Age", "City"] |
| assert table_struct["cells"] is not None |
|
|
| def test_create_table_chunk_with_cell_positions( |
| self, chunker, simple_table_regions, table_layout_region |
| ): |
| """Test that cell positions are preserved for highlighting.""" |
| chunk = chunker._create_table_chunk( |
| simple_table_regions, |
| table_layout_region, |
| document_id="test_doc", |
| source_path=None |
| ) |
|
|
| cell_positions = chunk.extra["table_structure"]["cell_positions"] |
|
|
| |
| assert len(cell_positions) == 3 |
| for row_positions in cell_positions: |
| assert len(row_positions) == 3 |
| for cell in row_positions: |
| assert "text" in cell |
| assert "bbox" in cell |
| assert "confidence" in cell |
|
|
| def test_create_table_chunk_searchable_text( |
| self, chunker, simple_table_regions, table_layout_region |
| ): |
| """Test that searchable text includes header context.""" |
| chunk = chunker._create_table_chunk( |
| simple_table_regions, |
| table_layout_region, |
| document_id="test_doc", |
| source_path=None |
| ) |
|
|
| searchable = chunk.extra["searchable_text"] |
|
|
| |
| assert "Headers:" in searchable |
|
|
| |
| assert "Name: Alice" in searchable or "Alice" in searchable |
| assert "Age: 25" in searchable or "25" in searchable |
|
|
| def test_create_empty_table_chunk(self, chunker, table_layout_region): |
| """Test creating chunk for empty table.""" |
| chunk = chunker._create_table_chunk( |
| [], |
| table_layout_region, |
| document_id="test_doc", |
| source_path=None |
| ) |
|
|
| assert chunk.text == "[Empty Table]" |
| assert chunk.confidence == 0.0 |
|
|
|
|
| |
| |
| |
|
|
| class TestChunkerConfiguration: |
| """Test chunker configuration options.""" |
|
|
| def test_disable_table_structure_preservation(self, simple_table_regions, table_layout_region): |
| """Test disabling table structure preservation.""" |
| config = ChunkerConfig(preserve_table_structure=False) |
| chunker = SemanticChunker(config) |
|
|
| chunk = chunker._create_table_chunk( |
| simple_table_regions, |
| table_layout_region, |
| document_id="test_doc", |
| source_path=None |
| ) |
|
|
| |
| assert "|" in chunk.text |
| assert "| --- |" not in chunk.text |
|
|
| def test_disable_header_detection(self, simple_table_regions, table_layout_region): |
| """Test disabling header detection.""" |
| config = ChunkerConfig( |
| preserve_table_structure=True, |
| detect_table_headers=False |
| ) |
| chunker = SemanticChunker(config) |
|
|
| chunk = chunker._create_table_chunk( |
| simple_table_regions, |
| table_layout_region, |
| document_id="test_doc", |
| source_path=None |
| ) |
|
|
| |
| table_struct = chunk.extra["table_structure"] |
| assert table_struct["has_header"] == False |
| assert table_struct["headers"] == [] |
|
|
| def test_custom_row_threshold(self): |
| """Test custom row grouping threshold.""" |
| |
| config = ChunkerConfig(table_row_threshold=5.0) |
| chunker = SemanticChunker(config) |
|
|
| |
| regions = [ |
| OCRRegion(text="A", confidence=0.9, bbox=BoundingBox(x_min=50, y_min=100, x_max=100, y_max=120), page=0), |
| OCRRegion(text="B", confidence=0.9, bbox=BoundingBox(x_min=50, y_min=108, x_max=100, y_max=128), page=0), |
| ] |
|
|
| result = chunker._reconstruct_table_structure(regions) |
|
|
| |
| assert result["row_count"] == 2 |
|
|
|
|
| |
| |
| |
|
|
| class TestNumericDetection: |
| """Test numeric value detection for header identification.""" |
|
|
| def test_detect_pure_number(self, chunker): |
| """Test detection of pure numbers.""" |
| assert chunker._is_numeric("123") == True |
| assert chunker._is_numeric("0") == True |
| assert chunker._is_numeric("999999") == True |
|
|
| def test_detect_currency(self, chunker): |
| """Test detection of currency values.""" |
| assert chunker._is_numeric("$1,234.56") == True |
| assert chunker._is_numeric("€100") == True |
| assert chunker._is_numeric("£50.00") == True |
|
|
| def test_detect_percentage(self, chunker): |
| """Test detection of percentage values.""" |
| assert chunker._is_numeric("15%") == True |
| assert chunker._is_numeric("100.5%") == True |
|
|
| def test_detect_negative_numbers(self, chunker): |
| """Test detection of negative numbers.""" |
| assert chunker._is_numeric("-123") == True |
| assert chunker._is_numeric("(-50)") == True |
|
|
| def test_non_numeric_text(self, chunker): |
| """Test that text is not detected as numeric.""" |
| assert chunker._is_numeric("Name") == False |
| assert chunker._is_numeric("Alice") == False |
| assert chunker._is_numeric("Revenue Growth") == False |
|
|
| def test_mixed_content(self, chunker): |
| """Test mixed alphanumeric content.""" |
| assert chunker._is_numeric("Q1 2023") == False |
| assert chunker._is_numeric("Rev: $100") == False |
|
|
|
|
| |
| |
| |
|
|
| class TestFullChunkingPipeline: |
| """Test table handling in full chunking pipeline.""" |
|
|
| def test_chunk_document_with_table( |
| self, chunker, simple_table_regions, table_layout_region |
| ): |
| """Test chunking a document that contains a table.""" |
| layout_regions = [table_layout_region] |
|
|
| chunks = chunker.create_chunks( |
| ocr_regions=simple_table_regions, |
| layout_regions=layout_regions, |
| document_id="test_doc", |
| source_path="/path/to/doc.pdf" |
| ) |
|
|
| assert len(chunks) == 1 |
| assert chunks[0].chunk_type == ChunkType.TABLE |
| assert "| Name | Age | City |" in chunks[0].text |
|
|
| def test_chunk_document_mixed_content(self, chunker): |
| """Test chunking document with tables and text.""" |
| |
| text_regions = [ |
| OCRRegion(text="Introduction", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=50, x_max=200, y_max=70), page=0), |
| OCRRegion(text="This document contains data.", confidence=0.92, bbox=BoundingBox(x_min=50, y_min=80, x_max=300, y_max=100), page=0), |
| ] |
|
|
| table_regions = [ |
| OCRRegion(text="Col1", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=150, x_max=100, y_max=170), page=0), |
| OCRRegion(text="Col2", confidence=0.95, bbox=BoundingBox(x_min=150, y_min=150, x_max=200, y_max=170), page=0), |
| OCRRegion(text="A", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=180, x_max=100, y_max=200), page=0), |
| OCRRegion(text="B", confidence=0.95, bbox=BoundingBox(x_min=150, y_min=180, x_max=200, y_max=200), page=0), |
| ] |
|
|
| all_regions = text_regions + table_regions |
|
|
| layout_regions = [ |
| LayoutRegion( |
| id="text_001", |
| type=LayoutType.PARAGRAPH, |
| confidence=0.9, |
| bbox=BoundingBox(x_min=40, y_min=40, x_max=350, y_max=110), |
| page=0 |
| ), |
| LayoutRegion( |
| id="table_001", |
| type=LayoutType.TABLE, |
| confidence=0.95, |
| bbox=BoundingBox(x_min=40, y_min=140, x_max=250, y_max=210), |
| page=0 |
| ), |
| ] |
|
|
| chunks = chunker.create_chunks( |
| ocr_regions=all_regions, |
| layout_regions=layout_regions, |
| document_id="test_doc", |
| source_path=None |
| ) |
|
|
| |
| assert len(chunks) == 2 |
|
|
| chunk_types = [c.chunk_type for c in chunks] |
| assert ChunkType.PARAGRAPH in chunk_types |
| assert ChunkType.TABLE in chunk_types |
|
|
|
|
| |
| |
| |
|
|
| if __name__ == "__main__": |
| pytest.main([__file__, "-v", "--tb=short"]) |
|
|