| """ |
| Unit Tests for RAG Integration with Document Intelligence |
| |
| Tests the bridge between document_intelligence and RAG subsystems: |
| - DocIntIndexer: Indexing ParseResult into vector store |
| - DocIntRetriever: Semantic retrieval with evidence |
| - RAG Tools: IndexDocumentTool, RetrieveChunksTool, RAGAnswerTool |
| """ |
|
|
| import pytest |
| from unittest.mock import Mock, MagicMock, patch |
| from typing import List |
|
|
|
|
| class TestDocIntBridge: |
| """Tests for the document intelligence RAG bridge.""" |
|
|
| def test_bridge_imports(self): |
| """Test that bridge module imports correctly.""" |
| from src.rag.docint_bridge import ( |
| DocIntIndexer, |
| DocIntRetriever, |
| get_docint_indexer, |
| get_docint_retriever, |
| ) |
|
|
| assert DocIntIndexer is not None |
| assert DocIntRetriever is not None |
|
|
| def test_indexer_creation(self): |
| """Test DocIntIndexer creation.""" |
| from src.rag.docint_bridge import DocIntIndexer |
| from src.rag.indexer import IndexerConfig |
|
|
| config = IndexerConfig( |
| batch_size=16, |
| include_bbox=True, |
| min_chunk_length=5, |
| ) |
|
|
| |
| mock_store = Mock() |
| mock_embedder = Mock() |
| mock_embedder.embed_batch = Mock(return_value=[[0.1] * 768]) |
|
|
| indexer = DocIntIndexer( |
| config=config, |
| vector_store=mock_store, |
| embedding_adapter=mock_embedder, |
| ) |
|
|
| assert indexer.config.batch_size == 16 |
| assert indexer.config.include_bbox is True |
|
|
| def test_retriever_creation(self): |
| """Test DocIntRetriever creation.""" |
| from src.rag.docint_bridge import DocIntRetriever |
|
|
| mock_store = Mock() |
| mock_embedder = Mock() |
|
|
| retriever = DocIntRetriever( |
| vector_store=mock_store, |
| embedding_adapter=mock_embedder, |
| similarity_threshold=0.6, |
| ) |
|
|
| assert retriever.similarity_threshold == 0.6 |
|
|
|
|
| class TestDocIntIndexer: |
| """Tests for DocIntIndexer functionality.""" |
|
|
| @pytest.fixture |
| def mock_parse_result(self): |
| """Create a mock ParseResult for testing.""" |
| from src.document_intelligence.chunks import ( |
| ParseResult, |
| DocumentChunk, |
| ChunkType, |
| BoundingBox, |
| ) |
|
|
| chunks = [ |
| DocumentChunk( |
| chunk_id="chunk_001", |
| doc_id="test_doc", |
| chunk_type=ChunkType.PARAGRAPH, |
| text="This is a test paragraph with enough content to index.", |
| page=1, |
| bbox=BoundingBox(x_min=0.1, y_min=0.1, x_max=0.9, y_max=0.2), |
| confidence=0.9, |
| sequence_index=0, |
| ), |
| DocumentChunk( |
| chunk_id="chunk_002", |
| doc_id="test_doc", |
| chunk_type=ChunkType.PARAGRAPH, |
| text="Second paragraph with different content for testing.", |
| page=1, |
| bbox=BoundingBox(x_min=0.1, y_min=0.3, x_max=0.9, y_max=0.4), |
| confidence=0.85, |
| sequence_index=1, |
| ), |
| DocumentChunk( |
| chunk_id="chunk_003", |
| doc_id="test_doc", |
| chunk_type=ChunkType.TABLE, |
| text="| Header | Value |\n| --- | --- |\n| A | 100 |", |
| page=2, |
| bbox=BoundingBox(x_min=0.1, y_min=0.1, x_max=0.9, y_max=0.5), |
| confidence=0.95, |
| sequence_index=2, |
| ), |
| ] |
|
|
| return ParseResult( |
| doc_id="test_doc", |
| filename="test.pdf", |
| chunks=chunks, |
| num_pages=2, |
| processing_time_ms=100, |
| markdown_full="# Test Document\n\nContent here.", |
| ) |
|
|
| def test_index_parse_result(self, mock_parse_result): |
| """Test indexing a ParseResult.""" |
| from src.rag.docint_bridge import DocIntIndexer |
|
|
| mock_store = Mock() |
| mock_store.add_chunks = Mock() |
|
|
| mock_embedder = Mock() |
| |
| mock_embedder.embed_batch = Mock(return_value=[ |
| [0.1] * 768, |
| [0.2] * 768, |
| [0.3] * 768, |
| ]) |
|
|
| indexer = DocIntIndexer( |
| vector_store=mock_store, |
| embedding_adapter=mock_embedder, |
| ) |
|
|
| result = indexer.index_parse_result(mock_parse_result) |
|
|
| assert result.success is True |
| assert result.document_id == "test_doc" |
| assert result.num_chunks_indexed == 3 |
| assert result.num_chunks_skipped == 0 |
|
|
| |
| mock_store.add_chunks.assert_called_once() |
|
|
| def test_index_skips_short_chunks(self, mock_parse_result): |
| """Test that short chunks are skipped.""" |
| from src.rag.docint_bridge import DocIntIndexer |
| from src.rag.indexer import IndexerConfig |
|
|
| |
| from src.document_intelligence.chunks import ( |
| DocumentChunk, |
| ChunkType, |
| BoundingBox, |
| ) |
|
|
| mock_parse_result.chunks.append( |
| DocumentChunk( |
| chunk_id="chunk_short", |
| doc_id="test_doc", |
| chunk_type=ChunkType.PARAGRAPH, |
| text="Short", |
| page=1, |
| bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1), |
| confidence=0.9, |
| sequence_index=3, |
| ) |
| ) |
|
|
| config = IndexerConfig(min_chunk_length=10) |
|
|
| mock_store = Mock() |
| mock_store.add_chunks = Mock() |
|
|
| mock_embedder = Mock() |
| mock_embedder.embed_batch = Mock(return_value=[ |
| [0.1] * 768, |
| [0.2] * 768, |
| [0.3] * 768, |
| ]) |
|
|
| indexer = DocIntIndexer( |
| config=config, |
| vector_store=mock_store, |
| embedding_adapter=mock_embedder, |
| ) |
|
|
| result = indexer.index_parse_result(mock_parse_result) |
|
|
| assert result.success is True |
| assert result.num_chunks_indexed == 3 |
| assert result.num_chunks_skipped == 1 |
|
|
| def test_delete_document(self): |
| """Test deleting a document from index.""" |
| from src.rag.docint_bridge import DocIntIndexer |
|
|
| mock_store = Mock() |
| mock_store.delete_document = Mock(return_value=5) |
|
|
| indexer = DocIntIndexer(vector_store=mock_store) |
|
|
| deleted = indexer.delete_document("test_doc") |
|
|
| assert deleted == 5 |
| mock_store.delete_document.assert_called_once_with("test_doc") |
|
|
|
|
| class TestDocIntRetriever: |
| """Tests for DocIntRetriever functionality.""" |
|
|
| def test_retrieve_chunks(self): |
| """Test basic chunk retrieval.""" |
| from src.rag.docint_bridge import DocIntRetriever |
| from src.rag.store import VectorSearchResult |
|
|
| |
| mock_results = [ |
| VectorSearchResult( |
| chunk_id="chunk_001", |
| document_id="test_doc", |
| text="Relevant content about the query.", |
| similarity=0.85, |
| page=1, |
| chunk_type="paragraph", |
| bbox={"x_min": 0.1, "y_min": 0.1, "x_max": 0.9, "y_max": 0.2}, |
| metadata={"source_path": "test.pdf", "confidence": 0.9}, |
| ), |
| VectorSearchResult( |
| chunk_id="chunk_002", |
| document_id="test_doc", |
| text="Another relevant chunk.", |
| similarity=0.75, |
| page=2, |
| chunk_type="paragraph", |
| bbox={"x_min": 0.1, "y_min": 0.3, "x_max": 0.9, "y_max": 0.4}, |
| metadata={"source_path": "test.pdf", "confidence": 0.85}, |
| ), |
| ] |
|
|
| mock_store = Mock() |
| mock_store.search = Mock(return_value=mock_results) |
|
|
| mock_embedder = Mock() |
| mock_embedder.embed_text = Mock(return_value=[0.1] * 768) |
|
|
| retriever = DocIntRetriever( |
| vector_store=mock_store, |
| embedding_adapter=mock_embedder, |
| similarity_threshold=0.5, |
| ) |
|
|
| chunks = retriever.retrieve("test query", top_k=5) |
|
|
| assert len(chunks) == 2 |
| assert chunks[0]["chunk_id"] == "chunk_001" |
| assert chunks[0]["similarity"] == 0.85 |
|
|
| def test_retrieve_with_evidence(self): |
| """Test retrieval with evidence references.""" |
| from src.rag.docint_bridge import DocIntRetriever |
| from src.rag.store import VectorSearchResult |
|
|
| mock_results = [ |
| VectorSearchResult( |
| chunk_id="chunk_001", |
| document_id="test_doc", |
| text="Content with evidence.", |
| similarity=0.9, |
| page=1, |
| chunk_type="paragraph", |
| bbox={"x_min": 0.1, "y_min": 0.1, "x_max": 0.9, "y_max": 0.2}, |
| metadata={}, |
| ), |
| ] |
|
|
| mock_store = Mock() |
| mock_store.search = Mock(return_value=mock_results) |
|
|
| mock_embedder = Mock() |
| mock_embedder.embed_text = Mock(return_value=[0.1] * 768) |
|
|
| retriever = DocIntRetriever( |
| vector_store=mock_store, |
| embedding_adapter=mock_embedder, |
| ) |
|
|
| chunks, evidence_refs = retriever.retrieve_with_evidence("query") |
|
|
| assert len(chunks) == 1 |
| assert len(evidence_refs) == 1 |
| assert evidence_refs[0].chunk_id == "chunk_001" |
| assert evidence_refs[0].page == 1 |
|
|
| def test_retrieve_with_filters(self): |
| """Test retrieval with filters.""" |
| from src.rag.docint_bridge import DocIntRetriever |
|
|
| mock_store = Mock() |
| mock_store.search = Mock(return_value=[]) |
|
|
| mock_embedder = Mock() |
| mock_embedder.embed_text = Mock(return_value=[0.1] * 768) |
|
|
| retriever = DocIntRetriever( |
| vector_store=mock_store, |
| embedding_adapter=mock_embedder, |
| ) |
|
|
| |
| chunks = retriever.retrieve( |
| "query", |
| document_id="specific_doc", |
| chunk_types=["paragraph", "table"], |
| page_range=(1, 5), |
| ) |
|
|
| |
| call_args = mock_store.search.call_args |
| filters = call_args.kwargs.get("filters") |
|
|
| assert filters["document_id"] == "specific_doc" |
| assert filters["chunk_type"] == ["paragraph", "table"] |
| assert filters["page"] == {"min": 1, "max": 5} |
|
|
| def test_build_context(self): |
| """Test context building from chunks.""" |
| from src.rag.docint_bridge import DocIntRetriever |
|
|
| retriever = DocIntRetriever() |
|
|
| chunks = [ |
| { |
| "chunk_id": "c1", |
| "text": "First chunk content.", |
| "page": 1, |
| "chunk_type": "paragraph", |
| "similarity": 0.9, |
| }, |
| { |
| "chunk_id": "c2", |
| "text": "Second chunk content.", |
| "page": 2, |
| "chunk_type": "table", |
| "similarity": 0.8, |
| }, |
| ] |
|
|
| context = retriever.build_context(chunks) |
|
|
| assert "[1]" in context |
| assert "[2]" in context |
| assert "Page 1" in context |
| assert "Page 2" in context |
| assert "First chunk content" in context |
| assert "Second chunk content" in context |
|
|
|
|
| class TestRAGTools: |
| """Tests for RAG tools in document_intelligence.""" |
|
|
| def test_tool_imports(self): |
| """Test that RAG tools import correctly.""" |
| from src.document_intelligence.tools import ( |
| IndexDocumentTool, |
| RetrieveChunksTool, |
| RAGAnswerTool, |
| DeleteDocumentTool, |
| GetIndexStatsTool, |
| get_rag_tool, |
| list_rag_tools, |
| ) |
|
|
| assert IndexDocumentTool is not None |
| assert RetrieveChunksTool is not None |
| assert RAGAnswerTool is not None |
|
|
| def test_list_rag_tools(self): |
| """Test listing RAG tools.""" |
| from src.document_intelligence.tools import list_rag_tools |
|
|
| tools = list_rag_tools() |
|
|
| assert len(tools) >= 3 |
| tool_names = [t["name"] for t in tools] |
| assert "index_document" in tool_names |
| assert "retrieve_chunks" in tool_names |
| assert "rag_answer" in tool_names |
|
|
| def test_get_rag_tool(self): |
| """Test getting RAG tool by name.""" |
| from src.document_intelligence.tools import get_rag_tool |
|
|
| tool = get_rag_tool("index_document") |
| assert tool.name == "index_document" |
|
|
| tool = get_rag_tool("retrieve_chunks") |
| assert tool.name == "retrieve_chunks" |
|
|
| @patch("src.document_intelligence.tools.rag_tools.RAG_AVAILABLE", False) |
| def test_tool_graceful_degradation(self): |
| """Test that tools handle missing RAG gracefully.""" |
| from src.document_intelligence.tools.rag_tools import IndexDocumentTool |
|
|
| tool = IndexDocumentTool() |
| result = tool.execute(path="test.pdf") |
|
|
| assert result.success is False |
| assert "not available" in result.error.lower() |
|
|
|
|
| class TestAnswerQuestionRAGMode: |
| """Tests for AnswerQuestionTool with RAG mode.""" |
|
|
| def test_answer_with_keywords(self): |
| """Test keyword-based answering (use_rag=False).""" |
| from src.document_intelligence.tools import get_tool |
| from src.document_intelligence.chunks import ( |
| ParseResult, |
| DocumentChunk, |
| ChunkType, |
| BoundingBox, |
| ) |
|
|
| |
| chunks = [ |
| DocumentChunk( |
| chunk_id="chunk_001", |
| doc_id="test_doc", |
| chunk_type=ChunkType.PARAGRAPH, |
| text="The total amount due is $500.00 as shown on page one.", |
| page=1, |
| bbox=BoundingBox(x_min=0.1, y_min=0.1, x_max=0.9, y_max=0.2), |
| confidence=0.9, |
| sequence_index=0, |
| ), |
| ] |
|
|
| parse_result = ParseResult( |
| doc_id="test_doc", |
| filename="test.pdf", |
| chunks=chunks, |
| num_pages=1, |
| processing_time_ms=100, |
| markdown_full="# Test", |
| ) |
|
|
| tool = get_tool("answer_question") |
| result = tool.execute( |
| parse_result=parse_result, |
| question="What is the total amount?", |
| use_rag=False, |
| ) |
|
|
| assert result.success is True |
| assert "500" in result.data.get("answer", "") |
|
|
|
|
| class TestAbstentionPolicy: |
| """Tests for abstention behavior.""" |
|
|
| def test_abstains_on_no_results(self): |
| """Test that system abstains when no relevant chunks found.""" |
| from src.document_intelligence.tools import get_tool |
| from src.document_intelligence.chunks import ( |
| ParseResult, |
| DocumentChunk, |
| ChunkType, |
| BoundingBox, |
| ) |
|
|
| |
| chunks = [ |
| DocumentChunk( |
| chunk_id="chunk_001", |
| doc_id="test_doc", |
| chunk_type=ChunkType.PARAGRAPH, |
| text="This document discusses weather patterns in Antarctica.", |
| page=1, |
| bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1), |
| confidence=0.9, |
| sequence_index=0, |
| ), |
| ] |
|
|
| parse_result = ParseResult( |
| doc_id="test_doc", |
| filename="test.pdf", |
| chunks=chunks, |
| num_pages=1, |
| processing_time_ms=100, |
| markdown_full="# Test", |
| ) |
|
|
| tool = get_tool("answer_question") |
| result = tool.execute( |
| parse_result=parse_result, |
| question="What is the invoice number?", |
| use_rag=False, |
| ) |
|
|
| assert result.success is True |
| assert result.data.get("abstained") is True |
| assert result.data.get("confidence", 1.0) == 0.0 |
|
|
|
|
| class TestEvidenceGeneration: |
| """Tests for evidence reference generation.""" |
|
|
| def test_evidence_from_retrieval(self): |
| """Test evidence refs are generated from retrieval.""" |
| from src.rag.docint_bridge import DocIntRetriever |
| from src.rag.store import VectorSearchResult |
|
|
| mock_results = [ |
| VectorSearchResult( |
| chunk_id="chunk_001", |
| document_id="doc_001", |
| text="Evidence text here.", |
| similarity=0.9, |
| page=1, |
| chunk_type="paragraph", |
| bbox={"x_min": 0.1, "y_min": 0.2, "x_max": 0.9, "y_max": 0.3}, |
| metadata={"confidence": 0.95}, |
| ), |
| ] |
|
|
| mock_store = Mock() |
| mock_store.search = Mock(return_value=mock_results) |
|
|
| mock_embedder = Mock() |
| mock_embedder.embed_text = Mock(return_value=[0.1] * 768) |
|
|
| retriever = DocIntRetriever( |
| vector_store=mock_store, |
| embedding_adapter=mock_embedder, |
| ) |
|
|
| chunks, evidence = retriever.retrieve_with_evidence("query") |
|
|
| assert len(evidence) == 1 |
| ev = evidence[0] |
| assert ev.chunk_id == "chunk_001" |
| assert ev.page == 1 |
| assert ev.bbox.x_min == 0.1 |
| assert ev.bbox.y_max == 0.3 |
| assert "Evidence text" in ev.snippet |
|
|
|
|
| if __name__ == "__main__": |
| pytest.main([__file__, "-v"]) |
|
|