| """ |
| Test data validation fixes for MCP paper parsing and PDF processing. |
| This test verifies that malformed data (dicts instead of lists) is handled correctly. |
| """ |
| import sys |
| from datetime import datetime |
| from utils.schemas import Paper |
| from utils.pdf_processor import PDFProcessor |
|
|
|
|
| def test_paper_schema_validators(): |
| """Test that Paper schema validators correctly normalize malformed data.""" |
| print("\n" + "="*80) |
| print("TEST 1: Paper Schema Validators") |
| print("="*80) |
|
|
| |
| print("\n1. Testing authors as dict (malformed data)...") |
| try: |
| paper = Paper( |
| arxiv_id="test.001", |
| title="Test Paper", |
| authors={"author1": "John Doe", "author2": "Jane Smith"}, |
| abstract="Test abstract", |
| pdf_url="https://arxiv.org/pdf/test.001.pdf", |
| published=datetime.now(), |
| categories=["cs.AI"] |
| ) |
| print(f" β Paper created successfully") |
| print(f" Authors type: {type(paper.authors)}") |
| print(f" Authors value: {paper.authors}") |
| assert isinstance(paper.authors, list), "Authors should be normalized to list" |
| print(f" β Authors correctly normalized to list") |
| except Exception as e: |
| print(f" β Failed: {str(e)}") |
| return False |
|
|
| |
| print("\n2. Testing categories as dict (malformed data)...") |
| try: |
| paper = Paper( |
| arxiv_id="test.002", |
| title="Test Paper 2", |
| authors=["John Doe"], |
| abstract="Test abstract", |
| pdf_url="https://arxiv.org/pdf/test.002.pdf", |
| published=datetime.now(), |
| categories={"cat1": "cs.AI", "cat2": "cs.LG"} |
| ) |
| print(f" β Paper created successfully") |
| print(f" Categories type: {type(paper.categories)}") |
| print(f" Categories value: {paper.categories}") |
| assert isinstance(paper.categories, list), "Categories should be normalized to list" |
| print(f" β Categories correctly normalized to list") |
| except Exception as e: |
| print(f" β Failed: {str(e)}") |
| return False |
|
|
| |
| print("\n3. Testing multiple fields malformed...") |
| try: |
| paper = Paper( |
| arxiv_id="test.003", |
| title={"title": "Test Paper 3"}, |
| authors={"names": ["John Doe", "Jane Smith"]}, |
| abstract={"summary": "Test abstract"}, |
| pdf_url={"url": "https://arxiv.org/pdf/test.003.pdf"}, |
| published=datetime.now(), |
| categories={"categories": ["cs.AI"]} |
| ) |
| print(f" β Paper created successfully") |
| print(f" Title type: {type(paper.title)}, value: {paper.title}") |
| print(f" Authors type: {type(paper.authors)}, value: {paper.authors}") |
| print(f" Abstract type: {type(paper.abstract)}, value: {paper.abstract[:50]}...") |
| print(f" PDF URL type: {type(paper.pdf_url)}, value: {paper.pdf_url}") |
| print(f" Categories type: {type(paper.categories)}, value: {paper.categories}") |
|
|
| assert isinstance(paper.title, str), "Title should be normalized to string" |
| assert isinstance(paper.authors, list), "Authors should be normalized to list" |
| assert isinstance(paper.abstract, str), "Abstract should be normalized to string" |
| assert isinstance(paper.pdf_url, str), "PDF URL should be normalized to string" |
| assert isinstance(paper.categories, list), "Categories should be normalized to list" |
| print(f" β All fields correctly normalized") |
| except Exception as e: |
| print(f" β Failed: {str(e)}") |
| return False |
|
|
| print("\n" + "="*80) |
| print("β ALL PAPER SCHEMA VALIDATION TESTS PASSED") |
| print("="*80) |
| return True |
|
|
|
|
| def test_pdf_processor_resilience(): |
| """Test that PDFProcessor handles malformed Paper objects gracefully.""" |
| print("\n" + "="*80) |
| print("TEST 2: PDFProcessor Resilience") |
| print("="*80) |
|
|
| processor = PDFProcessor(chunk_size=100, chunk_overlap=10) |
|
|
| |
| print("\n1. Testing PDF processor with validated Paper object...") |
| try: |
| paper = Paper( |
| arxiv_id="test.004", |
| title="Test Paper", |
| authors={"author1": "John Doe"}, |
| abstract="Test abstract", |
| pdf_url="https://arxiv.org/pdf/test.004.pdf", |
| published=datetime.now(), |
| categories=["cs.AI"] |
| ) |
|
|
| |
| test_text = "This is a test document. " * 100 |
|
|
| chunks = processor.chunk_text(test_text, paper) |
| print(f" β Created {len(chunks)} chunks successfully") |
| print(f" First chunk metadata authors type: {type(chunks[0].metadata['authors'])}") |
| print(f" First chunk metadata authors: {chunks[0].metadata['authors']}") |
|
|
| assert isinstance(chunks[0].metadata['authors'], list), "Chunk metadata authors should be list" |
| print(f" β Chunk metadata correctly contains list for authors") |
|
|
| except Exception as e: |
| print(f" β Failed: {str(e)}") |
| import traceback |
| traceback.print_exc() |
| return False |
|
|
| print("\n" + "="*80) |
| print("β PDF PROCESSOR RESILIENCE TESTS PASSED") |
| print("="*80) |
| return True |
|
|
|
|
| if __name__ == "__main__": |
| print("\n" + "="*80) |
| print("DATA VALIDATION FIX VERIFICATION TESTS") |
| print("="*80) |
| print("\nThese tests verify that the fixes for malformed MCP data work correctly:") |
| print("- Paper schema validators normalize dict fields to proper types") |
| print("- PDF processor handles validated Paper objects without errors") |
| print("="*80) |
|
|
| test1_pass = test_paper_schema_validators() |
| test2_pass = test_pdf_processor_resilience() |
|
|
| print("\n" + "="*80) |
| print("FINAL RESULTS") |
| print("="*80) |
| print(f"Paper Schema Validators: {'β PASS' if test1_pass else 'β FAIL'}") |
| print(f"PDF Processor Resilience: {'β PASS' if test2_pass else 'β FAIL'}") |
| print("="*80) |
|
|
| if test1_pass and test2_pass: |
| print("\nβ ALL TESTS PASSED - The data validation fixes are working correctly!") |
| print("\nThe system should now handle malformed MCP responses gracefully.") |
| sys.exit(0) |
| else: |
| print("\nβ SOME TESTS FAILED - Please review the errors above") |
| sys.exit(1) |
|
|