""" Tests du générateur ALTO v4 (Sprint 3 — Session A). Vérifie : - XML produit est valide (parseable par lxml) - Correspondance exacte bbox → HPOS/VPOS/WIDTH/HEIGHT - text_block / margin / rubric → TextBlock - miniature / decorated_initial → Illustration - other → ComposedBlock - Texte OCR présent dans TextBlock quand disponible - Fallback diplomatic_text dans le premier TextBlock si aucun bloc OCR par région - Master sans régions → ALTO valide avec PrintSpace vide - Région invalide → ValueError explicite (jamais ALTO partiel) - Dimensions de page (WIDTH/HEIGHT) issues de master.image - OCRProcessing présent si master.processing, absent sinon - Attribut TYPE sur Illustration (valeur du RegionType) - Page ID contient le page_id du master """ # 1. stdlib import json from datetime import datetime, timezone from pathlib import Path # 2. third-party import pytest from lxml import etree # 3. local from app.schemas.page_master import EditorialInfo, EditorialStatus, OCRResult, PageMaster, ProcessingInfo from app.services.export.alto import generate_alto, write_alto # ── Namespace ALTO v4 ───────────────────────────────────────────────────── _ALTO_NS = "http://www.loc.gov/standards/alto/ns-v4#" _A = f"{{{_ALTO_NS}}}" # --------------------------------------------------------------------------- # Helpers / Fixtures # --------------------------------------------------------------------------- def _make_master( page_id: str = "test-ms-0001r", sequence: int = 1, regions: list | None = None, ocr: OCRResult | None = None, width: int = 1500, height: int = 2000, with_processing: bool = False, ) -> PageMaster: if regions is None: regions = [] processing = None if with_processing: processing = ProcessingInfo( provider="google_ai_studio", model_id="gemini-2.0-flash", model_display_name="Gemini 2.0 Flash", prompt_version="prompts/medieval-illuminated/primary_v1.txt", raw_response_path="/data/ai_raw.json", processed_at=datetime(2024, 6, 15, 12, 0, 0, tzinfo=timezone.utc), ) return PageMaster( page_id=page_id, corpus_profile="medieval-illuminated", manuscript_id="ms-test", folio_label="0001r", sequence=sequence, image={ "master": "https://example.com/img.jpg", "derivative_web": "/data/deriv.jpg", "thumbnail": "/data/thumb.jpg", "width": width, "height": height, }, layout={"regions": regions}, ocr=ocr, processing=processing, editorial=EditorialInfo(status=EditorialStatus.MACHINE_DRAFT), ) def _parse(xml_str: str) -> etree._Element: """Parse une chaîne XML et retourne la racine.""" return etree.fromstring(xml_str.encode("utf-8")) def _xpath(root: etree._Element, path: str) -> list: return root.xpath(path, namespaces={"a": _ALTO_NS}) def _one(root: etree._Element, path: str) -> etree._Element: results = _xpath(root, path) assert len(results) == 1, f"Expected 1 match for {path!r}, got {len(results)}" return results[0] # --------------------------------------------------------------------------- # Synthetic master.json fixtures (équivalents aux 3 master.json du Sprint 2) # --------------------------------------------------------------------------- @pytest.fixture def master_text_only(): """Master simulant une page texte (seul TextBlock, OCR global).""" return _make_master( page_id="beatus-hr-0001r", sequence=1, regions=[ {"id": "r1", "type": "text_block", "bbox": [50, 100, 1400, 1800], "confidence": 0.92}, ], ocr=OCRResult( diplomatic_text="Incipit explanatio beati Ieronimi", language="la", confidence=0.92, ), width=1500, height=2000, with_processing=True, ) @pytest.fixture def master_mixed_regions(): """Master simulant un folio enluminé (texte + miniature + initial décoré).""" return _make_master( page_id="beatus-br-0013r", sequence=13, regions=[ {"id": "r1", "type": "miniature", "bbox": [0, 0, 1500, 800], "confidence": 0.95}, {"id": "r2", "type": "decorated_initial", "bbox": [50, 820, 200, 200], "confidence": 0.88}, {"id": "r3", "type": "text_block", "bbox": [260, 820, 1200, 200], "confidence": 0.90}, {"id": "r4", "type": "rubric", "bbox": [50, 1040, 1400, 80], "confidence": 0.85}, {"id": "r5", "type": "margin", "bbox": [0, 100, 45, 1800], "confidence": 0.70}, ], ocr=OCRResult( diplomatic_text="Sequitur de bestia", language="la", confidence=0.90, ), width=1500, height=2000, ) @pytest.fixture def master_with_per_region_ocr(): """Master avec OCR indexé par region_id dans ocr.blocks.""" return _make_master( page_id="chroniques-f016", sequence=16, regions=[ {"id": "r1", "type": "text_block", "bbox": [100, 100, 600, 400], "confidence": 0.91}, {"id": "r2", "type": "text_block", "bbox": [100, 520, 600, 300], "confidence": 0.87}, {"id": "r3", "type": "miniature", "bbox": [720, 100, 700, 800], "confidence": 0.96}, ], ocr=OCRResult( diplomatic_text="[global fallback]", blocks=[ {"region_id": "r1", "text": "Cy commence le prologue", "confidence": 0.91}, {"region_id": "r2", "text": "Des grandes chroniques de France", "confidence": 0.87}, ], language="fr", confidence=0.89, ), width=1500, height=2000, ) # --------------------------------------------------------------------------- # Tests — validité XML # --------------------------------------------------------------------------- def test_generate_alto_returns_string(master_text_only): result = generate_alto(master_text_only) assert isinstance(result, str) def test_generate_alto_valid_xml(master_text_only): """Le XML produit doit être parseable par lxml sans erreur.""" xml_str = generate_alto(master_text_only) root = _parse(xml_str) assert root is not None def test_generate_alto_valid_xml_mixed(master_mixed_regions): xml_str = generate_alto(master_mixed_regions) root = _parse(xml_str) assert root is not None def test_generate_alto_valid_xml_per_region_ocr(master_with_per_region_ocr): xml_str = generate_alto(master_with_per_region_ocr) root = _parse(xml_str) assert root is not None def test_generate_alto_xml_declaration(master_text_only): """Le XML doit commencer par la déclaration XML.""" xml_str = generate_alto(master_text_only) assert xml_str.startswith("