Spaces:
Build error
Build error
Claude
fix: IIIF image fetch performance — retry with backoff, request size reduction, cursor warning
2e30fe9 unverified | """ | |
| Tests du pipeline image : fetch IIIF + normalisation (dérivé + thumbnail). | |
| Tests unitaires : httpx mocké, images créées en mémoire (Pillow). | |
| Tests d'intégration : requêtes réseau réelles, activés via RUN_INTEGRATION_TESTS=1. | |
| """ | |
| # 1. stdlib | |
| import io | |
| import os | |
| from pathlib import Path | |
| from unittest.mock import MagicMock, patch | |
| # 2. third-party | |
| import httpx | |
| import pytest | |
| from PIL import Image | |
| from pydantic import ValidationError | |
| # 3. local | |
| from app.schemas.image import ImageDerivativeInfo | |
| from app.services.image.normalizer import ( | |
| _MAX_DERIVATIVE_PX, | |
| _MAX_THUMBNAIL_PX, | |
| _resize_to_max, | |
| create_derivatives, | |
| fetch_and_normalize, | |
| ) | |
| from app.services.ingest.iiif_fetcher import fetch_iiif_image | |
| # --------------------------------------------------------------------------- | |
| # Marqueur d'intégration — activé seulement si RUN_INTEGRATION_TESTS=1 | |
| # --------------------------------------------------------------------------- | |
| integration = pytest.mark.skipif( | |
| not os.environ.get("RUN_INTEGRATION_TESTS"), | |
| reason="Tests réseau réels : définir RUN_INTEGRATION_TESTS=1 pour les activer", | |
| ) | |
| # URLs IIIF des 3 manuscrits de test (BnF Gallica) | |
| _URL_BEATUS_HI = ( | |
| "https://gallica.bnf.fr/iiif/ark:/12148/btv1b8432836p/f13/full/max/0/default.jpg" | |
| ) | |
| _URL_BEATUS_LO = ( | |
| "https://gallica.bnf.fr/iiif/ark:/12148/btv1b8432836p/f13/full/600,/0/default.jpg" | |
| ) | |
| _URL_GRANDES_CHRONIQUES = ( | |
| "https://gallica.bnf.fr/iiif/ark:/12148/btv1b8427295k/f3/full/max/0/default.jpg" | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Helpers de test | |
| # --------------------------------------------------------------------------- | |
| def _make_jpeg_bytes(width: int, height: int, color: tuple[int, int, int] = (200, 150, 100)) -> bytes: | |
| """Crée un JPEG minimal en mémoire pour les tests unitaires.""" | |
| img = Image.new("RGB", (width, height), color=color) | |
| buf = io.BytesIO() | |
| img.save(buf, format="JPEG", quality=85) | |
| return buf.getvalue() | |
| def _make_png_rgba_bytes(width: int, height: int) -> bytes: | |
| """Crée un PNG RGBA en mémoire (pour tester la conversion RGB).""" | |
| img = Image.new("RGBA", (width, height), color=(100, 150, 200, 128)) | |
| buf = io.BytesIO() | |
| img.save(buf, format="PNG") | |
| return buf.getvalue() | |
| # --------------------------------------------------------------------------- | |
| # Tests — ImageDerivativeInfo (schéma) | |
| # --------------------------------------------------------------------------- | |
| def test_schema_valid(): | |
| info = ImageDerivativeInfo( | |
| original_url="https://example.com/image.jpg", | |
| original_width=3000, | |
| original_height=4000, | |
| derivative_path="/data/corpora/test/derivatives/0001r.jpg", | |
| derivative_width=1125, | |
| derivative_height=1500, | |
| thumbnail_path="/data/corpora/test/derivatives/0001r_thumb.jpg", | |
| thumbnail_width=192, | |
| thumbnail_height=256, | |
| ) | |
| assert info.original_width == 3000 | |
| assert info.derivative_width == 1125 | |
| def test_schema_missing_required_field(): | |
| with pytest.raises(ValidationError): | |
| ImageDerivativeInfo.model_validate({"original_url": "https://x.com/img.jpg"}) | |
| def test_schema_all_fields_present(): | |
| fields = ImageDerivativeInfo.model_fields.keys() | |
| expected = { | |
| "original_url", "original_width", "original_height", | |
| "derivative_path", "derivative_width", "derivative_height", | |
| "thumbnail_path", "thumbnail_width", "thumbnail_height", | |
| } | |
| assert set(fields) == expected | |
| # --------------------------------------------------------------------------- | |
| # Tests — _resize_to_max | |
| # --------------------------------------------------------------------------- | |
| def test_resize_small_image_not_upscaled(): | |
| """Une image déjà petite ne doit pas être agrandie.""" | |
| img = Image.new("RGB", (800, 600)) | |
| result = _resize_to_max(img, _MAX_DERIVATIVE_PX) | |
| assert result.size == (800, 600) | |
| def test_resize_exact_max_not_changed(): | |
| """Une image dont le grand côté est exactement max_size n'est pas redimensionnée.""" | |
| img = Image.new("RGB", (1500, 1000)) | |
| result = _resize_to_max(img, _MAX_DERIVATIVE_PX) | |
| assert result.size == (1500, 1000) | |
| def test_resize_landscape_large(): | |
| """Paysage 3000x2000 → 1500x1000.""" | |
| img = Image.new("RGB", (3000, 2000)) | |
| result = _resize_to_max(img, 1500) | |
| assert result.size == (1500, 1000) | |
| def test_resize_portrait_large(): | |
| """Portrait 2000x3000 → 1000x1500.""" | |
| img = Image.new("RGB", (2000, 3000)) | |
| result = _resize_to_max(img, 1500) | |
| assert result.size == (1000, 1500) | |
| def test_resize_square_large(): | |
| """Carré 2000x2000 → 1500x1500.""" | |
| img = Image.new("RGB", (2000, 2000)) | |
| result = _resize_to_max(img, 1500) | |
| assert result.size == (1500, 1500) | |
| def test_resize_preserves_aspect_ratio(): | |
| """Le ratio d'aspect est préservé après resize.""" | |
| img = Image.new("RGB", (4000, 3000)) | |
| result = _resize_to_max(img, 1500) | |
| w, h = result.size | |
| assert w == 1500 | |
| assert abs(w / h - 4 / 3) < 0.01 | |
| def test_resize_returns_copy_when_no_resize_needed(): | |
| """Retourne une copie (pas la même instance) même sans resize.""" | |
| img = Image.new("RGB", (100, 100)) | |
| result = _resize_to_max(img, 1500) | |
| assert result is not img | |
| def test_resize_thumbnail_size(): | |
| """Vérification pour la taille thumbnail (256px).""" | |
| img = Image.new("RGB", (1200, 800)) | |
| result = _resize_to_max(img, _MAX_THUMBNAIL_PX) | |
| assert result.size[0] == 256 | |
| assert result.size[1] == 171 # round(800 * 256 / 1200) = round(170.67) = 171 | |
| # --------------------------------------------------------------------------- | |
| # Tests — create_derivatives | |
| # --------------------------------------------------------------------------- | |
| def test_create_derivatives_large_landscape(tmp_path): | |
| """Image 3000x2000 → dérivé 1500x1000, thumbnail 256x171.""" | |
| source = _make_jpeg_bytes(3000, 2000) | |
| info = create_derivatives(source, "https://x.com/img.jpg", "test-corpus", "0001r", tmp_path) | |
| assert info.original_width == 3000 | |
| assert info.original_height == 2000 | |
| assert info.derivative_width == 1500 | |
| assert info.derivative_height == 1000 | |
| assert info.thumbnail_width == 256 | |
| assert info.thumbnail_height == 171 | |
| assert info.original_url == "https://x.com/img.jpg" | |
| def test_create_derivatives_small_image_not_upscaled(tmp_path): | |
| """Image 600x900 (< 1500px) : dérivé conserve les dimensions originales.""" | |
| source = _make_jpeg_bytes(600, 900) | |
| info = create_derivatives(source, "https://x.com/img.jpg", "test-corpus", "0001r", tmp_path) | |
| assert info.derivative_width == 600 | |
| assert info.derivative_height == 900 | |
| assert info.original_width == 600 | |
| assert info.original_height == 900 | |
| def test_create_derivatives_files_exist(tmp_path): | |
| """Les deux fichiers JPEG sont bien créés sur disque.""" | |
| source = _make_jpeg_bytes(2000, 3000) | |
| info = create_derivatives(source, "https://x.com/img.jpg", "corpus-a", "f001r", tmp_path) | |
| assert Path(info.derivative_path).exists() | |
| assert Path(info.thumbnail_path).exists() | |
| def test_create_derivatives_path_structure(tmp_path): | |
| """Les chemins respectent la convention CLAUDE.md §3.""" | |
| source = _make_jpeg_bytes(1000, 1000) | |
| info = create_derivatives(source, "https://x.com/img.jpg", "beatus-lat8878", "0013r", tmp_path) | |
| expected_deriv = tmp_path / "corpora" / "beatus-lat8878" / "derivatives" / "0013r.jpg" | |
| expected_thumb = tmp_path / "corpora" / "beatus-lat8878" / "derivatives" / "0013r_thumb.jpg" | |
| assert info.derivative_path == str(expected_deriv) | |
| assert info.thumbnail_path == str(expected_thumb) | |
| def test_create_derivatives_output_is_jpeg(tmp_path): | |
| """Les fichiers produits sont bien des JPEG valides.""" | |
| source = _make_jpeg_bytes(1000, 800) | |
| info = create_derivatives(source, "https://x.com/img.jpg", "corpus-b", "f002r", tmp_path) | |
| with Image.open(info.derivative_path) as img: | |
| assert img.format == "JPEG" | |
| with Image.open(info.thumbnail_path) as img: | |
| assert img.format == "JPEG" | |
| def test_create_derivatives_rgba_converted_to_rgb(tmp_path): | |
| """Un PNG RGBA est converti en RGB sans erreur.""" | |
| source = _make_png_rgba_bytes(800, 1000) | |
| info = create_derivatives(source, "https://x.com/img.png", "corpus-c", "f003r", tmp_path) | |
| with Image.open(info.derivative_path) as img: | |
| assert img.mode == "RGB" | |
| assert info.original_width == 800 | |
| assert info.original_height == 1000 | |
| def test_create_derivatives_thumbnail_dimensions(tmp_path): | |
| """Le thumbnail a bien son grand côté <= 256px.""" | |
| source = _make_jpeg_bytes(3000, 4000) | |
| info = create_derivatives(source, "https://x.com/img.jpg", "corpus-d", "f004r", tmp_path) | |
| assert max(info.thumbnail_width, info.thumbnail_height) == _MAX_THUMBNAIL_PX | |
| def test_create_derivatives_creates_parent_dirs(tmp_path): | |
| """Les dossiers intermédiaires sont créés automatiquement.""" | |
| source = _make_jpeg_bytes(500, 500) | |
| new_slug = "nouveau-corpus-jamais-vu" | |
| info = create_derivatives(source, "https://x.com/img.jpg", new_slug, "f001r", tmp_path) | |
| assert Path(info.derivative_path).parent.exists() | |
| # --------------------------------------------------------------------------- | |
| # Tests — fetch_iiif_image | |
| # --------------------------------------------------------------------------- | |
| def test_fetch_iiif_image_success(): | |
| """Retourne les bytes de l'image si la requête réussit.""" | |
| fake_bytes = _make_jpeg_bytes(100, 100) | |
| with patch("app.services.ingest.iiif_fetcher.httpx.get") as mock_get, \ | |
| patch("app.services.ingest.iiif_fetcher.time.sleep"), \ | |
| patch("app.services.ingest.iiif_fetcher.time.monotonic", return_value=0.0): | |
| mock_response = MagicMock() | |
| mock_response.status_code = 200 | |
| mock_response.content = fake_bytes | |
| mock_response.raise_for_status.return_value = None | |
| mock_get.return_value = mock_response | |
| result = fetch_iiif_image("https://example.com/image.jpg") | |
| assert result == fake_bytes | |
| _, kwargs = mock_get.call_args | |
| assert kwargs["follow_redirects"] is True | |
| # Timeout is an httpx.Timeout object (connect=15s, read=60s) | |
| assert kwargs["timeout"].connect == 15.0 | |
| assert kwargs["timeout"].read == 60.0 | |
| def test_fetch_iiif_image_http_error(): | |
| """Propage HTTPStatusError si le serveur répond 404 (pas de retry sur 4xx hors 429).""" | |
| with patch("app.services.ingest.iiif_fetcher.httpx.get") as mock_get, \ | |
| patch("app.services.ingest.iiif_fetcher.time.sleep"), \ | |
| patch("app.services.ingest.iiif_fetcher.time.monotonic", return_value=0.0): | |
| mock_response = MagicMock() | |
| mock_response.status_code = 404 | |
| mock_response.raise_for_status.side_effect = httpx.HTTPStatusError( | |
| "404 Not Found", | |
| request=MagicMock(), | |
| response=MagicMock(status_code=404), | |
| ) | |
| mock_get.return_value = mock_response | |
| with pytest.raises(httpx.HTTPStatusError): | |
| fetch_iiif_image("https://example.com/missing.jpg") | |
| def test_fetch_iiif_image_timeout(): | |
| """Propage TimeoutException après épuisement des retries.""" | |
| with patch("app.services.ingest.iiif_fetcher.httpx.get") as mock_get, \ | |
| patch("app.services.ingest.iiif_fetcher.time.sleep"), \ | |
| patch("app.services.ingest.iiif_fetcher.time.monotonic", return_value=0.0): | |
| mock_get.side_effect = httpx.TimeoutException("timed out") | |
| with pytest.raises(httpx.TimeoutException): | |
| fetch_iiif_image("https://example.com/slow.jpg", timeout=1.0) | |
| def test_fetch_iiif_image_custom_timeout(): | |
| """Le timeout personnalisé est bien transmis à httpx.get.""" | |
| fake_bytes = _make_jpeg_bytes(50, 50) | |
| with patch("app.services.ingest.iiif_fetcher.httpx.get") as mock_get, \ | |
| patch("app.services.ingest.iiif_fetcher.time.sleep"), \ | |
| patch("app.services.ingest.iiif_fetcher.time.monotonic", return_value=0.0): | |
| mock_response = MagicMock() | |
| mock_response.status_code = 200 | |
| mock_response.content = fake_bytes | |
| mock_response.raise_for_status.return_value = None | |
| mock_get.return_value = mock_response | |
| fetch_iiif_image("https://example.com/img.jpg", timeout=120.0) | |
| _, kwargs = mock_get.call_args | |
| # Custom timeout wraps in httpx.Timeout(120.0, connect=15.0) | |
| assert kwargs["timeout"].read == 120.0 | |
| assert kwargs["timeout"].connect == 15.0 | |
| # --------------------------------------------------------------------------- | |
| # Tests — fetch_and_normalize (end-to-end mocké) | |
| # --------------------------------------------------------------------------- | |
| def test_fetch_and_normalize_chains_correctly(tmp_path): | |
| """fetch_and_normalize appelle fetch_iiif_image puis create_derivatives.""" | |
| fake_bytes = _make_jpeg_bytes(2000, 1500) | |
| with patch("app.services.image.normalizer.fetch_iiif_image", return_value=fake_bytes) as mock_fetch: | |
| info = fetch_and_normalize( | |
| "https://example.com/ms/f001.jpg", | |
| "corpus-test", | |
| "0001r", | |
| tmp_path, | |
| ) | |
| mock_fetch.assert_called_once_with("https://example.com/ms/f001.jpg") | |
| assert info.original_url == "https://example.com/ms/f001.jpg" | |
| assert info.original_width == 2000 | |
| assert info.original_height == 1500 | |
| assert info.derivative_width == 1500 | |
| assert info.derivative_height == 1125 | |
| assert Path(info.derivative_path).exists() | |
| assert Path(info.thumbnail_path).exists() | |
| def test_fetch_and_normalize_propagates_http_error(tmp_path): | |
| """Les erreurs HTTP de fetch_iiif_image sont propagées sans être masquées.""" | |
| with patch( | |
| "app.services.image.normalizer.fetch_iiif_image", | |
| side_effect=httpx.HTTPStatusError("403", request=MagicMock(), response=MagicMock()), | |
| ): | |
| with pytest.raises(httpx.HTTPStatusError): | |
| fetch_and_normalize("https://example.com/img.jpg", "corpus", "f001", tmp_path) | |
| # --------------------------------------------------------------------------- | |
| # Tests d'intégration — URLs IIIF BnF réelles (skippés par défaut) | |
| # --------------------------------------------------------------------------- | |
| def test_integration_beatus_high_res(tmp_path): | |
| """Beatus de Saint-Sever, BnF Latin 8878, f.13 — haute résolution.""" | |
| info = fetch_and_normalize(_URL_BEATUS_HI, "beatus-lat8878", "0013r", tmp_path) | |
| assert info.original_width > 1500 or info.original_height > 1500 | |
| assert info.derivative_width <= _MAX_DERIVATIVE_PX | |
| assert info.derivative_height <= _MAX_DERIVATIVE_PX | |
| assert max(info.derivative_width, info.derivative_height) == _MAX_DERIVATIVE_PX | |
| assert Path(info.derivative_path).exists() | |
| assert Path(info.thumbnail_path).exists() | |
| def test_integration_beatus_low_res(tmp_path): | |
| """Beatus de Saint-Sever, BnF Latin 8878, f.13 — 600px (image déjà petite).""" | |
| info = fetch_and_normalize(_URL_BEATUS_LO, "beatus-lat8878", "0013r-600", tmp_path) | |
| # Image à 600px de large : pas d'upscaling, dérivé == original | |
| assert info.derivative_width <= _MAX_DERIVATIVE_PX | |
| assert info.derivative_height <= _MAX_DERIVATIVE_PX | |
| assert max(info.derivative_width, info.derivative_height) <= 600 | |
| assert Path(info.derivative_path).exists() | |
| def test_integration_grandes_chroniques(tmp_path): | |
| """Grandes Chroniques de France, BnF Français 2813.""" | |
| info = fetch_and_normalize(_URL_GRANDES_CHRONIQUES, "grandes-chroniques", "f003", tmp_path) | |
| assert info.derivative_width <= _MAX_DERIVATIVE_PX | |
| assert info.derivative_height <= _MAX_DERIVATIVE_PX | |
| assert Path(info.derivative_path).exists() | |
| assert Path(info.thumbnail_path).exists() | |