| """Docling document parsing with figure extraction and markdown export.""" |
|
|
| import os |
| import tempfile |
| from typing import Any |
|
|
|
|
| def parse_document(pdf_bytes: bytes) -> dict[str, Any]: |
| """Parse a PDF with Docling and extract markdown, text, and figure regions. |
| |
| Args: |
| pdf_bytes: PDF file content as bytes. |
| |
| Returns: |
| Dictionary with keys: |
| - ``html``: HTML-wrapped markdown representation of the document. |
| - ``text``: Full extracted plain text. |
| - ``figures``: List of figure dicts with ``bbox``, ``page``, ``caption``, and ``image``. |
| """ |
| try: |
| from docling.datamodel.base_models import InputFormat |
| from docling.document_converter import DocumentConverter, PdfFormatOption |
|
|
| with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp: |
| tmp.write(pdf_bytes) |
| tmp_path = tmp.name |
|
|
| try: |
| pdf_format_option = PdfFormatOption() |
| pdf_format_option.pipeline_options.generate_picture_images = True |
| pdf_format_option.pipeline_options.images_scale = 2.0 |
|
|
| converter = DocumentConverter(format_options={InputFormat.PDF: pdf_format_option}) |
| result = converter.convert(tmp_path) |
| doc = result.document |
|
|
| markdown_text = doc.export_to_markdown() |
| html = markdown_text |
| text = doc.export_to_text() |
|
|
| figures: list[dict[str, Any]] = [] |
| try: |
| if hasattr(doc, "pictures"): |
| for figure in doc.pictures: |
| if figure.content_layer.value != "body": |
| continue |
|
|
| page_num = 0 |
| bbox_list = None |
|
|
| if figure.prov: |
| page_num = figure.prov[0].page_no - 1 |
| bbox = figure.prov[0].bbox |
| bbox_list = [bbox.l, bbox.t, bbox.width, bbox.height] |
|
|
| caption = "" |
| if figure.captions: |
| for cap_ref in figure.captions: |
| try: |
| if hasattr(cap_ref, "cref") and cap_ref.cref.startswith("#/texts/"): |
| idx = int(cap_ref.cref.split("/")[-1]) |
| if idx < len(doc.texts): |
| caption = doc.texts[idx].text |
| break |
| except Exception: |
| pass |
|
|
| if figure.image: |
| try: |
| pil_image = figure.image.pil_image |
| figures.append({ |
| "bbox": bbox_list, |
| "page": page_num, |
| "caption": caption, |
| "image": pil_image, |
| }) |
| except Exception: |
| pass |
|
|
| except Exception: |
| figures = [] |
|
|
| return {"html": html, "text": text, "figures": figures} |
|
|
| finally: |
| if os.path.exists(tmp_path): |
| os.unlink(tmp_path) |
|
|
| except ImportError as e: |
| print(f"Docling import error: {e}, using placeholder") |
| return { |
| "html": "<h1>Sample Document</h1><p>Docling not available - using placeholder.</p>", |
| "text": "Sample text from PDF.\n\nDocling not available - using placeholder.", |
| "figures": [], |
| } |
| except Exception as e: |
| import traceback |
|
|
| print(f"Docling parse error: {e}") |
| traceback.print_exc() |
| return { |
| "html": f"<h1>Error</h1><pre>{e!s}</pre>", |
| "text": f"Error: {e!s}", |
| "figures": [], |
| } |
|
|