DocAI / src /docling_parse.py
Pengyuan Li
feat: sync improvements from granite-vision-document-intelligence
39af568
"""Docling document parsing with figure extraction and markdown export."""
import os
import tempfile
from typing import Any
def parse_document(pdf_bytes: bytes) -> dict[str, Any]:
"""Parse a PDF with Docling and extract markdown, text, and figure regions.
Args:
pdf_bytes: PDF file content as bytes.
Returns:
Dictionary with keys:
- ``html``: HTML-wrapped markdown representation of the document.
- ``text``: Full extracted plain text.
- ``figures``: List of figure dicts with ``bbox``, ``page``, ``caption``, and ``image``.
"""
try:
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
tmp.write(pdf_bytes)
tmp_path = tmp.name
try:
pdf_format_option = PdfFormatOption()
pdf_format_option.pipeline_options.generate_picture_images = True
pdf_format_option.pipeline_options.images_scale = 2.0
converter = DocumentConverter(format_options={InputFormat.PDF: pdf_format_option})
result = converter.convert(tmp_path)
doc = result.document
markdown_text = doc.export_to_markdown()
html = markdown_text
text = doc.export_to_text()
figures: list[dict[str, Any]] = []
try:
if hasattr(doc, "pictures"):
for figure in doc.pictures:
if figure.content_layer.value != "body":
continue
page_num = 0
bbox_list = None
if figure.prov:
page_num = figure.prov[0].page_no - 1 # Docling is 1-based
bbox = figure.prov[0].bbox
bbox_list = [bbox.l, bbox.t, bbox.width, bbox.height]
caption = ""
if figure.captions:
for cap_ref in figure.captions:
try:
if hasattr(cap_ref, "cref") and cap_ref.cref.startswith("#/texts/"):
idx = int(cap_ref.cref.split("/")[-1])
if idx < len(doc.texts):
caption = doc.texts[idx].text
break
except Exception: # noqa: BLE001
pass
if figure.image:
try:
pil_image = figure.image.pil_image
figures.append({
"bbox": bbox_list,
"page": page_num,
"caption": caption,
"image": pil_image,
})
except Exception: # noqa: BLE001
pass
except Exception: # noqa: BLE001
figures = []
return {"html": html, "text": text, "figures": figures}
finally:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
except ImportError as e:
print(f"Docling import error: {e}, using placeholder")
return {
"html": "<h1>Sample Document</h1><p>Docling not available - using placeholder.</p>",
"text": "Sample text from PDF.\n\nDocling not available - using placeholder.",
"figures": [],
}
except Exception as e: # noqa: BLE001
import traceback
print(f"Docling parse error: {e}")
traceback.print_exc()
return {
"html": f"<h1>Error</h1><pre>{e!s}</pre>",
"text": f"Error: {e!s}",
"figures": [],
}