Spaces:

pengyuan
/

DocAI

Running

DocAI / src /docling_parse.py

Pengyuan Li

feat: sync improvements from granite-vision-document-intelligence

39af568 about 13 hours ago

4.09 kB

	"""Docling document parsing with figure extraction and markdown export."""

	import os
	import tempfile
	from typing import Any


	def parse_document(pdf_bytes: bytes) -> dict[str, Any]:
	"""Parse a PDF with Docling and extract markdown, text, and figure regions.

	Args:
	pdf_bytes: PDF file content as bytes.

	Returns:
	Dictionary with keys:
	- ``html``: HTML-wrapped markdown representation of the document.
	- ``text``: Full extracted plain text.
	- ``figures``: List of figure dicts with ``bbox``, ``page``, ``caption``, and ``image``.
	"""
	try:
	from docling.datamodel.base_models import InputFormat
	from docling.document_converter import DocumentConverter, PdfFormatOption

	with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
	tmp.write(pdf_bytes)
	tmp_path = tmp.name

	try:
	pdf_format_option = PdfFormatOption()
	pdf_format_option.pipeline_options.generate_picture_images = True
	pdf_format_option.pipeline_options.images_scale = 2.0

	converter = DocumentConverter(format_options={InputFormat.PDF: pdf_format_option})
	result = converter.convert(tmp_path)
	doc = result.document

	markdown_text = doc.export_to_markdown()
	html = markdown_text
	text = doc.export_to_text()

	figures: list[dict[str, Any]] = []
	try:
	if hasattr(doc, "pictures"):
	for figure in doc.pictures:
	if figure.content_layer.value != "body":
	continue

	page_num = 0
	bbox_list = None

	if figure.prov:
	page_num = figure.prov[0].page_no - 1 # Docling is 1-based
	bbox = figure.prov[0].bbox
	bbox_list = [bbox.l, bbox.t, bbox.width, bbox.height]

	caption = ""
	if figure.captions:
	for cap_ref in figure.captions:
	try:
	if hasattr(cap_ref, "cref") and cap_ref.cref.startswith("#/texts/"):
	idx = int(cap_ref.cref.split("/")[-1])
	if idx < len(doc.texts):
	caption = doc.texts[idx].text
	break
	except Exception: # noqa: BLE001
	pass

	if figure.image:
	try:
	pil_image = figure.image.pil_image
	figures.append({
	"bbox": bbox_list,
	"page": page_num,
	"caption": caption,
	"image": pil_image,
	})
	except Exception: # noqa: BLE001
	pass

	except Exception: # noqa: BLE001
	figures = []

	return {"html": html, "text": text, "figures": figures}

	finally:
	if os.path.exists(tmp_path):
	os.unlink(tmp_path)

	except ImportError as e:
	print(f"Docling import error: {e}, using placeholder")
	return {
	"html": "<h1>Sample Document</h1><p>Docling not available - using placeholder.</p>",
	"text": "Sample text from PDF.\n\nDocling not available - using placeholder.",
	"figures": [],
	}
	except Exception as e: # noqa: BLE001
	import traceback

	print(f"Docling parse error: {e}")
	traceback.print_exc()
	return {
	"html": f"<h1>Error</h1><pre>{e!s}</pre>",
	"text": f"Error: {e!s}",
	"figures": [],
	}