Spaces:

thethinkmachine
/

DoclingAIO

Sleeping

App Files Files Community

DoclingAIO / app.py

thethinkmachine

Update app.py

69dea31 verified 21 days ago

raw

history blame contribute delete

32.2 kB

	import streamlit as st
	import tempfile
	import os
	import json
	import zipfile
	import io
	import time
	import traceback
	import html as html_lib
	from pathlib import Path
	from typing import Optional, List, Dict, Any

	# ── Page config ──────────────────────────────────────────────────────────────
	st.set_page_config(
	page_title="Docling AIO Converter",
	page_icon="📄",
	layout="wide",
	initial_sidebar_state="expanded",
	)

	# ── CSS ───────────────────────────────────────────────────────────────────────
	st.markdown("""
	<style>
	@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');

	html, body, [class*="css"] { font-family: 'Inter', sans-serif; }

	.hero {
	background: linear-gradient(135deg, #1a1a2e 0%, #16213e 50%, #0f3460 100%);
	border: 1px solid rgba(255,255,255,0.08);
	border-radius: 16px;
	padding: 2.5rem 2rem;
	margin-bottom: 2rem;
	position: relative;
	overflow: hidden;
	}
	.hero::before {
	content: '';
	position: absolute;
	top: -50%;
	right: -20%;
	width: 400px;
	height: 400px;
	background: radial-gradient(circle, rgba(99,102,241,0.15) 0%, transparent 70%);
	border-radius: 50%;
	}
	.hero h1 { color: #fff; font-size: 2.2rem; font-weight: 700; margin: 0 0 0.5rem; }
	.hero p { color: rgba(255,255,255,0.65); font-size: 1.05rem; margin: 0; }
	.hero .badge {
	display: inline-flex; align-items: center; gap: 6px;
	background: rgba(99,102,241,0.25);
	border: 1px solid rgba(99,102,241,0.5);
	color: #a5b4fc;
	border-radius: 20px;
	padding: 3px 12px;
	font-size: 0.78rem;
	font-weight: 600;
	margin-right: 8px;
	margin-bottom: 1rem;
	}

	.section-header {
	color: #6366f1;
	font-size: 0.7rem;
	font-weight: 700;
	letter-spacing: 0.12em;
	text-transform: uppercase;
	margin: 1.4rem 0 0.6rem;
	padding-bottom: 4px;
	border-bottom: 1px solid rgba(99,102,241,0.2);
	}

	.file-card {
	background: #0f172a;
	border: 1px solid rgba(255,255,255,0.07);
	border-radius: 10px;
	padding: 0.9rem 1.1rem;
	margin-bottom: 0.5rem;
	display: flex;
	align-items: center;
	gap: 12px;
	}
	.file-card .status-ok { color: #4ade80; }
	.file-card .status-err { color: #f87171; }
	.file-card .status-wait { color: #94a3b8; }

	.result-box {
	background: #0d1117;
	border: 1px solid rgba(255,255,255,0.07);
	border-radius: 10px;
	padding: 1.2rem;
	font-size: 0.85rem;
	color: #e2e8f0;
	max-height: 520px;
	overflow-y: auto;
	white-space: pre-wrap;
	font-family: 'JetBrains Mono', 'Fira Code', monospace;
	line-height: 1.6;
	}

	.metric-row {
	display: flex;
	gap: 1rem;
	margin-bottom: 1.2rem;
	flex-wrap: wrap;
	}
	.metric-box {
	flex: 1;
	min-width: 100px;
	background: #0f172a;
	border: 1px solid rgba(255,255,255,0.07);
	border-radius: 10px;
	padding: 0.8rem 1rem;
	text-align: center;
	}
	.metric-box .val { font-size: 1.6rem; font-weight: 700; color: #a5b4fc; }
	.metric-box .lbl { font-size: 0.75rem; color: #64748b; margin-top: 2px; }

	.tag {
	display: inline-block;
	background: rgba(99,102,241,0.15);
	color: #a5b4fc;
	border-radius: 4px;
	padding: 2px 8px;
	font-size: 0.72rem;
	font-weight: 600;
	margin: 2px;
	}
	.tag-green { background: rgba(74,222,128,0.12); color: #4ade80; }
	.tag-red { background: rgba(248,113,113,0.12); color: #f87171; }
	.tag-yellow { background: rgba(251,191,36,0.12); color: #fbbf24; }

	[data-testid="stSidebar"] { background: #0a0e1a; }
	[data-testid="stSidebar"] .block-container { padding-top: 1rem; }

	.stButton>button {
	background: linear-gradient(135deg, #6366f1, #8b5cf6);
	color: white;
	border: none;
	border-radius: 8px;
	font-weight: 600;
	padding: 0.55rem 1.5rem;
	transition: all 0.2s;
	}
	.stButton>button:hover { opacity: 0.88; transform: translateY(-1px); }

	.stDownloadButton>button {
	background: #1e293b;
	color: #a5b4fc;
	border: 1px solid rgba(99,102,241,0.35);
	border-radius: 8px;
	font-weight: 500;
	}
	</style>
	""", unsafe_allow_html=True)

	# ── Helpers ───────────────────────────────────────────────────────────────────
	SUPPORTED_EXTENSIONS = {
	"pdf": "📕", "docx": "📘", "doc": "📘", "pptx": "📙", "ppt": "📙",
	"xlsx": "📗", "xls": "📗", "csv": "📊", "html": "🌐", "htm": "🌐",
	"md": "📝", "txt": "📄", "png": "🖼️", "jpg": "🖼️", "jpeg": "🖼️",
	"tiff": "🖼️", "tif": "🖼️", "bmp": "🖼️", "webp": "🖼️",
	"asciidoc": "📃", "adoc": "📃", "xml": "📑", "json": "📋",
	}

	OUTPUT_FORMATS = {
	"Markdown (.md)": "md",
	"HTML (.html)": "html",
	"JSON (.json)": "json",
	"Plain Text (.txt)": "txt",
	"DocTags (.doctags)":"doctags",
	}

	ELEMENT_LABELS = {
	"Paragraphs / Text": "paragraph",
	"Section Headers": "section_header",
	"Titles": "title",
	"Tables": "table",
	"Figures / Pictures": "picture",
	"Captions": "caption",
	"Footnotes": "footnote",
	"Formulas / Equations":"formula",
	"List Items": "list_item",
	"Code Blocks": "code",
	"Page Headers": "page_header",
	"Page Footers": "page_footer",
	"Key-Value Regions": "key_value_region",
	"Form Elements": "form",
	"Document Index": "document_index",
	}

	def file_icon(filename: str) -> str:
	ext = Path(filename).suffix.lstrip(".").lower()
	return SUPPORTED_EXTENSIONS.get(ext, "📄")

	def fmt_bytes(n: int) -> str:
	for unit in ("B", "KB", "MB", "GB"):
	if n < 1024:
	return f"{n:.1f} {unit}"
	n /= 1024
	return f"{n:.1f} TB"

	def fmt_time(s: float) -> str:
	return f"{s:.1f}s" if s < 60 else f"{int(s//60)}m {int(s%60)}s"

	# ── Lazy-load Docling (heavy) ─────────────────────────────────────────────────
	@st.cache_resource(show_spinner=False)
	def _load_docling():
	"""Import docling once and cache."""
	from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption
	from docling.datamodel.pipeline_options import (
	PdfPipelineOptions,
	TableStructureOptions,
	EasyOcrOptions,
	TesseractCliOcrOptions,
	)
	from docling.datamodel.base_models import InputFormat, ConversionStatus
	try:
	from docling.datamodel.pipeline_options import TableFormerMode
	except ImportError:
	TableFormerMode = None
	try:
	from docling_core.types.doc import ImageRefMode, DocItemLabel
	except ImportError:
	from docling.datamodel.base_models import ImageRefMode, DocItemLabel # type: ignore
	return {
	"DocumentConverter": DocumentConverter,
	"PdfFormatOption": PdfFormatOption,
	"WordFormatOption": WordFormatOption,
	"PdfPipelineOptions": PdfPipelineOptions,
	"TableStructureOptions": TableStructureOptions,
	"EasyOcrOptions": EasyOcrOptions,
	"TesseractCliOcrOptions": TesseractCliOcrOptions,
	"InputFormat": InputFormat,
	"ConversionStatus": ConversionStatus,
	"TableFormerMode": TableFormerMode,
	"ImageRefMode": ImageRefMode,
	"DocItemLabel": DocItemLabel,
	}

	# ── Sidebar Config ────────────────────────────────────────────────────────────
	def sidebar() -> Dict[str, Any]:
	cfg: Dict[str, Any] = {}

	with st.sidebar:
	st.markdown("## ⚙️ Configuration")

	# ── OCR ──────────────────────────────────────────────────────────────
	st.markdown('<div class="section-header">🔍 OCR Settings</div>', unsafe_allow_html=True)
	cfg["do_ocr"] = st.checkbox("Enable OCR", value=True,
	help="Optical Character Recognition for scanned/image-based content.")
	cfg["force_full_page_ocr"] = st.checkbox("Force full-page OCR", value=False,
	help="Run OCR on every page even if text layer exists.")
	cfg["ocr_engine"] = st.radio("OCR Engine", ["EasyOCR", "Tesseract"],
	horizontal=True,
	help="EasyOCR is pure-Python; Tesseract requires system install.")
	cfg["ocr_languages"] = st.multiselect(
	"OCR Languages",
	["en", "de", "fr", "es", "it", "pt", "nl", "ru", "zh", "ja", "ko",
	"ar", "hi", "pl", "cs", "ro", "sv", "da", "fi", "no", "hu", "tr"],
	default=["en"],
	help="Languages for OCR. EasyOCR supports all; Tesseract needs packs installed.")

	# ── Table Extraction ──────────────────────────────────────────────────
	st.markdown('<div class="section-header">📊 Table Extraction</div>', unsafe_allow_html=True)
	cfg["do_table_structure"] = st.checkbox("Extract table structure", value=True,
	help="Use TableFormer model to detect rows/columns/cells in tables.")
	cfg["table_mode"] = st.radio("TableFormer mode",
	["Accurate (slower)", "Fast (lighter)"],
	index=0, horizontal=True,
	help="Accurate uses the full model; Fast is a smaller/faster variant.")
	cfg["do_cell_matching"] = st.checkbox("Cell text matching", value=True,
	help="Match detected cells back to underlying PDF text for accuracy.")

	# ── Image Handling ────────────────────────────────────────────────────
	st.markdown('<div class="section-header">🖼️ Image & Page Rendering</div>', unsafe_allow_html=True)
	cfg["generate_page_images"] = st.checkbox("Generate page images", value=False,
	help="Rasterise each page as an image (needed for embedded page images in output).")
	cfg["generate_picture_images"] = st.checkbox("Generate picture crops", value=True,
	help="Extract figure/picture regions as cropped images.")
	cfg["images_scale"] = st.slider("Rendering scale (DPI multiplier)", 1.0, 4.0, 2.0, 0.5,
	help="Higher = better quality but slower & more memory.")
	cfg["generate_table_images"] = st.checkbox("Generate table images", value=False,
	help="Also rasterise table regions as images.")

	# ── Content Elements ──────────────────────────────────────────────────
	st.markdown('<div class="section-header">📋 Content Elements to Include</div>', unsafe_allow_html=True)
	st.caption("Uncheck elements you want to exclude from the output.")
	selected_labels = []
	for label_name, label_val in ELEMENT_LABELS.items():
	default = True
	# default off for things rarely needed
	if label_val in ("page_header", "page_footer", "document_index",
	"key_value_region", "form"):
	default = False
	if st.checkbox(label_name, value=default, key=f"lbl_{label_val}"):
	selected_labels.append(label_val)
	cfg["selected_labels"] = selected_labels

	# ── Output Format ─────────────────────────────────────────────────────
	st.markdown('<div class="section-header">📤 Output Format</div>', unsafe_allow_html=True)
	cfg["output_format"] = st.selectbox("Convert to", list(OUTPUT_FORMATS.keys()))

	# ── Format-specific options ───────────────────────────────────────────
	fmt = OUTPUT_FORMATS[cfg["output_format"]]
	if fmt in ("md", "html"):
	cfg["image_mode"] = st.selectbox(
	"Image handling in output",
	["Placeholder comment", "Embedded (base64)", "Referenced path", "Omit images"],
	help="How images appear in Markdown / HTML output.")
	if fmt == "md":
	cfg["strict_text"] = st.checkbox("Strict text mode", value=False,
	help="Disable Markdown enrichment; output pure text lines.")
	cfg["indent"] = st.slider("List indent (spaces)", 2, 8, 4, 2)

	# ── PDF-specific ──────────────────────────────────────────────────────
	st.markdown('<div class="section-header">📕 PDF-Specific Options</div>', unsafe_allow_html=True)
	cfg["abort_on_error"] = st.checkbox("Abort batch on first error", value=False)
	cfg["max_file_mb"] = st.slider("Max file size (MB)", 5, 200, 50,
	help="Files larger than this will be skipped with a warning.")

	st.markdown("---")
	st.caption("Powered by [Docling](https://github.com/DS4SD/docling) · IBM Research")

	return cfg

	# ── Converter logic ───────────────────────────────────────────────────────────
	def build_converter(cfg: Dict[str, Any], dl) -> Any:
	"""Construct a DocumentConverter from sidebar config."""
	PdfPipelineOptions = dl["PdfPipelineOptions"]
	TableStructureOptions = dl["TableStructureOptions"]
	EasyOcrOptions = dl["EasyOcrOptions"]
	TesseractCliOcrOptions = dl["TesseractCliOcrOptions"]
	PdfFormatOption = dl["PdfFormatOption"]
	DocumentConverter = dl["DocumentConverter"]
	TableFormerMode = dl["TableFormerMode"]
	InputFormat = dl["InputFormat"]

	# OCR backend
	ocr_options = None
	if cfg["do_ocr"]:
	if cfg["ocr_engine"] == "EasyOCR":
	ocr_options = EasyOcrOptions(lang=cfg["ocr_languages"])
	else:
	ocr_options = TesseractCliOcrOptions(lang="+".join(cfg["ocr_languages"]))

	# Table structure
	tbl_kwargs = {"do_cell_matching": cfg["do_cell_matching"]}
	if TableFormerMode is not None:
	tbl_kwargs["mode"] = (TableFormerMode.ACCURATE
	if "Accurate" in cfg["table_mode"]
	else TableFormerMode.FAST)
	tbl_opts = TableStructureOptions(**tbl_kwargs)

	# PDF pipeline
	pdf_opts_kwargs = dict(
	do_ocr=cfg["do_ocr"],
	do_table_structure=cfg["do_table_structure"],
	table_structure_options=tbl_opts,
	generate_page_images=cfg["generate_page_images"],
	generate_picture_images=cfg["generate_picture_images"],
	images_scale=cfg["images_scale"],
	)
	if cfg["do_ocr"] and ocr_options is not None:
	pdf_opts_kwargs["ocr_options"] = ocr_options
	if cfg["force_full_page_ocr"] and cfg["do_ocr"]:
	pdf_opts_kwargs["force_full_page_ocr"] = True
	if hasattr(PdfPipelineOptions, "generate_table_images"):
	pdf_opts_kwargs["generate_table_images"] = cfg.get("generate_table_images", False)

	pdf_pipeline_opts = PdfPipelineOptions(**pdf_opts_kwargs)

	format_options = {
	InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_opts),
	}

	converter = DocumentConverter(format_options=format_options)
	return converter


	def image_ref_mode(cfg, dl):
	ImageRefMode = dl["ImageRefMode"]
	choice = cfg.get("image_mode", "Placeholder comment")
	mapping = {
	"Placeholder comment": ImageRefMode.PLACEHOLDER,
	"Embedded (base64)": ImageRefMode.EMBEDDED,
	"Referenced path": ImageRefMode.REFERENCED,
	"Omit images": ImageRefMode.PLACEHOLDER, # handled via labels
	}
	return mapping.get(choice, ImageRefMode.PLACEHOLDER)


	def resolve_labels(cfg, dl):
	DocItemLabel = dl["DocItemLabel"]
	# build label objects from selected string values
	labels = []
	label_map = {v: v for v in ELEMENT_LABELS.values()}
	for lv in cfg["selected_labels"]:
	try:
	labels.append(DocItemLabel(lv))
	except Exception:
	pass
	return labels if labels else None


	def do_export(doc, cfg, dl) -> str:
	"""Export converted document to the chosen format."""
	fmt = OUTPUT_FORMATS[cfg["output_format"]]
	labels = resolve_labels(cfg, dl)

	try:
	if fmt == "md":
	kwargs = dict(
	image_mode=image_ref_mode(cfg, dl),
	strict_text=cfg.get("strict_text", False),
	indent=cfg.get("indent", 4),
	)
	if labels is not None:
	kwargs["labels"] = labels
	return doc.export_to_markdown(**kwargs)

	elif fmt == "html":
	kwargs = dict(image_mode=image_ref_mode(cfg, dl))
	if labels is not None:
	kwargs["labels"] = labels
	return doc.export_to_html(**kwargs)

	elif fmt == "json":
	d = doc.export_to_dict()
	return json.dumps(d, indent=2, ensure_ascii=False)

	elif fmt == "txt":
	kwargs = {}
	if labels is not None:
	kwargs["labels"] = labels
	return doc.export_to_text(**kwargs)

	elif fmt == "doctags":
	try:
	return doc.export_to_document_tokens()
	except AttributeError:
	return doc.export_to_markdown()

	except TypeError:
	# Fallback: export without unsupported kwargs
	if fmt == "md":
	return doc.export_to_markdown()
	elif fmt == "html":
	return doc.export_to_html()
	elif fmt == "json":
	return json.dumps(doc.export_to_dict(), indent=2, ensure_ascii=False)
	else:
	return doc.export_to_text()

	return ""


	def convert_file(path: str, cfg: Dict[str, Any], converter, dl) -> Dict[str, Any]:
	"""Run docling on a single file. Returns result dict."""
	ConversionStatus = dl["ConversionStatus"]
	t0 = time.time()
	try:
	result = converter.convert(path)
	elapsed = time.time() - t0
	if result.status not in (ConversionStatus.SUCCESS, ConversionStatus.PARTIAL_SUCCESS):
	return {"ok": False, "error": f"Conversion failed: {result.status}", "elapsed": elapsed}

	doc = result.document
	exported = do_export(doc, cfg, dl)

	# metadata
	meta = {}
	try:
	meta["pages"] = len(result.document.pages) if hasattr(result.document, "pages") else "N/A"
	except Exception:
	meta["pages"] = "N/A"
	try:
	meta["tables"] = len([i for i in doc.iterate_items()
	if hasattr(i[1], 'label') and
	str(getattr(i[1], 'label', '')).endswith('table')])
	except Exception:
	meta["tables"] = "N/A"
	try:
	meta["figures"] = len(doc.pictures) if hasattr(doc, "pictures") else "N/A"
	except Exception:
	meta["figures"] = "N/A"

	return {
	"ok": True,
	"content": exported,
	"elapsed": elapsed,
	"status": str(result.status),
	"meta": meta,
	}
	except Exception as e:
	return {
	"ok": False,
	"error": f"{type(e).__name__}: {e}",
	"traceback": traceback.format_exc(),
	"elapsed": time.time() - t0,
	}

	# ── Main ──────────────────────────────────────────────────────────────────────
	def main():
	cfg = sidebar()

	# Hero
	st.markdown("""
	<div class="hero">
	<span class="badge">⚡ Powered by Docling</span>
	<span class="badge">🤗 HuggingFace Spaces</span>
	<h1>📄 Docling AIO Converter</h1>
	<p>Parse & convert any document — PDF, DOCX, PPTX, XLSX, images, HTML and more —
	with full control over OCR, tables, figures, and output formatting.</p>
	</div>
	""", unsafe_allow_html=True)

	# ── Upload area ───────────────────────────────────────────────────────────
	st.markdown("### 📁 Upload Documents")
	max_mb = cfg.get("max_file_mb", 50)
	uploaded = st.file_uploader(
	f"Drag & drop files here · Max {max_mb} MB per file",
	accept_multiple_files=True,
	type=list(SUPPORTED_EXTENSIONS.keys()),
	help="You can upload multiple files at once for batch conversion.",
	)

	if not uploaded:
	st.info("👆 Upload one or more files to get started. "
	"Adjust all settings in the sidebar before converting.", icon="ℹ️")
	# Supported formats table
	with st.expander("📋 Supported Input Formats"):
	cols = st.columns(4)
	items = list(SUPPORTED_EXTENSIONS.items())
	for i, (ext, icon) in enumerate(items):
	cols[i % 4].markdown(f"{icon} `.{ext}`")
	return

	# ── File list ─────────────────────────────────────────────────────────────
	oversized = [f for f in uploaded if f.size > max_mb * 1024 * 1024]
	valid = [f for f in uploaded if f.size <= max_mb * 1024 * 1024]

	st.markdown(f"{len(uploaded)} file(s) selected · "
	f"<span class='tag tag-green'>{len(valid)} ready</span>"
	+ (f" <span class='tag tag-red'>{len(oversized)} oversized</span>"
	if oversized else ""),
	unsafe_allow_html=True)

	for f in valid[:8]: # show preview of first 8
	st.markdown(
	f"<div class='file-card'>"
	f"<span style='font-size:1.3rem'>{file_icon(f.name)}</span>"
	f"<span style='flex:1;font-weight:500;color:#e2e8f0'>{f.name}</span>"
	f"<span style='color:#64748b;font-size:0.82rem'>{fmt_bytes(f.size)}</span>"
	f"</div>",
	unsafe_allow_html=True,
	)
	if len(valid) > 8:
	st.caption(f"…and {len(valid)-8} more files")
	for f in oversized:
	st.warning(f"⚠️ {f.name} ({fmt_bytes(f.size)}) exceeds the {max_mb} MB limit and will be skipped.")

	if not valid:
	return

	# ── Convert button ────────────────────────────────────────────────────────
	col_btn, col_fmt, _ = st.columns([2, 2, 4])
	with col_btn:
	run = st.button("🚀 Convert All", use_container_width=True)
	with col_fmt:
	st.markdown(f"<br><span class='tag'>{cfg['output_format']}</span>", unsafe_allow_html=True)

	if not run:
	return

	# ── Load Docling ──────────────────────────────────────────────────────────
	with st.spinner("Loading Docling models (first run downloads ~1 GB of models)…"):
	try:
	dl = _load_docling()
	except Exception as e:
	st.error(f"Failed to import Docling: {e}\n\n"
	"Make sure `docling` is installed (`pip install docling`).")
	return

	with st.spinner("Building converter pipeline…"):
	try:
	converter = build_converter(cfg, dl)
	except Exception as e:
	st.error(f"Could not build converter: {e}\n```\n{traceback.format_exc()}\n```")
	return

	# ── Process files ─────────────────────────────────────────────────────────
	st.markdown("---")
	st.markdown("### ⚙️ Processing")

	results: Dict[str, Dict] = {}
	overall_bar = st.progress(0)
	status_area = st.empty()

	with tempfile.TemporaryDirectory() as tmpdir:
	for idx, uf in enumerate(valid):
	fname = uf.name
	status_area.markdown(
	f"<div class='file-card'>"
	f"<span style='font-size:1.2rem'>{file_icon(fname)}</span>"
	f"<span style='flex:1;color:#e2e8f0'>{fname}</span>"
	f"<span class='status-wait'>⏳ converting…</span>"
	f"</div>",
	unsafe_allow_html=True,
	)

	# Fix 4: each file gets its own subdirectory so two uploaded files
	# with the same basename (e.g. "report.pdf" from different folders)
	# never silently overwrite each other in the shared tmpdir.
	file_subdir = os.path.join(tmpdir, str(idx))
	os.makedirs(file_subdir, exist_ok=True)
	tmp_path = os.path.join(file_subdir, fname)

	# Fix 3: always seek(0) before reading — on Streamlit re-renders
	# the BytesIO cursor is already at EOF and uf.read() returns b"",
	# writing a zero-byte file that Docling then silently fails on.
	uf.seek(0)
	with open(tmp_path, "wb") as fh:
	fh.write(uf.read())

	result = convert_file(tmp_path, cfg, converter, dl)
	results[fname] = result

	overall_bar.progress((idx + 1) / len(valid))

	if not result["ok"] and cfg.get("abort_on_error"):
	st.error(f"❌ Aborted after error on {fname}:\n```\n{result['error']}\n```")
	break

	status_area.empty()
	overall_bar.empty()

	# ── Summary metrics ───────────────────────────────────────────────────────
	ok_count = sum(1 for r in results.values() if r["ok"])
	err_count = len(results) - ok_count
	total_time = sum(r["elapsed"] for r in results.values())

	st.markdown(
	f"<div class='metric-row'>"
	f"<div class='metric-box'><div class='val'>{len(results)}</div><div class='lbl'>Files processed</div></div>"
	f"<div class='metric-box'><div class='val' style='color:#4ade80'>{ok_count}</div><div class='lbl'>Succeeded</div></div>"
	f"<div class='metric-box'><div class='val' style='color:#f87171'>{err_count}</div><div class='lbl'>Failed</div></div>"
	f"<div class='metric-box'><div class='val'>{fmt_time(total_time)}</div><div class='lbl'>Total time</div></div>"
	f"</div>",
	unsafe_allow_html=True,
	)

	# ── Per-file results ──────────────────────────────────────────────────────
	st.markdown("### 📂 Results")

	ext = OUTPUT_FORMATS[cfg["output_format"]]

	# Build ZIP in memory
	zip_buf = io.BytesIO()
	with zipfile.ZipFile(zip_buf, "w", zipfile.ZIP_DEFLATED) as zf:
	for fname, res in results.items():
	if res["ok"]:
	out_name = Path(fname).stem + f".{ext}"
	zf.writestr(out_name, res["content"])
	zip_buf.seek(0)

	dl_col1, dl_col2 = st.columns([2, 4])
	with dl_col1:
	st.download_button(
	"⬇️ Download All as ZIP",
	data=zip_buf,
	file_name="docling_output.zip",
	mime="application/zip",
	use_container_width=True,
	)

	st.markdown("---")

	for fname, res in results.items():
	icon = file_icon(fname)
	with st.expander(
	f"{icon} {fname} "
	+ ("✅" if res["ok"] else "❌")
	+ f" · {fmt_time(res['elapsed'])}",
	expanded=ok_count == 1,
	):
	if not res["ok"]:
	st.error(f"Error: {res['error']}")
	if "traceback" in res:
	with st.expander("📋 Full traceback"):
	st.code(res["traceback"], language="python")
	else:
	# Metadata strip
	meta = res.get("meta", {})
	m_cols = st.columns(4)
	m_cols[0].metric("Pages", meta.get("pages", "—"))
	m_cols[1].metric("Tables", meta.get("tables", "—"))
	m_cols[2].metric("Figures", meta.get("figures", "—"))
	m_cols[3].metric("Time", fmt_time(res["elapsed"]))

	content = res["content"]
	out_name = Path(fname).stem + f".{ext}"

	# Download individual file
	mime_map = {
	"md": "text/markdown",
	"html": "text/html",
	"json": "application/json",
	"txt": "text/plain",
	"doctags": "text/plain",
	}
	st.download_button(
	f"⬇️ Download {out_name}",
	data=content.encode("utf-8"),
	file_name=out_name,
	mime=mime_map.get(ext, "text/plain"),
	key=f"dl_{fname}",
	)

	# Preview
	char_count = len(content)
	word_count = len(content.split())
	st.caption(f"Output: {char_count:,} chars · {word_count:,} words")

	if ext == "md":
	tab1, tab2 = st.tabs(["📖 Rendered", "📝 Raw Markdown"])
	with tab1:
	st.markdown(content[:15000] + ("\n\n[truncated for preview…]"
	if len(content) > 15000 else ""),
	unsafe_allow_html=True)
	with tab2:
	# Fix 5: escape before injecting into HTML — raw doc content
	# can contain <, >, & which would break the div or be executed.
	preview_raw = html_lib.escape(content[:12000])
	suffix = "…[truncated]" if len(content) > 12000 else ""
	st.markdown(f"<div class='result-box'>{preview_raw}{suffix}</div>",
	unsafe_allow_html=True)

	elif ext == "html":
	tab1, tab2 = st.tabs(["🌐 Rendered", "📝 HTML Source"])
	with tab1:
	st.components.v1.html(content, height=600, scrolling=True)
	with tab2:
	st.code(content[:10000], language="html")

	elif ext == "json":
	try:
	parsed = json.loads(content)
	st.json(parsed, expanded=False)
	except Exception:
	st.code(content[:10000], language="json")

	else:
	# Fix 5: escape before injecting into HTML — same reason as above.
	preview_plain = html_lib.escape(content[:12000])
	suffix = "…[truncated]" if len(content) > 12000 else ""
	st.markdown(f"<div class='result-box'>{preview_plain}{suffix}</div>",
	unsafe_allow_html=True)


	if __name__ == "__main__":
	main()