DoclingAIO / app.py
thethinkmachine's picture
Update app.py
69dea31 verified
import streamlit as st
import tempfile
import os
import json
import zipfile
import io
import time
import traceback
import html as html_lib
from pathlib import Path
from typing import Optional, List, Dict, Any
# ── Page config ──────────────────────────────────────────────────────────────
st.set_page_config(
page_title="Docling AIO Converter",
page_icon="πŸ“„",
layout="wide",
initial_sidebar_state="expanded",
)
# ── CSS ───────────────────────────────────────────────────────────────────────
st.markdown("""
<style>
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
html, body, [class*="css"] { font-family: 'Inter', sans-serif; }
.hero {
background: linear-gradient(135deg, #1a1a2e 0%, #16213e 50%, #0f3460 100%);
border: 1px solid rgba(255,255,255,0.08);
border-radius: 16px;
padding: 2.5rem 2rem;
margin-bottom: 2rem;
position: relative;
overflow: hidden;
}
.hero::before {
content: '';
position: absolute;
top: -50%;
right: -20%;
width: 400px;
height: 400px;
background: radial-gradient(circle, rgba(99,102,241,0.15) 0%, transparent 70%);
border-radius: 50%;
}
.hero h1 { color: #fff; font-size: 2.2rem; font-weight: 700; margin: 0 0 0.5rem; }
.hero p { color: rgba(255,255,255,0.65); font-size: 1.05rem; margin: 0; }
.hero .badge {
display: inline-flex; align-items: center; gap: 6px;
background: rgba(99,102,241,0.25);
border: 1px solid rgba(99,102,241,0.5);
color: #a5b4fc;
border-radius: 20px;
padding: 3px 12px;
font-size: 0.78rem;
font-weight: 600;
margin-right: 8px;
margin-bottom: 1rem;
}
.section-header {
color: #6366f1;
font-size: 0.7rem;
font-weight: 700;
letter-spacing: 0.12em;
text-transform: uppercase;
margin: 1.4rem 0 0.6rem;
padding-bottom: 4px;
border-bottom: 1px solid rgba(99,102,241,0.2);
}
.file-card {
background: #0f172a;
border: 1px solid rgba(255,255,255,0.07);
border-radius: 10px;
padding: 0.9rem 1.1rem;
margin-bottom: 0.5rem;
display: flex;
align-items: center;
gap: 12px;
}
.file-card .status-ok { color: #4ade80; }
.file-card .status-err { color: #f87171; }
.file-card .status-wait { color: #94a3b8; }
.result-box {
background: #0d1117;
border: 1px solid rgba(255,255,255,0.07);
border-radius: 10px;
padding: 1.2rem;
font-size: 0.85rem;
color: #e2e8f0;
max-height: 520px;
overflow-y: auto;
white-space: pre-wrap;
font-family: 'JetBrains Mono', 'Fira Code', monospace;
line-height: 1.6;
}
.metric-row {
display: flex;
gap: 1rem;
margin-bottom: 1.2rem;
flex-wrap: wrap;
}
.metric-box {
flex: 1;
min-width: 100px;
background: #0f172a;
border: 1px solid rgba(255,255,255,0.07);
border-radius: 10px;
padding: 0.8rem 1rem;
text-align: center;
}
.metric-box .val { font-size: 1.6rem; font-weight: 700; color: #a5b4fc; }
.metric-box .lbl { font-size: 0.75rem; color: #64748b; margin-top: 2px; }
.tag {
display: inline-block;
background: rgba(99,102,241,0.15);
color: #a5b4fc;
border-radius: 4px;
padding: 2px 8px;
font-size: 0.72rem;
font-weight: 600;
margin: 2px;
}
.tag-green { background: rgba(74,222,128,0.12); color: #4ade80; }
.tag-red { background: rgba(248,113,113,0.12); color: #f87171; }
.tag-yellow { background: rgba(251,191,36,0.12); color: #fbbf24; }
[data-testid="stSidebar"] { background: #0a0e1a; }
[data-testid="stSidebar"] .block-container { padding-top: 1rem; }
.stButton>button {
background: linear-gradient(135deg, #6366f1, #8b5cf6);
color: white;
border: none;
border-radius: 8px;
font-weight: 600;
padding: 0.55rem 1.5rem;
transition: all 0.2s;
}
.stButton>button:hover { opacity: 0.88; transform: translateY(-1px); }
.stDownloadButton>button {
background: #1e293b;
color: #a5b4fc;
border: 1px solid rgba(99,102,241,0.35);
border-radius: 8px;
font-weight: 500;
}
</style>
""", unsafe_allow_html=True)
# ── Helpers ───────────────────────────────────────────────────────────────────
SUPPORTED_EXTENSIONS = {
"pdf": "πŸ“•", "docx": "πŸ“˜", "doc": "πŸ“˜", "pptx": "πŸ“™", "ppt": "πŸ“™",
"xlsx": "πŸ“—", "xls": "πŸ“—", "csv": "πŸ“Š", "html": "🌐", "htm": "🌐",
"md": "πŸ“", "txt": "πŸ“„", "png": "πŸ–ΌοΈ", "jpg": "πŸ–ΌοΈ", "jpeg": "πŸ–ΌοΈ",
"tiff": "πŸ–ΌοΈ", "tif": "πŸ–ΌοΈ", "bmp": "πŸ–ΌοΈ", "webp": "πŸ–ΌοΈ",
"asciidoc": "πŸ“ƒ", "adoc": "πŸ“ƒ", "xml": "πŸ“‘", "json": "πŸ“‹",
}
OUTPUT_FORMATS = {
"Markdown (.md)": "md",
"HTML (.html)": "html",
"JSON (.json)": "json",
"Plain Text (.txt)": "txt",
"DocTags (.doctags)":"doctags",
}
ELEMENT_LABELS = {
"Paragraphs / Text": "paragraph",
"Section Headers": "section_header",
"Titles": "title",
"Tables": "table",
"Figures / Pictures": "picture",
"Captions": "caption",
"Footnotes": "footnote",
"Formulas / Equations":"formula",
"List Items": "list_item",
"Code Blocks": "code",
"Page Headers": "page_header",
"Page Footers": "page_footer",
"Key-Value Regions": "key_value_region",
"Form Elements": "form",
"Document Index": "document_index",
}
def file_icon(filename: str) -> str:
ext = Path(filename).suffix.lstrip(".").lower()
return SUPPORTED_EXTENSIONS.get(ext, "πŸ“„")
def fmt_bytes(n: int) -> str:
for unit in ("B", "KB", "MB", "GB"):
if n < 1024:
return f"{n:.1f} {unit}"
n /= 1024
return f"{n:.1f} TB"
def fmt_time(s: float) -> str:
return f"{s:.1f}s" if s < 60 else f"{int(s//60)}m {int(s%60)}s"
# ── Lazy-load Docling (heavy) ─────────────────────────────────────────────────
@st.cache_resource(show_spinner=False)
def _load_docling():
"""Import docling once and cache."""
from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
TableStructureOptions,
EasyOcrOptions,
TesseractCliOcrOptions,
)
from docling.datamodel.base_models import InputFormat, ConversionStatus
try:
from docling.datamodel.pipeline_options import TableFormerMode
except ImportError:
TableFormerMode = None
try:
from docling_core.types.doc import ImageRefMode, DocItemLabel
except ImportError:
from docling.datamodel.base_models import ImageRefMode, DocItemLabel # type: ignore
return {
"DocumentConverter": DocumentConverter,
"PdfFormatOption": PdfFormatOption,
"WordFormatOption": WordFormatOption,
"PdfPipelineOptions": PdfPipelineOptions,
"TableStructureOptions": TableStructureOptions,
"EasyOcrOptions": EasyOcrOptions,
"TesseractCliOcrOptions": TesseractCliOcrOptions,
"InputFormat": InputFormat,
"ConversionStatus": ConversionStatus,
"TableFormerMode": TableFormerMode,
"ImageRefMode": ImageRefMode,
"DocItemLabel": DocItemLabel,
}
# ── Sidebar Config ────────────────────────────────────────────────────────────
def sidebar() -> Dict[str, Any]:
cfg: Dict[str, Any] = {}
with st.sidebar:
st.markdown("## βš™οΈ Configuration")
# ── OCR ──────────────────────────────────────────────────────────────
st.markdown('<div class="section-header">πŸ” OCR Settings</div>', unsafe_allow_html=True)
cfg["do_ocr"] = st.checkbox("Enable OCR", value=True,
help="Optical Character Recognition for scanned/image-based content.")
cfg["force_full_page_ocr"] = st.checkbox("Force full-page OCR", value=False,
help="Run OCR on every page even if text layer exists.")
cfg["ocr_engine"] = st.radio("OCR Engine", ["EasyOCR", "Tesseract"],
horizontal=True,
help="EasyOCR is pure-Python; Tesseract requires system install.")
cfg["ocr_languages"] = st.multiselect(
"OCR Languages",
["en", "de", "fr", "es", "it", "pt", "nl", "ru", "zh", "ja", "ko",
"ar", "hi", "pl", "cs", "ro", "sv", "da", "fi", "no", "hu", "tr"],
default=["en"],
help="Languages for OCR. EasyOCR supports all; Tesseract needs packs installed.")
# ── Table Extraction ──────────────────────────────────────────────────
st.markdown('<div class="section-header">πŸ“Š Table Extraction</div>', unsafe_allow_html=True)
cfg["do_table_structure"] = st.checkbox("Extract table structure", value=True,
help="Use TableFormer model to detect rows/columns/cells in tables.")
cfg["table_mode"] = st.radio("TableFormer mode",
["Accurate (slower)", "Fast (lighter)"],
index=0, horizontal=True,
help="Accurate uses the full model; Fast is a smaller/faster variant.")
cfg["do_cell_matching"] = st.checkbox("Cell text matching", value=True,
help="Match detected cells back to underlying PDF text for accuracy.")
# ── Image Handling ────────────────────────────────────────────────────
st.markdown('<div class="section-header">πŸ–ΌοΈ Image & Page Rendering</div>', unsafe_allow_html=True)
cfg["generate_page_images"] = st.checkbox("Generate page images", value=False,
help="Rasterise each page as an image (needed for embedded page images in output).")
cfg["generate_picture_images"] = st.checkbox("Generate picture crops", value=True,
help="Extract figure/picture regions as cropped images.")
cfg["images_scale"] = st.slider("Rendering scale (DPI multiplier)", 1.0, 4.0, 2.0, 0.5,
help="Higher = better quality but slower & more memory.")
cfg["generate_table_images"] = st.checkbox("Generate table images", value=False,
help="Also rasterise table regions as images.")
# ── Content Elements ──────────────────────────────────────────────────
st.markdown('<div class="section-header">πŸ“‹ Content Elements to Include</div>', unsafe_allow_html=True)
st.caption("Uncheck elements you want to exclude from the output.")
selected_labels = []
for label_name, label_val in ELEMENT_LABELS.items():
default = True
# default off for things rarely needed
if label_val in ("page_header", "page_footer", "document_index",
"key_value_region", "form"):
default = False
if st.checkbox(label_name, value=default, key=f"lbl_{label_val}"):
selected_labels.append(label_val)
cfg["selected_labels"] = selected_labels
# ── Output Format ─────────────────────────────────────────────────────
st.markdown('<div class="section-header">πŸ“€ Output Format</div>', unsafe_allow_html=True)
cfg["output_format"] = st.selectbox("Convert to", list(OUTPUT_FORMATS.keys()))
# ── Format-specific options ───────────────────────────────────────────
fmt = OUTPUT_FORMATS[cfg["output_format"]]
if fmt in ("md", "html"):
cfg["image_mode"] = st.selectbox(
"Image handling in output",
["Placeholder comment", "Embedded (base64)", "Referenced path", "Omit images"],
help="How images appear in Markdown / HTML output.")
if fmt == "md":
cfg["strict_text"] = st.checkbox("Strict text mode", value=False,
help="Disable Markdown enrichment; output pure text lines.")
cfg["indent"] = st.slider("List indent (spaces)", 2, 8, 4, 2)
# ── PDF-specific ──────────────────────────────────────────────────────
st.markdown('<div class="section-header">πŸ“• PDF-Specific Options</div>', unsafe_allow_html=True)
cfg["abort_on_error"] = st.checkbox("Abort batch on first error", value=False)
cfg["max_file_mb"] = st.slider("Max file size (MB)", 5, 200, 50,
help="Files larger than this will be skipped with a warning.")
st.markdown("---")
st.caption("Powered by [Docling](https://github.com/DS4SD/docling) Β· IBM Research")
return cfg
# ── Converter logic ───────────────────────────────────────────────────────────
def build_converter(cfg: Dict[str, Any], dl) -> Any:
"""Construct a DocumentConverter from sidebar config."""
PdfPipelineOptions = dl["PdfPipelineOptions"]
TableStructureOptions = dl["TableStructureOptions"]
EasyOcrOptions = dl["EasyOcrOptions"]
TesseractCliOcrOptions = dl["TesseractCliOcrOptions"]
PdfFormatOption = dl["PdfFormatOption"]
DocumentConverter = dl["DocumentConverter"]
TableFormerMode = dl["TableFormerMode"]
InputFormat = dl["InputFormat"]
# OCR backend
ocr_options = None
if cfg["do_ocr"]:
if cfg["ocr_engine"] == "EasyOCR":
ocr_options = EasyOcrOptions(lang=cfg["ocr_languages"])
else:
ocr_options = TesseractCliOcrOptions(lang="+".join(cfg["ocr_languages"]))
# Table structure
tbl_kwargs = {"do_cell_matching": cfg["do_cell_matching"]}
if TableFormerMode is not None:
tbl_kwargs["mode"] = (TableFormerMode.ACCURATE
if "Accurate" in cfg["table_mode"]
else TableFormerMode.FAST)
tbl_opts = TableStructureOptions(**tbl_kwargs)
# PDF pipeline
pdf_opts_kwargs = dict(
do_ocr=cfg["do_ocr"],
do_table_structure=cfg["do_table_structure"],
table_structure_options=tbl_opts,
generate_page_images=cfg["generate_page_images"],
generate_picture_images=cfg["generate_picture_images"],
images_scale=cfg["images_scale"],
)
if cfg["do_ocr"] and ocr_options is not None:
pdf_opts_kwargs["ocr_options"] = ocr_options
if cfg["force_full_page_ocr"] and cfg["do_ocr"]:
pdf_opts_kwargs["force_full_page_ocr"] = True
if hasattr(PdfPipelineOptions, "generate_table_images"):
pdf_opts_kwargs["generate_table_images"] = cfg.get("generate_table_images", False)
pdf_pipeline_opts = PdfPipelineOptions(**pdf_opts_kwargs)
format_options = {
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_opts),
}
converter = DocumentConverter(format_options=format_options)
return converter
def image_ref_mode(cfg, dl):
ImageRefMode = dl["ImageRefMode"]
choice = cfg.get("image_mode", "Placeholder comment")
mapping = {
"Placeholder comment": ImageRefMode.PLACEHOLDER,
"Embedded (base64)": ImageRefMode.EMBEDDED,
"Referenced path": ImageRefMode.REFERENCED,
"Omit images": ImageRefMode.PLACEHOLDER, # handled via labels
}
return mapping.get(choice, ImageRefMode.PLACEHOLDER)
def resolve_labels(cfg, dl):
DocItemLabel = dl["DocItemLabel"]
# build label objects from selected string values
labels = []
label_map = {v: v for v in ELEMENT_LABELS.values()}
for lv in cfg["selected_labels"]:
try:
labels.append(DocItemLabel(lv))
except Exception:
pass
return labels if labels else None
def do_export(doc, cfg, dl) -> str:
"""Export converted document to the chosen format."""
fmt = OUTPUT_FORMATS[cfg["output_format"]]
labels = resolve_labels(cfg, dl)
try:
if fmt == "md":
kwargs = dict(
image_mode=image_ref_mode(cfg, dl),
strict_text=cfg.get("strict_text", False),
indent=cfg.get("indent", 4),
)
if labels is not None:
kwargs["labels"] = labels
return doc.export_to_markdown(**kwargs)
elif fmt == "html":
kwargs = dict(image_mode=image_ref_mode(cfg, dl))
if labels is not None:
kwargs["labels"] = labels
return doc.export_to_html(**kwargs)
elif fmt == "json":
d = doc.export_to_dict()
return json.dumps(d, indent=2, ensure_ascii=False)
elif fmt == "txt":
kwargs = {}
if labels is not None:
kwargs["labels"] = labels
return doc.export_to_text(**kwargs)
elif fmt == "doctags":
try:
return doc.export_to_document_tokens()
except AttributeError:
return doc.export_to_markdown()
except TypeError:
# Fallback: export without unsupported kwargs
if fmt == "md":
return doc.export_to_markdown()
elif fmt == "html":
return doc.export_to_html()
elif fmt == "json":
return json.dumps(doc.export_to_dict(), indent=2, ensure_ascii=False)
else:
return doc.export_to_text()
return ""
def convert_file(path: str, cfg: Dict[str, Any], converter, dl) -> Dict[str, Any]:
"""Run docling on a single file. Returns result dict."""
ConversionStatus = dl["ConversionStatus"]
t0 = time.time()
try:
result = converter.convert(path)
elapsed = time.time() - t0
if result.status not in (ConversionStatus.SUCCESS, ConversionStatus.PARTIAL_SUCCESS):
return {"ok": False, "error": f"Conversion failed: {result.status}", "elapsed": elapsed}
doc = result.document
exported = do_export(doc, cfg, dl)
# metadata
meta = {}
try:
meta["pages"] = len(result.document.pages) if hasattr(result.document, "pages") else "N/A"
except Exception:
meta["pages"] = "N/A"
try:
meta["tables"] = len([i for i in doc.iterate_items()
if hasattr(i[1], 'label') and
str(getattr(i[1], 'label', '')).endswith('table')])
except Exception:
meta["tables"] = "N/A"
try:
meta["figures"] = len(doc.pictures) if hasattr(doc, "pictures") else "N/A"
except Exception:
meta["figures"] = "N/A"
return {
"ok": True,
"content": exported,
"elapsed": elapsed,
"status": str(result.status),
"meta": meta,
}
except Exception as e:
return {
"ok": False,
"error": f"{type(e).__name__}: {e}",
"traceback": traceback.format_exc(),
"elapsed": time.time() - t0,
}
# ── Main ──────────────────────────────────────────────────────────────────────
def main():
cfg = sidebar()
# Hero
st.markdown("""
<div class="hero">
<span class="badge">⚑ Powered by Docling</span>
<span class="badge">πŸ€— HuggingFace Spaces</span>
<h1>πŸ“„ Docling AIO Converter</h1>
<p>Parse &amp; convert any document β€” PDF, DOCX, PPTX, XLSX, images, HTML and more β€”
with full control over OCR, tables, figures, and output formatting.</p>
</div>
""", unsafe_allow_html=True)
# ── Upload area ───────────────────────────────────────────────────────────
st.markdown("### πŸ“ Upload Documents")
max_mb = cfg.get("max_file_mb", 50)
uploaded = st.file_uploader(
f"Drag & drop files here Β· Max {max_mb} MB per file",
accept_multiple_files=True,
type=list(SUPPORTED_EXTENSIONS.keys()),
help="You can upload multiple files at once for batch conversion.",
)
if not uploaded:
st.info("πŸ‘† Upload one or more files to get started. "
"Adjust all settings in the **sidebar** before converting.", icon="ℹ️")
# Supported formats table
with st.expander("πŸ“‹ Supported Input Formats"):
cols = st.columns(4)
items = list(SUPPORTED_EXTENSIONS.items())
for i, (ext, icon) in enumerate(items):
cols[i % 4].markdown(f"{icon} `.{ext}`")
return
# ── File list ─────────────────────────────────────────────────────────────
oversized = [f for f in uploaded if f.size > max_mb * 1024 * 1024]
valid = [f for f in uploaded if f.size <= max_mb * 1024 * 1024]
st.markdown(f"**{len(uploaded)} file(s) selected** Β· "
f"<span class='tag tag-green'>{len(valid)} ready</span>"
+ (f" <span class='tag tag-red'>{len(oversized)} oversized</span>"
if oversized else ""),
unsafe_allow_html=True)
for f in valid[:8]: # show preview of first 8
st.markdown(
f"<div class='file-card'>"
f"<span style='font-size:1.3rem'>{file_icon(f.name)}</span>"
f"<span style='flex:1;font-weight:500;color:#e2e8f0'>{f.name}</span>"
f"<span style='color:#64748b;font-size:0.82rem'>{fmt_bytes(f.size)}</span>"
f"</div>",
unsafe_allow_html=True,
)
if len(valid) > 8:
st.caption(f"…and {len(valid)-8} more files")
for f in oversized:
st.warning(f"⚠️ **{f.name}** ({fmt_bytes(f.size)}) exceeds the {max_mb} MB limit and will be skipped.")
if not valid:
return
# ── Convert button ────────────────────────────────────────────────────────
col_btn, col_fmt, _ = st.columns([2, 2, 4])
with col_btn:
run = st.button("πŸš€ Convert All", use_container_width=True)
with col_fmt:
st.markdown(f"<br><span class='tag'>{cfg['output_format']}</span>", unsafe_allow_html=True)
if not run:
return
# ── Load Docling ──────────────────────────────────────────────────────────
with st.spinner("Loading Docling models (first run downloads ~1 GB of models)…"):
try:
dl = _load_docling()
except Exception as e:
st.error(f"Failed to import Docling: {e}\n\n"
"Make sure `docling` is installed (`pip install docling`).")
return
with st.spinner("Building converter pipeline…"):
try:
converter = build_converter(cfg, dl)
except Exception as e:
st.error(f"Could not build converter: {e}\n```\n{traceback.format_exc()}\n```")
return
# ── Process files ─────────────────────────────────────────────────────────
st.markdown("---")
st.markdown("### βš™οΈ Processing")
results: Dict[str, Dict] = {}
overall_bar = st.progress(0)
status_area = st.empty()
with tempfile.TemporaryDirectory() as tmpdir:
for idx, uf in enumerate(valid):
fname = uf.name
status_area.markdown(
f"<div class='file-card'>"
f"<span style='font-size:1.2rem'>{file_icon(fname)}</span>"
f"<span style='flex:1;color:#e2e8f0'>{fname}</span>"
f"<span class='status-wait'>⏳ converting…</span>"
f"</div>",
unsafe_allow_html=True,
)
# Fix 4: each file gets its own subdirectory so two uploaded files
# with the same basename (e.g. "report.pdf" from different folders)
# never silently overwrite each other in the shared tmpdir.
file_subdir = os.path.join(tmpdir, str(idx))
os.makedirs(file_subdir, exist_ok=True)
tmp_path = os.path.join(file_subdir, fname)
# Fix 3: always seek(0) before reading β€” on Streamlit re-renders
# the BytesIO cursor is already at EOF and uf.read() returns b"",
# writing a zero-byte file that Docling then silently fails on.
uf.seek(0)
with open(tmp_path, "wb") as fh:
fh.write(uf.read())
result = convert_file(tmp_path, cfg, converter, dl)
results[fname] = result
overall_bar.progress((idx + 1) / len(valid))
if not result["ok"] and cfg.get("abort_on_error"):
st.error(f"❌ Aborted after error on **{fname}**:\n```\n{result['error']}\n```")
break
status_area.empty()
overall_bar.empty()
# ── Summary metrics ───────────────────────────────────────────────────────
ok_count = sum(1 for r in results.values() if r["ok"])
err_count = len(results) - ok_count
total_time = sum(r["elapsed"] for r in results.values())
st.markdown(
f"<div class='metric-row'>"
f"<div class='metric-box'><div class='val'>{len(results)}</div><div class='lbl'>Files processed</div></div>"
f"<div class='metric-box'><div class='val' style='color:#4ade80'>{ok_count}</div><div class='lbl'>Succeeded</div></div>"
f"<div class='metric-box'><div class='val' style='color:#f87171'>{err_count}</div><div class='lbl'>Failed</div></div>"
f"<div class='metric-box'><div class='val'>{fmt_time(total_time)}</div><div class='lbl'>Total time</div></div>"
f"</div>",
unsafe_allow_html=True,
)
# ── Per-file results ──────────────────────────────────────────────────────
st.markdown("### πŸ“‚ Results")
ext = OUTPUT_FORMATS[cfg["output_format"]]
# Build ZIP in memory
zip_buf = io.BytesIO()
with zipfile.ZipFile(zip_buf, "w", zipfile.ZIP_DEFLATED) as zf:
for fname, res in results.items():
if res["ok"]:
out_name = Path(fname).stem + f".{ext}"
zf.writestr(out_name, res["content"])
zip_buf.seek(0)
dl_col1, dl_col2 = st.columns([2, 4])
with dl_col1:
st.download_button(
"⬇️ Download All as ZIP",
data=zip_buf,
file_name="docling_output.zip",
mime="application/zip",
use_container_width=True,
)
st.markdown("---")
for fname, res in results.items():
icon = file_icon(fname)
with st.expander(
f"{icon} **{fname}** "
+ ("βœ…" if res["ok"] else "❌")
+ f" Β· {fmt_time(res['elapsed'])}",
expanded=ok_count == 1,
):
if not res["ok"]:
st.error(f"**Error:** {res['error']}")
if "traceback" in res:
with st.expander("πŸ“‹ Full traceback"):
st.code(res["traceback"], language="python")
else:
# Metadata strip
meta = res.get("meta", {})
m_cols = st.columns(4)
m_cols[0].metric("Pages", meta.get("pages", "β€”"))
m_cols[1].metric("Tables", meta.get("tables", "β€”"))
m_cols[2].metric("Figures", meta.get("figures", "β€”"))
m_cols[3].metric("Time", fmt_time(res["elapsed"]))
content = res["content"]
out_name = Path(fname).stem + f".{ext}"
# Download individual file
mime_map = {
"md": "text/markdown",
"html": "text/html",
"json": "application/json",
"txt": "text/plain",
"doctags": "text/plain",
}
st.download_button(
f"⬇️ Download {out_name}",
data=content.encode("utf-8"),
file_name=out_name,
mime=mime_map.get(ext, "text/plain"),
key=f"dl_{fname}",
)
# Preview
char_count = len(content)
word_count = len(content.split())
st.caption(f"Output: **{char_count:,} chars** Β· **{word_count:,} words**")
if ext == "md":
tab1, tab2 = st.tabs(["πŸ“– Rendered", "πŸ“ Raw Markdown"])
with tab1:
st.markdown(content[:15000] + ("\n\n*[truncated for preview…]*"
if len(content) > 15000 else ""),
unsafe_allow_html=True)
with tab2:
# Fix 5: escape before injecting into HTML β€” raw doc content
# can contain <, >, & which would break the div or be executed.
preview_raw = html_lib.escape(content[:12000])
suffix = "…[truncated]" if len(content) > 12000 else ""
st.markdown(f"<div class='result-box'>{preview_raw}{suffix}</div>",
unsafe_allow_html=True)
elif ext == "html":
tab1, tab2 = st.tabs(["🌐 Rendered", "πŸ“ HTML Source"])
with tab1:
st.components.v1.html(content, height=600, scrolling=True)
with tab2:
st.code(content[:10000], language="html")
elif ext == "json":
try:
parsed = json.loads(content)
st.json(parsed, expanded=False)
except Exception:
st.code(content[:10000], language="json")
else:
# Fix 5: escape before injecting into HTML β€” same reason as above.
preview_plain = html_lib.escape(content[:12000])
suffix = "…[truncated]" if len(content) > 12000 else ""
st.markdown(f"<div class='result-box'>{preview_plain}{suffix}</div>",
unsafe_allow_html=True)
if __name__ == "__main__":
main()