Spaces:

thethinkmachine
/

DoclingAIO

Sleeping

App Files Files Community

thethinkmachine commited on Mar 27

Commit

7df3afe

verified ·

1 Parent(s): 4d25869

Upload 3 files

Browse files

Files changed (3) hide show

Dockerfile +60 -0
app.py +762 -0
requirements.txt +36 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,60 @@

+FROM python:3.10-slim
+# ── System deps ───────────────────────────────────────────────────────────────
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    # OpenCV / image processing
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender1 \
+    libgomp1 \
+    # Fonts for document rendering
+    fonts-liberation \
+    fonts-dejavu-core \
+    # General utilities
+    wget \
+    curl \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# ── Working directory ─────────────────────────────────────────────────────────
+WORKDIR /app
+# ── Python deps (cached layer — only re-runs when requirements.txt changes) ───
+COPY requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip \
+ && pip install --no-cache-dir -r requirements.txt
+# ── App source ────────────────────────────────────────────────────────────────
+COPY app.py .
+# ── HF Spaces runs as a non-root user; make cache dirs writable ───────────────
+RUN mkdir -p /app/.cache /app/tmp \
+ && chmod -R 777 /app/.cache /app/tmp
+# Tell HuggingFace / torch / transformers to use our writable cache dir
+ENV HF_HOME=/app/.cache/huggingface
+ENV TORCH_HOME=/app/.cache/torch
+ENV TRANSFORMERS_CACHE=/app/.cache/huggingface/transformers
+ENV TMPDIR=/app/tmp
+# ── Port (HF Spaces expects 7860) ─────────────────────────────────────────────
+EXPOSE 7860
+# ── Launch — ALL server flags as explicit CLI args ────────────────────────────
+# This is the only approach that cannot be silently overridden by HF's runner.
+# config.toml is NOT used here so there is no ambiguity.
+CMD ["streamlit", "run", "app.py", \
+     "--server.headless=true", \
+     "--server.port=7860", \
+     "--server.address=0.0.0.0", \
+     "--server.enableCORS=false", \
+     "--server.enableXsrfProtection=false", \
+     "--server.maxUploadSize=200", \
+     "--server.fileWatcherType=none", \
+     "--browser.gatherUsageStats=false", \
+     "--theme.primaryColor=#6366f1", \
+     "--theme.backgroundColor=#0a0e1a", \
+     "--theme.secondaryBackgroundColor=#0f172a", \
+     "--theme.textColor=#e2e8f0"]

app.py ADDED Viewed

	@@ -0,0 +1,762 @@

+import streamlit as st
+import tempfile
+import os
+import json
+import zipfile
+import io
+import time
+import traceback
+import html as html_lib   # stdlib — used to escape doc content before unsafe_allow_html injection
+from pathlib import Path
+from typing import Optional, List, Dict, Any
+# ── Page config ──────────────────────────────────────────────────────────────
+st.set_page_config(
+    page_title="Docling AIO Converter",
+    page_icon="📄",
+    layout="wide",
+    initial_sidebar_state="expanded",
+)
+# ── CSS ───────────────────────────────────────────────────────────────────────
+st.markdown("""
+<style>
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
+html, body, [class*="css"] { font-family: 'Inter', sans-serif; }
+.hero {
+    background: linear-gradient(135deg, #1a1a2e 0%, #16213e 50%, #0f3460 100%);
+    border: 1px solid rgba(255,255,255,0.08);
+    border-radius: 16px;
+    padding: 2.5rem 2rem;
+    margin-bottom: 2rem;
+    position: relative;
+    overflow: hidden;
+}
+.hero::before {
+    content: '';
+    position: absolute;
+    top: -50%;
+    right: -20%;
+    width: 400px;
+    height: 400px;
+    background: radial-gradient(circle, rgba(99,102,241,0.15) 0%, transparent 70%);
+    border-radius: 50%;
+}
+.hero h1 { color: #fff; font-size: 2.2rem; font-weight: 700; margin: 0 0 0.5rem; }
+.hero p  { color: rgba(255,255,255,0.65); font-size: 1.05rem; margin: 0; }
+.hero .badge {
+    display: inline-flex; align-items: center; gap: 6px;
+    background: rgba(99,102,241,0.25);
+    border: 1px solid rgba(99,102,241,0.5);
+    color: #a5b4fc;
+    border-radius: 20px;
+    padding: 3px 12px;
+    font-size: 0.78rem;
+    font-weight: 600;
+    margin-right: 8px;
+    margin-bottom: 1rem;
+}
+.section-header {
+    color: #6366f1;
+    font-size: 0.7rem;
+    font-weight: 700;
+    letter-spacing: 0.12em;
+    text-transform: uppercase;
+    margin: 1.4rem 0 0.6rem;
+    padding-bottom: 4px;
+    border-bottom: 1px solid rgba(99,102,241,0.2);
+}
+.file-card {
+    background: #0f172a;
+    border: 1px solid rgba(255,255,255,0.07);
+    border-radius: 10px;
+    padding: 0.9rem 1.1rem;
+    margin-bottom: 0.5rem;
+    display: flex;
+    align-items: center;
+    gap: 12px;
+}
+.file-card .status-ok   { color: #4ade80; }
+.file-card .status-err  { color: #f87171; }
+.file-card .status-wait { color: #94a3b8; }
+.result-box {
+    background: #0d1117;
+    border: 1px solid rgba(255,255,255,0.07);
+    border-radius: 10px;
+    padding: 1.2rem;
+    font-size: 0.85rem;
+    color: #e2e8f0;
+    max-height: 520px;
+    overflow-y: auto;
+    white-space: pre-wrap;
+    font-family: 'JetBrains Mono', 'Fira Code', monospace;
+    line-height: 1.6;
+}
+.metric-row {
+    display: flex;
+    gap: 1rem;
+    margin-bottom: 1.2rem;
+    flex-wrap: wrap;
+}
+.metric-box {
+    flex: 1;
+    min-width: 100px;
+    background: #0f172a;
+    border: 1px solid rgba(255,255,255,0.07);
+    border-radius: 10px;
+    padding: 0.8rem 1rem;
+    text-align: center;
+}
+.metric-box .val { font-size: 1.6rem; font-weight: 700; color: #a5b4fc; }
+.metric-box .lbl { font-size: 0.75rem; color: #64748b; margin-top: 2px; }
+.tag {
+    display: inline-block;
+    background: rgba(99,102,241,0.15);
+    color: #a5b4fc;
+    border-radius: 4px;
+    padding: 2px 8px;
+    font-size: 0.72rem;
+    font-weight: 600;
+    margin: 2px;
+}
+.tag-green  { background: rgba(74,222,128,0.12); color: #4ade80; }
+.tag-red    { background: rgba(248,113,113,0.12); color: #f87171; }
+.tag-yellow { background: rgba(251,191,36,0.12);  color: #fbbf24; }
+[data-testid="stSidebar"] { background: #0a0e1a; }
+[data-testid="stSidebar"] .block-container { padding-top: 1rem; }
+.stButton>button {
+    background: linear-gradient(135deg, #6366f1, #8b5cf6);
+    color: white;
+    border: none;
+    border-radius: 8px;
+    font-weight: 600;
+    padding: 0.55rem 1.5rem;
+    transition: all 0.2s;
+}
+.stButton>button:hover { opacity: 0.88; transform: translateY(-1px); }
+.stDownloadButton>button {
+    background: #1e293b;
+    color: #a5b4fc;
+    border: 1px solid rgba(99,102,241,0.35);
+    border-radius: 8px;
+    font-weight: 500;
+}
+</style>
+""", unsafe_allow_html=True)
+# ── Helpers ───────────────────────────────────────────────────────────────────
+SUPPORTED_EXTENSIONS = {
+    "pdf": "📕", "docx": "📘", "doc": "📘", "pptx": "📙", "ppt": "📙",
+    "xlsx": "📗", "xls": "📗", "csv": "📊", "html": "🌐", "htm": "🌐",
+    "md": "📝", "txt": "📄", "png": "🖼️", "jpg": "🖼️", "jpeg": "🖼️",
+    "tiff": "��️", "tif": "🖼️", "bmp": "🖼️", "webp": "🖼️",
+    "asciidoc": "📃", "adoc": "📃", "xml": "📑", "json": "📋",
+}
+OUTPUT_FORMATS = {
+    "Markdown (.md)":    "md",
+    "HTML (.html)":      "html",
+    "JSON (.json)":      "json",
+    "Plain Text (.txt)": "txt",
+    "DocTags (.doctags)":"doctags",
+}
+ELEMENT_LABELS = {
+    "Paragraphs / Text":   "paragraph",
+    "Section Headers":     "section_header",
+    "Titles":              "title",
+    "Tables":              "table",
+    "Figures / Pictures":  "picture",
+    "Captions":            "caption",
+    "Footnotes":           "footnote",
+    "Formulas / Equations":"formula",
+    "List Items":          "list_item",
+    "Code Blocks":         "code",
+    "Page Headers":        "page_header",
+    "Page Footers":        "page_footer",
+    "Key-Value Regions":   "key_value_region",
+    "Form Elements":       "form",
+    "Document Index":      "document_index",
+}
+def file_icon(filename: str) -> str:
+    ext = Path(filename).suffix.lstrip(".").lower()
+    return SUPPORTED_EXTENSIONS.get(ext, "📄")
+def fmt_bytes(n: int) -> str:
+    for unit in ("B", "KB", "MB", "GB"):
+        if n < 1024:
+            return f"{n:.1f} {unit}"
+        n /= 1024
+    return f"{n:.1f} TB"
+def fmt_time(s: float) -> str:
+    return f"{s:.1f}s" if s < 60 else f"{int(s//60)}m {int(s%60)}s"
+# ── Lazy-load Docling (heavy) ─────────────────────────────────────────────────
+@st.cache_resource(show_spinner=False)
+def _load_docling():
+    """Import docling once and cache."""
+    from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption
+    from docling.datamodel.pipeline_options import (
+        PdfPipelineOptions,
+        TableStructureOptions,
+        EasyOcrOptions,
+        TesseractCliOcrOptions,
+    )
+    from docling.datamodel.base_models import InputFormat, ConversionStatus
+    try:
+        from docling.datamodel.pipeline_options import TableFormerMode
+    except ImportError:
+        TableFormerMode = None
+    try:
+        from docling_core.types.doc import ImageRefMode, DocItemLabel
+    except ImportError:
+        from docling.datamodel.base_models import ImageRefMode, DocItemLabel  # type: ignore
+    return {
+        "DocumentConverter":       DocumentConverter,
+        "PdfFormatOption":         PdfFormatOption,
+        "WordFormatOption":        WordFormatOption,
+        "PdfPipelineOptions":      PdfPipelineOptions,
+        "TableStructureOptions":   TableStructureOptions,
+        "EasyOcrOptions":          EasyOcrOptions,
+        "TesseractCliOcrOptions":  TesseractCliOcrOptions,
+        "InputFormat":             InputFormat,
+        "ConversionStatus":        ConversionStatus,
+        "TableFormerMode":         TableFormerMode,
+        "ImageRefMode":            ImageRefMode,
+        "DocItemLabel":            DocItemLabel,
+    }
+# ── Sidebar Config ────────────────────────────────────────────────────────────
+def sidebar() -> Dict[str, Any]:
+    cfg: Dict[str, Any] = {}
+    with st.sidebar:
+        st.markdown("## ⚙️ Configuration")
+        # ── OCR ──────────────────────────────────────────────────────────────
+        st.markdown('<div class="section-header">🔍 OCR Settings</div>', unsafe_allow_html=True)
+        cfg["do_ocr"] = st.checkbox("Enable OCR", value=True,
+            help="Optical Character Recognition for scanned/image-based content.")
+        cfg["force_full_page_ocr"] = st.checkbox("Force full-page OCR", value=False,
+            help="Run OCR on every page even if text layer exists.")
+        cfg["ocr_engine"] = st.radio("OCR Engine", ["EasyOCR", "Tesseract"],
+            horizontal=True,
+            help="EasyOCR is pure-Python; Tesseract requires system install.")
+        cfg["ocr_languages"] = st.multiselect(
+            "OCR Languages",
+            ["en", "de", "fr", "es", "it", "pt", "nl", "ru", "zh", "ja", "ko",
+             "ar", "hi", "pl", "cs", "ro", "sv", "da", "fi", "no", "hu", "tr"],
+            default=["en"],
+            help="Languages for OCR. EasyOCR supports all; Tesseract needs packs installed.")
+        # ── Table Extraction ──────────────────────────────────────────────────
+        st.markdown('<div class="section-header">📊 Table Extraction</div>', unsafe_allow_html=True)
+        cfg["do_table_structure"] = st.checkbox("Extract table structure", value=True,
+            help="Use TableFormer model to detect rows/columns/cells in tables.")
+        cfg["table_mode"] = st.radio("TableFormer mode",
+            ["Accurate (slower)", "Fast (lighter)"],
+            index=0, horizontal=True,
+            help="Accurate uses the full model; Fast is a smaller/faster variant.")
+        cfg["do_cell_matching"] = st.checkbox("Cell text matching", value=True,
+            help="Match detected cells back to underlying PDF text for accuracy.")
+        # ── Image Handling ────────────────────────────────────────────────────
+        st.markdown('<div class="section-header">🖼️ Image & Page Rendering</div>', unsafe_allow_html=True)
+        cfg["generate_page_images"] = st.checkbox("Generate page images", value=False,
+            help="Rasterise each page as an image (needed for embedded page images in output).")
+        cfg["generate_picture_images"] = st.checkbox("Generate picture crops", value=True,
+            help="Extract figure/picture regions as cropped images.")
+        cfg["images_scale"] = st.slider("Rendering scale (DPI multiplier)", 1.0, 4.0, 2.0, 0.5,
+            help="Higher = better quality but slower & more memory.")
+        cfg["generate_table_images"] = st.checkbox("Generate table images", value=False,
+            help="Also rasterise table regions as images.")
+        # ── Content Elements ──────────────────────────────────────────────────
+        st.markdown('<div class="section-header">📋 Content Elements to Include</div>', unsafe_allow_html=True)
+        st.caption("Uncheck elements you want to exclude from the output.")
+        selected_labels = []
+        for label_name, label_val in ELEMENT_LABELS.items():
+            default = True
+            # default off for things rarely needed
+            if label_val in ("page_header", "page_footer", "document_index",
+                             "key_value_region", "form"):
+                default = False
+            if st.checkbox(label_name, value=default, key=f"lbl_{label_val}"):
+                selected_labels.append(label_val)
+        cfg["selected_labels"] = selected_labels
+        # ── Output Format ─────────────────────────────────────────────────────
+        st.markdown('<div class="section-header">📤 Output Format</div>', unsafe_allow_html=True)
+        cfg["output_format"] = st.selectbox("Convert to", list(OUTPUT_FORMATS.keys()))
+        # ── Format-specific options ───────────────────────────────────────────
+        fmt = OUTPUT_FORMATS[cfg["output_format"]]
+        if fmt in ("md", "html"):
+            cfg["image_mode"] = st.selectbox(
+                "Image handling in output",
+                ["Placeholder comment", "Embedded (base64)", "Referenced path", "Omit images"],
+                help="How images appear in Markdown / HTML output.")
+        if fmt == "md":
+            cfg["strict_text"] = st.checkbox("Strict text mode", value=False,
+                help="Disable Markdown enrichment; output pure text lines.")
+            cfg["indent"] = st.slider("List indent (spaces)", 2, 8, 4, 2)
+        # ── PDF-specific ──────────────────────────────────────────────────────
+        st.markdown('<div class="section-header">📕 PDF-Specific Options</div>', unsafe_allow_html=True)
+        cfg["abort_on_error"] = st.checkbox("Abort batch on first error", value=False)
+        cfg["max_file_mb"] = st.slider("Max file size (MB)", 5, 200, 50,
+            help="Files larger than this will be skipped with a warning.")
+        st.markdown("---")
+        st.caption("Powered by [Docling](https://github.com/DS4SD/docling) · IBM Research")
+    return cfg
+# ── Converter logic ───────────────────────────────────────────────────────────
+def build_converter(cfg: Dict[str, Any], dl) -> Any:
+    """Construct a DocumentConverter from sidebar config."""
+    PdfPipelineOptions   = dl["PdfPipelineOptions"]
+    TableStructureOptions = dl["TableStructureOptions"]
+    EasyOcrOptions       = dl["EasyOcrOptions"]
+    TesseractCliOcrOptions = dl["TesseractCliOcrOptions"]
+    PdfFormatOption      = dl["PdfFormatOption"]
+    DocumentConverter    = dl["DocumentConverter"]
+    TableFormerMode      = dl["TableFormerMode"]
+    InputFormat          = dl["InputFormat"]
+    # OCR backend
+    ocr_options = None
+    if cfg["do_ocr"]:
+        if cfg["ocr_engine"] == "EasyOCR":
+            ocr_options = EasyOcrOptions(lang=cfg["ocr_languages"])
+        else:
+            ocr_options = TesseractCliOcrOptions(lang="+".join(cfg["ocr_languages"]))
+    # Table structure
+    tbl_kwargs = {"do_cell_matching": cfg["do_cell_matching"]}
+    if TableFormerMode is not None:
+        tbl_kwargs["mode"] = (TableFormerMode.ACCURATE
+                              if "Accurate" in cfg["table_mode"]
+                              else TableFormerMode.FAST)
+    tbl_opts = TableStructureOptions(**tbl_kwargs)
+    # PDF pipeline
+    pdf_opts_kwargs = dict(
+        do_ocr=cfg["do_ocr"],
+        do_table_structure=cfg["do_table_structure"],
+        table_structure_options=tbl_opts,
+        generate_page_images=cfg["generate_page_images"],
+        generate_picture_images=cfg["generate_picture_images"],
+        images_scale=cfg["images_scale"],
+    )
+    if cfg["do_ocr"] and ocr_options is not None:
+        pdf_opts_kwargs["ocr_options"] = ocr_options
+    if cfg["force_full_page_ocr"] and cfg["do_ocr"]:
+        pdf_opts_kwargs["force_full_page_ocr"] = True
+    if hasattr(PdfPipelineOptions, "generate_table_images"):
+        pdf_opts_kwargs["generate_table_images"] = cfg.get("generate_table_images", False)
+    pdf_pipeline_opts = PdfPipelineOptions(**pdf_opts_kwargs)
+    format_options = {
+        InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_opts),
+    }
+    converter = DocumentConverter(format_options=format_options)
+    return converter
+def image_ref_mode(cfg, dl):
+    ImageRefMode = dl["ImageRefMode"]
+    choice = cfg.get("image_mode", "Placeholder comment")
+    mapping = {
+        "Placeholder comment": ImageRefMode.PLACEHOLDER,
+        "Embedded (base64)":   ImageRefMode.EMBEDDED,
+        "Referenced path":     ImageRefMode.REFERENCED,
+        "Omit images":         ImageRefMode.PLACEHOLDER,  # handled via labels
+    }
+    return mapping.get(choice, ImageRefMode.PLACEHOLDER)
+def resolve_labels(cfg, dl):
+    DocItemLabel = dl["DocItemLabel"]
+    # build label objects from selected string values
+    labels = []
+    label_map = {v: v for v in ELEMENT_LABELS.values()}
+    for lv in cfg["selected_labels"]:
+        try:
+            labels.append(DocItemLabel(lv))
+        except Exception:
+            pass
+    return labels if labels else None
+def do_export(doc, cfg, dl) -> str:
+    """Export converted document to the chosen format."""
+    fmt = OUTPUT_FORMATS[cfg["output_format"]]
+    labels = resolve_labels(cfg, dl)
+    try:
+        if fmt == "md":
+            kwargs = dict(
+                image_mode=image_ref_mode(cfg, dl),
+                strict_text=cfg.get("strict_text", False),
+                indent=cfg.get("indent", 4),
+            )
+            if labels is not None:
+                kwargs["labels"] = labels
+            return doc.export_to_markdown(**kwargs)
+        elif fmt == "html":
+            kwargs = dict(image_mode=image_ref_mode(cfg, dl))
+            if labels is not None:
+                kwargs["labels"] = labels
+            return doc.export_to_html(**kwargs)
+        elif fmt == "json":
+            d = doc.export_to_dict()
+            return json.dumps(d, indent=2, ensure_ascii=False)
+        elif fmt == "txt":
+            kwargs = {}
+            if labels is not None:
+                kwargs["labels"] = labels
+            return doc.export_to_text(**kwargs)
+        elif fmt == "doctags":
+            try:
+                return doc.export_to_document_tokens()
+            except AttributeError:
+                return doc.export_to_markdown()
+    except TypeError:
+        # Fallback: export without unsupported kwargs
+        if fmt == "md":
+            return doc.export_to_markdown()
+        elif fmt == "html":
+            return doc.export_to_html()
+        elif fmt == "json":
+            return json.dumps(doc.export_to_dict(), indent=2, ensure_ascii=False)
+        else:
+            return doc.export_to_text()
+    return ""
+def convert_file(path: str, cfg: Dict[str, Any], converter, dl) -> Dict[str, Any]:
+    """Run docling on a single file. Returns result dict."""
+    ConversionStatus = dl["ConversionStatus"]
+    t0 = time.time()
+    try:
+        result = converter.convert(path)
+        elapsed = time.time() - t0
+        if result.status not in (ConversionStatus.SUCCESS, ConversionStatus.PARTIAL_SUCCESS):
+            return {"ok": False, "error": f"Conversion failed: {result.status}", "elapsed": elapsed}
+        doc = result.document
+        exported = do_export(doc, cfg, dl)
+        # metadata
+        meta = {}
+        try:
+            meta["pages"] = len(result.document.pages) if hasattr(result.document, "pages") else "N/A"
+        except Exception:
+            meta["pages"] = "N/A"
+        try:
+            meta["tables"] = len([i for i in doc.iterate_items()
+                                   if hasattr(i[1], 'label') and
+                                   str(getattr(i[1], 'label', '')).endswith('table')])
+        except Exception:
+            meta["tables"] = "N/A"
+        try:
+            meta["figures"] = len(doc.pictures) if hasattr(doc, "pictures") else "N/A"
+        except Exception:
+            meta["figures"] = "N/A"
+        return {
+            "ok": True,
+            "content": exported,
+            "elapsed": elapsed,
+            "status": str(result.status),
+            "meta": meta,
+        }
+    except Exception as e:
+        return {
+            "ok": False,
+            "error": f"{type(e).__name__}: {e}",
+            "traceback": traceback.format_exc(),
+            "elapsed": time.time() - t0,
+        }
+# ── Main ──────────────────────────────────────────────────────────────────────
+def main():
+    cfg = sidebar()
+    # Hero
+    st.markdown("""
+    <div class="hero">
+        <span class="badge">⚡ Powered by Docling</span>
+        <span class="badge">🤗 HuggingFace Spaces</span>
+        <h1>📄 Docling AIO Converter</h1>
+        <p>Parse &amp; convert any document — PDF, DOCX, PPTX, XLSX, images, HTML and more —
+           with full control over OCR, tables, figures, and output formatting.</p>
+    </div>
+    """, unsafe_allow_html=True)
+    # ── Upload area ───────────────────────────────────────────────────────────
+    st.markdown("### 📁 Upload Documents")
+    max_mb = cfg.get("max_file_mb", 50)
+    uploaded = st.file_uploader(
+        f"Drag & drop files here · Max {max_mb} MB per file",
+        accept_multiple_files=True,
+        type=list(SUPPORTED_EXTENSIONS.keys()),
+        help="You can upload multiple files at once for batch conversion.",
+    )
+    if not uploaded:
+        st.info("👆 Upload one or more files to get started. "
+                "Adjust all settings in the **sidebar** before converting.", icon="ℹ️")
+        # Supported formats table
+        with st.expander("📋 Supported Input Formats"):
+            cols = st.columns(4)
+            items = list(SUPPORTED_EXTENSIONS.items())
+            for i, (ext, icon) in enumerate(items):
+                cols[i % 4].markdown(f"{icon} `.{ext}`")
+        return
+    # ── File list ─────────────────────────────────────────────────────────────
+    oversized = [f for f in uploaded if f.size > max_mb * 1024 * 1024]
+    valid     = [f for f in uploaded if f.size <= max_mb * 1024 * 1024]
+    st.markdown(f"**{len(uploaded)} file(s) selected** · "
+                f"<span class='tag tag-green'>{len(valid)} ready</span>"
+                + (f" <span class='tag tag-red'>{len(oversized)} oversized</span>"
+                   if oversized else ""),
+                unsafe_allow_html=True)
+    for f in valid[:8]:  # show preview of first 8
+        st.markdown(
+            f"<div class='file-card'>"
+            f"<span style='font-size:1.3rem'>{file_icon(f.name)}</span>"
+            f"<span style='flex:1;font-weight:500;color:#e2e8f0'>{f.name}</span>"
+            f"<span style='color:#64748b;font-size:0.82rem'>{fmt_bytes(f.size)}</span>"
+            f"</div>",
+            unsafe_allow_html=True,
+        )
+    if len(valid) > 8:
+        st.caption(f"…and {len(valid)-8} more files")
+    for f in oversized:
+        st.warning(f"⚠️ **{f.name}** ({fmt_bytes(f.size)}) exceeds the {max_mb} MB limit and will be skipped.")
+    if not valid:
+        return
+    # ── Convert button ────────────────────────────────────────────────────────
+    col_btn, col_fmt, _ = st.columns([2, 2, 4])
+    with col_btn:
+        run = st.button("🚀 Convert All", use_container_width=True)
+    with col_fmt:
+        st.markdown(f"<br><span class='tag'>{cfg['output_format']}</span>", unsafe_allow_html=True)
+    if not run:
+        return
+    # ── Load Docling ──────────────────────────────────────────────────────────
+    with st.spinner("Loading Docling models (first run downloads ~1 GB of models)…"):
+        try:
+            dl = _load_docling()
+        except Exception as e:
+            st.error(f"Failed to import Docling: {e}\n\n"
+                     "Make sure `docling` is installed (`pip install docling`).")
+            return
+    with st.spinner("Building converter pipeline…"):
+        try:
+            converter = build_converter(cfg, dl)
+        except Exception as e:
+            st.error(f"Could not build converter: {e}\n```\n{traceback.format_exc()}\n```")
+            return
+    # ── Process files ─────────────────────────────────────────────────────────
+    st.markdown("---")
+    st.markdown("### ⚙️ Processing")
+    results: Dict[str, Dict] = {}
+    overall_bar = st.progress(0)
+    status_area = st.empty()
+    with tempfile.TemporaryDirectory() as tmpdir:
+        for idx, uf in enumerate(valid):
+            fname = uf.name
+            status_area.markdown(
+                f"<div class='file-card'>"
+                f"<span style='font-size:1.2rem'>{file_icon(fname)}</span>"
+                f"<span style='flex:1;color:#e2e8f0'>{fname}</span>"
+                f"<span class='status-wait'>⏳ converting…</span>"
+                f"</div>",
+                unsafe_allow_html=True,
+            )
+            # Fix 4: each file gets its own subdirectory so two uploaded files
+            # with the same basename (e.g. "report.pdf" from different folders)
+            # never silently overwrite each other in the shared tmpdir.
+            file_subdir = os.path.join(tmpdir, str(idx))
+            os.makedirs(file_subdir, exist_ok=True)
+            tmp_path = os.path.join(file_subdir, fname)
+            # Fix 3: always seek(0) before reading — on Streamlit re-renders
+            # the BytesIO cursor is already at EOF and uf.read() returns b"",
+            # writing a zero-byte file that Docling then silently fails on.
+            uf.seek(0)
+            with open(tmp_path, "wb") as fh:
+                fh.write(uf.read())
+            result = convert_file(tmp_path, cfg, converter, dl)
+            results[fname] = result
+            overall_bar.progress((idx + 1) / len(valid))
+            if not result["ok"] and cfg.get("abort_on_error"):
+                st.error(f"❌ Aborted after error on **{fname}**:\n```\n{result['error']}\n```")
+                break
+    status_area.empty()
+    overall_bar.empty()
+    # ── Summary metrics ───────────────────────────────────────────────────────
+    ok_count  = sum(1 for r in results.values() if r["ok"])
+    err_count = len(results) - ok_count
+    total_time = sum(r["elapsed"] for r in results.values())
+    st.markdown(
+        f"<div class='metric-row'>"
+        f"<div class='metric-box'><div class='val'>{len(results)}</div><div class='lbl'>Files processed</div></div>"
+        f"<div class='metric-box'><div class='val' style='color:#4ade80'>{ok_count}</div><div class='lbl'>Succeeded</div></div>"
+        f"<div class='metric-box'><div class='val' style='color:#f87171'>{err_count}</div><div class='lbl'>Failed</div></div>"
+        f"<div class='metric-box'><div class='val'>{fmt_time(total_time)}</div><div class='lbl'>Total time</div></div>"
+        f"</div>",
+        unsafe_allow_html=True,
+    )
+    # ── Per-file results ──────────────────────────────────────────────────────
+    st.markdown("### 📂 Results")
+    ext = OUTPUT_FORMATS[cfg["output_format"]]
+    # Build ZIP in memory
+    zip_buf = io.BytesIO()
+    with zipfile.ZipFile(zip_buf, "w", zipfile.ZIP_DEFLATED) as zf:
+        for fname, res in results.items():
+            if res["ok"]:
+                out_name = Path(fname).stem + f".{ext}"
+                zf.writestr(out_name, res["content"])
+    zip_buf.seek(0)
+    dl_col1, dl_col2 = st.columns([2, 4])
+    with dl_col1:
+        st.download_button(
+            "⬇️ Download All as ZIP",
+            data=zip_buf,
+            file_name="docling_output.zip",
+            mime="application/zip",
+            use_container_width=True,
+        )
+    st.markdown("---")
+    for fname, res in results.items():
+        icon = file_icon(fname)
+        with st.expander(
+            f"{icon}  **{fname}**  "
+            + ("✅" if res["ok"] else "❌")
+            + f"  ·  {fmt_time(res['elapsed'])}",
+            expanded=ok_count == 1,
+        ):
+            if not res["ok"]:
+                st.error(f"**Error:** {res['error']}")
+                if "traceback" in res:
+                    with st.expander("📋 Full traceback"):
+                        st.code(res["traceback"], language="python")
+            else:
+                # Metadata strip
+                meta = res.get("meta", {})
+                m_cols = st.columns(4)
+                m_cols[0].metric("Pages",   meta.get("pages",   "—"))
+                m_cols[1].metric("Tables",  meta.get("tables",  "—"))
+                m_cols[2].metric("Figures", meta.get("figures", "—"))
+                m_cols[3].metric("Time",    fmt_time(res["elapsed"]))
+                content = res["content"]
+                out_name = Path(fname).stem + f".{ext}"
+                # Download individual file
+                mime_map = {
+                    "md": "text/markdown",
+                    "html": "text/html",
+                    "json": "application/json",
+                    "txt": "text/plain",
+                    "doctags": "text/plain",
+                }
+                st.download_button(
+                    f"⬇️ Download {out_name}",
+                    data=content.encode("utf-8"),
+                    file_name=out_name,
+                    mime=mime_map.get(ext, "text/plain"),
+                    key=f"dl_{fname}",
+                )
+                # Preview
+                char_count = len(content)
+                word_count = len(content.split())
+                st.caption(f"Output: **{char_count:,} chars** · **{word_count:,} words**")
+                if ext == "md":
+                    tab1, tab2 = st.tabs(["📖 Rendered", "📝 Raw Markdown"])
+                    with tab1:
+                        st.markdown(content[:15000] + ("\n\n*[truncated for preview…]*"
+                                                        if len(content) > 15000 else ""),
+                                    unsafe_allow_html=True)
+                    with tab2:
+                        # Fix 5: escape before injecting into HTML — raw doc content
+                        # can contain <, >, & which would break the div or be executed.
+                        preview_raw = html_lib.escape(content[:12000])
+                        suffix = "…[truncated]" if len(content) > 12000 else ""
+                        st.markdown(f"<div class='result-box'>{preview_raw}{suffix}</div>",
+                                    unsafe_allow_html=True)
+                elif ext == "html":
+                    tab1, tab2 = st.tabs(["🌐 Rendered", "📝 HTML Source"])
+                    with tab1:
+                        st.components.v1.html(content, height=600, scrolling=True)
+                    with tab2:
+                        st.code(content[:10000], language="html")
+                elif ext == "json":
+                    try:
+                        parsed = json.loads(content)
+                        st.json(parsed, expanded=False)
+                    except Exception:
+                        st.code(content[:10000], language="json")
+                else:
+                    # Fix 5: escape before injecting into HTML — same reason as above.
+                    preview_plain = html_lib.escape(content[:12000])
+                    suffix = "…[truncated]" if len(content) > 12000 else ""
+                    st.markdown(f"<div class='result-box'>{preview_plain}{suffix}</div>",
+                                unsafe_allow_html=True)
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,36 @@

+# ── Core ─────────────────────────────────────────────────────────────────────
+docling>=2.5.0
+docling-core>=2.0.0
+# ── Streamlit ─────────────────────────────────────────────────────────────────
+streamlit>=1.35.0
+# ── OCR backends ─────────────────────────────────────────────────────────────
+easyocr>=1.7.0          # Pure-Python OCR (no system deps needed)
+# pytesseract            # Tesseract wrapper — uncomment if packages.txt has tesseract
+# ── Document format support ───────────────────────────────────────────────────
+python-docx>=1.1.0      # DOCX reading/writing
+python-pptx>=0.6.23     # PPTX support
+openpyxl>=3.1.2         # XLSX support
+pandas>=2.0.0           # CSV / tabular
+beautifulsoup4>=4.12.0  # HTML parsing
+lxml>=5.0.0             # XML/HTML backend
+# ── Image processing ──────────────────────────────────────────────────────────
+Pillow>=10.0.0
+opencv-python-headless>=4.9.0  # headless for server environments
+# ── PDF ───────────────────────────────────────────────────────────────────────
+pypdfium2>=4.0.0        # Fast PDF rendering backend used by Docling
+pdfminer.six>=20221105
+# ── ML / model support ────────────────────────────────────────────────────────
+torch>=2.1.0
+torchvision>=0.16.0
+transformers>=4.40.0
+huggingface-hub>=0.20.0
+# ── Misc utilities ────────────────────────────────────────────────────────────
+requests>=2.31.0
+tqdm>=4.66.0