Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import tempfile | |
| import os | |
| import json | |
| import zipfile | |
| import io | |
| import time | |
| import traceback | |
| import html as html_lib | |
| from pathlib import Path | |
| from typing import Optional, List, Dict, Any | |
| # ββ Page config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.set_page_config( | |
| page_title="Docling AIO Converter", | |
| page_icon="π", | |
| layout="wide", | |
| initial_sidebar_state="expanded", | |
| ) | |
| # ββ CSS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.markdown(""" | |
| <style> | |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap'); | |
| html, body, [class*="css"] { font-family: 'Inter', sans-serif; } | |
| .hero { | |
| background: linear-gradient(135deg, #1a1a2e 0%, #16213e 50%, #0f3460 100%); | |
| border: 1px solid rgba(255,255,255,0.08); | |
| border-radius: 16px; | |
| padding: 2.5rem 2rem; | |
| margin-bottom: 2rem; | |
| position: relative; | |
| overflow: hidden; | |
| } | |
| .hero::before { | |
| content: ''; | |
| position: absolute; | |
| top: -50%; | |
| right: -20%; | |
| width: 400px; | |
| height: 400px; | |
| background: radial-gradient(circle, rgba(99,102,241,0.15) 0%, transparent 70%); | |
| border-radius: 50%; | |
| } | |
| .hero h1 { color: #fff; font-size: 2.2rem; font-weight: 700; margin: 0 0 0.5rem; } | |
| .hero p { color: rgba(255,255,255,0.65); font-size: 1.05rem; margin: 0; } | |
| .hero .badge { | |
| display: inline-flex; align-items: center; gap: 6px; | |
| background: rgba(99,102,241,0.25); | |
| border: 1px solid rgba(99,102,241,0.5); | |
| color: #a5b4fc; | |
| border-radius: 20px; | |
| padding: 3px 12px; | |
| font-size: 0.78rem; | |
| font-weight: 600; | |
| margin-right: 8px; | |
| margin-bottom: 1rem; | |
| } | |
| .section-header { | |
| color: #6366f1; | |
| font-size: 0.7rem; | |
| font-weight: 700; | |
| letter-spacing: 0.12em; | |
| text-transform: uppercase; | |
| margin: 1.4rem 0 0.6rem; | |
| padding-bottom: 4px; | |
| border-bottom: 1px solid rgba(99,102,241,0.2); | |
| } | |
| .file-card { | |
| background: #0f172a; | |
| border: 1px solid rgba(255,255,255,0.07); | |
| border-radius: 10px; | |
| padding: 0.9rem 1.1rem; | |
| margin-bottom: 0.5rem; | |
| display: flex; | |
| align-items: center; | |
| gap: 12px; | |
| } | |
| .file-card .status-ok { color: #4ade80; } | |
| .file-card .status-err { color: #f87171; } | |
| .file-card .status-wait { color: #94a3b8; } | |
| .result-box { | |
| background: #0d1117; | |
| border: 1px solid rgba(255,255,255,0.07); | |
| border-radius: 10px; | |
| padding: 1.2rem; | |
| font-size: 0.85rem; | |
| color: #e2e8f0; | |
| max-height: 520px; | |
| overflow-y: auto; | |
| white-space: pre-wrap; | |
| font-family: 'JetBrains Mono', 'Fira Code', monospace; | |
| line-height: 1.6; | |
| } | |
| .metric-row { | |
| display: flex; | |
| gap: 1rem; | |
| margin-bottom: 1.2rem; | |
| flex-wrap: wrap; | |
| } | |
| .metric-box { | |
| flex: 1; | |
| min-width: 100px; | |
| background: #0f172a; | |
| border: 1px solid rgba(255,255,255,0.07); | |
| border-radius: 10px; | |
| padding: 0.8rem 1rem; | |
| text-align: center; | |
| } | |
| .metric-box .val { font-size: 1.6rem; font-weight: 700; color: #a5b4fc; } | |
| .metric-box .lbl { font-size: 0.75rem; color: #64748b; margin-top: 2px; } | |
| .tag { | |
| display: inline-block; | |
| background: rgba(99,102,241,0.15); | |
| color: #a5b4fc; | |
| border-radius: 4px; | |
| padding: 2px 8px; | |
| font-size: 0.72rem; | |
| font-weight: 600; | |
| margin: 2px; | |
| } | |
| .tag-green { background: rgba(74,222,128,0.12); color: #4ade80; } | |
| .tag-red { background: rgba(248,113,113,0.12); color: #f87171; } | |
| .tag-yellow { background: rgba(251,191,36,0.12); color: #fbbf24; } | |
| [data-testid="stSidebar"] { background: #0a0e1a; } | |
| [data-testid="stSidebar"] .block-container { padding-top: 1rem; } | |
| .stButton>button { | |
| background: linear-gradient(135deg, #6366f1, #8b5cf6); | |
| color: white; | |
| border: none; | |
| border-radius: 8px; | |
| font-weight: 600; | |
| padding: 0.55rem 1.5rem; | |
| transition: all 0.2s; | |
| } | |
| .stButton>button:hover { opacity: 0.88; transform: translateY(-1px); } | |
| .stDownloadButton>button { | |
| background: #1e293b; | |
| color: #a5b4fc; | |
| border: 1px solid rgba(99,102,241,0.35); | |
| border-radius: 8px; | |
| font-weight: 500; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # ββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| SUPPORTED_EXTENSIONS = { | |
| "pdf": "π", "docx": "π", "doc": "π", "pptx": "π", "ppt": "π", | |
| "xlsx": "π", "xls": "π", "csv": "π", "html": "π", "htm": "π", | |
| "md": "π", "txt": "π", "png": "πΌοΈ", "jpg": "πΌοΈ", "jpeg": "πΌοΈ", | |
| "tiff": "πΌοΈ", "tif": "πΌοΈ", "bmp": "πΌοΈ", "webp": "πΌοΈ", | |
| "asciidoc": "π", "adoc": "π", "xml": "π", "json": "π", | |
| } | |
| OUTPUT_FORMATS = { | |
| "Markdown (.md)": "md", | |
| "HTML (.html)": "html", | |
| "JSON (.json)": "json", | |
| "Plain Text (.txt)": "txt", | |
| "DocTags (.doctags)":"doctags", | |
| } | |
| ELEMENT_LABELS = { | |
| "Paragraphs / Text": "paragraph", | |
| "Section Headers": "section_header", | |
| "Titles": "title", | |
| "Tables": "table", | |
| "Figures / Pictures": "picture", | |
| "Captions": "caption", | |
| "Footnotes": "footnote", | |
| "Formulas / Equations":"formula", | |
| "List Items": "list_item", | |
| "Code Blocks": "code", | |
| "Page Headers": "page_header", | |
| "Page Footers": "page_footer", | |
| "Key-Value Regions": "key_value_region", | |
| "Form Elements": "form", | |
| "Document Index": "document_index", | |
| } | |
| def file_icon(filename: str) -> str: | |
| ext = Path(filename).suffix.lstrip(".").lower() | |
| return SUPPORTED_EXTENSIONS.get(ext, "π") | |
| def fmt_bytes(n: int) -> str: | |
| for unit in ("B", "KB", "MB", "GB"): | |
| if n < 1024: | |
| return f"{n:.1f} {unit}" | |
| n /= 1024 | |
| return f"{n:.1f} TB" | |
| def fmt_time(s: float) -> str: | |
| return f"{s:.1f}s" if s < 60 else f"{int(s//60)}m {int(s%60)}s" | |
| # ββ Lazy-load Docling (heavy) βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _load_docling(): | |
| """Import docling once and cache.""" | |
| from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption | |
| from docling.datamodel.pipeline_options import ( | |
| PdfPipelineOptions, | |
| TableStructureOptions, | |
| EasyOcrOptions, | |
| TesseractCliOcrOptions, | |
| ) | |
| from docling.datamodel.base_models import InputFormat, ConversionStatus | |
| try: | |
| from docling.datamodel.pipeline_options import TableFormerMode | |
| except ImportError: | |
| TableFormerMode = None | |
| try: | |
| from docling_core.types.doc import ImageRefMode, DocItemLabel | |
| except ImportError: | |
| from docling.datamodel.base_models import ImageRefMode, DocItemLabel # type: ignore | |
| return { | |
| "DocumentConverter": DocumentConverter, | |
| "PdfFormatOption": PdfFormatOption, | |
| "WordFormatOption": WordFormatOption, | |
| "PdfPipelineOptions": PdfPipelineOptions, | |
| "TableStructureOptions": TableStructureOptions, | |
| "EasyOcrOptions": EasyOcrOptions, | |
| "TesseractCliOcrOptions": TesseractCliOcrOptions, | |
| "InputFormat": InputFormat, | |
| "ConversionStatus": ConversionStatus, | |
| "TableFormerMode": TableFormerMode, | |
| "ImageRefMode": ImageRefMode, | |
| "DocItemLabel": DocItemLabel, | |
| } | |
| # ββ Sidebar Config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def sidebar() -> Dict[str, Any]: | |
| cfg: Dict[str, Any] = {} | |
| with st.sidebar: | |
| st.markdown("## βοΈ Configuration") | |
| # ββ OCR ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.markdown('<div class="section-header">π OCR Settings</div>', unsafe_allow_html=True) | |
| cfg["do_ocr"] = st.checkbox("Enable OCR", value=True, | |
| help="Optical Character Recognition for scanned/image-based content.") | |
| cfg["force_full_page_ocr"] = st.checkbox("Force full-page OCR", value=False, | |
| help="Run OCR on every page even if text layer exists.") | |
| cfg["ocr_engine"] = st.radio("OCR Engine", ["EasyOCR", "Tesseract"], | |
| horizontal=True, | |
| help="EasyOCR is pure-Python; Tesseract requires system install.") | |
| cfg["ocr_languages"] = st.multiselect( | |
| "OCR Languages", | |
| ["en", "de", "fr", "es", "it", "pt", "nl", "ru", "zh", "ja", "ko", | |
| "ar", "hi", "pl", "cs", "ro", "sv", "da", "fi", "no", "hu", "tr"], | |
| default=["en"], | |
| help="Languages for OCR. EasyOCR supports all; Tesseract needs packs installed.") | |
| # ββ Table Extraction ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.markdown('<div class="section-header">π Table Extraction</div>', unsafe_allow_html=True) | |
| cfg["do_table_structure"] = st.checkbox("Extract table structure", value=True, | |
| help="Use TableFormer model to detect rows/columns/cells in tables.") | |
| cfg["table_mode"] = st.radio("TableFormer mode", | |
| ["Accurate (slower)", "Fast (lighter)"], | |
| index=0, horizontal=True, | |
| help="Accurate uses the full model; Fast is a smaller/faster variant.") | |
| cfg["do_cell_matching"] = st.checkbox("Cell text matching", value=True, | |
| help="Match detected cells back to underlying PDF text for accuracy.") | |
| # ββ Image Handling ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.markdown('<div class="section-header">πΌοΈ Image & Page Rendering</div>', unsafe_allow_html=True) | |
| cfg["generate_page_images"] = st.checkbox("Generate page images", value=False, | |
| help="Rasterise each page as an image (needed for embedded page images in output).") | |
| cfg["generate_picture_images"] = st.checkbox("Generate picture crops", value=True, | |
| help="Extract figure/picture regions as cropped images.") | |
| cfg["images_scale"] = st.slider("Rendering scale (DPI multiplier)", 1.0, 4.0, 2.0, 0.5, | |
| help="Higher = better quality but slower & more memory.") | |
| cfg["generate_table_images"] = st.checkbox("Generate table images", value=False, | |
| help="Also rasterise table regions as images.") | |
| # ββ Content Elements ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.markdown('<div class="section-header">π Content Elements to Include</div>', unsafe_allow_html=True) | |
| st.caption("Uncheck elements you want to exclude from the output.") | |
| selected_labels = [] | |
| for label_name, label_val in ELEMENT_LABELS.items(): | |
| default = True | |
| # default off for things rarely needed | |
| if label_val in ("page_header", "page_footer", "document_index", | |
| "key_value_region", "form"): | |
| default = False | |
| if st.checkbox(label_name, value=default, key=f"lbl_{label_val}"): | |
| selected_labels.append(label_val) | |
| cfg["selected_labels"] = selected_labels | |
| # ββ Output Format βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.markdown('<div class="section-header">π€ Output Format</div>', unsafe_allow_html=True) | |
| cfg["output_format"] = st.selectbox("Convert to", list(OUTPUT_FORMATS.keys())) | |
| # ββ Format-specific options βββββββββββββββββββββββββββββββββββββββββββ | |
| fmt = OUTPUT_FORMATS[cfg["output_format"]] | |
| if fmt in ("md", "html"): | |
| cfg["image_mode"] = st.selectbox( | |
| "Image handling in output", | |
| ["Placeholder comment", "Embedded (base64)", "Referenced path", "Omit images"], | |
| help="How images appear in Markdown / HTML output.") | |
| if fmt == "md": | |
| cfg["strict_text"] = st.checkbox("Strict text mode", value=False, | |
| help="Disable Markdown enrichment; output pure text lines.") | |
| cfg["indent"] = st.slider("List indent (spaces)", 2, 8, 4, 2) | |
| # ββ PDF-specific ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.markdown('<div class="section-header">π PDF-Specific Options</div>', unsafe_allow_html=True) | |
| cfg["abort_on_error"] = st.checkbox("Abort batch on first error", value=False) | |
| cfg["max_file_mb"] = st.slider("Max file size (MB)", 5, 200, 50, | |
| help="Files larger than this will be skipped with a warning.") | |
| st.markdown("---") | |
| st.caption("Powered by [Docling](https://github.com/DS4SD/docling) Β· IBM Research") | |
| return cfg | |
| # ββ Converter logic βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def build_converter(cfg: Dict[str, Any], dl) -> Any: | |
| """Construct a DocumentConverter from sidebar config.""" | |
| PdfPipelineOptions = dl["PdfPipelineOptions"] | |
| TableStructureOptions = dl["TableStructureOptions"] | |
| EasyOcrOptions = dl["EasyOcrOptions"] | |
| TesseractCliOcrOptions = dl["TesseractCliOcrOptions"] | |
| PdfFormatOption = dl["PdfFormatOption"] | |
| DocumentConverter = dl["DocumentConverter"] | |
| TableFormerMode = dl["TableFormerMode"] | |
| InputFormat = dl["InputFormat"] | |
| # OCR backend | |
| ocr_options = None | |
| if cfg["do_ocr"]: | |
| if cfg["ocr_engine"] == "EasyOCR": | |
| ocr_options = EasyOcrOptions(lang=cfg["ocr_languages"]) | |
| else: | |
| ocr_options = TesseractCliOcrOptions(lang="+".join(cfg["ocr_languages"])) | |
| # Table structure | |
| tbl_kwargs = {"do_cell_matching": cfg["do_cell_matching"]} | |
| if TableFormerMode is not None: | |
| tbl_kwargs["mode"] = (TableFormerMode.ACCURATE | |
| if "Accurate" in cfg["table_mode"] | |
| else TableFormerMode.FAST) | |
| tbl_opts = TableStructureOptions(**tbl_kwargs) | |
| # PDF pipeline | |
| pdf_opts_kwargs = dict( | |
| do_ocr=cfg["do_ocr"], | |
| do_table_structure=cfg["do_table_structure"], | |
| table_structure_options=tbl_opts, | |
| generate_page_images=cfg["generate_page_images"], | |
| generate_picture_images=cfg["generate_picture_images"], | |
| images_scale=cfg["images_scale"], | |
| ) | |
| if cfg["do_ocr"] and ocr_options is not None: | |
| pdf_opts_kwargs["ocr_options"] = ocr_options | |
| if cfg["force_full_page_ocr"] and cfg["do_ocr"]: | |
| pdf_opts_kwargs["force_full_page_ocr"] = True | |
| if hasattr(PdfPipelineOptions, "generate_table_images"): | |
| pdf_opts_kwargs["generate_table_images"] = cfg.get("generate_table_images", False) | |
| pdf_pipeline_opts = PdfPipelineOptions(**pdf_opts_kwargs) | |
| format_options = { | |
| InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_opts), | |
| } | |
| converter = DocumentConverter(format_options=format_options) | |
| return converter | |
| def image_ref_mode(cfg, dl): | |
| ImageRefMode = dl["ImageRefMode"] | |
| choice = cfg.get("image_mode", "Placeholder comment") | |
| mapping = { | |
| "Placeholder comment": ImageRefMode.PLACEHOLDER, | |
| "Embedded (base64)": ImageRefMode.EMBEDDED, | |
| "Referenced path": ImageRefMode.REFERENCED, | |
| "Omit images": ImageRefMode.PLACEHOLDER, # handled via labels | |
| } | |
| return mapping.get(choice, ImageRefMode.PLACEHOLDER) | |
| def resolve_labels(cfg, dl): | |
| DocItemLabel = dl["DocItemLabel"] | |
| # build label objects from selected string values | |
| labels = [] | |
| label_map = {v: v for v in ELEMENT_LABELS.values()} | |
| for lv in cfg["selected_labels"]: | |
| try: | |
| labels.append(DocItemLabel(lv)) | |
| except Exception: | |
| pass | |
| return labels if labels else None | |
| def do_export(doc, cfg, dl) -> str: | |
| """Export converted document to the chosen format.""" | |
| fmt = OUTPUT_FORMATS[cfg["output_format"]] | |
| labels = resolve_labels(cfg, dl) | |
| try: | |
| if fmt == "md": | |
| kwargs = dict( | |
| image_mode=image_ref_mode(cfg, dl), | |
| strict_text=cfg.get("strict_text", False), | |
| indent=cfg.get("indent", 4), | |
| ) | |
| if labels is not None: | |
| kwargs["labels"] = labels | |
| return doc.export_to_markdown(**kwargs) | |
| elif fmt == "html": | |
| kwargs = dict(image_mode=image_ref_mode(cfg, dl)) | |
| if labels is not None: | |
| kwargs["labels"] = labels | |
| return doc.export_to_html(**kwargs) | |
| elif fmt == "json": | |
| d = doc.export_to_dict() | |
| return json.dumps(d, indent=2, ensure_ascii=False) | |
| elif fmt == "txt": | |
| kwargs = {} | |
| if labels is not None: | |
| kwargs["labels"] = labels | |
| return doc.export_to_text(**kwargs) | |
| elif fmt == "doctags": | |
| try: | |
| return doc.export_to_document_tokens() | |
| except AttributeError: | |
| return doc.export_to_markdown() | |
| except TypeError: | |
| # Fallback: export without unsupported kwargs | |
| if fmt == "md": | |
| return doc.export_to_markdown() | |
| elif fmt == "html": | |
| return doc.export_to_html() | |
| elif fmt == "json": | |
| return json.dumps(doc.export_to_dict(), indent=2, ensure_ascii=False) | |
| else: | |
| return doc.export_to_text() | |
| return "" | |
| def convert_file(path: str, cfg: Dict[str, Any], converter, dl) -> Dict[str, Any]: | |
| """Run docling on a single file. Returns result dict.""" | |
| ConversionStatus = dl["ConversionStatus"] | |
| t0 = time.time() | |
| try: | |
| result = converter.convert(path) | |
| elapsed = time.time() - t0 | |
| if result.status not in (ConversionStatus.SUCCESS, ConversionStatus.PARTIAL_SUCCESS): | |
| return {"ok": False, "error": f"Conversion failed: {result.status}", "elapsed": elapsed} | |
| doc = result.document | |
| exported = do_export(doc, cfg, dl) | |
| # metadata | |
| meta = {} | |
| try: | |
| meta["pages"] = len(result.document.pages) if hasattr(result.document, "pages") else "N/A" | |
| except Exception: | |
| meta["pages"] = "N/A" | |
| try: | |
| meta["tables"] = len([i for i in doc.iterate_items() | |
| if hasattr(i[1], 'label') and | |
| str(getattr(i[1], 'label', '')).endswith('table')]) | |
| except Exception: | |
| meta["tables"] = "N/A" | |
| try: | |
| meta["figures"] = len(doc.pictures) if hasattr(doc, "pictures") else "N/A" | |
| except Exception: | |
| meta["figures"] = "N/A" | |
| return { | |
| "ok": True, | |
| "content": exported, | |
| "elapsed": elapsed, | |
| "status": str(result.status), | |
| "meta": meta, | |
| } | |
| except Exception as e: | |
| return { | |
| "ok": False, | |
| "error": f"{type(e).__name__}: {e}", | |
| "traceback": traceback.format_exc(), | |
| "elapsed": time.time() - t0, | |
| } | |
| # ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def main(): | |
| cfg = sidebar() | |
| # Hero | |
| st.markdown(""" | |
| <div class="hero"> | |
| <span class="badge">β‘ Powered by Docling</span> | |
| <span class="badge">π€ HuggingFace Spaces</span> | |
| <h1>π Docling AIO Converter</h1> | |
| <p>Parse & convert any document β PDF, DOCX, PPTX, XLSX, images, HTML and more β | |
| with full control over OCR, tables, figures, and output formatting.</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # ββ Upload area βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.markdown("### π Upload Documents") | |
| max_mb = cfg.get("max_file_mb", 50) | |
| uploaded = st.file_uploader( | |
| f"Drag & drop files here Β· Max {max_mb} MB per file", | |
| accept_multiple_files=True, | |
| type=list(SUPPORTED_EXTENSIONS.keys()), | |
| help="You can upload multiple files at once for batch conversion.", | |
| ) | |
| if not uploaded: | |
| st.info("π Upload one or more files to get started. " | |
| "Adjust all settings in the **sidebar** before converting.", icon="βΉοΈ") | |
| # Supported formats table | |
| with st.expander("π Supported Input Formats"): | |
| cols = st.columns(4) | |
| items = list(SUPPORTED_EXTENSIONS.items()) | |
| for i, (ext, icon) in enumerate(items): | |
| cols[i % 4].markdown(f"{icon} `.{ext}`") | |
| return | |
| # ββ File list βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| oversized = [f for f in uploaded if f.size > max_mb * 1024 * 1024] | |
| valid = [f for f in uploaded if f.size <= max_mb * 1024 * 1024] | |
| st.markdown(f"**{len(uploaded)} file(s) selected** Β· " | |
| f"<span class='tag tag-green'>{len(valid)} ready</span>" | |
| + (f" <span class='tag tag-red'>{len(oversized)} oversized</span>" | |
| if oversized else ""), | |
| unsafe_allow_html=True) | |
| for f in valid[:8]: # show preview of first 8 | |
| st.markdown( | |
| f"<div class='file-card'>" | |
| f"<span style='font-size:1.3rem'>{file_icon(f.name)}</span>" | |
| f"<span style='flex:1;font-weight:500;color:#e2e8f0'>{f.name}</span>" | |
| f"<span style='color:#64748b;font-size:0.82rem'>{fmt_bytes(f.size)}</span>" | |
| f"</div>", | |
| unsafe_allow_html=True, | |
| ) | |
| if len(valid) > 8: | |
| st.caption(f"β¦and {len(valid)-8} more files") | |
| for f in oversized: | |
| st.warning(f"β οΈ **{f.name}** ({fmt_bytes(f.size)}) exceeds the {max_mb} MB limit and will be skipped.") | |
| if not valid: | |
| return | |
| # ββ Convert button ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| col_btn, col_fmt, _ = st.columns([2, 2, 4]) | |
| with col_btn: | |
| run = st.button("π Convert All", use_container_width=True) | |
| with col_fmt: | |
| st.markdown(f"<br><span class='tag'>{cfg['output_format']}</span>", unsafe_allow_html=True) | |
| if not run: | |
| return | |
| # ββ Load Docling ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with st.spinner("Loading Docling models (first run downloads ~1 GB of models)β¦"): | |
| try: | |
| dl = _load_docling() | |
| except Exception as e: | |
| st.error(f"Failed to import Docling: {e}\n\n" | |
| "Make sure `docling` is installed (`pip install docling`).") | |
| return | |
| with st.spinner("Building converter pipelineβ¦"): | |
| try: | |
| converter = build_converter(cfg, dl) | |
| except Exception as e: | |
| st.error(f"Could not build converter: {e}\n```\n{traceback.format_exc()}\n```") | |
| return | |
| # ββ Process files βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.markdown("---") | |
| st.markdown("### βοΈ Processing") | |
| results: Dict[str, Dict] = {} | |
| overall_bar = st.progress(0) | |
| status_area = st.empty() | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| for idx, uf in enumerate(valid): | |
| fname = uf.name | |
| status_area.markdown( | |
| f"<div class='file-card'>" | |
| f"<span style='font-size:1.2rem'>{file_icon(fname)}</span>" | |
| f"<span style='flex:1;color:#e2e8f0'>{fname}</span>" | |
| f"<span class='status-wait'>β³ convertingβ¦</span>" | |
| f"</div>", | |
| unsafe_allow_html=True, | |
| ) | |
| # Fix 4: each file gets its own subdirectory so two uploaded files | |
| # with the same basename (e.g. "report.pdf" from different folders) | |
| # never silently overwrite each other in the shared tmpdir. | |
| file_subdir = os.path.join(tmpdir, str(idx)) | |
| os.makedirs(file_subdir, exist_ok=True) | |
| tmp_path = os.path.join(file_subdir, fname) | |
| # Fix 3: always seek(0) before reading β on Streamlit re-renders | |
| # the BytesIO cursor is already at EOF and uf.read() returns b"", | |
| # writing a zero-byte file that Docling then silently fails on. | |
| uf.seek(0) | |
| with open(tmp_path, "wb") as fh: | |
| fh.write(uf.read()) | |
| result = convert_file(tmp_path, cfg, converter, dl) | |
| results[fname] = result | |
| overall_bar.progress((idx + 1) / len(valid)) | |
| if not result["ok"] and cfg.get("abort_on_error"): | |
| st.error(f"β Aborted after error on **{fname}**:\n```\n{result['error']}\n```") | |
| break | |
| status_area.empty() | |
| overall_bar.empty() | |
| # ββ Summary metrics βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| ok_count = sum(1 for r in results.values() if r["ok"]) | |
| err_count = len(results) - ok_count | |
| total_time = sum(r["elapsed"] for r in results.values()) | |
| st.markdown( | |
| f"<div class='metric-row'>" | |
| f"<div class='metric-box'><div class='val'>{len(results)}</div><div class='lbl'>Files processed</div></div>" | |
| f"<div class='metric-box'><div class='val' style='color:#4ade80'>{ok_count}</div><div class='lbl'>Succeeded</div></div>" | |
| f"<div class='metric-box'><div class='val' style='color:#f87171'>{err_count}</div><div class='lbl'>Failed</div></div>" | |
| f"<div class='metric-box'><div class='val'>{fmt_time(total_time)}</div><div class='lbl'>Total time</div></div>" | |
| f"</div>", | |
| unsafe_allow_html=True, | |
| ) | |
| # ββ Per-file results ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.markdown("### π Results") | |
| ext = OUTPUT_FORMATS[cfg["output_format"]] | |
| # Build ZIP in memory | |
| zip_buf = io.BytesIO() | |
| with zipfile.ZipFile(zip_buf, "w", zipfile.ZIP_DEFLATED) as zf: | |
| for fname, res in results.items(): | |
| if res["ok"]: | |
| out_name = Path(fname).stem + f".{ext}" | |
| zf.writestr(out_name, res["content"]) | |
| zip_buf.seek(0) | |
| dl_col1, dl_col2 = st.columns([2, 4]) | |
| with dl_col1: | |
| st.download_button( | |
| "β¬οΈ Download All as ZIP", | |
| data=zip_buf, | |
| file_name="docling_output.zip", | |
| mime="application/zip", | |
| use_container_width=True, | |
| ) | |
| st.markdown("---") | |
| for fname, res in results.items(): | |
| icon = file_icon(fname) | |
| with st.expander( | |
| f"{icon} **{fname}** " | |
| + ("β " if res["ok"] else "β") | |
| + f" Β· {fmt_time(res['elapsed'])}", | |
| expanded=ok_count == 1, | |
| ): | |
| if not res["ok"]: | |
| st.error(f"**Error:** {res['error']}") | |
| if "traceback" in res: | |
| with st.expander("π Full traceback"): | |
| st.code(res["traceback"], language="python") | |
| else: | |
| # Metadata strip | |
| meta = res.get("meta", {}) | |
| m_cols = st.columns(4) | |
| m_cols[0].metric("Pages", meta.get("pages", "β")) | |
| m_cols[1].metric("Tables", meta.get("tables", "β")) | |
| m_cols[2].metric("Figures", meta.get("figures", "β")) | |
| m_cols[3].metric("Time", fmt_time(res["elapsed"])) | |
| content = res["content"] | |
| out_name = Path(fname).stem + f".{ext}" | |
| # Download individual file | |
| mime_map = { | |
| "md": "text/markdown", | |
| "html": "text/html", | |
| "json": "application/json", | |
| "txt": "text/plain", | |
| "doctags": "text/plain", | |
| } | |
| st.download_button( | |
| f"β¬οΈ Download {out_name}", | |
| data=content.encode("utf-8"), | |
| file_name=out_name, | |
| mime=mime_map.get(ext, "text/plain"), | |
| key=f"dl_{fname}", | |
| ) | |
| # Preview | |
| char_count = len(content) | |
| word_count = len(content.split()) | |
| st.caption(f"Output: **{char_count:,} chars** Β· **{word_count:,} words**") | |
| if ext == "md": | |
| tab1, tab2 = st.tabs(["π Rendered", "π Raw Markdown"]) | |
| with tab1: | |
| st.markdown(content[:15000] + ("\n\n*[truncated for previewβ¦]*" | |
| if len(content) > 15000 else ""), | |
| unsafe_allow_html=True) | |
| with tab2: | |
| # Fix 5: escape before injecting into HTML β raw doc content | |
| # can contain <, >, & which would break the div or be executed. | |
| preview_raw = html_lib.escape(content[:12000]) | |
| suffix = "β¦[truncated]" if len(content) > 12000 else "" | |
| st.markdown(f"<div class='result-box'>{preview_raw}{suffix}</div>", | |
| unsafe_allow_html=True) | |
| elif ext == "html": | |
| tab1, tab2 = st.tabs(["π Rendered", "π HTML Source"]) | |
| with tab1: | |
| st.components.v1.html(content, height=600, scrolling=True) | |
| with tab2: | |
| st.code(content[:10000], language="html") | |
| elif ext == "json": | |
| try: | |
| parsed = json.loads(content) | |
| st.json(parsed, expanded=False) | |
| except Exception: | |
| st.code(content[:10000], language="json") | |
| else: | |
| # Fix 5: escape before injecting into HTML β same reason as above. | |
| preview_plain = html_lib.escape(content[:12000]) | |
| suffix = "β¦[truncated]" if len(content) > 12000 else "" | |
| st.markdown(f"<div class='result-box'>{preview_plain}{suffix}</div>", | |
| unsafe_allow_html=True) | |
| if __name__ == "__main__": | |
| main() | |