thethinkmachine commited on
Commit
7df3afe
Β·
verified Β·
1 Parent(s): 4d25869

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +60 -0
  2. app.py +762 -0
  3. requirements.txt +36 -0
Dockerfile ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # ── System deps ───────────────────────────────────────────────────────────────
4
+ RUN apt-get update && apt-get install -y --no-install-recommends \
5
+ # OpenCV / image processing
6
+ libgl1-mesa-glx \
7
+ libglib2.0-0 \
8
+ libsm6 \
9
+ libxext6 \
10
+ libxrender1 \
11
+ libgomp1 \
12
+ # Fonts for document rendering
13
+ fonts-liberation \
14
+ fonts-dejavu-core \
15
+ # General utilities
16
+ wget \
17
+ curl \
18
+ git \
19
+ && rm -rf /var/lib/apt/lists/*
20
+
21
+ # ── Working directory ─────────────────────────────────────────────────────────
22
+ WORKDIR /app
23
+
24
+ # ── Python deps (cached layer β€” only re-runs when requirements.txt changes) ───
25
+ COPY requirements.txt .
26
+ RUN pip install --no-cache-dir --upgrade pip \
27
+ && pip install --no-cache-dir -r requirements.txt
28
+
29
+ # ── App source ────────────────────────────────────────────────────────────────
30
+ COPY app.py .
31
+
32
+ # ── HF Spaces runs as a non-root user; make cache dirs writable ───────────────
33
+ RUN mkdir -p /app/.cache /app/tmp \
34
+ && chmod -R 777 /app/.cache /app/tmp
35
+
36
+ # Tell HuggingFace / torch / transformers to use our writable cache dir
37
+ ENV HF_HOME=/app/.cache/huggingface
38
+ ENV TORCH_HOME=/app/.cache/torch
39
+ ENV TRANSFORMERS_CACHE=/app/.cache/huggingface/transformers
40
+ ENV TMPDIR=/app/tmp
41
+
42
+ # ── Port (HF Spaces expects 7860) ─────────────────────────────────────────────
43
+ EXPOSE 7860
44
+
45
+ # ── Launch β€” ALL server flags as explicit CLI args ────────────────────────────
46
+ # This is the only approach that cannot be silently overridden by HF's runner.
47
+ # config.toml is NOT used here so there is no ambiguity.
48
+ CMD ["streamlit", "run", "app.py", \
49
+ "--server.headless=true", \
50
+ "--server.port=7860", \
51
+ "--server.address=0.0.0.0", \
52
+ "--server.enableCORS=false", \
53
+ "--server.enableXsrfProtection=false", \
54
+ "--server.maxUploadSize=200", \
55
+ "--server.fileWatcherType=none", \
56
+ "--browser.gatherUsageStats=false", \
57
+ "--theme.primaryColor=#6366f1", \
58
+ "--theme.backgroundColor=#0a0e1a", \
59
+ "--theme.secondaryBackgroundColor=#0f172a", \
60
+ "--theme.textColor=#e2e8f0"]
app.py ADDED
@@ -0,0 +1,762 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import tempfile
3
+ import os
4
+ import json
5
+ import zipfile
6
+ import io
7
+ import time
8
+ import traceback
9
+ import html as html_lib # stdlib β€” used to escape doc content before unsafe_allow_html injection
10
+ from pathlib import Path
11
+ from typing import Optional, List, Dict, Any
12
+
13
+ # ── Page config ──────────────────────────────────────────────────────────────
14
+ st.set_page_config(
15
+ page_title="Docling AIO Converter",
16
+ page_icon="πŸ“„",
17
+ layout="wide",
18
+ initial_sidebar_state="expanded",
19
+ )
20
+
21
+ # ── CSS ───────────────────────────────────────────────────────────────────────
22
+ st.markdown("""
23
+ <style>
24
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
25
+
26
+ html, body, [class*="css"] { font-family: 'Inter', sans-serif; }
27
+
28
+ .hero {
29
+ background: linear-gradient(135deg, #1a1a2e 0%, #16213e 50%, #0f3460 100%);
30
+ border: 1px solid rgba(255,255,255,0.08);
31
+ border-radius: 16px;
32
+ padding: 2.5rem 2rem;
33
+ margin-bottom: 2rem;
34
+ position: relative;
35
+ overflow: hidden;
36
+ }
37
+ .hero::before {
38
+ content: '';
39
+ position: absolute;
40
+ top: -50%;
41
+ right: -20%;
42
+ width: 400px;
43
+ height: 400px;
44
+ background: radial-gradient(circle, rgba(99,102,241,0.15) 0%, transparent 70%);
45
+ border-radius: 50%;
46
+ }
47
+ .hero h1 { color: #fff; font-size: 2.2rem; font-weight: 700; margin: 0 0 0.5rem; }
48
+ .hero p { color: rgba(255,255,255,0.65); font-size: 1.05rem; margin: 0; }
49
+ .hero .badge {
50
+ display: inline-flex; align-items: center; gap: 6px;
51
+ background: rgba(99,102,241,0.25);
52
+ border: 1px solid rgba(99,102,241,0.5);
53
+ color: #a5b4fc;
54
+ border-radius: 20px;
55
+ padding: 3px 12px;
56
+ font-size: 0.78rem;
57
+ font-weight: 600;
58
+ margin-right: 8px;
59
+ margin-bottom: 1rem;
60
+ }
61
+
62
+ .section-header {
63
+ color: #6366f1;
64
+ font-size: 0.7rem;
65
+ font-weight: 700;
66
+ letter-spacing: 0.12em;
67
+ text-transform: uppercase;
68
+ margin: 1.4rem 0 0.6rem;
69
+ padding-bottom: 4px;
70
+ border-bottom: 1px solid rgba(99,102,241,0.2);
71
+ }
72
+
73
+ .file-card {
74
+ background: #0f172a;
75
+ border: 1px solid rgba(255,255,255,0.07);
76
+ border-radius: 10px;
77
+ padding: 0.9rem 1.1rem;
78
+ margin-bottom: 0.5rem;
79
+ display: flex;
80
+ align-items: center;
81
+ gap: 12px;
82
+ }
83
+ .file-card .status-ok { color: #4ade80; }
84
+ .file-card .status-err { color: #f87171; }
85
+ .file-card .status-wait { color: #94a3b8; }
86
+
87
+ .result-box {
88
+ background: #0d1117;
89
+ border: 1px solid rgba(255,255,255,0.07);
90
+ border-radius: 10px;
91
+ padding: 1.2rem;
92
+ font-size: 0.85rem;
93
+ color: #e2e8f0;
94
+ max-height: 520px;
95
+ overflow-y: auto;
96
+ white-space: pre-wrap;
97
+ font-family: 'JetBrains Mono', 'Fira Code', monospace;
98
+ line-height: 1.6;
99
+ }
100
+
101
+ .metric-row {
102
+ display: flex;
103
+ gap: 1rem;
104
+ margin-bottom: 1.2rem;
105
+ flex-wrap: wrap;
106
+ }
107
+ .metric-box {
108
+ flex: 1;
109
+ min-width: 100px;
110
+ background: #0f172a;
111
+ border: 1px solid rgba(255,255,255,0.07);
112
+ border-radius: 10px;
113
+ padding: 0.8rem 1rem;
114
+ text-align: center;
115
+ }
116
+ .metric-box .val { font-size: 1.6rem; font-weight: 700; color: #a5b4fc; }
117
+ .metric-box .lbl { font-size: 0.75rem; color: #64748b; margin-top: 2px; }
118
+
119
+ .tag {
120
+ display: inline-block;
121
+ background: rgba(99,102,241,0.15);
122
+ color: #a5b4fc;
123
+ border-radius: 4px;
124
+ padding: 2px 8px;
125
+ font-size: 0.72rem;
126
+ font-weight: 600;
127
+ margin: 2px;
128
+ }
129
+ .tag-green { background: rgba(74,222,128,0.12); color: #4ade80; }
130
+ .tag-red { background: rgba(248,113,113,0.12); color: #f87171; }
131
+ .tag-yellow { background: rgba(251,191,36,0.12); color: #fbbf24; }
132
+
133
+ [data-testid="stSidebar"] { background: #0a0e1a; }
134
+ [data-testid="stSidebar"] .block-container { padding-top: 1rem; }
135
+
136
+ .stButton>button {
137
+ background: linear-gradient(135deg, #6366f1, #8b5cf6);
138
+ color: white;
139
+ border: none;
140
+ border-radius: 8px;
141
+ font-weight: 600;
142
+ padding: 0.55rem 1.5rem;
143
+ transition: all 0.2s;
144
+ }
145
+ .stButton>button:hover { opacity: 0.88; transform: translateY(-1px); }
146
+
147
+ .stDownloadButton>button {
148
+ background: #1e293b;
149
+ color: #a5b4fc;
150
+ border: 1px solid rgba(99,102,241,0.35);
151
+ border-radius: 8px;
152
+ font-weight: 500;
153
+ }
154
+ </style>
155
+ """, unsafe_allow_html=True)
156
+
157
+ # ── Helpers ───────────────────────────────────────────────────────────────────
158
+ SUPPORTED_EXTENSIONS = {
159
+ "pdf": "πŸ“•", "docx": "πŸ“˜", "doc": "πŸ“˜", "pptx": "πŸ“™", "ppt": "πŸ“™",
160
+ "xlsx": "πŸ“—", "xls": "πŸ“—", "csv": "πŸ“Š", "html": "🌐", "htm": "🌐",
161
+ "md": "πŸ“", "txt": "πŸ“„", "png": "πŸ–ΌοΈ", "jpg": "πŸ–ΌοΈ", "jpeg": "πŸ–ΌοΈ",
162
+ "tiff": "��️", "tif": "πŸ–ΌοΈ", "bmp": "πŸ–ΌοΈ", "webp": "πŸ–ΌοΈ",
163
+ "asciidoc": "πŸ“ƒ", "adoc": "πŸ“ƒ", "xml": "πŸ“‘", "json": "πŸ“‹",
164
+ }
165
+
166
+ OUTPUT_FORMATS = {
167
+ "Markdown (.md)": "md",
168
+ "HTML (.html)": "html",
169
+ "JSON (.json)": "json",
170
+ "Plain Text (.txt)": "txt",
171
+ "DocTags (.doctags)":"doctags",
172
+ }
173
+
174
+ ELEMENT_LABELS = {
175
+ "Paragraphs / Text": "paragraph",
176
+ "Section Headers": "section_header",
177
+ "Titles": "title",
178
+ "Tables": "table",
179
+ "Figures / Pictures": "picture",
180
+ "Captions": "caption",
181
+ "Footnotes": "footnote",
182
+ "Formulas / Equations":"formula",
183
+ "List Items": "list_item",
184
+ "Code Blocks": "code",
185
+ "Page Headers": "page_header",
186
+ "Page Footers": "page_footer",
187
+ "Key-Value Regions": "key_value_region",
188
+ "Form Elements": "form",
189
+ "Document Index": "document_index",
190
+ }
191
+
192
+ def file_icon(filename: str) -> str:
193
+ ext = Path(filename).suffix.lstrip(".").lower()
194
+ return SUPPORTED_EXTENSIONS.get(ext, "πŸ“„")
195
+
196
+ def fmt_bytes(n: int) -> str:
197
+ for unit in ("B", "KB", "MB", "GB"):
198
+ if n < 1024:
199
+ return f"{n:.1f} {unit}"
200
+ n /= 1024
201
+ return f"{n:.1f} TB"
202
+
203
+ def fmt_time(s: float) -> str:
204
+ return f"{s:.1f}s" if s < 60 else f"{int(s//60)}m {int(s%60)}s"
205
+
206
+ # ── Lazy-load Docling (heavy) ─────────────────────────────────────────────────
207
+ @st.cache_resource(show_spinner=False)
208
+ def _load_docling():
209
+ """Import docling once and cache."""
210
+ from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption
211
+ from docling.datamodel.pipeline_options import (
212
+ PdfPipelineOptions,
213
+ TableStructureOptions,
214
+ EasyOcrOptions,
215
+ TesseractCliOcrOptions,
216
+ )
217
+ from docling.datamodel.base_models import InputFormat, ConversionStatus
218
+ try:
219
+ from docling.datamodel.pipeline_options import TableFormerMode
220
+ except ImportError:
221
+ TableFormerMode = None
222
+ try:
223
+ from docling_core.types.doc import ImageRefMode, DocItemLabel
224
+ except ImportError:
225
+ from docling.datamodel.base_models import ImageRefMode, DocItemLabel # type: ignore
226
+ return {
227
+ "DocumentConverter": DocumentConverter,
228
+ "PdfFormatOption": PdfFormatOption,
229
+ "WordFormatOption": WordFormatOption,
230
+ "PdfPipelineOptions": PdfPipelineOptions,
231
+ "TableStructureOptions": TableStructureOptions,
232
+ "EasyOcrOptions": EasyOcrOptions,
233
+ "TesseractCliOcrOptions": TesseractCliOcrOptions,
234
+ "InputFormat": InputFormat,
235
+ "ConversionStatus": ConversionStatus,
236
+ "TableFormerMode": TableFormerMode,
237
+ "ImageRefMode": ImageRefMode,
238
+ "DocItemLabel": DocItemLabel,
239
+ }
240
+
241
+ # ── Sidebar Config ────────────────────────────────────────────────────────────
242
+ def sidebar() -> Dict[str, Any]:
243
+ cfg: Dict[str, Any] = {}
244
+
245
+ with st.sidebar:
246
+ st.markdown("## βš™οΈ Configuration")
247
+
248
+ # ── OCR ──────────────────────────────────────────────────────────────
249
+ st.markdown('<div class="section-header">πŸ” OCR Settings</div>', unsafe_allow_html=True)
250
+ cfg["do_ocr"] = st.checkbox("Enable OCR", value=True,
251
+ help="Optical Character Recognition for scanned/image-based content.")
252
+ cfg["force_full_page_ocr"] = st.checkbox("Force full-page OCR", value=False,
253
+ help="Run OCR on every page even if text layer exists.")
254
+ cfg["ocr_engine"] = st.radio("OCR Engine", ["EasyOCR", "Tesseract"],
255
+ horizontal=True,
256
+ help="EasyOCR is pure-Python; Tesseract requires system install.")
257
+ cfg["ocr_languages"] = st.multiselect(
258
+ "OCR Languages",
259
+ ["en", "de", "fr", "es", "it", "pt", "nl", "ru", "zh", "ja", "ko",
260
+ "ar", "hi", "pl", "cs", "ro", "sv", "da", "fi", "no", "hu", "tr"],
261
+ default=["en"],
262
+ help="Languages for OCR. EasyOCR supports all; Tesseract needs packs installed.")
263
+
264
+ # ── Table Extraction ──────────────────────────────────────────────────
265
+ st.markdown('<div class="section-header">πŸ“Š Table Extraction</div>', unsafe_allow_html=True)
266
+ cfg["do_table_structure"] = st.checkbox("Extract table structure", value=True,
267
+ help="Use TableFormer model to detect rows/columns/cells in tables.")
268
+ cfg["table_mode"] = st.radio("TableFormer mode",
269
+ ["Accurate (slower)", "Fast (lighter)"],
270
+ index=0, horizontal=True,
271
+ help="Accurate uses the full model; Fast is a smaller/faster variant.")
272
+ cfg["do_cell_matching"] = st.checkbox("Cell text matching", value=True,
273
+ help="Match detected cells back to underlying PDF text for accuracy.")
274
+
275
+ # ── Image Handling ────────────────────────────────────────────────────
276
+ st.markdown('<div class="section-header">πŸ–ΌοΈ Image & Page Rendering</div>', unsafe_allow_html=True)
277
+ cfg["generate_page_images"] = st.checkbox("Generate page images", value=False,
278
+ help="Rasterise each page as an image (needed for embedded page images in output).")
279
+ cfg["generate_picture_images"] = st.checkbox("Generate picture crops", value=True,
280
+ help="Extract figure/picture regions as cropped images.")
281
+ cfg["images_scale"] = st.slider("Rendering scale (DPI multiplier)", 1.0, 4.0, 2.0, 0.5,
282
+ help="Higher = better quality but slower & more memory.")
283
+ cfg["generate_table_images"] = st.checkbox("Generate table images", value=False,
284
+ help="Also rasterise table regions as images.")
285
+
286
+ # ── Content Elements ──────────────────────────────────────────────────
287
+ st.markdown('<div class="section-header">πŸ“‹ Content Elements to Include</div>', unsafe_allow_html=True)
288
+ st.caption("Uncheck elements you want to exclude from the output.")
289
+ selected_labels = []
290
+ for label_name, label_val in ELEMENT_LABELS.items():
291
+ default = True
292
+ # default off for things rarely needed
293
+ if label_val in ("page_header", "page_footer", "document_index",
294
+ "key_value_region", "form"):
295
+ default = False
296
+ if st.checkbox(label_name, value=default, key=f"lbl_{label_val}"):
297
+ selected_labels.append(label_val)
298
+ cfg["selected_labels"] = selected_labels
299
+
300
+ # ── Output Format ─────────────────────────────────────────────────────
301
+ st.markdown('<div class="section-header">πŸ“€ Output Format</div>', unsafe_allow_html=True)
302
+ cfg["output_format"] = st.selectbox("Convert to", list(OUTPUT_FORMATS.keys()))
303
+
304
+ # ── Format-specific options ───────────────────────────────────────────
305
+ fmt = OUTPUT_FORMATS[cfg["output_format"]]
306
+ if fmt in ("md", "html"):
307
+ cfg["image_mode"] = st.selectbox(
308
+ "Image handling in output",
309
+ ["Placeholder comment", "Embedded (base64)", "Referenced path", "Omit images"],
310
+ help="How images appear in Markdown / HTML output.")
311
+ if fmt == "md":
312
+ cfg["strict_text"] = st.checkbox("Strict text mode", value=False,
313
+ help="Disable Markdown enrichment; output pure text lines.")
314
+ cfg["indent"] = st.slider("List indent (spaces)", 2, 8, 4, 2)
315
+
316
+ # ── PDF-specific ──────────────────────────────────────────────────────
317
+ st.markdown('<div class="section-header">πŸ“• PDF-Specific Options</div>', unsafe_allow_html=True)
318
+ cfg["abort_on_error"] = st.checkbox("Abort batch on first error", value=False)
319
+ cfg["max_file_mb"] = st.slider("Max file size (MB)", 5, 200, 50,
320
+ help="Files larger than this will be skipped with a warning.")
321
+
322
+ st.markdown("---")
323
+ st.caption("Powered by [Docling](https://github.com/DS4SD/docling) Β· IBM Research")
324
+
325
+ return cfg
326
+
327
+ # ── Converter logic ───────────────────────────────────────────────────────────
328
+ def build_converter(cfg: Dict[str, Any], dl) -> Any:
329
+ """Construct a DocumentConverter from sidebar config."""
330
+ PdfPipelineOptions = dl["PdfPipelineOptions"]
331
+ TableStructureOptions = dl["TableStructureOptions"]
332
+ EasyOcrOptions = dl["EasyOcrOptions"]
333
+ TesseractCliOcrOptions = dl["TesseractCliOcrOptions"]
334
+ PdfFormatOption = dl["PdfFormatOption"]
335
+ DocumentConverter = dl["DocumentConverter"]
336
+ TableFormerMode = dl["TableFormerMode"]
337
+ InputFormat = dl["InputFormat"]
338
+
339
+ # OCR backend
340
+ ocr_options = None
341
+ if cfg["do_ocr"]:
342
+ if cfg["ocr_engine"] == "EasyOCR":
343
+ ocr_options = EasyOcrOptions(lang=cfg["ocr_languages"])
344
+ else:
345
+ ocr_options = TesseractCliOcrOptions(lang="+".join(cfg["ocr_languages"]))
346
+
347
+ # Table structure
348
+ tbl_kwargs = {"do_cell_matching": cfg["do_cell_matching"]}
349
+ if TableFormerMode is not None:
350
+ tbl_kwargs["mode"] = (TableFormerMode.ACCURATE
351
+ if "Accurate" in cfg["table_mode"]
352
+ else TableFormerMode.FAST)
353
+ tbl_opts = TableStructureOptions(**tbl_kwargs)
354
+
355
+ # PDF pipeline
356
+ pdf_opts_kwargs = dict(
357
+ do_ocr=cfg["do_ocr"],
358
+ do_table_structure=cfg["do_table_structure"],
359
+ table_structure_options=tbl_opts,
360
+ generate_page_images=cfg["generate_page_images"],
361
+ generate_picture_images=cfg["generate_picture_images"],
362
+ images_scale=cfg["images_scale"],
363
+ )
364
+ if cfg["do_ocr"] and ocr_options is not None:
365
+ pdf_opts_kwargs["ocr_options"] = ocr_options
366
+ if cfg["force_full_page_ocr"] and cfg["do_ocr"]:
367
+ pdf_opts_kwargs["force_full_page_ocr"] = True
368
+ if hasattr(PdfPipelineOptions, "generate_table_images"):
369
+ pdf_opts_kwargs["generate_table_images"] = cfg.get("generate_table_images", False)
370
+
371
+ pdf_pipeline_opts = PdfPipelineOptions(**pdf_opts_kwargs)
372
+
373
+ format_options = {
374
+ InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_opts),
375
+ }
376
+
377
+ converter = DocumentConverter(format_options=format_options)
378
+ return converter
379
+
380
+
381
+ def image_ref_mode(cfg, dl):
382
+ ImageRefMode = dl["ImageRefMode"]
383
+ choice = cfg.get("image_mode", "Placeholder comment")
384
+ mapping = {
385
+ "Placeholder comment": ImageRefMode.PLACEHOLDER,
386
+ "Embedded (base64)": ImageRefMode.EMBEDDED,
387
+ "Referenced path": ImageRefMode.REFERENCED,
388
+ "Omit images": ImageRefMode.PLACEHOLDER, # handled via labels
389
+ }
390
+ return mapping.get(choice, ImageRefMode.PLACEHOLDER)
391
+
392
+
393
+ def resolve_labels(cfg, dl):
394
+ DocItemLabel = dl["DocItemLabel"]
395
+ # build label objects from selected string values
396
+ labels = []
397
+ label_map = {v: v for v in ELEMENT_LABELS.values()}
398
+ for lv in cfg["selected_labels"]:
399
+ try:
400
+ labels.append(DocItemLabel(lv))
401
+ except Exception:
402
+ pass
403
+ return labels if labels else None
404
+
405
+
406
+ def do_export(doc, cfg, dl) -> str:
407
+ """Export converted document to the chosen format."""
408
+ fmt = OUTPUT_FORMATS[cfg["output_format"]]
409
+ labels = resolve_labels(cfg, dl)
410
+
411
+ try:
412
+ if fmt == "md":
413
+ kwargs = dict(
414
+ image_mode=image_ref_mode(cfg, dl),
415
+ strict_text=cfg.get("strict_text", False),
416
+ indent=cfg.get("indent", 4),
417
+ )
418
+ if labels is not None:
419
+ kwargs["labels"] = labels
420
+ return doc.export_to_markdown(**kwargs)
421
+
422
+ elif fmt == "html":
423
+ kwargs = dict(image_mode=image_ref_mode(cfg, dl))
424
+ if labels is not None:
425
+ kwargs["labels"] = labels
426
+ return doc.export_to_html(**kwargs)
427
+
428
+ elif fmt == "json":
429
+ d = doc.export_to_dict()
430
+ return json.dumps(d, indent=2, ensure_ascii=False)
431
+
432
+ elif fmt == "txt":
433
+ kwargs = {}
434
+ if labels is not None:
435
+ kwargs["labels"] = labels
436
+ return doc.export_to_text(**kwargs)
437
+
438
+ elif fmt == "doctags":
439
+ try:
440
+ return doc.export_to_document_tokens()
441
+ except AttributeError:
442
+ return doc.export_to_markdown()
443
+
444
+ except TypeError:
445
+ # Fallback: export without unsupported kwargs
446
+ if fmt == "md":
447
+ return doc.export_to_markdown()
448
+ elif fmt == "html":
449
+ return doc.export_to_html()
450
+ elif fmt == "json":
451
+ return json.dumps(doc.export_to_dict(), indent=2, ensure_ascii=False)
452
+ else:
453
+ return doc.export_to_text()
454
+
455
+ return ""
456
+
457
+
458
+ def convert_file(path: str, cfg: Dict[str, Any], converter, dl) -> Dict[str, Any]:
459
+ """Run docling on a single file. Returns result dict."""
460
+ ConversionStatus = dl["ConversionStatus"]
461
+ t0 = time.time()
462
+ try:
463
+ result = converter.convert(path)
464
+ elapsed = time.time() - t0
465
+ if result.status not in (ConversionStatus.SUCCESS, ConversionStatus.PARTIAL_SUCCESS):
466
+ return {"ok": False, "error": f"Conversion failed: {result.status}", "elapsed": elapsed}
467
+
468
+ doc = result.document
469
+ exported = do_export(doc, cfg, dl)
470
+
471
+ # metadata
472
+ meta = {}
473
+ try:
474
+ meta["pages"] = len(result.document.pages) if hasattr(result.document, "pages") else "N/A"
475
+ except Exception:
476
+ meta["pages"] = "N/A"
477
+ try:
478
+ meta["tables"] = len([i for i in doc.iterate_items()
479
+ if hasattr(i[1], 'label') and
480
+ str(getattr(i[1], 'label', '')).endswith('table')])
481
+ except Exception:
482
+ meta["tables"] = "N/A"
483
+ try:
484
+ meta["figures"] = len(doc.pictures) if hasattr(doc, "pictures") else "N/A"
485
+ except Exception:
486
+ meta["figures"] = "N/A"
487
+
488
+ return {
489
+ "ok": True,
490
+ "content": exported,
491
+ "elapsed": elapsed,
492
+ "status": str(result.status),
493
+ "meta": meta,
494
+ }
495
+ except Exception as e:
496
+ return {
497
+ "ok": False,
498
+ "error": f"{type(e).__name__}: {e}",
499
+ "traceback": traceback.format_exc(),
500
+ "elapsed": time.time() - t0,
501
+ }
502
+
503
+ # ── Main ──────────────────────────────────────────────────────────────────────
504
+ def main():
505
+ cfg = sidebar()
506
+
507
+ # Hero
508
+ st.markdown("""
509
+ <div class="hero">
510
+ <span class="badge">⚑ Powered by Docling</span>
511
+ <span class="badge">πŸ€— HuggingFace Spaces</span>
512
+ <h1>πŸ“„ Docling AIO Converter</h1>
513
+ <p>Parse &amp; convert any document β€” PDF, DOCX, PPTX, XLSX, images, HTML and more β€”
514
+ with full control over OCR, tables, figures, and output formatting.</p>
515
+ </div>
516
+ """, unsafe_allow_html=True)
517
+
518
+ # ── Upload area ───────────────────────────────────────────────────────────
519
+ st.markdown("### πŸ“ Upload Documents")
520
+ max_mb = cfg.get("max_file_mb", 50)
521
+ uploaded = st.file_uploader(
522
+ f"Drag & drop files here Β· Max {max_mb} MB per file",
523
+ accept_multiple_files=True,
524
+ type=list(SUPPORTED_EXTENSIONS.keys()),
525
+ help="You can upload multiple files at once for batch conversion.",
526
+ )
527
+
528
+ if not uploaded:
529
+ st.info("πŸ‘† Upload one or more files to get started. "
530
+ "Adjust all settings in the **sidebar** before converting.", icon="ℹ️")
531
+ # Supported formats table
532
+ with st.expander("πŸ“‹ Supported Input Formats"):
533
+ cols = st.columns(4)
534
+ items = list(SUPPORTED_EXTENSIONS.items())
535
+ for i, (ext, icon) in enumerate(items):
536
+ cols[i % 4].markdown(f"{icon} `.{ext}`")
537
+ return
538
+
539
+ # ── File list ─────────────────────────────────────────────────────────────
540
+ oversized = [f for f in uploaded if f.size > max_mb * 1024 * 1024]
541
+ valid = [f for f in uploaded if f.size <= max_mb * 1024 * 1024]
542
+
543
+ st.markdown(f"**{len(uploaded)} file(s) selected** Β· "
544
+ f"<span class='tag tag-green'>{len(valid)} ready</span>"
545
+ + (f" <span class='tag tag-red'>{len(oversized)} oversized</span>"
546
+ if oversized else ""),
547
+ unsafe_allow_html=True)
548
+
549
+ for f in valid[:8]: # show preview of first 8
550
+ st.markdown(
551
+ f"<div class='file-card'>"
552
+ f"<span style='font-size:1.3rem'>{file_icon(f.name)}</span>"
553
+ f"<span style='flex:1;font-weight:500;color:#e2e8f0'>{f.name}</span>"
554
+ f"<span style='color:#64748b;font-size:0.82rem'>{fmt_bytes(f.size)}</span>"
555
+ f"</div>",
556
+ unsafe_allow_html=True,
557
+ )
558
+ if len(valid) > 8:
559
+ st.caption(f"…and {len(valid)-8} more files")
560
+ for f in oversized:
561
+ st.warning(f"⚠️ **{f.name}** ({fmt_bytes(f.size)}) exceeds the {max_mb} MB limit and will be skipped.")
562
+
563
+ if not valid:
564
+ return
565
+
566
+ # ── Convert button ────────────────────────────────────────────────────────
567
+ col_btn, col_fmt, _ = st.columns([2, 2, 4])
568
+ with col_btn:
569
+ run = st.button("πŸš€ Convert All", use_container_width=True)
570
+ with col_fmt:
571
+ st.markdown(f"<br><span class='tag'>{cfg['output_format']}</span>", unsafe_allow_html=True)
572
+
573
+ if not run:
574
+ return
575
+
576
+ # ── Load Docling ──────────────────────────────────────────────────────────
577
+ with st.spinner("Loading Docling models (first run downloads ~1 GB of models)…"):
578
+ try:
579
+ dl = _load_docling()
580
+ except Exception as e:
581
+ st.error(f"Failed to import Docling: {e}\n\n"
582
+ "Make sure `docling` is installed (`pip install docling`).")
583
+ return
584
+
585
+ with st.spinner("Building converter pipeline…"):
586
+ try:
587
+ converter = build_converter(cfg, dl)
588
+ except Exception as e:
589
+ st.error(f"Could not build converter: {e}\n```\n{traceback.format_exc()}\n```")
590
+ return
591
+
592
+ # ── Process files ─────────────────────────────────────────────────────────
593
+ st.markdown("---")
594
+ st.markdown("### βš™οΈ Processing")
595
+
596
+ results: Dict[str, Dict] = {}
597
+ overall_bar = st.progress(0)
598
+ status_area = st.empty()
599
+
600
+ with tempfile.TemporaryDirectory() as tmpdir:
601
+ for idx, uf in enumerate(valid):
602
+ fname = uf.name
603
+ status_area.markdown(
604
+ f"<div class='file-card'>"
605
+ f"<span style='font-size:1.2rem'>{file_icon(fname)}</span>"
606
+ f"<span style='flex:1;color:#e2e8f0'>{fname}</span>"
607
+ f"<span class='status-wait'>⏳ converting…</span>"
608
+ f"</div>",
609
+ unsafe_allow_html=True,
610
+ )
611
+
612
+ # Fix 4: each file gets its own subdirectory so two uploaded files
613
+ # with the same basename (e.g. "report.pdf" from different folders)
614
+ # never silently overwrite each other in the shared tmpdir.
615
+ file_subdir = os.path.join(tmpdir, str(idx))
616
+ os.makedirs(file_subdir, exist_ok=True)
617
+ tmp_path = os.path.join(file_subdir, fname)
618
+
619
+ # Fix 3: always seek(0) before reading β€” on Streamlit re-renders
620
+ # the BytesIO cursor is already at EOF and uf.read() returns b"",
621
+ # writing a zero-byte file that Docling then silently fails on.
622
+ uf.seek(0)
623
+ with open(tmp_path, "wb") as fh:
624
+ fh.write(uf.read())
625
+
626
+ result = convert_file(tmp_path, cfg, converter, dl)
627
+ results[fname] = result
628
+
629
+ overall_bar.progress((idx + 1) / len(valid))
630
+
631
+ if not result["ok"] and cfg.get("abort_on_error"):
632
+ st.error(f"❌ Aborted after error on **{fname}**:\n```\n{result['error']}\n```")
633
+ break
634
+
635
+ status_area.empty()
636
+ overall_bar.empty()
637
+
638
+ # ── Summary metrics ───────────────────────────────────────────────────────
639
+ ok_count = sum(1 for r in results.values() if r["ok"])
640
+ err_count = len(results) - ok_count
641
+ total_time = sum(r["elapsed"] for r in results.values())
642
+
643
+ st.markdown(
644
+ f"<div class='metric-row'>"
645
+ f"<div class='metric-box'><div class='val'>{len(results)}</div><div class='lbl'>Files processed</div></div>"
646
+ f"<div class='metric-box'><div class='val' style='color:#4ade80'>{ok_count}</div><div class='lbl'>Succeeded</div></div>"
647
+ f"<div class='metric-box'><div class='val' style='color:#f87171'>{err_count}</div><div class='lbl'>Failed</div></div>"
648
+ f"<div class='metric-box'><div class='val'>{fmt_time(total_time)}</div><div class='lbl'>Total time</div></div>"
649
+ f"</div>",
650
+ unsafe_allow_html=True,
651
+ )
652
+
653
+ # ── Per-file results ──────────────────────────────────────────────────────
654
+ st.markdown("### πŸ“‚ Results")
655
+
656
+ ext = OUTPUT_FORMATS[cfg["output_format"]]
657
+
658
+ # Build ZIP in memory
659
+ zip_buf = io.BytesIO()
660
+ with zipfile.ZipFile(zip_buf, "w", zipfile.ZIP_DEFLATED) as zf:
661
+ for fname, res in results.items():
662
+ if res["ok"]:
663
+ out_name = Path(fname).stem + f".{ext}"
664
+ zf.writestr(out_name, res["content"])
665
+ zip_buf.seek(0)
666
+
667
+ dl_col1, dl_col2 = st.columns([2, 4])
668
+ with dl_col1:
669
+ st.download_button(
670
+ "⬇️ Download All as ZIP",
671
+ data=zip_buf,
672
+ file_name="docling_output.zip",
673
+ mime="application/zip",
674
+ use_container_width=True,
675
+ )
676
+
677
+ st.markdown("---")
678
+
679
+ for fname, res in results.items():
680
+ icon = file_icon(fname)
681
+ with st.expander(
682
+ f"{icon} **{fname}** "
683
+ + ("βœ…" if res["ok"] else "❌")
684
+ + f" Β· {fmt_time(res['elapsed'])}",
685
+ expanded=ok_count == 1,
686
+ ):
687
+ if not res["ok"]:
688
+ st.error(f"**Error:** {res['error']}")
689
+ if "traceback" in res:
690
+ with st.expander("πŸ“‹ Full traceback"):
691
+ st.code(res["traceback"], language="python")
692
+ else:
693
+ # Metadata strip
694
+ meta = res.get("meta", {})
695
+ m_cols = st.columns(4)
696
+ m_cols[0].metric("Pages", meta.get("pages", "β€”"))
697
+ m_cols[1].metric("Tables", meta.get("tables", "β€”"))
698
+ m_cols[2].metric("Figures", meta.get("figures", "β€”"))
699
+ m_cols[3].metric("Time", fmt_time(res["elapsed"]))
700
+
701
+ content = res["content"]
702
+ out_name = Path(fname).stem + f".{ext}"
703
+
704
+ # Download individual file
705
+ mime_map = {
706
+ "md": "text/markdown",
707
+ "html": "text/html",
708
+ "json": "application/json",
709
+ "txt": "text/plain",
710
+ "doctags": "text/plain",
711
+ }
712
+ st.download_button(
713
+ f"⬇️ Download {out_name}",
714
+ data=content.encode("utf-8"),
715
+ file_name=out_name,
716
+ mime=mime_map.get(ext, "text/plain"),
717
+ key=f"dl_{fname}",
718
+ )
719
+
720
+ # Preview
721
+ char_count = len(content)
722
+ word_count = len(content.split())
723
+ st.caption(f"Output: **{char_count:,} chars** Β· **{word_count:,} words**")
724
+
725
+ if ext == "md":
726
+ tab1, tab2 = st.tabs(["πŸ“– Rendered", "πŸ“ Raw Markdown"])
727
+ with tab1:
728
+ st.markdown(content[:15000] + ("\n\n*[truncated for preview…]*"
729
+ if len(content) > 15000 else ""),
730
+ unsafe_allow_html=True)
731
+ with tab2:
732
+ # Fix 5: escape before injecting into HTML β€” raw doc content
733
+ # can contain <, >, & which would break the div or be executed.
734
+ preview_raw = html_lib.escape(content[:12000])
735
+ suffix = "…[truncated]" if len(content) > 12000 else ""
736
+ st.markdown(f"<div class='result-box'>{preview_raw}{suffix}</div>",
737
+ unsafe_allow_html=True)
738
+
739
+ elif ext == "html":
740
+ tab1, tab2 = st.tabs(["🌐 Rendered", "πŸ“ HTML Source"])
741
+ with tab1:
742
+ st.components.v1.html(content, height=600, scrolling=True)
743
+ with tab2:
744
+ st.code(content[:10000], language="html")
745
+
746
+ elif ext == "json":
747
+ try:
748
+ parsed = json.loads(content)
749
+ st.json(parsed, expanded=False)
750
+ except Exception:
751
+ st.code(content[:10000], language="json")
752
+
753
+ else:
754
+ # Fix 5: escape before injecting into HTML β€” same reason as above.
755
+ preview_plain = html_lib.escape(content[:12000])
756
+ suffix = "…[truncated]" if len(content) > 12000 else ""
757
+ st.markdown(f"<div class='result-box'>{preview_plain}{suffix}</div>",
758
+ unsafe_allow_html=True)
759
+
760
+
761
+ if __name__ == "__main__":
762
+ main()
requirements.txt ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ── Core ─────────────────────────────────────────────────────────────────────
2
+ docling>=2.5.0
3
+ docling-core>=2.0.0
4
+
5
+ # ── Streamlit ─────────────────────────────────────────────────────────────────
6
+ streamlit>=1.35.0
7
+
8
+ # ── OCR backends ─────────────────────────────────────────────────────────────
9
+ easyocr>=1.7.0 # Pure-Python OCR (no system deps needed)
10
+ # pytesseract # Tesseract wrapper β€” uncomment if packages.txt has tesseract
11
+
12
+ # ── Document format support ───────────────────────────────────────────────────
13
+ python-docx>=1.1.0 # DOCX reading/writing
14
+ python-pptx>=0.6.23 # PPTX support
15
+ openpyxl>=3.1.2 # XLSX support
16
+ pandas>=2.0.0 # CSV / tabular
17
+ beautifulsoup4>=4.12.0 # HTML parsing
18
+ lxml>=5.0.0 # XML/HTML backend
19
+
20
+ # ── Image processing ──────────────────────────────────────────────────────────
21
+ Pillow>=10.0.0
22
+ opencv-python-headless>=4.9.0 # headless for server environments
23
+
24
+ # ── PDF ───────────────────────────────────────────────────────────────────────
25
+ pypdfium2>=4.0.0 # Fast PDF rendering backend used by Docling
26
+ pdfminer.six>=20221105
27
+
28
+ # ── ML / model support ────────────────────────────────────────────────────────
29
+ torch>=2.1.0
30
+ torchvision>=0.16.0
31
+ transformers>=4.40.0
32
+ huggingface-hub>=0.20.0
33
+
34
+ # ── Misc utilities ────────────────────────────────────────────────────────────
35
+ requests>=2.31.0
36
+ tqdm>=4.66.0