Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- Dockerfile +60 -0
- app.py +762 -0
- requirements.txt +36 -0
Dockerfile
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
# ββ System deps βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 5 |
+
# OpenCV / image processing
|
| 6 |
+
libgl1-mesa-glx \
|
| 7 |
+
libglib2.0-0 \
|
| 8 |
+
libsm6 \
|
| 9 |
+
libxext6 \
|
| 10 |
+
libxrender1 \
|
| 11 |
+
libgomp1 \
|
| 12 |
+
# Fonts for document rendering
|
| 13 |
+
fonts-liberation \
|
| 14 |
+
fonts-dejavu-core \
|
| 15 |
+
# General utilities
|
| 16 |
+
wget \
|
| 17 |
+
curl \
|
| 18 |
+
git \
|
| 19 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 20 |
+
|
| 21 |
+
# ββ Working directory βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 22 |
+
WORKDIR /app
|
| 23 |
+
|
| 24 |
+
# ββ Python deps (cached layer β only re-runs when requirements.txt changes) βββ
|
| 25 |
+
COPY requirements.txt .
|
| 26 |
+
RUN pip install --no-cache-dir --upgrade pip \
|
| 27 |
+
&& pip install --no-cache-dir -r requirements.txt
|
| 28 |
+
|
| 29 |
+
# ββ App source ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
+
COPY app.py .
|
| 31 |
+
|
| 32 |
+
# ββ HF Spaces runs as a non-root user; make cache dirs writable βββββββββββββββ
|
| 33 |
+
RUN mkdir -p /app/.cache /app/tmp \
|
| 34 |
+
&& chmod -R 777 /app/.cache /app/tmp
|
| 35 |
+
|
| 36 |
+
# Tell HuggingFace / torch / transformers to use our writable cache dir
|
| 37 |
+
ENV HF_HOME=/app/.cache/huggingface
|
| 38 |
+
ENV TORCH_HOME=/app/.cache/torch
|
| 39 |
+
ENV TRANSFORMERS_CACHE=/app/.cache/huggingface/transformers
|
| 40 |
+
ENV TMPDIR=/app/tmp
|
| 41 |
+
|
| 42 |
+
# ββ Port (HF Spaces expects 7860) βββββββββββββββββββββββββββββββββββββββββββββ
|
| 43 |
+
EXPOSE 7860
|
| 44 |
+
|
| 45 |
+
# ββ Launch β ALL server flags as explicit CLI args ββββββββββββββββββββββββββββ
|
| 46 |
+
# This is the only approach that cannot be silently overridden by HF's runner.
|
| 47 |
+
# config.toml is NOT used here so there is no ambiguity.
|
| 48 |
+
CMD ["streamlit", "run", "app.py", \
|
| 49 |
+
"--server.headless=true", \
|
| 50 |
+
"--server.port=7860", \
|
| 51 |
+
"--server.address=0.0.0.0", \
|
| 52 |
+
"--server.enableCORS=false", \
|
| 53 |
+
"--server.enableXsrfProtection=false", \
|
| 54 |
+
"--server.maxUploadSize=200", \
|
| 55 |
+
"--server.fileWatcherType=none", \
|
| 56 |
+
"--browser.gatherUsageStats=false", \
|
| 57 |
+
"--theme.primaryColor=#6366f1", \
|
| 58 |
+
"--theme.backgroundColor=#0a0e1a", \
|
| 59 |
+
"--theme.secondaryBackgroundColor=#0f172a", \
|
| 60 |
+
"--theme.textColor=#e2e8f0"]
|
app.py
ADDED
|
@@ -0,0 +1,762 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import tempfile
|
| 3 |
+
import os
|
| 4 |
+
import json
|
| 5 |
+
import zipfile
|
| 6 |
+
import io
|
| 7 |
+
import time
|
| 8 |
+
import traceback
|
| 9 |
+
import html as html_lib # stdlib β used to escape doc content before unsafe_allow_html injection
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Optional, List, Dict, Any
|
| 12 |
+
|
| 13 |
+
# ββ Page config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 14 |
+
st.set_page_config(
|
| 15 |
+
page_title="Docling AIO Converter",
|
| 16 |
+
page_icon="π",
|
| 17 |
+
layout="wide",
|
| 18 |
+
initial_sidebar_state="expanded",
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
# ββ CSS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 22 |
+
st.markdown("""
|
| 23 |
+
<style>
|
| 24 |
+
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
|
| 25 |
+
|
| 26 |
+
html, body, [class*="css"] { font-family: 'Inter', sans-serif; }
|
| 27 |
+
|
| 28 |
+
.hero {
|
| 29 |
+
background: linear-gradient(135deg, #1a1a2e 0%, #16213e 50%, #0f3460 100%);
|
| 30 |
+
border: 1px solid rgba(255,255,255,0.08);
|
| 31 |
+
border-radius: 16px;
|
| 32 |
+
padding: 2.5rem 2rem;
|
| 33 |
+
margin-bottom: 2rem;
|
| 34 |
+
position: relative;
|
| 35 |
+
overflow: hidden;
|
| 36 |
+
}
|
| 37 |
+
.hero::before {
|
| 38 |
+
content: '';
|
| 39 |
+
position: absolute;
|
| 40 |
+
top: -50%;
|
| 41 |
+
right: -20%;
|
| 42 |
+
width: 400px;
|
| 43 |
+
height: 400px;
|
| 44 |
+
background: radial-gradient(circle, rgba(99,102,241,0.15) 0%, transparent 70%);
|
| 45 |
+
border-radius: 50%;
|
| 46 |
+
}
|
| 47 |
+
.hero h1 { color: #fff; font-size: 2.2rem; font-weight: 700; margin: 0 0 0.5rem; }
|
| 48 |
+
.hero p { color: rgba(255,255,255,0.65); font-size: 1.05rem; margin: 0; }
|
| 49 |
+
.hero .badge {
|
| 50 |
+
display: inline-flex; align-items: center; gap: 6px;
|
| 51 |
+
background: rgba(99,102,241,0.25);
|
| 52 |
+
border: 1px solid rgba(99,102,241,0.5);
|
| 53 |
+
color: #a5b4fc;
|
| 54 |
+
border-radius: 20px;
|
| 55 |
+
padding: 3px 12px;
|
| 56 |
+
font-size: 0.78rem;
|
| 57 |
+
font-weight: 600;
|
| 58 |
+
margin-right: 8px;
|
| 59 |
+
margin-bottom: 1rem;
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
.section-header {
|
| 63 |
+
color: #6366f1;
|
| 64 |
+
font-size: 0.7rem;
|
| 65 |
+
font-weight: 700;
|
| 66 |
+
letter-spacing: 0.12em;
|
| 67 |
+
text-transform: uppercase;
|
| 68 |
+
margin: 1.4rem 0 0.6rem;
|
| 69 |
+
padding-bottom: 4px;
|
| 70 |
+
border-bottom: 1px solid rgba(99,102,241,0.2);
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
.file-card {
|
| 74 |
+
background: #0f172a;
|
| 75 |
+
border: 1px solid rgba(255,255,255,0.07);
|
| 76 |
+
border-radius: 10px;
|
| 77 |
+
padding: 0.9rem 1.1rem;
|
| 78 |
+
margin-bottom: 0.5rem;
|
| 79 |
+
display: flex;
|
| 80 |
+
align-items: center;
|
| 81 |
+
gap: 12px;
|
| 82 |
+
}
|
| 83 |
+
.file-card .status-ok { color: #4ade80; }
|
| 84 |
+
.file-card .status-err { color: #f87171; }
|
| 85 |
+
.file-card .status-wait { color: #94a3b8; }
|
| 86 |
+
|
| 87 |
+
.result-box {
|
| 88 |
+
background: #0d1117;
|
| 89 |
+
border: 1px solid rgba(255,255,255,0.07);
|
| 90 |
+
border-radius: 10px;
|
| 91 |
+
padding: 1.2rem;
|
| 92 |
+
font-size: 0.85rem;
|
| 93 |
+
color: #e2e8f0;
|
| 94 |
+
max-height: 520px;
|
| 95 |
+
overflow-y: auto;
|
| 96 |
+
white-space: pre-wrap;
|
| 97 |
+
font-family: 'JetBrains Mono', 'Fira Code', monospace;
|
| 98 |
+
line-height: 1.6;
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
.metric-row {
|
| 102 |
+
display: flex;
|
| 103 |
+
gap: 1rem;
|
| 104 |
+
margin-bottom: 1.2rem;
|
| 105 |
+
flex-wrap: wrap;
|
| 106 |
+
}
|
| 107 |
+
.metric-box {
|
| 108 |
+
flex: 1;
|
| 109 |
+
min-width: 100px;
|
| 110 |
+
background: #0f172a;
|
| 111 |
+
border: 1px solid rgba(255,255,255,0.07);
|
| 112 |
+
border-radius: 10px;
|
| 113 |
+
padding: 0.8rem 1rem;
|
| 114 |
+
text-align: center;
|
| 115 |
+
}
|
| 116 |
+
.metric-box .val { font-size: 1.6rem; font-weight: 700; color: #a5b4fc; }
|
| 117 |
+
.metric-box .lbl { font-size: 0.75rem; color: #64748b; margin-top: 2px; }
|
| 118 |
+
|
| 119 |
+
.tag {
|
| 120 |
+
display: inline-block;
|
| 121 |
+
background: rgba(99,102,241,0.15);
|
| 122 |
+
color: #a5b4fc;
|
| 123 |
+
border-radius: 4px;
|
| 124 |
+
padding: 2px 8px;
|
| 125 |
+
font-size: 0.72rem;
|
| 126 |
+
font-weight: 600;
|
| 127 |
+
margin: 2px;
|
| 128 |
+
}
|
| 129 |
+
.tag-green { background: rgba(74,222,128,0.12); color: #4ade80; }
|
| 130 |
+
.tag-red { background: rgba(248,113,113,0.12); color: #f87171; }
|
| 131 |
+
.tag-yellow { background: rgba(251,191,36,0.12); color: #fbbf24; }
|
| 132 |
+
|
| 133 |
+
[data-testid="stSidebar"] { background: #0a0e1a; }
|
| 134 |
+
[data-testid="stSidebar"] .block-container { padding-top: 1rem; }
|
| 135 |
+
|
| 136 |
+
.stButton>button {
|
| 137 |
+
background: linear-gradient(135deg, #6366f1, #8b5cf6);
|
| 138 |
+
color: white;
|
| 139 |
+
border: none;
|
| 140 |
+
border-radius: 8px;
|
| 141 |
+
font-weight: 600;
|
| 142 |
+
padding: 0.55rem 1.5rem;
|
| 143 |
+
transition: all 0.2s;
|
| 144 |
+
}
|
| 145 |
+
.stButton>button:hover { opacity: 0.88; transform: translateY(-1px); }
|
| 146 |
+
|
| 147 |
+
.stDownloadButton>button {
|
| 148 |
+
background: #1e293b;
|
| 149 |
+
color: #a5b4fc;
|
| 150 |
+
border: 1px solid rgba(99,102,241,0.35);
|
| 151 |
+
border-radius: 8px;
|
| 152 |
+
font-weight: 500;
|
| 153 |
+
}
|
| 154 |
+
</style>
|
| 155 |
+
""", unsafe_allow_html=True)
|
| 156 |
+
|
| 157 |
+
# ββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 158 |
+
SUPPORTED_EXTENSIONS = {
|
| 159 |
+
"pdf": "π", "docx": "π", "doc": "π", "pptx": "π", "ppt": "π",
|
| 160 |
+
"xlsx": "π", "xls": "π", "csv": "π", "html": "π", "htm": "π",
|
| 161 |
+
"md": "π", "txt": "π", "png": "πΌοΈ", "jpg": "πΌοΈ", "jpeg": "πΌοΈ",
|
| 162 |
+
"tiff": "οΏ½οΏ½οΈ", "tif": "πΌοΈ", "bmp": "πΌοΈ", "webp": "πΌοΈ",
|
| 163 |
+
"asciidoc": "π", "adoc": "π", "xml": "π", "json": "π",
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
OUTPUT_FORMATS = {
|
| 167 |
+
"Markdown (.md)": "md",
|
| 168 |
+
"HTML (.html)": "html",
|
| 169 |
+
"JSON (.json)": "json",
|
| 170 |
+
"Plain Text (.txt)": "txt",
|
| 171 |
+
"DocTags (.doctags)":"doctags",
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
ELEMENT_LABELS = {
|
| 175 |
+
"Paragraphs / Text": "paragraph",
|
| 176 |
+
"Section Headers": "section_header",
|
| 177 |
+
"Titles": "title",
|
| 178 |
+
"Tables": "table",
|
| 179 |
+
"Figures / Pictures": "picture",
|
| 180 |
+
"Captions": "caption",
|
| 181 |
+
"Footnotes": "footnote",
|
| 182 |
+
"Formulas / Equations":"formula",
|
| 183 |
+
"List Items": "list_item",
|
| 184 |
+
"Code Blocks": "code",
|
| 185 |
+
"Page Headers": "page_header",
|
| 186 |
+
"Page Footers": "page_footer",
|
| 187 |
+
"Key-Value Regions": "key_value_region",
|
| 188 |
+
"Form Elements": "form",
|
| 189 |
+
"Document Index": "document_index",
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
def file_icon(filename: str) -> str:
|
| 193 |
+
ext = Path(filename).suffix.lstrip(".").lower()
|
| 194 |
+
return SUPPORTED_EXTENSIONS.get(ext, "π")
|
| 195 |
+
|
| 196 |
+
def fmt_bytes(n: int) -> str:
|
| 197 |
+
for unit in ("B", "KB", "MB", "GB"):
|
| 198 |
+
if n < 1024:
|
| 199 |
+
return f"{n:.1f} {unit}"
|
| 200 |
+
n /= 1024
|
| 201 |
+
return f"{n:.1f} TB"
|
| 202 |
+
|
| 203 |
+
def fmt_time(s: float) -> str:
|
| 204 |
+
return f"{s:.1f}s" if s < 60 else f"{int(s//60)}m {int(s%60)}s"
|
| 205 |
+
|
| 206 |
+
# ββ Lazy-load Docling (heavy) βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 207 |
+
@st.cache_resource(show_spinner=False)
|
| 208 |
+
def _load_docling():
|
| 209 |
+
"""Import docling once and cache."""
|
| 210 |
+
from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption
|
| 211 |
+
from docling.datamodel.pipeline_options import (
|
| 212 |
+
PdfPipelineOptions,
|
| 213 |
+
TableStructureOptions,
|
| 214 |
+
EasyOcrOptions,
|
| 215 |
+
TesseractCliOcrOptions,
|
| 216 |
+
)
|
| 217 |
+
from docling.datamodel.base_models import InputFormat, ConversionStatus
|
| 218 |
+
try:
|
| 219 |
+
from docling.datamodel.pipeline_options import TableFormerMode
|
| 220 |
+
except ImportError:
|
| 221 |
+
TableFormerMode = None
|
| 222 |
+
try:
|
| 223 |
+
from docling_core.types.doc import ImageRefMode, DocItemLabel
|
| 224 |
+
except ImportError:
|
| 225 |
+
from docling.datamodel.base_models import ImageRefMode, DocItemLabel # type: ignore
|
| 226 |
+
return {
|
| 227 |
+
"DocumentConverter": DocumentConverter,
|
| 228 |
+
"PdfFormatOption": PdfFormatOption,
|
| 229 |
+
"WordFormatOption": WordFormatOption,
|
| 230 |
+
"PdfPipelineOptions": PdfPipelineOptions,
|
| 231 |
+
"TableStructureOptions": TableStructureOptions,
|
| 232 |
+
"EasyOcrOptions": EasyOcrOptions,
|
| 233 |
+
"TesseractCliOcrOptions": TesseractCliOcrOptions,
|
| 234 |
+
"InputFormat": InputFormat,
|
| 235 |
+
"ConversionStatus": ConversionStatus,
|
| 236 |
+
"TableFormerMode": TableFormerMode,
|
| 237 |
+
"ImageRefMode": ImageRefMode,
|
| 238 |
+
"DocItemLabel": DocItemLabel,
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
# ββ Sidebar Config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 242 |
+
def sidebar() -> Dict[str, Any]:
|
| 243 |
+
cfg: Dict[str, Any] = {}
|
| 244 |
+
|
| 245 |
+
with st.sidebar:
|
| 246 |
+
st.markdown("## βοΈ Configuration")
|
| 247 |
+
|
| 248 |
+
# ββ OCR ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 249 |
+
st.markdown('<div class="section-header">π OCR Settings</div>', unsafe_allow_html=True)
|
| 250 |
+
cfg["do_ocr"] = st.checkbox("Enable OCR", value=True,
|
| 251 |
+
help="Optical Character Recognition for scanned/image-based content.")
|
| 252 |
+
cfg["force_full_page_ocr"] = st.checkbox("Force full-page OCR", value=False,
|
| 253 |
+
help="Run OCR on every page even if text layer exists.")
|
| 254 |
+
cfg["ocr_engine"] = st.radio("OCR Engine", ["EasyOCR", "Tesseract"],
|
| 255 |
+
horizontal=True,
|
| 256 |
+
help="EasyOCR is pure-Python; Tesseract requires system install.")
|
| 257 |
+
cfg["ocr_languages"] = st.multiselect(
|
| 258 |
+
"OCR Languages",
|
| 259 |
+
["en", "de", "fr", "es", "it", "pt", "nl", "ru", "zh", "ja", "ko",
|
| 260 |
+
"ar", "hi", "pl", "cs", "ro", "sv", "da", "fi", "no", "hu", "tr"],
|
| 261 |
+
default=["en"],
|
| 262 |
+
help="Languages for OCR. EasyOCR supports all; Tesseract needs packs installed.")
|
| 263 |
+
|
| 264 |
+
# ββ Table Extraction ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 265 |
+
st.markdown('<div class="section-header">π Table Extraction</div>', unsafe_allow_html=True)
|
| 266 |
+
cfg["do_table_structure"] = st.checkbox("Extract table structure", value=True,
|
| 267 |
+
help="Use TableFormer model to detect rows/columns/cells in tables.")
|
| 268 |
+
cfg["table_mode"] = st.radio("TableFormer mode",
|
| 269 |
+
["Accurate (slower)", "Fast (lighter)"],
|
| 270 |
+
index=0, horizontal=True,
|
| 271 |
+
help="Accurate uses the full model; Fast is a smaller/faster variant.")
|
| 272 |
+
cfg["do_cell_matching"] = st.checkbox("Cell text matching", value=True,
|
| 273 |
+
help="Match detected cells back to underlying PDF text for accuracy.")
|
| 274 |
+
|
| 275 |
+
# ββ Image Handling ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 276 |
+
st.markdown('<div class="section-header">πΌοΈ Image & Page Rendering</div>', unsafe_allow_html=True)
|
| 277 |
+
cfg["generate_page_images"] = st.checkbox("Generate page images", value=False,
|
| 278 |
+
help="Rasterise each page as an image (needed for embedded page images in output).")
|
| 279 |
+
cfg["generate_picture_images"] = st.checkbox("Generate picture crops", value=True,
|
| 280 |
+
help="Extract figure/picture regions as cropped images.")
|
| 281 |
+
cfg["images_scale"] = st.slider("Rendering scale (DPI multiplier)", 1.0, 4.0, 2.0, 0.5,
|
| 282 |
+
help="Higher = better quality but slower & more memory.")
|
| 283 |
+
cfg["generate_table_images"] = st.checkbox("Generate table images", value=False,
|
| 284 |
+
help="Also rasterise table regions as images.")
|
| 285 |
+
|
| 286 |
+
# ββ Content Elements ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 287 |
+
st.markdown('<div class="section-header">π Content Elements to Include</div>', unsafe_allow_html=True)
|
| 288 |
+
st.caption("Uncheck elements you want to exclude from the output.")
|
| 289 |
+
selected_labels = []
|
| 290 |
+
for label_name, label_val in ELEMENT_LABELS.items():
|
| 291 |
+
default = True
|
| 292 |
+
# default off for things rarely needed
|
| 293 |
+
if label_val in ("page_header", "page_footer", "document_index",
|
| 294 |
+
"key_value_region", "form"):
|
| 295 |
+
default = False
|
| 296 |
+
if st.checkbox(label_name, value=default, key=f"lbl_{label_val}"):
|
| 297 |
+
selected_labels.append(label_val)
|
| 298 |
+
cfg["selected_labels"] = selected_labels
|
| 299 |
+
|
| 300 |
+
# ββ Output Format βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 301 |
+
st.markdown('<div class="section-header">π€ Output Format</div>', unsafe_allow_html=True)
|
| 302 |
+
cfg["output_format"] = st.selectbox("Convert to", list(OUTPUT_FORMATS.keys()))
|
| 303 |
+
|
| 304 |
+
# ββ Format-specific options βββββββββββββββββββββββββββββββββββββββββββ
|
| 305 |
+
fmt = OUTPUT_FORMATS[cfg["output_format"]]
|
| 306 |
+
if fmt in ("md", "html"):
|
| 307 |
+
cfg["image_mode"] = st.selectbox(
|
| 308 |
+
"Image handling in output",
|
| 309 |
+
["Placeholder comment", "Embedded (base64)", "Referenced path", "Omit images"],
|
| 310 |
+
help="How images appear in Markdown / HTML output.")
|
| 311 |
+
if fmt == "md":
|
| 312 |
+
cfg["strict_text"] = st.checkbox("Strict text mode", value=False,
|
| 313 |
+
help="Disable Markdown enrichment; output pure text lines.")
|
| 314 |
+
cfg["indent"] = st.slider("List indent (spaces)", 2, 8, 4, 2)
|
| 315 |
+
|
| 316 |
+
# ββ PDF-specific ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 317 |
+
st.markdown('<div class="section-header">π PDF-Specific Options</div>', unsafe_allow_html=True)
|
| 318 |
+
cfg["abort_on_error"] = st.checkbox("Abort batch on first error", value=False)
|
| 319 |
+
cfg["max_file_mb"] = st.slider("Max file size (MB)", 5, 200, 50,
|
| 320 |
+
help="Files larger than this will be skipped with a warning.")
|
| 321 |
+
|
| 322 |
+
st.markdown("---")
|
| 323 |
+
st.caption("Powered by [Docling](https://github.com/DS4SD/docling) Β· IBM Research")
|
| 324 |
+
|
| 325 |
+
return cfg
|
| 326 |
+
|
| 327 |
+
# ββ Converter logic βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 328 |
+
def build_converter(cfg: Dict[str, Any], dl) -> Any:
|
| 329 |
+
"""Construct a DocumentConverter from sidebar config."""
|
| 330 |
+
PdfPipelineOptions = dl["PdfPipelineOptions"]
|
| 331 |
+
TableStructureOptions = dl["TableStructureOptions"]
|
| 332 |
+
EasyOcrOptions = dl["EasyOcrOptions"]
|
| 333 |
+
TesseractCliOcrOptions = dl["TesseractCliOcrOptions"]
|
| 334 |
+
PdfFormatOption = dl["PdfFormatOption"]
|
| 335 |
+
DocumentConverter = dl["DocumentConverter"]
|
| 336 |
+
TableFormerMode = dl["TableFormerMode"]
|
| 337 |
+
InputFormat = dl["InputFormat"]
|
| 338 |
+
|
| 339 |
+
# OCR backend
|
| 340 |
+
ocr_options = None
|
| 341 |
+
if cfg["do_ocr"]:
|
| 342 |
+
if cfg["ocr_engine"] == "EasyOCR":
|
| 343 |
+
ocr_options = EasyOcrOptions(lang=cfg["ocr_languages"])
|
| 344 |
+
else:
|
| 345 |
+
ocr_options = TesseractCliOcrOptions(lang="+".join(cfg["ocr_languages"]))
|
| 346 |
+
|
| 347 |
+
# Table structure
|
| 348 |
+
tbl_kwargs = {"do_cell_matching": cfg["do_cell_matching"]}
|
| 349 |
+
if TableFormerMode is not None:
|
| 350 |
+
tbl_kwargs["mode"] = (TableFormerMode.ACCURATE
|
| 351 |
+
if "Accurate" in cfg["table_mode"]
|
| 352 |
+
else TableFormerMode.FAST)
|
| 353 |
+
tbl_opts = TableStructureOptions(**tbl_kwargs)
|
| 354 |
+
|
| 355 |
+
# PDF pipeline
|
| 356 |
+
pdf_opts_kwargs = dict(
|
| 357 |
+
do_ocr=cfg["do_ocr"],
|
| 358 |
+
do_table_structure=cfg["do_table_structure"],
|
| 359 |
+
table_structure_options=tbl_opts,
|
| 360 |
+
generate_page_images=cfg["generate_page_images"],
|
| 361 |
+
generate_picture_images=cfg["generate_picture_images"],
|
| 362 |
+
images_scale=cfg["images_scale"],
|
| 363 |
+
)
|
| 364 |
+
if cfg["do_ocr"] and ocr_options is not None:
|
| 365 |
+
pdf_opts_kwargs["ocr_options"] = ocr_options
|
| 366 |
+
if cfg["force_full_page_ocr"] and cfg["do_ocr"]:
|
| 367 |
+
pdf_opts_kwargs["force_full_page_ocr"] = True
|
| 368 |
+
if hasattr(PdfPipelineOptions, "generate_table_images"):
|
| 369 |
+
pdf_opts_kwargs["generate_table_images"] = cfg.get("generate_table_images", False)
|
| 370 |
+
|
| 371 |
+
pdf_pipeline_opts = PdfPipelineOptions(**pdf_opts_kwargs)
|
| 372 |
+
|
| 373 |
+
format_options = {
|
| 374 |
+
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_opts),
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
converter = DocumentConverter(format_options=format_options)
|
| 378 |
+
return converter
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
def image_ref_mode(cfg, dl):
|
| 382 |
+
ImageRefMode = dl["ImageRefMode"]
|
| 383 |
+
choice = cfg.get("image_mode", "Placeholder comment")
|
| 384 |
+
mapping = {
|
| 385 |
+
"Placeholder comment": ImageRefMode.PLACEHOLDER,
|
| 386 |
+
"Embedded (base64)": ImageRefMode.EMBEDDED,
|
| 387 |
+
"Referenced path": ImageRefMode.REFERENCED,
|
| 388 |
+
"Omit images": ImageRefMode.PLACEHOLDER, # handled via labels
|
| 389 |
+
}
|
| 390 |
+
return mapping.get(choice, ImageRefMode.PLACEHOLDER)
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
def resolve_labels(cfg, dl):
|
| 394 |
+
DocItemLabel = dl["DocItemLabel"]
|
| 395 |
+
# build label objects from selected string values
|
| 396 |
+
labels = []
|
| 397 |
+
label_map = {v: v for v in ELEMENT_LABELS.values()}
|
| 398 |
+
for lv in cfg["selected_labels"]:
|
| 399 |
+
try:
|
| 400 |
+
labels.append(DocItemLabel(lv))
|
| 401 |
+
except Exception:
|
| 402 |
+
pass
|
| 403 |
+
return labels if labels else None
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
def do_export(doc, cfg, dl) -> str:
|
| 407 |
+
"""Export converted document to the chosen format."""
|
| 408 |
+
fmt = OUTPUT_FORMATS[cfg["output_format"]]
|
| 409 |
+
labels = resolve_labels(cfg, dl)
|
| 410 |
+
|
| 411 |
+
try:
|
| 412 |
+
if fmt == "md":
|
| 413 |
+
kwargs = dict(
|
| 414 |
+
image_mode=image_ref_mode(cfg, dl),
|
| 415 |
+
strict_text=cfg.get("strict_text", False),
|
| 416 |
+
indent=cfg.get("indent", 4),
|
| 417 |
+
)
|
| 418 |
+
if labels is not None:
|
| 419 |
+
kwargs["labels"] = labels
|
| 420 |
+
return doc.export_to_markdown(**kwargs)
|
| 421 |
+
|
| 422 |
+
elif fmt == "html":
|
| 423 |
+
kwargs = dict(image_mode=image_ref_mode(cfg, dl))
|
| 424 |
+
if labels is not None:
|
| 425 |
+
kwargs["labels"] = labels
|
| 426 |
+
return doc.export_to_html(**kwargs)
|
| 427 |
+
|
| 428 |
+
elif fmt == "json":
|
| 429 |
+
d = doc.export_to_dict()
|
| 430 |
+
return json.dumps(d, indent=2, ensure_ascii=False)
|
| 431 |
+
|
| 432 |
+
elif fmt == "txt":
|
| 433 |
+
kwargs = {}
|
| 434 |
+
if labels is not None:
|
| 435 |
+
kwargs["labels"] = labels
|
| 436 |
+
return doc.export_to_text(**kwargs)
|
| 437 |
+
|
| 438 |
+
elif fmt == "doctags":
|
| 439 |
+
try:
|
| 440 |
+
return doc.export_to_document_tokens()
|
| 441 |
+
except AttributeError:
|
| 442 |
+
return doc.export_to_markdown()
|
| 443 |
+
|
| 444 |
+
except TypeError:
|
| 445 |
+
# Fallback: export without unsupported kwargs
|
| 446 |
+
if fmt == "md":
|
| 447 |
+
return doc.export_to_markdown()
|
| 448 |
+
elif fmt == "html":
|
| 449 |
+
return doc.export_to_html()
|
| 450 |
+
elif fmt == "json":
|
| 451 |
+
return json.dumps(doc.export_to_dict(), indent=2, ensure_ascii=False)
|
| 452 |
+
else:
|
| 453 |
+
return doc.export_to_text()
|
| 454 |
+
|
| 455 |
+
return ""
|
| 456 |
+
|
| 457 |
+
|
| 458 |
+
def convert_file(path: str, cfg: Dict[str, Any], converter, dl) -> Dict[str, Any]:
|
| 459 |
+
"""Run docling on a single file. Returns result dict."""
|
| 460 |
+
ConversionStatus = dl["ConversionStatus"]
|
| 461 |
+
t0 = time.time()
|
| 462 |
+
try:
|
| 463 |
+
result = converter.convert(path)
|
| 464 |
+
elapsed = time.time() - t0
|
| 465 |
+
if result.status not in (ConversionStatus.SUCCESS, ConversionStatus.PARTIAL_SUCCESS):
|
| 466 |
+
return {"ok": False, "error": f"Conversion failed: {result.status}", "elapsed": elapsed}
|
| 467 |
+
|
| 468 |
+
doc = result.document
|
| 469 |
+
exported = do_export(doc, cfg, dl)
|
| 470 |
+
|
| 471 |
+
# metadata
|
| 472 |
+
meta = {}
|
| 473 |
+
try:
|
| 474 |
+
meta["pages"] = len(result.document.pages) if hasattr(result.document, "pages") else "N/A"
|
| 475 |
+
except Exception:
|
| 476 |
+
meta["pages"] = "N/A"
|
| 477 |
+
try:
|
| 478 |
+
meta["tables"] = len([i for i in doc.iterate_items()
|
| 479 |
+
if hasattr(i[1], 'label') and
|
| 480 |
+
str(getattr(i[1], 'label', '')).endswith('table')])
|
| 481 |
+
except Exception:
|
| 482 |
+
meta["tables"] = "N/A"
|
| 483 |
+
try:
|
| 484 |
+
meta["figures"] = len(doc.pictures) if hasattr(doc, "pictures") else "N/A"
|
| 485 |
+
except Exception:
|
| 486 |
+
meta["figures"] = "N/A"
|
| 487 |
+
|
| 488 |
+
return {
|
| 489 |
+
"ok": True,
|
| 490 |
+
"content": exported,
|
| 491 |
+
"elapsed": elapsed,
|
| 492 |
+
"status": str(result.status),
|
| 493 |
+
"meta": meta,
|
| 494 |
+
}
|
| 495 |
+
except Exception as e:
|
| 496 |
+
return {
|
| 497 |
+
"ok": False,
|
| 498 |
+
"error": f"{type(e).__name__}: {e}",
|
| 499 |
+
"traceback": traceback.format_exc(),
|
| 500 |
+
"elapsed": time.time() - t0,
|
| 501 |
+
}
|
| 502 |
+
|
| 503 |
+
# ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 504 |
+
def main():
|
| 505 |
+
cfg = sidebar()
|
| 506 |
+
|
| 507 |
+
# Hero
|
| 508 |
+
st.markdown("""
|
| 509 |
+
<div class="hero">
|
| 510 |
+
<span class="badge">β‘ Powered by Docling</span>
|
| 511 |
+
<span class="badge">π€ HuggingFace Spaces</span>
|
| 512 |
+
<h1>π Docling AIO Converter</h1>
|
| 513 |
+
<p>Parse & convert any document β PDF, DOCX, PPTX, XLSX, images, HTML and more β
|
| 514 |
+
with full control over OCR, tables, figures, and output formatting.</p>
|
| 515 |
+
</div>
|
| 516 |
+
""", unsafe_allow_html=True)
|
| 517 |
+
|
| 518 |
+
# ββ Upload area βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 519 |
+
st.markdown("### π Upload Documents")
|
| 520 |
+
max_mb = cfg.get("max_file_mb", 50)
|
| 521 |
+
uploaded = st.file_uploader(
|
| 522 |
+
f"Drag & drop files here Β· Max {max_mb} MB per file",
|
| 523 |
+
accept_multiple_files=True,
|
| 524 |
+
type=list(SUPPORTED_EXTENSIONS.keys()),
|
| 525 |
+
help="You can upload multiple files at once for batch conversion.",
|
| 526 |
+
)
|
| 527 |
+
|
| 528 |
+
if not uploaded:
|
| 529 |
+
st.info("π Upload one or more files to get started. "
|
| 530 |
+
"Adjust all settings in the **sidebar** before converting.", icon="βΉοΈ")
|
| 531 |
+
# Supported formats table
|
| 532 |
+
with st.expander("π Supported Input Formats"):
|
| 533 |
+
cols = st.columns(4)
|
| 534 |
+
items = list(SUPPORTED_EXTENSIONS.items())
|
| 535 |
+
for i, (ext, icon) in enumerate(items):
|
| 536 |
+
cols[i % 4].markdown(f"{icon} `.{ext}`")
|
| 537 |
+
return
|
| 538 |
+
|
| 539 |
+
# ββ File list βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 540 |
+
oversized = [f for f in uploaded if f.size > max_mb * 1024 * 1024]
|
| 541 |
+
valid = [f for f in uploaded if f.size <= max_mb * 1024 * 1024]
|
| 542 |
+
|
| 543 |
+
st.markdown(f"**{len(uploaded)} file(s) selected** Β· "
|
| 544 |
+
f"<span class='tag tag-green'>{len(valid)} ready</span>"
|
| 545 |
+
+ (f" <span class='tag tag-red'>{len(oversized)} oversized</span>"
|
| 546 |
+
if oversized else ""),
|
| 547 |
+
unsafe_allow_html=True)
|
| 548 |
+
|
| 549 |
+
for f in valid[:8]: # show preview of first 8
|
| 550 |
+
st.markdown(
|
| 551 |
+
f"<div class='file-card'>"
|
| 552 |
+
f"<span style='font-size:1.3rem'>{file_icon(f.name)}</span>"
|
| 553 |
+
f"<span style='flex:1;font-weight:500;color:#e2e8f0'>{f.name}</span>"
|
| 554 |
+
f"<span style='color:#64748b;font-size:0.82rem'>{fmt_bytes(f.size)}</span>"
|
| 555 |
+
f"</div>",
|
| 556 |
+
unsafe_allow_html=True,
|
| 557 |
+
)
|
| 558 |
+
if len(valid) > 8:
|
| 559 |
+
st.caption(f"β¦and {len(valid)-8} more files")
|
| 560 |
+
for f in oversized:
|
| 561 |
+
st.warning(f"β οΈ **{f.name}** ({fmt_bytes(f.size)}) exceeds the {max_mb} MB limit and will be skipped.")
|
| 562 |
+
|
| 563 |
+
if not valid:
|
| 564 |
+
return
|
| 565 |
+
|
| 566 |
+
# ββ Convert button ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 567 |
+
col_btn, col_fmt, _ = st.columns([2, 2, 4])
|
| 568 |
+
with col_btn:
|
| 569 |
+
run = st.button("π Convert All", use_container_width=True)
|
| 570 |
+
with col_fmt:
|
| 571 |
+
st.markdown(f"<br><span class='tag'>{cfg['output_format']}</span>", unsafe_allow_html=True)
|
| 572 |
+
|
| 573 |
+
if not run:
|
| 574 |
+
return
|
| 575 |
+
|
| 576 |
+
# ββ Load Docling ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 577 |
+
with st.spinner("Loading Docling models (first run downloads ~1 GB of models)β¦"):
|
| 578 |
+
try:
|
| 579 |
+
dl = _load_docling()
|
| 580 |
+
except Exception as e:
|
| 581 |
+
st.error(f"Failed to import Docling: {e}\n\n"
|
| 582 |
+
"Make sure `docling` is installed (`pip install docling`).")
|
| 583 |
+
return
|
| 584 |
+
|
| 585 |
+
with st.spinner("Building converter pipelineβ¦"):
|
| 586 |
+
try:
|
| 587 |
+
converter = build_converter(cfg, dl)
|
| 588 |
+
except Exception as e:
|
| 589 |
+
st.error(f"Could not build converter: {e}\n```\n{traceback.format_exc()}\n```")
|
| 590 |
+
return
|
| 591 |
+
|
| 592 |
+
# ββ Process files βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 593 |
+
st.markdown("---")
|
| 594 |
+
st.markdown("### βοΈ Processing")
|
| 595 |
+
|
| 596 |
+
results: Dict[str, Dict] = {}
|
| 597 |
+
overall_bar = st.progress(0)
|
| 598 |
+
status_area = st.empty()
|
| 599 |
+
|
| 600 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 601 |
+
for idx, uf in enumerate(valid):
|
| 602 |
+
fname = uf.name
|
| 603 |
+
status_area.markdown(
|
| 604 |
+
f"<div class='file-card'>"
|
| 605 |
+
f"<span style='font-size:1.2rem'>{file_icon(fname)}</span>"
|
| 606 |
+
f"<span style='flex:1;color:#e2e8f0'>{fname}</span>"
|
| 607 |
+
f"<span class='status-wait'>β³ convertingβ¦</span>"
|
| 608 |
+
f"</div>",
|
| 609 |
+
unsafe_allow_html=True,
|
| 610 |
+
)
|
| 611 |
+
|
| 612 |
+
# Fix 4: each file gets its own subdirectory so two uploaded files
|
| 613 |
+
# with the same basename (e.g. "report.pdf" from different folders)
|
| 614 |
+
# never silently overwrite each other in the shared tmpdir.
|
| 615 |
+
file_subdir = os.path.join(tmpdir, str(idx))
|
| 616 |
+
os.makedirs(file_subdir, exist_ok=True)
|
| 617 |
+
tmp_path = os.path.join(file_subdir, fname)
|
| 618 |
+
|
| 619 |
+
# Fix 3: always seek(0) before reading β on Streamlit re-renders
|
| 620 |
+
# the BytesIO cursor is already at EOF and uf.read() returns b"",
|
| 621 |
+
# writing a zero-byte file that Docling then silently fails on.
|
| 622 |
+
uf.seek(0)
|
| 623 |
+
with open(tmp_path, "wb") as fh:
|
| 624 |
+
fh.write(uf.read())
|
| 625 |
+
|
| 626 |
+
result = convert_file(tmp_path, cfg, converter, dl)
|
| 627 |
+
results[fname] = result
|
| 628 |
+
|
| 629 |
+
overall_bar.progress((idx + 1) / len(valid))
|
| 630 |
+
|
| 631 |
+
if not result["ok"] and cfg.get("abort_on_error"):
|
| 632 |
+
st.error(f"β Aborted after error on **{fname}**:\n```\n{result['error']}\n```")
|
| 633 |
+
break
|
| 634 |
+
|
| 635 |
+
status_area.empty()
|
| 636 |
+
overall_bar.empty()
|
| 637 |
+
|
| 638 |
+
# ββ Summary metrics βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 639 |
+
ok_count = sum(1 for r in results.values() if r["ok"])
|
| 640 |
+
err_count = len(results) - ok_count
|
| 641 |
+
total_time = sum(r["elapsed"] for r in results.values())
|
| 642 |
+
|
| 643 |
+
st.markdown(
|
| 644 |
+
f"<div class='metric-row'>"
|
| 645 |
+
f"<div class='metric-box'><div class='val'>{len(results)}</div><div class='lbl'>Files processed</div></div>"
|
| 646 |
+
f"<div class='metric-box'><div class='val' style='color:#4ade80'>{ok_count}</div><div class='lbl'>Succeeded</div></div>"
|
| 647 |
+
f"<div class='metric-box'><div class='val' style='color:#f87171'>{err_count}</div><div class='lbl'>Failed</div></div>"
|
| 648 |
+
f"<div class='metric-box'><div class='val'>{fmt_time(total_time)}</div><div class='lbl'>Total time</div></div>"
|
| 649 |
+
f"</div>",
|
| 650 |
+
unsafe_allow_html=True,
|
| 651 |
+
)
|
| 652 |
+
|
| 653 |
+
# ββ Per-file results ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 654 |
+
st.markdown("### π Results")
|
| 655 |
+
|
| 656 |
+
ext = OUTPUT_FORMATS[cfg["output_format"]]
|
| 657 |
+
|
| 658 |
+
# Build ZIP in memory
|
| 659 |
+
zip_buf = io.BytesIO()
|
| 660 |
+
with zipfile.ZipFile(zip_buf, "w", zipfile.ZIP_DEFLATED) as zf:
|
| 661 |
+
for fname, res in results.items():
|
| 662 |
+
if res["ok"]:
|
| 663 |
+
out_name = Path(fname).stem + f".{ext}"
|
| 664 |
+
zf.writestr(out_name, res["content"])
|
| 665 |
+
zip_buf.seek(0)
|
| 666 |
+
|
| 667 |
+
dl_col1, dl_col2 = st.columns([2, 4])
|
| 668 |
+
with dl_col1:
|
| 669 |
+
st.download_button(
|
| 670 |
+
"β¬οΈ Download All as ZIP",
|
| 671 |
+
data=zip_buf,
|
| 672 |
+
file_name="docling_output.zip",
|
| 673 |
+
mime="application/zip",
|
| 674 |
+
use_container_width=True,
|
| 675 |
+
)
|
| 676 |
+
|
| 677 |
+
st.markdown("---")
|
| 678 |
+
|
| 679 |
+
for fname, res in results.items():
|
| 680 |
+
icon = file_icon(fname)
|
| 681 |
+
with st.expander(
|
| 682 |
+
f"{icon} **{fname}** "
|
| 683 |
+
+ ("β
" if res["ok"] else "β")
|
| 684 |
+
+ f" Β· {fmt_time(res['elapsed'])}",
|
| 685 |
+
expanded=ok_count == 1,
|
| 686 |
+
):
|
| 687 |
+
if not res["ok"]:
|
| 688 |
+
st.error(f"**Error:** {res['error']}")
|
| 689 |
+
if "traceback" in res:
|
| 690 |
+
with st.expander("π Full traceback"):
|
| 691 |
+
st.code(res["traceback"], language="python")
|
| 692 |
+
else:
|
| 693 |
+
# Metadata strip
|
| 694 |
+
meta = res.get("meta", {})
|
| 695 |
+
m_cols = st.columns(4)
|
| 696 |
+
m_cols[0].metric("Pages", meta.get("pages", "β"))
|
| 697 |
+
m_cols[1].metric("Tables", meta.get("tables", "β"))
|
| 698 |
+
m_cols[2].metric("Figures", meta.get("figures", "β"))
|
| 699 |
+
m_cols[3].metric("Time", fmt_time(res["elapsed"]))
|
| 700 |
+
|
| 701 |
+
content = res["content"]
|
| 702 |
+
out_name = Path(fname).stem + f".{ext}"
|
| 703 |
+
|
| 704 |
+
# Download individual file
|
| 705 |
+
mime_map = {
|
| 706 |
+
"md": "text/markdown",
|
| 707 |
+
"html": "text/html",
|
| 708 |
+
"json": "application/json",
|
| 709 |
+
"txt": "text/plain",
|
| 710 |
+
"doctags": "text/plain",
|
| 711 |
+
}
|
| 712 |
+
st.download_button(
|
| 713 |
+
f"β¬οΈ Download {out_name}",
|
| 714 |
+
data=content.encode("utf-8"),
|
| 715 |
+
file_name=out_name,
|
| 716 |
+
mime=mime_map.get(ext, "text/plain"),
|
| 717 |
+
key=f"dl_{fname}",
|
| 718 |
+
)
|
| 719 |
+
|
| 720 |
+
# Preview
|
| 721 |
+
char_count = len(content)
|
| 722 |
+
word_count = len(content.split())
|
| 723 |
+
st.caption(f"Output: **{char_count:,} chars** Β· **{word_count:,} words**")
|
| 724 |
+
|
| 725 |
+
if ext == "md":
|
| 726 |
+
tab1, tab2 = st.tabs(["π Rendered", "π Raw Markdown"])
|
| 727 |
+
with tab1:
|
| 728 |
+
st.markdown(content[:15000] + ("\n\n*[truncated for previewβ¦]*"
|
| 729 |
+
if len(content) > 15000 else ""),
|
| 730 |
+
unsafe_allow_html=True)
|
| 731 |
+
with tab2:
|
| 732 |
+
# Fix 5: escape before injecting into HTML β raw doc content
|
| 733 |
+
# can contain <, >, & which would break the div or be executed.
|
| 734 |
+
preview_raw = html_lib.escape(content[:12000])
|
| 735 |
+
suffix = "β¦[truncated]" if len(content) > 12000 else ""
|
| 736 |
+
st.markdown(f"<div class='result-box'>{preview_raw}{suffix}</div>",
|
| 737 |
+
unsafe_allow_html=True)
|
| 738 |
+
|
| 739 |
+
elif ext == "html":
|
| 740 |
+
tab1, tab2 = st.tabs(["π Rendered", "π HTML Source"])
|
| 741 |
+
with tab1:
|
| 742 |
+
st.components.v1.html(content, height=600, scrolling=True)
|
| 743 |
+
with tab2:
|
| 744 |
+
st.code(content[:10000], language="html")
|
| 745 |
+
|
| 746 |
+
elif ext == "json":
|
| 747 |
+
try:
|
| 748 |
+
parsed = json.loads(content)
|
| 749 |
+
st.json(parsed, expanded=False)
|
| 750 |
+
except Exception:
|
| 751 |
+
st.code(content[:10000], language="json")
|
| 752 |
+
|
| 753 |
+
else:
|
| 754 |
+
# Fix 5: escape before injecting into HTML β same reason as above.
|
| 755 |
+
preview_plain = html_lib.escape(content[:12000])
|
| 756 |
+
suffix = "β¦[truncated]" if len(content) > 12000 else ""
|
| 757 |
+
st.markdown(f"<div class='result-box'>{preview_plain}{suffix}</div>",
|
| 758 |
+
unsafe_allow_html=True)
|
| 759 |
+
|
| 760 |
+
|
| 761 |
+
if __name__ == "__main__":
|
| 762 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ββ Core βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2 |
+
docling>=2.5.0
|
| 3 |
+
docling-core>=2.0.0
|
| 4 |
+
|
| 5 |
+
# ββ Streamlit βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 6 |
+
streamlit>=1.35.0
|
| 7 |
+
|
| 8 |
+
# ββ OCR backends βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 9 |
+
easyocr>=1.7.0 # Pure-Python OCR (no system deps needed)
|
| 10 |
+
# pytesseract # Tesseract wrapper β uncomment if packages.txt has tesseract
|
| 11 |
+
|
| 12 |
+
# ββ Document format support βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 13 |
+
python-docx>=1.1.0 # DOCX reading/writing
|
| 14 |
+
python-pptx>=0.6.23 # PPTX support
|
| 15 |
+
openpyxl>=3.1.2 # XLSX support
|
| 16 |
+
pandas>=2.0.0 # CSV / tabular
|
| 17 |
+
beautifulsoup4>=4.12.0 # HTML parsing
|
| 18 |
+
lxml>=5.0.0 # XML/HTML backend
|
| 19 |
+
|
| 20 |
+
# ββ Image processing ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
+
Pillow>=10.0.0
|
| 22 |
+
opencv-python-headless>=4.9.0 # headless for server environments
|
| 23 |
+
|
| 24 |
+
# ββ PDF βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 25 |
+
pypdfium2>=4.0.0 # Fast PDF rendering backend used by Docling
|
| 26 |
+
pdfminer.six>=20221105
|
| 27 |
+
|
| 28 |
+
# ββ ML / model support ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 29 |
+
torch>=2.1.0
|
| 30 |
+
torchvision>=0.16.0
|
| 31 |
+
transformers>=4.40.0
|
| 32 |
+
huggingface-hub>=0.20.0
|
| 33 |
+
|
| 34 |
+
# ββ Misc utilities ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 35 |
+
requests>=2.31.0
|
| 36 |
+
tqdm>=4.66.0
|