Spaces:

lexicalspace
/

Blogger-Toolkit

Paused

App Files Files Community

lexicalspace commited on Feb 14

Commit

89fd60a

verified ·

1 Parent(s): c0058aa

upgrade txt to pdf

Browse files

Files changed (1) hide show

app.py +174 -72

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ from collections import Counter
 from bs4 import BeautifulSoup
 from fpdf import FPDF
 import os
 # --- BATCH 1: MEDIA & FILE FUNCTIONS ---
@@ -28,108 +28,210 @@ import os
 def run_pdf_converter_app():
     """
-    Main container function.
-    Contains all logic for downloading fonts, cleaning text, and generating PDF.
     """
-    # --- Nested Helper 1: Download Font (Crucial for HF Spaces) ---
     def get_unicode_font():
-        font_filename = "DejaVuSans.ttf"
-        # URL to a stable Unicode font source
         font_url = "https://github.com/dejavu-fonts/dejavu-fonts/raw/master/ttf/DejaVuSans.ttf"
-        if not os.path.exists(font_filename):
             try:
-                with st.spinner("Downloading Unicode font support..."):
-                    response = requests.get(font_url)
-                    with open(font_filename, "wb") as f:
-                        f.write(response.content)
             except Exception as e:
-                st.error(f"Failed to load font: {e}")
                 return None
-        return font_filename
-    # --- Nested Helper 2: Clean LMS Garbage ---
-    def clean_lms_text(text, active=True):
-        if not active:
-            return text
-        # 1. Normalize line endings (windows/linux compatibility)
         text = text.replace('\r\n', '\n').replace('\r', '\n')
-        # 2. Remove common LMS description patterns
-        # Example: specific text like "Question 1" or text inside brackets [Question ID]
-        # This Regex removes text inside square brackets often hidden in LMS
-        text = re.sub(r'\[.*?\]', '', text)
-        # 3. Remove text inside parenthesis if it looks like a description (optional)
-        # un-comment the line below if your LMS writes "Symbol (Name)"
-        # text = re.sub(r'\([A-Za-z\s]+\)', '', text)
-        # 4. Remove zero-width spaces often found in web copying
-        text = text.replace(u'\u200b', '')
         return text.strip()
-    # --- Nested Helper 3: PDF Generator ---
-    def create_pdf(text_content):
-        pdf = FPDF()
         pdf.add_page()
-        # Load the custom font
         font_path = get_unicode_font()
         if font_path:
-            # Add font (requires fpdf2)
-            pdf.add_font('DejaVu', fname=font_path)
-            pdf.set_font('DejaVu', size=12)
         else:
-            # Fallback (Symbols will likely fail here)
-            pdf.set_font("Arial", size=12)
-            st.warning("Using standard font. Special symbols might not appear.")
-        # Write text
-        # multi_cell handles text wrapping automatically
-        pdf.multi_cell(0, 10, text=text_content)
-        # Return PDF as byte string
-        return pdf.output(dest='S')
-    # --- UI LAYOUT ---
-    st.markdown("### 📝 LMS Text to PDF Converter")
-    st.info("Paste your text below. This tool supports Math symbols ($ \Sigma, \Delta $) and foreign languages.")
-    # 1. Input
-    raw_text = st.text_area("Paste Question/Text Here:", height=250)
-    # 2. Settings
-    col1, col2 = st.columns(2)
-    with col1:
-        auto_clean = st.checkbox("Remove LMS hidden codes like [ID_123]", value=True)
-    with col2:
-        filename = st.text_input("Filename", value="notes.pdf")
-    # 3. Action
-    if st.button("Convert to PDF"):
-        if not raw_text:
-            st.error("Please paste some text first.")
         else:
-            # Process
-            clean_text = clean_lms_text(raw_text, active=auto_clean)
-            pdf_data = create_pdf(clean_text)
-            if pdf_data:
-                st.success("Conversion Successful!")
-                # Download Button
                 st.download_button(
-                    label="📥 Download PDF",
-                    data=pdf_data,
-                    file_name=filename if filename.endswith('.pdf') else f"{filename}.pdf",
                     mime="application/pdf"
                 )
 import streamlit as st
 from huggingface_hub import InferenceClient

 from bs4 import BeautifulSoup
 from fpdf import FPDF
 import os
+from io import BytesIO
 # --- BATCH 1: MEDIA & FILE FUNCTIONS ---
 def run_pdf_converter_app():
     """
+    All-in-one function for Advanced Text-to-PDF conversion.
+    Features: Unicode, Markdown-like formatting, Image embedding, and Smart Symbol Replacement.
     """
+    # --- 1. CONFIG: Smart Symbol Replacement Dictionary ---
+    # Automatically fixes common plain-text approximations into real symbols
+    SYMBOL_MAP = {
+        r'\->': '→',
+        r'<-': '←',
+        r'<=': '≤',
+        r'>=': '≥',
+        r'!=': '≠',
+        r'\.\.\.': '…',
+        r'\(c\)': '©',
+        r'\(r\)': '®',
+        r'alpha': 'α',
+        r'beta': 'β',
+        r'theta': 'θ',
+        r'pi': 'π',
+        r'sigma': 'Σ',
+        r'delta': 'Δ',
+        r'gamma': 'Γ',
+        r'omega': 'Ω',
+        r'mu': 'μ',
+        r'lambda': 'λ',
+        r'sqrt': '√',
+        r'deg': '°'
+    }
+    # --- 2. HELPER: Download Font ---
     def get_unicode_font():
+        font_path = "DejaVuSans.ttf"
         font_url = "https://github.com/dejavu-fonts/dejavu-fonts/raw/master/ttf/DejaVuSans.ttf"
+        if not os.path.exists(font_path):
             try:
+                response = requests.get(font_url, timeout=10)
+                with open(font_path, "wb") as f:
+                    f.write(response.content)
             except Exception as e:
+                st.error(f"Font download failed: {e}")
                 return None
+        return font_path
+    # --- 3. HELPER: Clean & Enhance Text ---
+    def process_text(text, remove_lms_junk=True, smart_symbols=True):
+        if not text: return ""
+        # A. Basic Normalization
         text = text.replace('\r\n', '\n').replace('\r', '\n')
+        # B. LMS Junk Removal (Aggressive)
+        if remove_lms_junk:
+            # Removes "Question ID: [12345]" patterns
+            text = re.sub(r'Question\s+ID\s*[:\-]\s*\[?\w+\]?', '', text, flags=re.IGNORECASE)
+            # Removes "Points: 1.0" or similar
+            text = re.sub(r'Points\s*[:\-]\s*\d+(\.\d+)?', '', text, flags=re.IGNORECASE)
+            # Removes timestamps like [10:00 AM]
+            text = re.sub(r'\[\d{1,2}:\d{2}\s*(AM|PM)?\]', '', text)
+            # Removes text in square brackets mostly used for metadata
+            text = re.sub(r'\[.*?\]', '', text)
+        # C. Smart Symbol Replacement
+        if smart_symbols:
+            for pattern, symbol in SYMBOL_MAP.items():
+                # We use regex to ensure we don't replace inside words (e.g. 'alphabet' shouldn't become 'αbet')
+                # But for simple symbols like '->', direct replace is safer
+                if pattern.isalpha(): # Word replacement (alpha -> α)
+                     text = re.sub(r'\b' + pattern + r'\b', symbol, text, flags=re.IGNORECASE)
+                else: # Symbol replacement (-> -> →)
+                     text = re.sub(pattern, symbol, text)
         return text.strip()
+    # --- 4. HELPER: PDF Generator Class ---
+    class PDF(FPDF):
+        def header(self):
+            # Optional: Add a subtle header if needed
+            pass
+        def footer(self):
+            self.set_y(-15)
+            self.set_font("Arial", "I", 8)
+            self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')
+    # --- 5. MAIN LOGIC: Generate PDF ---
+    def generate_pdf(content, font_size=12, margin=10):
+        pdf = PDF()
+        pdf.set_auto_page_break(auto=True, margin=15)
+        pdf.set_margins(margin, margin, margin)
         pdf.add_page()
+        # Load Font
         font_path = get_unicode_font()
         if font_path:
+            pdf.add_font('DejaVu', '', font_path, uni=True)
+            pdf.set_font('DejaVu', '', font_size)
         else:
+            pdf.set_font("Arial", size=font_size)
+            st.warning("⚠️ Unicode font missing. Symbols may not render.")
+        # Process Content Line by Line to detect Images/Formatting
+        lines = content.split('\n')
+        for line in lines:
+            line = line.strip()
+            # CHECK: Image URL? (Simple detection: starts with http & ends with image ext)
+            if line.startswith("http") and line.lower().endswith(('.png', '.jpg', '.jpeg')):
+                try:
+                    # Download image to memory
+                    img_resp = requests.get(line, timeout=5)
+                    if img_resp.status_code == 200:
+                        img_data = BytesIO(img_resp.content)
+                        # Embed image centered
+                        pdf.image(img_data, x=None, w=100) # Width 100mm (adjust as needed)
+                        pdf.ln(5) # Add spacing
+                    else:
+                        pdf.write(5, f"[Image load failed: {line}]")
+                        pdf.ln()
+                except:
+                    pdf.write(5, f"[Invalid Image URL: {line}]")
+                    pdf.ln()
+                continue
+            # CHECK: Header? (starts with #)
+            if line.startswith('# '):
+                pdf.set_font_size(font_size + 6) # Bigger font
+                pdf.set_text_color(0, 50, 150)   # Blueish color
+                pdf.cell(0, 10, line.replace('#', '').strip(), ln=True)
+                pdf.set_text_color(0, 0, 0)      # Reset color
+                pdf.set_font_size(font_size)     # Reset size
+            # CHECK: Bullet Point?
+            elif line.startswith('* ') or line.startswith('- '):
+                current_x = pdf.get_x()
+                pdf.set_x(current_x + 5) # Indent
+                pdf.write(5, f"• {line[2:]}")
+                pdf.set_x(current_x) # Reset indent
+                pdf.ln()
+            # STANDARD TEXT
+            else:
+                if line: # Avoid printing empty strings unnecessarily
+                    pdf.write(5, line)
+                    pdf.ln() # Line break
+                else:
+                    pdf.ln(3) # Small gap for empty lines
+        return pdf.output(dest='S').encode('latin-1', 'ignore') # 'S' returns string, encode to bytes
+    # --- 6. UI LAYOUT ---
+    st.title("🚀 Supercharged LMS to PDF")
+    with st.expander("ℹ️ How to use & Features", expanded=False):
+        st.markdown("""
+        * **Paste Text:** Copy directly from your quiz or LMS.
+        * **Smart Symbols:** Writes 'alpha' -> 'α', '->' -> '→' automatically.
+        * **Images:** Paste a direct image URL (e.g., `https://example.com/graph.png`) on its own line to embed it.
+        * **Formatting:** Use `# Title` for headers, `- Item` for lists.
+        """)
+    # Sidebar Controls
+    with st.sidebar:
+        st.header("⚙️ PDF Settings")
+        f_size = st.slider("Font Size", 8, 24, 12)
+        pg_margin = st.slider("Page Margins (mm)", 5, 30, 10)
+        smart_fix = st.checkbox("Smart Symbol Fix (alpha -> α)", value=True)
+        lms_clean = st.checkbox("Remove LMS Metadata", value=True)
+    # Input Area
+    raw_input = st.text_area("Paste Content Here:", height=300, placeholder="Paste text, questions, or image URLs here...")
+    # Action Button
+    if st.button("✨ Generate PDF", type="primary"):
+        if not raw_input:
+            st.warning("⚠️ Please enter text to convert.")
         else:
+            with st.spinner("Processing text, downloading fonts, and rendering PDF..."):
+                # 1. Clean Text
+                final_text = process_text(raw_input, remove_lms_junk=lms_clean, smart_symbols=smart_fix)
+                # 2. Generate PDF
+                # Note: We pass the raw string to PDF generator, it handles encoding internally via fpdf2
+                # The 'encode' at the end of generate_pdf is a safeguard for streamlit's download button
+                pdf_bytes = generate_pdf(final_text, font_size=f_size, margin=pg_margin)
+                st.success("✅ PDF Ready!")
+                # 3. Download
                 st.download_button(
+                    label="⬇️ Download PDF Document",
+                    data=pdf_bytes,
+                    file_name="smart_notes.pdf",
                     mime="application/pdf"
                 )
 import streamlit as st
 from huggingface_hub import InferenceClient