lexicalspace commited on
Commit
89fd60a
·
verified ·
1 Parent(s): c0058aa

upgrade txt to pdf

Browse files
Files changed (1) hide show
  1. app.py +174 -72
app.py CHANGED
@@ -15,7 +15,7 @@ from collections import Counter
15
  from bs4 import BeautifulSoup
16
  from fpdf import FPDF
17
  import os
18
-
19
 
20
  # --- BATCH 1: MEDIA & FILE FUNCTIONS ---
21
 
@@ -28,108 +28,210 @@ import os
28
 
29
 
30
 
 
 
 
31
  def run_pdf_converter_app():
32
  """
33
- Main container function.
34
- Contains all logic for downloading fonts, cleaning text, and generating PDF.
35
  """
36
 
37
- # --- Nested Helper 1: Download Font (Crucial for HF Spaces) ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  def get_unicode_font():
39
- font_filename = "DejaVuSans.ttf"
40
- # URL to a stable Unicode font source
41
  font_url = "https://github.com/dejavu-fonts/dejavu-fonts/raw/master/ttf/DejaVuSans.ttf"
42
 
43
- if not os.path.exists(font_filename):
44
  try:
45
- with st.spinner("Downloading Unicode font support..."):
46
- response = requests.get(font_url)
47
- with open(font_filename, "wb") as f:
48
- f.write(response.content)
49
  except Exception as e:
50
- st.error(f"Failed to load font: {e}")
51
  return None
52
- return font_filename
53
 
54
- # --- Nested Helper 2: Clean LMS Garbage ---
55
- def clean_lms_text(text, active=True):
56
- if not active:
57
- return text
58
-
59
- # 1. Normalize line endings (windows/linux compatibility)
60
  text = text.replace('\r\n', '\n').replace('\r', '\n')
61
-
62
- # 2. Remove common LMS description patterns
63
- # Example: specific text like "Question 1" or text inside brackets [Question ID]
64
- # This Regex removes text inside square brackets often hidden in LMS
65
- text = re.sub(r'\[.*?\]', '', text)
66
-
67
- # 3. Remove text inside parenthesis if it looks like a description (optional)
68
- # un-comment the line below if your LMS writes "Symbol (Name)"
69
- # text = re.sub(r'\([A-Za-z\s]+\)', '', text)
70
 
71
- # 4. Remove zero-width spaces often found in web copying
72
- text = text.replace(u'\u200b', '')
73
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  return text.strip()
75
 
76
- # --- Nested Helper 3: PDF Generator ---
77
- def create_pdf(text_content):
78
- pdf = FPDF()
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  pdf.add_page()
80
-
81
- # Load the custom font
82
  font_path = get_unicode_font()
83
  if font_path:
84
- # Add font (requires fpdf2)
85
- pdf.add_font('DejaVu', fname=font_path)
86
- pdf.set_font('DejaVu', size=12)
87
  else:
88
- # Fallback (Symbols will likely fail here)
89
- pdf.set_font("Arial", size=12)
90
- st.warning("Using standard font. Special symbols might not appear.")
91
 
92
- # Write text
93
- # multi_cell handles text wrapping automatically
94
- pdf.multi_cell(0, 10, text=text_content)
95
 
96
- # Return PDF as byte string
97
- return pdf.output(dest='S')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
- # --- UI LAYOUT ---
100
- st.markdown("### 📝 LMS Text to PDF Converter")
101
- st.info("Paste your text below. This tool supports Math symbols ($ \Sigma, \Delta $) and foreign languages.")
102
 
103
- # 1. Input
104
- raw_text = st.text_area("Paste Question/Text Here:", height=250)
105
 
106
- # 2. Settings
107
- col1, col2 = st.columns(2)
108
- with col1:
109
- auto_clean = st.checkbox("Remove LMS hidden codes like [ID_123]", value=True)
110
- with col2:
111
- filename = st.text_input("Filename", value="notes.pdf")
 
112
 
113
- # 3. Action
114
- if st.button("Convert to PDF"):
115
- if not raw_text:
116
- st.error("Please paste some text first.")
 
 
 
 
 
 
 
 
 
 
 
117
  else:
118
- # Process
119
- clean_text = clean_lms_text(raw_text, active=auto_clean)
120
- pdf_data = create_pdf(clean_text)
121
-
122
- if pdf_data:
123
- st.success("Conversion Successful!")
124
- # Download Button
 
 
 
 
 
125
  st.download_button(
126
- label="📥 Download PDF",
127
- data=pdf_data,
128
- file_name=filename if filename.endswith('.pdf') else f"{filename}.pdf",
129
  mime="application/pdf"
130
  )
131
 
132
 
 
133
 
134
  import streamlit as st
135
  from huggingface_hub import InferenceClient
 
15
  from bs4 import BeautifulSoup
16
  from fpdf import FPDF
17
  import os
18
+ from io import BytesIO
19
 
20
  # --- BATCH 1: MEDIA & FILE FUNCTIONS ---
21
 
 
28
 
29
 
30
 
31
+
32
+
33
+
34
  def run_pdf_converter_app():
35
  """
36
+ All-in-one function for Advanced Text-to-PDF conversion.
37
+ Features: Unicode, Markdown-like formatting, Image embedding, and Smart Symbol Replacement.
38
  """
39
 
40
+ # --- 1. CONFIG: Smart Symbol Replacement Dictionary ---
41
+ # Automatically fixes common plain-text approximations into real symbols
42
+ SYMBOL_MAP = {
43
+ r'\->': '→',
44
+ r'<-': '←',
45
+ r'<=': '≤',
46
+ r'>=': '≥',
47
+ r'!=': '≠',
48
+ r'\.\.\.': '…',
49
+ r'\(c\)': '©',
50
+ r'\(r\)': '®',
51
+ r'alpha': 'α',
52
+ r'beta': 'β',
53
+ r'theta': 'θ',
54
+ r'pi': 'π',
55
+ r'sigma': 'Σ',
56
+ r'delta': 'Δ',
57
+ r'gamma': 'Γ',
58
+ r'omega': 'Ω',
59
+ r'mu': 'μ',
60
+ r'lambda': 'λ',
61
+ r'sqrt': '√',
62
+ r'deg': '°'
63
+ }
64
+
65
+ # --- 2. HELPER: Download Font ---
66
  def get_unicode_font():
67
+ font_path = "DejaVuSans.ttf"
 
68
  font_url = "https://github.com/dejavu-fonts/dejavu-fonts/raw/master/ttf/DejaVuSans.ttf"
69
 
70
+ if not os.path.exists(font_path):
71
  try:
72
+ response = requests.get(font_url, timeout=10)
73
+ with open(font_path, "wb") as f:
74
+ f.write(response.content)
 
75
  except Exception as e:
76
+ st.error(f"Font download failed: {e}")
77
  return None
78
+ return font_path
79
 
80
+ # --- 3. HELPER: Clean & Enhance Text ---
81
+ def process_text(text, remove_lms_junk=True, smart_symbols=True):
82
+ if not text: return ""
83
+
84
+ # A. Basic Normalization
 
85
  text = text.replace('\r\n', '\n').replace('\r', '\n')
 
 
 
 
 
 
 
 
 
86
 
87
+ # B. LMS Junk Removal (Aggressive)
88
+ if remove_lms_junk:
89
+ # Removes "Question ID: [12345]" patterns
90
+ text = re.sub(r'Question\s+ID\s*[:\-]\s*\[?\w+\]?', '', text, flags=re.IGNORECASE)
91
+ # Removes "Points: 1.0" or similar
92
+ text = re.sub(r'Points\s*[:\-]\s*\d+(\.\d+)?', '', text, flags=re.IGNORECASE)
93
+ # Removes timestamps like [10:00 AM]
94
+ text = re.sub(r'\[\d{1,2}:\d{2}\s*(AM|PM)?\]', '', text)
95
+ # Removes text in square brackets mostly used for metadata
96
+ text = re.sub(r'\[.*?\]', '', text)
97
+
98
+ # C. Smart Symbol Replacement
99
+ if smart_symbols:
100
+ for pattern, symbol in SYMBOL_MAP.items():
101
+ # We use regex to ensure we don't replace inside words (e.g. 'alphabet' shouldn't become 'αbet')
102
+ # But for simple symbols like '->', direct replace is safer
103
+ if pattern.isalpha(): # Word replacement (alpha -> α)
104
+ text = re.sub(r'\b' + pattern + r'\b', symbol, text, flags=re.IGNORECASE)
105
+ else: # Symbol replacement (-> -> →)
106
+ text = re.sub(pattern, symbol, text)
107
+
108
  return text.strip()
109
 
110
+ # --- 4. HELPER: PDF Generator Class ---
111
+ class PDF(FPDF):
112
+ def header(self):
113
+ # Optional: Add a subtle header if needed
114
+ pass
115
+
116
+ def footer(self):
117
+ self.set_y(-15)
118
+ self.set_font("Arial", "I", 8)
119
+ self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')
120
+
121
+ # --- 5. MAIN LOGIC: Generate PDF ---
122
+ def generate_pdf(content, font_size=12, margin=10):
123
+ pdf = PDF()
124
+ pdf.set_auto_page_break(auto=True, margin=15)
125
+ pdf.set_margins(margin, margin, margin)
126
  pdf.add_page()
127
+
128
+ # Load Font
129
  font_path = get_unicode_font()
130
  if font_path:
131
+ pdf.add_font('DejaVu', '', font_path, uni=True)
132
+ pdf.set_font('DejaVu', '', font_size)
 
133
  else:
134
+ pdf.set_font("Arial", size=font_size)
135
+ st.warning("⚠️ Unicode font missing. Symbols may not render.")
 
136
 
137
+ # Process Content Line by Line to detect Images/Formatting
138
+ lines = content.split('\n')
 
139
 
140
+ for line in lines:
141
+ line = line.strip()
142
+
143
+ # CHECK: Image URL? (Simple detection: starts with http & ends with image ext)
144
+ if line.startswith("http") and line.lower().endswith(('.png', '.jpg', '.jpeg')):
145
+ try:
146
+ # Download image to memory
147
+ img_resp = requests.get(line, timeout=5)
148
+ if img_resp.status_code == 200:
149
+ img_data = BytesIO(img_resp.content)
150
+ # Embed image centered
151
+ pdf.image(img_data, x=None, w=100) # Width 100mm (adjust as needed)
152
+ pdf.ln(5) # Add spacing
153
+ else:
154
+ pdf.write(5, f"[Image load failed: {line}]")
155
+ pdf.ln()
156
+ except:
157
+ pdf.write(5, f"[Invalid Image URL: {line}]")
158
+ pdf.ln()
159
+ continue
160
+
161
+ # CHECK: Header? (starts with #)
162
+ if line.startswith('# '):
163
+ pdf.set_font_size(font_size + 6) # Bigger font
164
+ pdf.set_text_color(0, 50, 150) # Blueish color
165
+ pdf.cell(0, 10, line.replace('#', '').strip(), ln=True)
166
+ pdf.set_text_color(0, 0, 0) # Reset color
167
+ pdf.set_font_size(font_size) # Reset size
168
+
169
+ # CHECK: Bullet Point?
170
+ elif line.startswith('* ') or line.startswith('- '):
171
+ current_x = pdf.get_x()
172
+ pdf.set_x(current_x + 5) # Indent
173
+ pdf.write(5, f"• {line[2:]}")
174
+ pdf.set_x(current_x) # Reset indent
175
+ pdf.ln()
176
+
177
+ # STANDARD TEXT
178
+ else:
179
+ if line: # Avoid printing empty strings unnecessarily
180
+ pdf.write(5, line)
181
+ pdf.ln() # Line break
182
+ else:
183
+ pdf.ln(3) # Small gap for empty lines
184
 
185
+ return pdf.output(dest='S').encode('latin-1', 'ignore') # 'S' returns string, encode to bytes
 
 
186
 
187
+ # --- 6. UI LAYOUT ---
188
+ st.title("🚀 Supercharged LMS to PDF")
189
 
190
+ with st.expander("ℹ️ How to use & Features", expanded=False):
191
+ st.markdown("""
192
+ * **Paste Text:** Copy directly from your quiz or LMS.
193
+ * **Smart Symbols:** Writes 'alpha' -> 'α', '->' -> '→' automatically.
194
+ * **Images:** Paste a direct image URL (e.g., `https://example.com/graph.png`) on its own line to embed it.
195
+ * **Formatting:** Use `# Title` for headers, `- Item` for lists.
196
+ """)
197
 
198
+ # Sidebar Controls
199
+ with st.sidebar:
200
+ st.header("⚙️ PDF Settings")
201
+ f_size = st.slider("Font Size", 8, 24, 12)
202
+ pg_margin = st.slider("Page Margins (mm)", 5, 30, 10)
203
+ smart_fix = st.checkbox("Smart Symbol Fix (alpha -> α)", value=True)
204
+ lms_clean = st.checkbox("Remove LMS Metadata", value=True)
205
+
206
+ # Input Area
207
+ raw_input = st.text_area("Paste Content Here:", height=300, placeholder="Paste text, questions, or image URLs here...")
208
+
209
+ # Action Button
210
+ if st.button("✨ Generate PDF", type="primary"):
211
+ if not raw_input:
212
+ st.warning("⚠️ Please enter text to convert.")
213
  else:
214
+ with st.spinner("Processing text, downloading fonts, and rendering PDF..."):
215
+ # 1. Clean Text
216
+ final_text = process_text(raw_input, remove_lms_junk=lms_clean, smart_symbols=smart_fix)
217
+
218
+ # 2. Generate PDF
219
+ # Note: We pass the raw string to PDF generator, it handles encoding internally via fpdf2
220
+ # The 'encode' at the end of generate_pdf is a safeguard for streamlit's download button
221
+ pdf_bytes = generate_pdf(final_text, font_size=f_size, margin=pg_margin)
222
+
223
+ st.success("✅ PDF Ready!")
224
+
225
+ # 3. Download
226
  st.download_button(
227
+ label="⬇️ Download PDF Document",
228
+ data=pdf_bytes,
229
+ file_name="smart_notes.pdf",
230
  mime="application/pdf"
231
  )
232
 
233
 
234
+
235
 
236
  import streamlit as st
237
  from huggingface_hub import InferenceClient