import gradio as gr import fitz # PyMuPDF import docx import io import re from sklearn.feature_extraction.text import TfidfVectorizer from sentence_transformers import SentenceTransformer, util import matplotlib.pyplot as plt import numpy as np from difflib import SequenceMatcher model = SentenceTransformer('all-MiniLM-L6-v2') def extract_text_from_pdf(pdf_file): try: pdf_reader = fitz.open(stream=pdf_file, filetype="pdf") text = "" for page in pdf_reader: text += page.get_text() pdf_reader.close() return text.strip() except Exception as e: return "" def normalize_text(text): return re.sub(r'\s+', ' ', text.strip().lower()) def extract_text_from_docx(docx_file): try: doc = docx.Document(io.BytesIO(docx_file)) full_text = [] for para in doc.paragraphs: if para.text.strip(): full_text.append(para.text.strip()) return full_text except: return [] def semantic_match(lo_list, content): scores = [] for lo in lo_list: try: lo_embed = model.encode(lo, convert_to_tensor=True) content_embed = model.encode(content, convert_to_tensor=True) sim = util.pytorch_cos_sim(lo_embed, content_embed).item() scores.append(round(sim, 2)) except: scores.append(0.0) return scores def content_change_score(text1, text2): try: sim = SequenceMatcher(None, normalize_text(text1), normalize_text(text2)).ratio() return round((1 - sim) * 100, 2) except: return 100.0 def compare_handouts(old_pdf, new_pdf, lo_file): old_text = extract_text_from_pdf(old_pdf) new_text = extract_text_from_pdf(new_pdf) if len(old_text.strip()) < 200 or len(new_text.strip()) < 200: return "⚠️ Could not extract meaningful content from one or both PDFs.", None lo_list = extract_text_from_docx(lo_file) if not lo_list: return "⚠️ No learning outcomes detected.", None old_scores = semantic_match(lo_list, old_text) new_scores = semantic_match(lo_list, new_text) change_percent = content_change_score(old_text, new_text) improved_count = sum([n > o for n, o in zip(new_scores, old_scores)]) matched_los = sum([n >= o for n, o in zip(new_scores, old_scores)]) summary = f"📈 Content Change Estimate: {change_percent}%\n" summary += f"🧠 LO Alignment: {matched_los} of {len(lo_list)} learning outcomes matched\n" if improved_count > 0: summary += "🟢 Summary: New handout has improved structure and added clarity." else: summary += "⚠️ Summary: No significant improvement in LO alignment." # Plot x = np.arange(len(lo_list)) width = 0.35 fig, ax = plt.subplots() ax.bar(x - width/2, old_scores, width, label='Old') ax.bar(x + width/2, new_scores, width, label='New') ax.set_ylabel('Match Score (0-1)') ax.set_title('LO-wise Match Score: Old vs New') ax.set_xticks(x) ax.set_xticklabels([f"LO{i+1}" for i in range(len(lo_list))], rotation=45) ax.legend() plt.tight_layout() return summary, fig with gr.Blocks() as demo: gr.Markdown("📘 **Educational Content Comparator**") gr.Markdown("Upload 2 handouts and a .docx file of Learning Outcomes to compare changes and alignment.") with gr.Row(): old_pdf = gr.File(label="📂 Upload Old PDF", file_types=[".pdf"], type="binary") new_pdf = gr.File(label="📂 Upload New PDF", file_types=[".pdf"], type="binary") lo_file = gr.File(label="📂 Upload Learning Outcomes (.docx)", file_types=[".docx"], type="binary") with gr.Row(): btn = gr.Button("Submit") clear_btn = gr.Button("Clear") output_text = gr.Textbox(label="📋 Summary", lines=5, interactive=False) output_plot = gr.Plot(label="📊 LO Match Chart") btn.click(fn=compare_handouts, inputs=[old_pdf, new_pdf, lo_file], outputs=[output_text, output_plot]) clear_btn.click(fn=lambda: ("", None), inputs=[], outputs=[output_text, output_plot]) demo.launch()