Spaces:
Sleeping
Sleeping
| """ | |
| LexiMind -- Discover Books & Papers | |
| Browse literary works and research papers analyzed by a multi-task transformer. | |
| Find your next read by topic, emotion, or keyword -- with AI-generated summaries. | |
| Author: Oliver Perrin | |
| Date: 2026-01-14 | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import re | |
| import warnings | |
| from pathlib import Path | |
| from typing import Any | |
| warnings.filterwarnings("ignore", message=".*parameter in the Blocks constructor will be removed.*") | |
| import gradio as gr | |
| # --------------- Load Dataset --------------- | |
| _DATA_PATHS = [ | |
| Path(__file__).parent.parent / "data" / "discovery_dataset.jsonl", | |
| Path("data") / "discovery_dataset.jsonl", | |
| ] | |
| def _load_jsonl() -> list[dict[str, Any]]: | |
| for p in _DATA_PATHS: | |
| if p.exists(): | |
| print(f"Loading discovery dataset from {p}...") | |
| with open(p) as f: | |
| return [json.loads(line) for line in f if line.strip()] | |
| raise FileNotFoundError( | |
| f"Discovery dataset not found. Looked in: {[str(p) for p in _DATA_PATHS]}" | |
| ) | |
| _raw_items = _load_jsonl() | |
| print(f"Loaded {len(_raw_items)} items") | |
| # Exclude social media posts | |
| ALL_ITEMS: list[dict[str, Any]] = [ | |
| item for item in _raw_items if item.get("source_type") != "social" | |
| ] | |
| # Extract unique topics and emotions from the dataset (what model predicted) | |
| TOPICS: list[str] = sorted(set(str(item["topic"]) for item in ALL_ITEMS if item.get("topic"))) | |
| EMOTIONS: list[str] = sorted( | |
| { | |
| str(item["emotion"]) | |
| for item in ALL_ITEMS | |
| if item.get("emotion") and item["emotion"] != "neutral" | |
| } | |
| ) | |
| # Group by source type | |
| BOOKS: list[dict[str, Any]] = [item for item in ALL_ITEMS if item.get("source_type") == "literary"] | |
| PAPERS: list[dict[str, Any]] = [item for item in ALL_ITEMS if item.get("source_type") == "academic"] | |
| print(f"Topics ({len(TOPICS)}): {TOPICS}") | |
| print(f"Emotions ({len(EMOTIONS)}): {EMOTIONS}") | |
| print(f"Books: {len(BOOKS)}, Papers: {len(PAPERS)}") | |
| # --------------- Load Evaluation Metrics --------------- | |
| METRICS: dict[str, Any] = {} | |
| _metrics_path = Path(__file__).parent.parent / "outputs" / "evaluation_report.json" | |
| if _metrics_path.exists(): | |
| try: | |
| with open(_metrics_path) as f: | |
| METRICS = json.load(f) | |
| print(f"Loaded evaluation metrics from {_metrics_path}") | |
| except Exception as e: | |
| print(f"Warning: Could not load metrics: {e}") | |
| # --------------- Helpers --------------- | |
| def _clean_paper_title(raw_title: str) -> str: | |
| """Clean up arXiv paper titles. | |
| Paper 'titles' in this dataset are the first ~150 chars of the abstract, | |
| not real titles. Clean them into a short, readable heading. | |
| """ | |
| t = raw_title.strip() | |
| # Remove bracket markers like [ [ background ] ] | |
| t = re.sub(r"\[[\s\[]*[^\]]*[\]\s]*\]", "", t) | |
| # Remove runs of + symbols (with or without spaces between them) | |
| t = re.sub(r"(\+\s*){2,}", "", t) | |
| # Remove other LaTeX artifacts like ^s$ ] | |
| t = re.sub(r"\^[a-z0-9]*\$\s*\]?", "", t) | |
| # Collapse whitespace and strip leading/trailing punctuation | |
| t = re.sub(r"\s+", " ", t).strip() | |
| t = t.strip(":").strip() | |
| # Remove leading section headers (e.g. "background :", "introduction :") | |
| t = re.sub( | |
| r"^(background|introduction|abstract|motivation|overview)\s*:\s*", | |
| "", | |
| t, | |
| flags=re.IGNORECASE, | |
| ) | |
| # Remove trailing ellipsis or period | |
| t = t.rstrip(".").rstrip() | |
| if t.endswith("..."): | |
| t = t[:-3].rstrip() | |
| # Capitalize first letter | |
| if t and t[0].islower(): | |
| t = t[0].upper() + t[1:] | |
| # Truncate to a reasonable length at a word boundary | |
| if len(t) > 90: | |
| cut = t[:90].rfind(" ") | |
| if cut > 40: | |
| t = t[:cut] + "..." | |
| return t or "Research Paper" | |
| # --------------- Card Formatting --------------- | |
| ITEMS_PER_PAGE = 25 | |
| def _format_book_card(item: dict) -> str: | |
| """Format a literary work as a discovery card. | |
| Uses the Goodreads description (reference summary) as the primary blurb. | |
| AI-generated summaries are not shown for books because the model was | |
| trained primarily on academic text and produces low-quality literary | |
| summaries. | |
| """ | |
| title = item.get("title", "Untitled") | |
| topic = item.get("topic", "") | |
| emotion = item.get("emotion", "neutral") | |
| ref_summary = (item.get("reference_summary") or "").strip() | |
| # Build metadata line | |
| parts = ["Book"] | |
| if topic: | |
| parts.append(f"Topic: {topic}") | |
| if emotion != "neutral": | |
| parts.append(f"Tone: {emotion.title()}") | |
| meta_line = " | ".join(parts) | |
| card = f"### {title}\n\n" | |
| card += f"*{meta_line}*\n\n" | |
| # Show the Goodreads description as the primary blurb | |
| if ref_summary: | |
| card += f"> {ref_summary}\n\n" | |
| card += "---\n\n" | |
| return card | |
| def _format_paper_card(item: dict) -> str: | |
| """Format a research paper as a discovery card. | |
| Uses the AI-generated summary as the primary blurb since it is usually | |
| a good condensation of the paper. The original abstract is shown in an | |
| expandable section. | |
| """ | |
| title = item.get("title", "Untitled") | |
| topic = item.get("topic", "") | |
| emotion = item.get("emotion", "neutral") | |
| gen_summary = (item.get("generated_summary") or "").strip() | |
| ref_summary = (item.get("reference_summary") or "").strip() | |
| display_title = _clean_paper_title(title) | |
| # Build metadata line | |
| parts = ["Paper"] | |
| if topic: | |
| parts.append(f"Topic: {topic}") | |
| if emotion != "neutral": | |
| parts.append(f"Tone: {emotion.title()}") | |
| meta_line = " | ".join(parts) | |
| card = f"### {display_title}\n\n" | |
| card += f"*{meta_line}*\n\n" | |
| if gen_summary: | |
| card += f"> {gen_summary}\n\n" | |
| elif ref_summary: | |
| card += f"> {ref_summary}\n\n" | |
| if gen_summary and ref_summary: | |
| card += ( | |
| f"<details>\n<summary>Original Abstract</summary>\n\n{ref_summary}\n\n</details>\n\n" | |
| ) | |
| card += "---\n\n" | |
| return card | |
| def _format_card(item: dict) -> str: | |
| """Route to the appropriate card formatter.""" | |
| source_type = item.get("source_type", "") | |
| if source_type == "literary": | |
| return _format_book_card(item) | |
| elif source_type == "academic": | |
| return _format_paper_card(item) | |
| return "" | |
| # --------------- Browse Functions --------------- | |
| def browse_by_topic(topic: str, source_filter: str) -> str: | |
| """Browse items filtered by topic and source type.""" | |
| if topic == "All Topics": | |
| items = list(ALL_ITEMS) | |
| else: | |
| items = [i for i in ALL_ITEMS if i.get("topic") == topic] | |
| if source_filter == "Books Only": | |
| items = [i for i in items if i.get("source_type") == "literary"] | |
| elif source_filter == "Papers Only": | |
| items = [i for i in items if i.get("source_type") == "academic"] | |
| if not items: | |
| return "No items found for this selection." | |
| books = [i for i in items if i.get("source_type") == "literary"] | |
| papers = [i for i in items if i.get("source_type") == "academic"] | |
| result = f"Showing **{len(items)}** results" | |
| if topic != "All Topics": | |
| result += f" in **{topic}**" | |
| result += f" -- {len(books)} books, {len(papers)} papers\n\n---\n\n" | |
| if source_filter != "Papers Only" and books: | |
| if source_filter == "All": | |
| result += f"## Books ({len(books)})\n\n" | |
| for item in books[:ITEMS_PER_PAGE]: | |
| result += _format_book_card(item) | |
| if source_filter != "Books Only" and papers: | |
| if source_filter == "All": | |
| result += f"## Research Papers ({len(papers)})\n\n" | |
| for item in papers[:ITEMS_PER_PAGE]: | |
| result += _format_paper_card(item) | |
| return result | |
| def browse_by_emotion(emotion: str, source_filter: str) -> str: | |
| """Browse items filtered by tone and source type.""" | |
| if emotion in ("All Emotions", "All Tones"): | |
| items = [i for i in ALL_ITEMS if i.get("emotion") != "neutral"] | |
| else: | |
| items = [i for i in ALL_ITEMS if i.get("emotion") == emotion.lower()] | |
| if source_filter == "Books Only": | |
| items = [i for i in items if i.get("source_type") == "literary"] | |
| elif source_filter == "Papers Only": | |
| items = [i for i in items if i.get("source_type") == "academic"] | |
| if not items: | |
| return ( | |
| "No items found for this selection.\n\n" | |
| "Try a different tone or select 'All Tones' to see " | |
| "all items with a detected tone." | |
| ) | |
| books = [i for i in items if i.get("source_type") == "literary"] | |
| papers = [i for i in items if i.get("source_type") == "academic"] | |
| header = emotion if emotion not in ("All Emotions", "All Tones") else "any detected tone" | |
| result = f"Showing **{len(items)}** results with **{header}**\n\n---\n\n" | |
| if source_filter != "Papers Only" and books: | |
| if source_filter == "All": | |
| result += f"## Books ({len(books)})\n\n" | |
| for item in books[:ITEMS_PER_PAGE]: | |
| result += _format_book_card(item) | |
| if source_filter != "Books Only" and papers: | |
| if source_filter == "All": | |
| result += f"## Research Papers ({len(papers)})\n\n" | |
| for item in papers[:ITEMS_PER_PAGE]: | |
| result += _format_paper_card(item) | |
| return result | |
| def search_items(query: str) -> str: | |
| """Search items by text content using word-boundary matching.""" | |
| if not query or len(query) < 2: | |
| return "Enter at least 2 characters to search." | |
| pattern = re.compile(r"\b" + re.escape(query) + r"\b", re.IGNORECASE) | |
| matches = [ | |
| item | |
| for item in ALL_ITEMS | |
| if pattern.search(item.get("text", "")) | |
| or pattern.search(item.get("reference_summary", "")) | |
| or pattern.search(item.get("generated_summary", "")) | |
| or pattern.search(item.get("title", "")) | |
| ] | |
| if not matches: | |
| return f'No results found for "{query}".' | |
| books = [i for i in matches if i.get("source_type") == "literary"] | |
| papers = [i for i in matches if i.get("source_type") == "academic"] | |
| result = f'Found **{len(matches)}** results for **"{query}"**\n\n---\n\n' | |
| if books: | |
| result += f"## Books ({len(books)})\n\n" | |
| for item in books[:ITEMS_PER_PAGE]: | |
| result += _format_book_card(item) | |
| if papers: | |
| result += f"## Research Papers ({len(papers)})\n\n" | |
| for item in papers[:ITEMS_PER_PAGE]: | |
| result += _format_paper_card(item) | |
| return result | |
| # --------------- Gradio Interface --------------- | |
| with gr.Blocks( | |
| title="LexiMind -- Discover Books & Papers", | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| * { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, | |
| 'Helvetica Neue', Arial, sans-serif !important; } | |
| .result-box { max-height: 800px; overflow-y: auto; } | |
| h3 { margin-top: 0.5em !important; margin-bottom: 0.2em !important; } | |
| blockquote { | |
| border-left: 3px solid #6366f1 !important; | |
| padding-left: 1em !important; | |
| color: #374151 !important; | |
| } | |
| """, | |
| ) as demo: | |
| gr.Markdown( | |
| "# LexiMind\n" | |
| "### Discover Your Next Read\n\n" | |
| "Browse **{book_count} books** and **{paper_count} research papers** " | |
| "analyzed by a multi-task AI model. Each item has an AI-generated " | |
| "summary, a topic classification, and an emotion label.\n\n" | |
| "Use the tabs below to filter by topic or emotion, or search by keyword.".format( | |
| book_count=len(BOOKS), paper_count=len(PAPERS) | |
| ) | |
| ) | |
| with gr.Tabs(): | |
| # -- Browse by Topic -- | |
| with gr.Tab("By Topic"): | |
| gr.Markdown("Select a topic to explore related books and papers.") | |
| with gr.Row(): | |
| topic_dropdown = gr.Dropdown( | |
| choices=["All Topics"] + TOPICS, | |
| value="All Topics", | |
| label="Topic", | |
| interactive=True, | |
| scale=2, | |
| ) | |
| source_filter_topic = gr.Radio( | |
| choices=["All", "Books Only", "Papers Only"], | |
| value="All", | |
| label="Show", | |
| interactive=True, | |
| scale=1, | |
| ) | |
| topic_results = gr.Markdown( | |
| value=browse_by_topic("All Topics", "All"), | |
| elem_classes=["result-box"], | |
| ) | |
| topic_dropdown.change( | |
| fn=browse_by_topic, | |
| inputs=[topic_dropdown, source_filter_topic], | |
| outputs=[topic_results], | |
| ) | |
| source_filter_topic.change( | |
| fn=browse_by_topic, | |
| inputs=[topic_dropdown, source_filter_topic], | |
| outputs=[topic_results], | |
| ) | |
| # -- Browse by Tone -- | |
| with gr.Tab("By Tone"): | |
| gr.Markdown( | |
| "Find books and papers by the dominant emotional tone detected by the model." | |
| ) | |
| with gr.Row(): | |
| emotion_dropdown = gr.Dropdown( | |
| choices=["All Tones"] + [e.title() for e in EMOTIONS], | |
| value="All Tones", | |
| label="Tone", | |
| interactive=True, | |
| scale=2, | |
| ) | |
| source_filter_emotion = gr.Radio( | |
| choices=["All", "Books Only", "Papers Only"], | |
| value="All", | |
| label="Show", | |
| interactive=True, | |
| scale=1, | |
| ) | |
| emotion_results = gr.Markdown( | |
| value=browse_by_emotion("All Tones", "All"), | |
| elem_classes=["result-box"], | |
| ) | |
| emotion_dropdown.change( | |
| fn=lambda e, f: browse_by_emotion(e, f), | |
| inputs=[emotion_dropdown, source_filter_emotion], | |
| outputs=[emotion_results], | |
| ) | |
| source_filter_emotion.change( | |
| fn=lambda e, f: browse_by_emotion(e, f), | |
| inputs=[emotion_dropdown, source_filter_emotion], | |
| outputs=[emotion_results], | |
| ) | |
| # -- Search -- | |
| with gr.Tab("Search"): | |
| gr.Markdown("Search across all books and papers by keyword.") | |
| search_input = gr.Textbox( | |
| placeholder="e.g. quantum, Shakespeare, neural network, gravity...", | |
| label="Search", | |
| interactive=True, | |
| ) | |
| search_results = gr.Markdown( | |
| value="Enter at least 2 characters to search.", | |
| elem_classes=["result-box"], | |
| ) | |
| search_input.change( | |
| fn=search_items, | |
| inputs=[search_input], | |
| outputs=[search_results], | |
| ) | |
| # -- Metrics -- | |
| with gr.Tab("Metrics"): | |
| gr.Markdown("### Model Evaluation\n\nComputed on held-out validation data.") | |
| gr.Markdown("#### Summarization") | |
| if METRICS.get("summarization"): | |
| summ = METRICS["summarization"] | |
| summ_md = ( | |
| "| Metric | Score |\n" | |
| "|--------|-------|\n" | |
| "| ROUGE-1 | {rouge1:.4f} |\n" | |
| "| ROUGE-2 | {rouge2:.4f} |\n" | |
| "| ROUGE-L | {rougeL:.4f} |\n" | |
| "| BLEU-4 | {bleu4:.4f} |\n" | |
| ).format( | |
| rouge1=summ.get("rouge_rouge1", summ.get("rouge1", 0)), | |
| rouge2=summ.get("rouge_rouge2", summ.get("rouge2", 0)), | |
| rougeL=summ.get("rouge_rougeL", summ.get("rougeL", 0)), | |
| bleu4=summ.get("bleu4", 0), | |
| ) | |
| gr.Markdown(summ_md) | |
| else: | |
| gr.Markdown("Summarization metrics not available. Run the evaluation script.") | |
| gr.Markdown("#### Topic Classification") | |
| if METRICS.get("topic"): | |
| topic_m = METRICS["topic"] | |
| topic_md = ( | |
| "| Metric | Score |\n" | |
| "|--------|-------|\n" | |
| "| Accuracy | {accuracy:.2%} |\n" | |
| "| Macro F1 | {f1:.4f} |\n" | |
| ).format( | |
| accuracy=topic_m.get("accuracy", 0), | |
| f1=topic_m.get("f1", topic_m.get("macro_f1", 0)), | |
| ) | |
| gr.Markdown(topic_md) | |
| else: | |
| gr.Markdown("Topic classification metrics not available.") | |
| gr.Markdown("#### Emotion Detection") | |
| if METRICS.get("emotion"): | |
| emotion_m = METRICS["emotion"] | |
| emotion_md = ( | |
| "| Metric | Score |\n" | |
| "|--------|-------|\n" | |
| "| Sample-avg F1 | {sample_f1:.4f} |\n" | |
| "| Macro F1 | {macro_f1:.4f} |\n" | |
| "| Micro F1 | {micro_f1:.4f} |\n\n" | |
| "28-label multi-label classification trained on GoEmotions." | |
| ).format( | |
| sample_f1=emotion_m.get( | |
| "sample_avg_f1", emotion_m.get("f1", emotion_m.get("multilabel_f1", 0)) | |
| ), | |
| macro_f1=emotion_m.get("macro_f1", 0), | |
| micro_f1=emotion_m.get("micro_f1", 0), | |
| ) | |
| gr.Markdown(emotion_md) | |
| else: | |
| gr.Markdown("Emotion detection metrics not available.") | |
| gr.Markdown("#### Discovery Dataset") | |
| gr.Markdown( | |
| "| Content | Count |\n" | |
| "|---------|-------|\n" | |
| f"| Literary Works | {len(BOOKS)} |\n" | |
| f"| Research Papers | {len(PAPERS)} |\n" | |
| f"| **Total** | **{len(ALL_ITEMS)}** |\n" | |
| f"| Unique Topics | {len(TOPICS)} |\n" | |
| f"| Unique Tones | {len(EMOTIONS)} |" | |
| ) | |
| # -- About -- | |
| with gr.Tab("About"): | |
| gr.Markdown( | |
| "### About LexiMind\n\n" | |
| "LexiMind is a **272M parameter encoder-decoder transformer** " | |
| "(FLAN-T5-base) trained jointly on three tasks:\n\n" | |
| "| Task | What it does | Training data |\n" | |
| "|------|-------------|---------------|\n" | |
| "| **Summarization** | Generates abstracts for research papers | " | |
| "~49K pairs (arXiv + Project Gutenberg/Goodreads) |\n" | |
| "| **Topic Classification** | Assigns one of 7 topics | 3.4K samples |\n" | |
| "| **Emotion Detection** | Detects up to 28 emotions | " | |
| "43K GoEmotions samples |\n\n" | |
| "**How to read the results:**\n\n" | |
| "- **Research papers** show AI-generated summaries that condense the " | |
| "paper's content. These are generated by the model and are generally " | |
| "accurate.\n" | |
| "- **Books** show the Goodreads description as the primary text. " | |
| "The model was trained primarily on academic text (~45K academic vs ~4K literary), " | |
| "so book summaries are not shown.\n" | |
| "- **Tone labels** indicate the dominant emotional tone detected by the model. " | |
| "Since the emotion detector was trained on social media (GoEmotions), " | |
| "it captures general sentiment better than specific emotions for " | |
| "formal text.\n\n" | |
| "#### Architecture\n\n" | |
| "- Custom from-scratch Transformer (not HuggingFace wrappers)\n" | |
| "- Shared encoder with task-specific heads: decoder for summarization, " | |
| "attention pooling for emotion, mean pooling for topic\n" | |
| "- Trained in ~9 hours on a single RTX 4070 12GB\n\n" | |
| "[GitHub](https://github.com/OliverPerrin/LexiMind) | " | |
| "[Model](https://huggingface.co/OliverPerrin/LexiMind-Model) | " | |
| "[Dataset](https://huggingface.co/datasets/OliverPerrin/LexiMind-Discovery) | " | |
| "[Paper](https://github.com/OliverPerrin/LexiMind/blob/main/docs/research_paper.tex)" | |
| "\n\n*Oliver Perrin -- Appalachian State University -- 2025-2026*" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |