Spaces:

OliverPerrin
/

LexiMind

Sleeping

OliverPerrin

Fixing summary bugs

7aa03a0 about 2 months ago

20.4 kB

	"""
	LexiMind -- Discover Books & Papers

	Browse literary works and research papers analyzed by a multi-task transformer.
	Find your next read by topic, emotion, or keyword -- with AI-generated summaries.

	Author: Oliver Perrin
	Date: 2026-01-14
	"""

	from __future__ import annotations

	import json
	import re
	import warnings
	from pathlib import Path
	from typing import Any

	warnings.filterwarnings("ignore", message=".parameter in the Blocks constructor will be removed.")

	import gradio as gr

	# --------------- Load Dataset ---------------

	_DATA_PATHS = [
	Path(__file__).parent.parent / "data" / "discovery_dataset.jsonl",
	Path("data") / "discovery_dataset.jsonl",
	]


	def _load_jsonl() -> list[dict[str, Any]]:
	for p in _DATA_PATHS:
	if p.exists():
	print(f"Loading discovery dataset from {p}...")
	with open(p) as f:
	return [json.loads(line) for line in f if line.strip()]
	raise FileNotFoundError(
	f"Discovery dataset not found. Looked in: {[str(p) for p in _DATA_PATHS]}"
	)


	_raw_items = _load_jsonl()
	print(f"Loaded {len(_raw_items)} items")

	# Exclude social media posts
	ALL_ITEMS: list[dict[str, Any]] = [
	item for item in _raw_items if item.get("source_type") != "social"
	]

	# Extract unique topics and emotions from the dataset (what model predicted)
	TOPICS: list[str] = sorted(set(str(item["topic"]) for item in ALL_ITEMS if item.get("topic")))
	EMOTIONS: list[str] = sorted(
	{
	str(item["emotion"])
	for item in ALL_ITEMS
	if item.get("emotion") and item["emotion"] != "neutral"
	}
	)

	# Group by source type
	BOOKS: list[dict[str, Any]] = [item for item in ALL_ITEMS if item.get("source_type") == "literary"]
	PAPERS: list[dict[str, Any]] = [item for item in ALL_ITEMS if item.get("source_type") == "academic"]

	print(f"Topics ({len(TOPICS)}): {TOPICS}")
	print(f"Emotions ({len(EMOTIONS)}): {EMOTIONS}")
	print(f"Books: {len(BOOKS)}, Papers: {len(PAPERS)}")

	# --------------- Load Evaluation Metrics ---------------

	METRICS: dict[str, Any] = {}
	_metrics_path = Path(__file__).parent.parent / "outputs" / "evaluation_report.json"
	if _metrics_path.exists():
	try:
	with open(_metrics_path) as f:
	METRICS = json.load(f)
	print(f"Loaded evaluation metrics from {_metrics_path}")
	except Exception as e:
	print(f"Warning: Could not load metrics: {e}")


	# --------------- Helpers ---------------


	def _clean_paper_title(raw_title: str) -> str:
	"""Clean up arXiv paper titles.

	Paper 'titles' in this dataset are the first ~150 chars of the abstract,
	not real titles. Clean them into a short, readable heading.
	"""
	t = raw_title.strip()
	# Remove bracket markers like [ [ background ] ]
	t = re.sub(r"\[[\s\[][^\]][\]\s]*\]", "", t)
	# Remove runs of + symbols (with or without spaces between them)
	t = re.sub(r"(\+\s*){2,}", "", t)
	# Remove other LaTeX artifacts like ^s$ ]
	t = re.sub(r"\^[a-z0-9]\$\s\]?", "", t)
	# Collapse whitespace and strip leading/trailing punctuation
	t = re.sub(r"\s+", " ", t).strip()
	t = t.strip(":").strip()
	# Remove leading section headers (e.g. "background :", "introduction :")
	t = re.sub(
	r"^(background\|introduction\|abstract\|motivation\|overview)\s:\s",
	"",
	t,
	flags=re.IGNORECASE,
	)
	# Remove trailing ellipsis or period
	t = t.rstrip(".").rstrip()
	if t.endswith("..."):
	t = t[:-3].rstrip()
	# Capitalize first letter
	if t and t[0].islower():
	t = t[0].upper() + t[1:]
	# Truncate to a reasonable length at a word boundary
	if len(t) > 90:
	cut = t[:90].rfind(" ")
	if cut > 40:
	t = t[:cut] + "..."
	return t or "Research Paper"


	# --------------- Card Formatting ---------------

	ITEMS_PER_PAGE = 25


	def _format_book_card(item: dict) -> str:
	"""Format a literary work as a discovery card.

	Uses the Goodreads description (reference summary) as the primary blurb.
	AI-generated summaries are not shown for books because the model was
	trained primarily on academic text and produces low-quality literary
	summaries.
	"""
	title = item.get("title", "Untitled")
	topic = item.get("topic", "")
	emotion = item.get("emotion", "neutral")

	ref_summary = (item.get("reference_summary") or "").strip()

	# Build metadata line
	parts = ["Book"]
	if topic:
	parts.append(f"Topic: {topic}")
	if emotion != "neutral":
	parts.append(f"Tone: {emotion.title()}")
	meta_line = " \| ".join(parts)

	card = f"### {title}\n\n"
	card += f"{meta_line}\n\n"

	# Show the Goodreads description as the primary blurb
	if ref_summary:
	card += f"> {ref_summary}\n\n"

	card += "---\n\n"
	return card


	def _format_paper_card(item: dict) -> str:
	"""Format a research paper as a discovery card.

	Uses the AI-generated summary as the primary blurb since it is usually
	a good condensation of the paper. The original abstract is shown in an
	expandable section.
	"""
	title = item.get("title", "Untitled")
	topic = item.get("topic", "")
	emotion = item.get("emotion", "neutral")

	gen_summary = (item.get("generated_summary") or "").strip()
	ref_summary = (item.get("reference_summary") or "").strip()

	display_title = _clean_paper_title(title)

	# Build metadata line
	parts = ["Paper"]
	if topic:
	parts.append(f"Topic: {topic}")
	if emotion != "neutral":
	parts.append(f"Tone: {emotion.title()}")
	meta_line = " \| ".join(parts)

	card = f"### {display_title}\n\n"
	card += f"{meta_line}\n\n"

	if gen_summary:
	card += f"> {gen_summary}\n\n"
	elif ref_summary:
	card += f"> {ref_summary}\n\n"

	if gen_summary and ref_summary:
	card += (
	f"<details>\n<summary>Original Abstract</summary>\n\n{ref_summary}\n\n</details>\n\n"
	)

	card += "---\n\n"
	return card


	def _format_card(item: dict) -> str:
	"""Route to the appropriate card formatter."""
	source_type = item.get("source_type", "")
	if source_type == "literary":
	return _format_book_card(item)
	elif source_type == "academic":
	return _format_paper_card(item)
	return ""


	# --------------- Browse Functions ---------------


	def browse_by_topic(topic: str, source_filter: str) -> str:
	"""Browse items filtered by topic and source type."""
	if topic == "All Topics":
	items = list(ALL_ITEMS)
	else:
	items = [i for i in ALL_ITEMS if i.get("topic") == topic]

	if source_filter == "Books Only":
	items = [i for i in items if i.get("source_type") == "literary"]
	elif source_filter == "Papers Only":
	items = [i for i in items if i.get("source_type") == "academic"]

	if not items:
	return "No items found for this selection."

	books = [i for i in items if i.get("source_type") == "literary"]
	papers = [i for i in items if i.get("source_type") == "academic"]

	result = f"Showing {len(items)} results"
	if topic != "All Topics":
	result += f" in {topic}"
	result += f" -- {len(books)} books, {len(papers)} papers\n\n---\n\n"

	if source_filter != "Papers Only" and books:
	if source_filter == "All":
	result += f"## Books ({len(books)})\n\n"
	for item in books[:ITEMS_PER_PAGE]:
	result += _format_book_card(item)

	if source_filter != "Books Only" and papers:
	if source_filter == "All":
	result += f"## Research Papers ({len(papers)})\n\n"
	for item in papers[:ITEMS_PER_PAGE]:
	result += _format_paper_card(item)

	return result


	def browse_by_emotion(emotion: str, source_filter: str) -> str:
	"""Browse items filtered by tone and source type."""
	if emotion in ("All Emotions", "All Tones"):
	items = [i for i in ALL_ITEMS if i.get("emotion") != "neutral"]
	else:
	items = [i for i in ALL_ITEMS if i.get("emotion") == emotion.lower()]

	if source_filter == "Books Only":
	items = [i for i in items if i.get("source_type") == "literary"]
	elif source_filter == "Papers Only":
	items = [i for i in items if i.get("source_type") == "academic"]

	if not items:
	return (
	"No items found for this selection.\n\n"
	"Try a different tone or select 'All Tones' to see "
	"all items with a detected tone."
	)

	books = [i for i in items if i.get("source_type") == "literary"]
	papers = [i for i in items if i.get("source_type") == "academic"]

	header = emotion if emotion not in ("All Emotions", "All Tones") else "any detected tone"
	result = f"Showing {len(items)} results with {header}\n\n---\n\n"

	if source_filter != "Papers Only" and books:
	if source_filter == "All":
	result += f"## Books ({len(books)})\n\n"
	for item in books[:ITEMS_PER_PAGE]:
	result += _format_book_card(item)

	if source_filter != "Books Only" and papers:
	if source_filter == "All":
	result += f"## Research Papers ({len(papers)})\n\n"
	for item in papers[:ITEMS_PER_PAGE]:
	result += _format_paper_card(item)

	return result


	def search_items(query: str) -> str:
	"""Search items by text content using word-boundary matching."""
	if not query or len(query) < 2:
	return "Enter at least 2 characters to search."

	pattern = re.compile(r"\b" + re.escape(query) + r"\b", re.IGNORECASE)
	matches = [
	item
	for item in ALL_ITEMS
	if pattern.search(item.get("text", ""))
	or pattern.search(item.get("reference_summary", ""))
	or pattern.search(item.get("generated_summary", ""))
	or pattern.search(item.get("title", ""))
	]

	if not matches:
	return f'No results found for "{query}".'

	books = [i for i in matches if i.get("source_type") == "literary"]
	papers = [i for i in matches if i.get("source_type") == "academic"]

	result = f'Found {len(matches)} results for "{query}"\n\n---\n\n'

	if books:
	result += f"## Books ({len(books)})\n\n"
	for item in books[:ITEMS_PER_PAGE]:
	result += _format_book_card(item)

	if papers:
	result += f"## Research Papers ({len(papers)})\n\n"
	for item in papers[:ITEMS_PER_PAGE]:
	result += _format_paper_card(item)

	return result


	# --------------- Gradio Interface ---------------

	with gr.Blocks(
	title="LexiMind -- Discover Books & Papers",
	theme=gr.themes.Soft(),
	css="""
	* { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto,
	'Helvetica Neue', Arial, sans-serif !important; }
	.result-box { max-height: 800px; overflow-y: auto; }
	h3 { margin-top: 0.5em !important; margin-bottom: 0.2em !important; }
	blockquote {
	border-left: 3px solid #6366f1 !important;
	padding-left: 1em !important;
	color: #374151 !important;
	}
	""",
	) as demo:
	gr.Markdown(
	"# LexiMind\n"
	"### Discover Your Next Read\n\n"
	"Browse {book_count} books and {paper_count} research papers "
	"analyzed by a multi-task AI model. Each item has an AI-generated "
	"summary, a topic classification, and an emotion label.\n\n"
	"Use the tabs below to filter by topic or emotion, or search by keyword.".format(
	book_count=len(BOOKS), paper_count=len(PAPERS)
	)
	)

	with gr.Tabs():
	# -- Browse by Topic --
	with gr.Tab("By Topic"):
	gr.Markdown("Select a topic to explore related books and papers.")
	with gr.Row():
	topic_dropdown = gr.Dropdown(
	choices=["All Topics"] + TOPICS,
	value="All Topics",
	label="Topic",
	interactive=True,
	scale=2,
	)
	source_filter_topic = gr.Radio(
	choices=["All", "Books Only", "Papers Only"],
	value="All",
	label="Show",
	interactive=True,
	scale=1,
	)

	topic_results = gr.Markdown(
	value=browse_by_topic("All Topics", "All"),
	elem_classes=["result-box"],
	)

	topic_dropdown.change(
	fn=browse_by_topic,
	inputs=[topic_dropdown, source_filter_topic],
	outputs=[topic_results],
	)
	source_filter_topic.change(
	fn=browse_by_topic,
	inputs=[topic_dropdown, source_filter_topic],
	outputs=[topic_results],
	)

	# -- Browse by Tone --
	with gr.Tab("By Tone"):
	gr.Markdown(
	"Find books and papers by the dominant emotional tone detected by the model."
	)
	with gr.Row():
	emotion_dropdown = gr.Dropdown(
	choices=["All Tones"] + [e.title() for e in EMOTIONS],
	value="All Tones",
	label="Tone",
	interactive=True,
	scale=2,
	)
	source_filter_emotion = gr.Radio(
	choices=["All", "Books Only", "Papers Only"],
	value="All",
	label="Show",
	interactive=True,
	scale=1,
	)

	emotion_results = gr.Markdown(
	value=browse_by_emotion("All Tones", "All"),
	elem_classes=["result-box"],
	)

	emotion_dropdown.change(
	fn=lambda e, f: browse_by_emotion(e, f),
	inputs=[emotion_dropdown, source_filter_emotion],
	outputs=[emotion_results],
	)
	source_filter_emotion.change(
	fn=lambda e, f: browse_by_emotion(e, f),
	inputs=[emotion_dropdown, source_filter_emotion],
	outputs=[emotion_results],
	)

	# -- Search --
	with gr.Tab("Search"):
	gr.Markdown("Search across all books and papers by keyword.")

	search_input = gr.Textbox(
	placeholder="e.g. quantum, Shakespeare, neural network, gravity...",
	label="Search",
	interactive=True,
	)

	search_results = gr.Markdown(
	value="Enter at least 2 characters to search.",
	elem_classes=["result-box"],
	)

	search_input.change(
	fn=search_items,
	inputs=[search_input],
	outputs=[search_results],
	)

	# -- Metrics --
	with gr.Tab("Metrics"):
	gr.Markdown("### Model Evaluation\n\nComputed on held-out validation data.")

	gr.Markdown("#### Summarization")

	if METRICS.get("summarization"):
	summ = METRICS["summarization"]
	summ_md = (
	"\| Metric \| Score \|\n"
	"\|--------\|-------\|\n"
	"\| ROUGE-1 \| {rouge1:.4f} \|\n"
	"\| ROUGE-2 \| {rouge2:.4f} \|\n"
	"\| ROUGE-L \| {rougeL:.4f} \|\n"
	"\| BLEU-4 \| {bleu4:.4f} \|\n"
	).format(
	rouge1=summ.get("rouge_rouge1", summ.get("rouge1", 0)),
	rouge2=summ.get("rouge_rouge2", summ.get("rouge2", 0)),
	rougeL=summ.get("rouge_rougeL", summ.get("rougeL", 0)),
	bleu4=summ.get("bleu4", 0),
	)
	gr.Markdown(summ_md)
	else:
	gr.Markdown("Summarization metrics not available. Run the evaluation script.")

	gr.Markdown("#### Topic Classification")

	if METRICS.get("topic"):
	topic_m = METRICS["topic"]
	topic_md = (
	"\| Metric \| Score \|\n"
	"\|--------\|-------\|\n"
	"\| Accuracy \| {accuracy:.2%} \|\n"
	"\| Macro F1 \| {f1:.4f} \|\n"
	).format(
	accuracy=topic_m.get("accuracy", 0),
	f1=topic_m.get("f1", topic_m.get("macro_f1", 0)),
	)
	gr.Markdown(topic_md)
	else:
	gr.Markdown("Topic classification metrics not available.")

	gr.Markdown("#### Emotion Detection")

	if METRICS.get("emotion"):
	emotion_m = METRICS["emotion"]
	emotion_md = (
	"\| Metric \| Score \|\n"
	"\|--------\|-------\|\n"
	"\| Sample-avg F1 \| {sample_f1:.4f} \|\n"
	"\| Macro F1 \| {macro_f1:.4f} \|\n"
	"\| Micro F1 \| {micro_f1:.4f} \|\n\n"
	"28-label multi-label classification trained on GoEmotions."
	).format(
	sample_f1=emotion_m.get(
	"sample_avg_f1", emotion_m.get("f1", emotion_m.get("multilabel_f1", 0))
	),
	macro_f1=emotion_m.get("macro_f1", 0),
	micro_f1=emotion_m.get("micro_f1", 0),
	)
	gr.Markdown(emotion_md)
	else:
	gr.Markdown("Emotion detection metrics not available.")

	gr.Markdown("#### Discovery Dataset")

	gr.Markdown(
	"\| Content \| Count \|\n"
	"\|---------\|-------\|\n"
	f"\| Literary Works \| {len(BOOKS)} \|\n"
	f"\| Research Papers \| {len(PAPERS)} \|\n"
	f"\| Total \| {len(ALL_ITEMS)} \|\n"
	f"\| Unique Topics \| {len(TOPICS)} \|\n"
	f"\| Unique Tones \| {len(EMOTIONS)} \|"
	)

	# -- About --
	with gr.Tab("About"):
	gr.Markdown(
	"### About LexiMind\n\n"
	"LexiMind is a 272M parameter encoder-decoder transformer "
	"(FLAN-T5-base) trained jointly on three tasks:\n\n"
	"\| Task \| What it does \| Training data \|\n"
	"\|------\|-------------\|---------------\|\n"
	"\| Summarization \| Generates abstracts for research papers \| "
	"~49K pairs (arXiv + Project Gutenberg/Goodreads) \|\n"
	"\| Topic Classification \| Assigns one of 7 topics \| 3.4K samples \|\n"
	"\| Emotion Detection \| Detects up to 28 emotions \| "
	"43K GoEmotions samples \|\n\n"
	"How to read the results:\n\n"
	"- Research papers show AI-generated summaries that condense the "
	"paper's content. These are generated by the model and are generally "
	"accurate.\n"
	"- Books show the Goodreads description as the primary text. "
	"The model was trained primarily on academic text (~45K academic vs ~4K literary), "
	"so book summaries are not shown.\n"
	"- Tone labels indicate the dominant emotional tone detected by the model. "
	"Since the emotion detector was trained on social media (GoEmotions), "
	"it captures general sentiment better than specific emotions for "
	"formal text.\n\n"
	"#### Architecture\n\n"
	"- Custom from-scratch Transformer (not HuggingFace wrappers)\n"
	"- Shared encoder with task-specific heads: decoder for summarization, "
	"attention pooling for emotion, mean pooling for topic\n"
	"- Trained in ~9 hours on a single RTX 4070 12GB\n\n"
	"[GitHub](https://github.com/OliverPerrin/LexiMind) \| "
	"[Model](https://huggingface.co/OliverPerrin/LexiMind-Model) \| "
	"[Dataset](https://huggingface.co/datasets/OliverPerrin/LexiMind-Discovery) \| "
	"[Paper](https://github.com/OliverPerrin/LexiMind/blob/main/docs/research_paper.tex)"
	"\n\nOliver Perrin -- Appalachian State University -- 2025-2026"
	)


	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)