"""
LexiMind -- Discover Books & Papers

Browse literary works and research papers analyzed by a multi-task transformer.
Find your next read by topic, emotion, or keyword -- with AI-generated summaries.

Author: Oliver Perrin
Date: 2026-01-14
"""

from __future__ import annotations

import json
import re
import warnings
from pathlib import Path
from typing import Any

warnings.filterwarnings("ignore", message=".*parameter in the Blocks constructor will be removed.*")

import gradio as gr

# --------------- Load Dataset ---------------

_DATA_PATHS = [
    Path(__file__).parent.parent / "data" / "discovery_dataset.jsonl",
    Path("data") / "discovery_dataset.jsonl",
]


def _load_jsonl() -> list[dict[str, Any]]:
    for p in _DATA_PATHS:
        if p.exists():
            print(f"Loading discovery dataset from {p}...")
            with open(p) as f:
                return [json.loads(line) for line in f if line.strip()]
    raise FileNotFoundError(
        f"Discovery dataset not found. Looked in: {[str(p) for p in _DATA_PATHS]}"
    )


_raw_items = _load_jsonl()
print(f"Loaded {len(_raw_items)} items")

# Exclude social media posts
ALL_ITEMS: list[dict[str, Any]] = [
    item for item in _raw_items if item.get("source_type") != "social"
]

# Extract unique topics and emotions from the dataset (what model predicted)
TOPICS: list[str] = sorted(set(str(item["topic"]) for item in ALL_ITEMS if item.get("topic")))
EMOTIONS: list[str] = sorted(
    {
        str(item["emotion"])
        for item in ALL_ITEMS
        if item.get("emotion") and item["emotion"] != "neutral"
    }
)

# Group by source type
BOOKS: list[dict[str, Any]] = [item for item in ALL_ITEMS if item.get("source_type") == "literary"]
PAPERS: list[dict[str, Any]] = [item for item in ALL_ITEMS if item.get("source_type") == "academic"]

print(f"Topics ({len(TOPICS)}): {TOPICS}")
print(f"Emotions ({len(EMOTIONS)}): {EMOTIONS}")
print(f"Books: {len(BOOKS)}, Papers: {len(PAPERS)}")

# --------------- Load Evaluation Metrics ---------------

METRICS: dict[str, Any] = {}
_metrics_path = Path(__file__).parent.parent / "outputs" / "evaluation_report.json"
if _metrics_path.exists():
    try:
        with open(_metrics_path) as f:
            METRICS = json.load(f)
        print(f"Loaded evaluation metrics from {_metrics_path}")
    except Exception as e:
        print(f"Warning: Could not load metrics: {e}")


# --------------- Helpers ---------------


def _clean_paper_title(raw_title: str) -> str:
    """Clean up arXiv paper titles.

    Paper 'titles' in this dataset are the first ~150 chars of the abstract,
    not real titles. Clean them into a short, readable heading.
    """
    t = raw_title.strip()
    # Remove bracket markers like [ [ background ] ]
    t = re.sub(r"\[[\s\[]*[^\]]*[\]\s]*\]", "", t)
    # Remove runs of + symbols (with or without spaces between them)
    t = re.sub(r"(\+\s*){2,}", "", t)
    # Remove other LaTeX artifacts like ^s$ ]
    t = re.sub(r"\^[a-z0-9]*\$\s*\]?", "", t)
    # Collapse whitespace and strip leading/trailing punctuation
    t = re.sub(r"\s+", " ", t).strip()
    t = t.strip(":").strip()
    # Remove leading section headers (e.g. "background :", "introduction :")
    t = re.sub(
        r"^(background|introduction|abstract|motivation|overview)\s*:\s*",
        "",
        t,
        flags=re.IGNORECASE,
    )
    # Remove trailing ellipsis or period
    t = t.rstrip(".").rstrip()
    if t.endswith("..."):
        t = t[:-3].rstrip()
    # Capitalize first letter
    if t and t[0].islower():
        t = t[0].upper() + t[1:]
    # Truncate to a reasonable length at a word boundary
    if len(t) > 90:
        cut = t[:90].rfind(" ")
        if cut > 40:
            t = t[:cut] + "..."
    return t or "Research Paper"


# --------------- Card Formatting ---------------

ITEMS_PER_PAGE = 25


def _format_book_card(item: dict) -> str:
    """Format a literary work as a discovery card.

    Uses the Goodreads description (reference summary) as the primary blurb.
    AI-generated summaries are not shown for books because the model was
    trained primarily on academic text and produces low-quality literary
    summaries.
    """
    title = item.get("title", "Untitled")
    topic = item.get("topic", "")
    emotion = item.get("emotion", "neutral")

    ref_summary = (item.get("reference_summary") or "").strip()

    # Build metadata line
    parts = ["Book"]
    if topic:
        parts.append(f"Topic: {topic}")
    if emotion != "neutral":
        parts.append(f"Tone: {emotion.title()}")
    meta_line = " | ".join(parts)

    card = f"### {title}\n\n"
    card += f"*{meta_line}*\n\n"

    # Show the Goodreads description as the primary blurb
    if ref_summary:
        card += f"> {ref_summary}\n\n"

    card += "---\n\n"
    return card


def _format_paper_card(item: dict) -> str:
    """Format a research paper as a discovery card.

    Uses the AI-generated summary as the primary blurb since it is usually
    a good condensation of the paper. The original abstract is shown in an
    expandable section.
    """
    title = item.get("title", "Untitled")
    topic = item.get("topic", "")
    emotion = item.get("emotion", "neutral")

    gen_summary = (item.get("generated_summary") or "").strip()
    ref_summary = (item.get("reference_summary") or "").strip()

    display_title = _clean_paper_title(title)

    # Build metadata line
    parts = ["Paper"]
    if topic:
        parts.append(f"Topic: {topic}")
    if emotion != "neutral":
        parts.append(f"Tone: {emotion.title()}")
    meta_line = " | ".join(parts)

    card = f"### {display_title}\n\n"
    card += f"*{meta_line}*\n\n"

    if gen_summary:
        card += f"> {gen_summary}\n\n"
    elif ref_summary:
        card += f"> {ref_summary}\n\n"

    if gen_summary and ref_summary:
        card += (
            f"<details>\n<summary>Original Abstract</summary>\n\n{ref_summary}\n\n</details>\n\n"
        )

    card += "---\n\n"
    return card


def _format_card(item: dict) -> str:
    """Route to the appropriate card formatter."""
    source_type = item.get("source_type", "")
    if source_type == "literary":
        return _format_book_card(item)
    elif source_type == "academic":
        return _format_paper_card(item)
    return ""


# --------------- Browse Functions ---------------


def browse_by_topic(topic: str, source_filter: str) -> str:
    """Browse items filtered by topic and source type."""
    if topic == "All Topics":
        items = list(ALL_ITEMS)
    else:
        items = [i for i in ALL_ITEMS if i.get("topic") == topic]

    if source_filter == "Books Only":
        items = [i for i in items if i.get("source_type") == "literary"]
    elif source_filter == "Papers Only":
        items = [i for i in items if i.get("source_type") == "academic"]

    if not items:
        return "No items found for this selection."

    books = [i for i in items if i.get("source_type") == "literary"]
    papers = [i for i in items if i.get("source_type") == "academic"]

    result = f"Showing **{len(items)}** results"
    if topic != "All Topics":
        result += f" in **{topic}**"
    result += f" -- {len(books)} books, {len(papers)} papers\n\n---\n\n"

    if source_filter != "Papers Only" and books:
        if source_filter == "All":
            result += f"## Books ({len(books)})\n\n"
        for item in books[:ITEMS_PER_PAGE]:
            result += _format_book_card(item)

    if source_filter != "Books Only" and papers:
        if source_filter == "All":
            result += f"## Research Papers ({len(papers)})\n\n"
        for item in papers[:ITEMS_PER_PAGE]:
            result += _format_paper_card(item)

    return result


def browse_by_emotion(emotion: str, source_filter: str) -> str:
    """Browse items filtered by tone and source type."""
    if emotion in ("All Emotions", "All Tones"):
        items = [i for i in ALL_ITEMS if i.get("emotion") != "neutral"]
    else:
        items = [i for i in ALL_ITEMS if i.get("emotion") == emotion.lower()]

    if source_filter == "Books Only":
        items = [i for i in items if i.get("source_type") == "literary"]
    elif source_filter == "Papers Only":
        items = [i for i in items if i.get("source_type") == "academic"]

    if not items:
        return (
            "No items found for this selection.\n\n"
            "Try a different tone or select 'All Tones' to see "
            "all items with a detected tone."
        )

    books = [i for i in items if i.get("source_type") == "literary"]
    papers = [i for i in items if i.get("source_type") == "academic"]

    header = emotion if emotion not in ("All Emotions", "All Tones") else "any detected tone"
    result = f"Showing **{len(items)}** results with **{header}**\n\n---\n\n"

    if source_filter != "Papers Only" and books:
        if source_filter == "All":
            result += f"## Books ({len(books)})\n\n"
        for item in books[:ITEMS_PER_PAGE]:
            result += _format_book_card(item)

    if source_filter != "Books Only" and papers:
        if source_filter == "All":
            result += f"## Research Papers ({len(papers)})\n\n"
        for item in papers[:ITEMS_PER_PAGE]:
            result += _format_paper_card(item)

    return result


def search_items(query: str) -> str:
    """Search items by text content using word-boundary matching."""
    if not query or len(query) < 2:
        return "Enter at least 2 characters to search."

    pattern = re.compile(r"\b" + re.escape(query) + r"\b", re.IGNORECASE)
    matches = [
        item
        for item in ALL_ITEMS
        if pattern.search(item.get("text", ""))
        or pattern.search(item.get("reference_summary", ""))
        or pattern.search(item.get("generated_summary", ""))
        or pattern.search(item.get("title", ""))
    ]

    if not matches:
        return f'No results found for "{query}".'

    books = [i for i in matches if i.get("source_type") == "literary"]
    papers = [i for i in matches if i.get("source_type") == "academic"]

    result = f'Found **{len(matches)}** results for **"{query}"**\n\n---\n\n'

    if books:
        result += f"## Books ({len(books)})\n\n"
        for item in books[:ITEMS_PER_PAGE]:
            result += _format_book_card(item)

    if papers:
        result += f"## Research Papers ({len(papers)})\n\n"
        for item in papers[:ITEMS_PER_PAGE]:
            result += _format_paper_card(item)

    return result


# --------------- Gradio Interface ---------------

with gr.Blocks(
    title="LexiMind -- Discover Books & Papers",
    theme=gr.themes.Soft(),
    css="""
    * { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto,
        'Helvetica Neue', Arial, sans-serif !important; }
    .result-box { max-height: 800px; overflow-y: auto; }
    h3 { margin-top: 0.5em !important; margin-bottom: 0.2em !important; }
    blockquote {
        border-left: 3px solid #6366f1 !important;
        padding-left: 1em !important;
        color: #374151 !important;
    }
    """,
) as demo:
    gr.Markdown(
        "# LexiMind\n"
        "### Discover Your Next Read\n\n"
        "Browse **{book_count} books** and **{paper_count} research papers** "
        "analyzed by a multi-task AI model. Each item has an AI-generated "
        "summary, a topic classification, and an emotion label.\n\n"
        "Use the tabs below to filter by topic or emotion, or search by keyword.".format(
            book_count=len(BOOKS), paper_count=len(PAPERS)
        )
    )

    with gr.Tabs():
        # -- Browse by Topic --
        with gr.Tab("By Topic"):
            gr.Markdown("Select a topic to explore related books and papers.")
            with gr.Row():
                topic_dropdown = gr.Dropdown(
                    choices=["All Topics"] + TOPICS,
                    value="All Topics",
                    label="Topic",
                    interactive=True,
                    scale=2,
                )
                source_filter_topic = gr.Radio(
                    choices=["All", "Books Only", "Papers Only"],
                    value="All",
                    label="Show",
                    interactive=True,
                    scale=1,
                )

            topic_results = gr.Markdown(
                value=browse_by_topic("All Topics", "All"),
                elem_classes=["result-box"],
            )

            topic_dropdown.change(
                fn=browse_by_topic,
                inputs=[topic_dropdown, source_filter_topic],
                outputs=[topic_results],
            )
            source_filter_topic.change(
                fn=browse_by_topic,
                inputs=[topic_dropdown, source_filter_topic],
                outputs=[topic_results],
            )

        # -- Browse by Tone --
        with gr.Tab("By Tone"):
            gr.Markdown(
                "Find books and papers by the dominant emotional tone detected by the model."
            )
            with gr.Row():
                emotion_dropdown = gr.Dropdown(
                    choices=["All Tones"] + [e.title() for e in EMOTIONS],
                    value="All Tones",
                    label="Tone",
                    interactive=True,
                    scale=2,
                )
                source_filter_emotion = gr.Radio(
                    choices=["All", "Books Only", "Papers Only"],
                    value="All",
                    label="Show",
                    interactive=True,
                    scale=1,
                )

            emotion_results = gr.Markdown(
                value=browse_by_emotion("All Tones", "All"),
                elem_classes=["result-box"],
            )

            emotion_dropdown.change(
                fn=lambda e, f: browse_by_emotion(e, f),
                inputs=[emotion_dropdown, source_filter_emotion],
                outputs=[emotion_results],
            )
            source_filter_emotion.change(
                fn=lambda e, f: browse_by_emotion(e, f),
                inputs=[emotion_dropdown, source_filter_emotion],
                outputs=[emotion_results],
            )

        # -- Search --
        with gr.Tab("Search"):
            gr.Markdown("Search across all books and papers by keyword.")

            search_input = gr.Textbox(
                placeholder="e.g. quantum, Shakespeare, neural network, gravity...",
                label="Search",
                interactive=True,
            )

            search_results = gr.Markdown(
                value="Enter at least 2 characters to search.",
                elem_classes=["result-box"],
            )

            search_input.change(
                fn=search_items,
                inputs=[search_input],
                outputs=[search_results],
            )

        # -- Metrics --
        with gr.Tab("Metrics"):
            gr.Markdown("### Model Evaluation\n\nComputed on held-out validation data.")

            gr.Markdown("#### Summarization")

            if METRICS.get("summarization"):
                summ = METRICS["summarization"]
                summ_md = (
                    "| Metric | Score |\n"
                    "|--------|-------|\n"
                    "| ROUGE-1 | {rouge1:.4f} |\n"
                    "| ROUGE-2 | {rouge2:.4f} |\n"
                    "| ROUGE-L | {rougeL:.4f} |\n"
                    "| BLEU-4 | {bleu4:.4f} |\n"
                ).format(
                    rouge1=summ.get("rouge_rouge1", summ.get("rouge1", 0)),
                    rouge2=summ.get("rouge_rouge2", summ.get("rouge2", 0)),
                    rougeL=summ.get("rouge_rougeL", summ.get("rougeL", 0)),
                    bleu4=summ.get("bleu4", 0),
                )
                gr.Markdown(summ_md)
            else:
                gr.Markdown("Summarization metrics not available. Run the evaluation script.")

            gr.Markdown("#### Topic Classification")

            if METRICS.get("topic"):
                topic_m = METRICS["topic"]
                topic_md = (
                    "| Metric | Score |\n"
                    "|--------|-------|\n"
                    "| Accuracy | {accuracy:.2%} |\n"
                    "| Macro F1 | {f1:.4f} |\n"
                ).format(
                    accuracy=topic_m.get("accuracy", 0),
                    f1=topic_m.get("f1", topic_m.get("macro_f1", 0)),
                )
                gr.Markdown(topic_md)
            else:
                gr.Markdown("Topic classification metrics not available.")

            gr.Markdown("#### Emotion Detection")

            if METRICS.get("emotion"):
                emotion_m = METRICS["emotion"]
                emotion_md = (
                    "| Metric | Score |\n"
                    "|--------|-------|\n"
                    "| Sample-avg F1 | {sample_f1:.4f} |\n"
                    "| Macro F1 | {macro_f1:.4f} |\n"
                    "| Micro F1 | {micro_f1:.4f} |\n\n"
                    "28-label multi-label classification trained on GoEmotions."
                ).format(
                    sample_f1=emotion_m.get(
                        "sample_avg_f1", emotion_m.get("f1", emotion_m.get("multilabel_f1", 0))
                    ),
                    macro_f1=emotion_m.get("macro_f1", 0),
                    micro_f1=emotion_m.get("micro_f1", 0),
                )
                gr.Markdown(emotion_md)
            else:
                gr.Markdown("Emotion detection metrics not available.")

            gr.Markdown("#### Discovery Dataset")

            gr.Markdown(
                "| Content | Count |\n"
                "|---------|-------|\n"
                f"| Literary Works | {len(BOOKS)} |\n"
                f"| Research Papers | {len(PAPERS)} |\n"
                f"| **Total** | **{len(ALL_ITEMS)}** |\n"
                f"| Unique Topics | {len(TOPICS)} |\n"
                f"| Unique Tones | {len(EMOTIONS)} |"
            )

        # -- About --
        with gr.Tab("About"):
            gr.Markdown(
                "### About LexiMind\n\n"
                "LexiMind is a **272M parameter encoder-decoder transformer** "
                "(FLAN-T5-base) trained jointly on three tasks:\n\n"
                "| Task | What it does | Training data |\n"
                "|------|-------------|---------------|\n"
                "| **Summarization** | Generates abstracts for research papers | "
                "~49K pairs (arXiv + Project Gutenberg/Goodreads) |\n"
                "| **Topic Classification** | Assigns one of 7 topics | 3.4K samples |\n"
                "| **Emotion Detection** | Detects up to 28 emotions | "
                "43K GoEmotions samples |\n\n"
                "**How to read the results:**\n\n"
                "- **Research papers** show AI-generated summaries that condense the "
                "paper's content. These are generated by the model and are generally "
                "accurate.\n"
                "- **Books** show the Goodreads description as the primary text. "
                "The model was trained primarily on academic text (~45K academic vs ~4K literary), "
                "so book summaries are not shown.\n"
                "- **Tone labels** indicate the dominant emotional tone detected by the model. "
                "Since the emotion detector was trained on social media (GoEmotions), "
                "it captures general sentiment better than specific emotions for "
                "formal text.\n\n"
                "#### Architecture\n\n"
                "- Custom from-scratch Transformer (not HuggingFace wrappers)\n"
                "- Shared encoder with task-specific heads: decoder for summarization, "
                "attention pooling for emotion, mean pooling for topic\n"
                "- Trained in ~9 hours on a single RTX 4070 12GB\n\n"
                "[GitHub](https://github.com/OliverPerrin/LexiMind) | "
                "[Model](https://huggingface.co/OliverPerrin/LexiMind-Model) | "
                "[Dataset](https://huggingface.co/datasets/OliverPerrin/LexiMind-Discovery) | "
                "[Paper](https://github.com/OliverPerrin/LexiMind/blob/main/docs/research_paper.tex)"
                "\n\n*Oliver Perrin -- Appalachian State University -- 2025-2026*"
            )


if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)