Spaces:

nuojohnchen
/

Kahneman4Review

Sleeping

App Files Files Community

nuocuhz Claude Sonnet 4.6 commited on Mar 13

Commit

0ab9a26

1 Parent(s): f90a391

Add Analytics tab: interactive Plotly figures + findings for ICLR/ICML/NeurIPS 2025

Browse files

Files changed (6) hide show

analytics.py +289 -0
app.py +14 -0
iclr2025_v2_results.jsonl +0 -0
icml2025_v3_results.jsonl +0 -0
neurips2025_v3_results.jsonl +0 -0
requirements.txt +1 -0

analytics.py ADDED Viewed

	@@ -0,0 +1,289 @@

+"""analytics.py — Load sample results and build Plotly figures for the Analytics tab."""
+import json
+import os
+from collections import Counter, defaultdict
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+# ── Data loading ──────────────────────────────────────────────────────────────
+_DIR = os.path.dirname(__file__)
+DATASETS = {
+    "ICLR 2025": "iclr2025_v2_results.jsonl",
+    "ICML 2025": "icml2025_v3_results.jsonl",
+    "NeurIPS 2025": "neurips2025_v3_results.jsonl",
+}
+LABEL_COLORS = {
+    "System 1":       "#ef4444",
+    "Mixed":          "#f59e0b",
+    "System 2":       "#22c55e",
+    "Non-evaluative": "#94a3b8",
+}
+CONF_COLORS = {
+    "ICLR 2025":    "#6366f1",
+    "ICML 2025":    "#f59e0b",
+    "NeurIPS 2025": "#22c55e",
+}
+def _load_results(fname: str) -> list:
+    path = os.path.join(_DIR, fname)
+    if not os.path.exists(path):
+        return []
+    out = []
+    for line in open(path):
+        line = line.strip()
+        if line:
+            try:
+                out.append(json.loads(line))
+            except Exception:
+                pass
+    return out
+def load_all() -> dict:
+    """Returns {conf: {"papers": [...], "reviews": [...], "metas": [...]}}"""
+    data = {}
+    for conf, fname in DATASETS.items():
+        papers = _load_results(fname)
+        reviews = []
+        for p in papers:
+            for r in p.get("review_ratings", []):
+                if r.get("label"):
+                    reviews.append({**r, "_decision": p.get("decision", ""), "_conf": conf})
+        metas = []
+        for p in papers:
+            m = p.get("metareview_rating")
+            if m and m.get("label"):
+                metas.append({**m, "_decision": p.get("decision", ""), "_conf": conf})
+        data[conf] = {"papers": papers, "reviews": reviews, "metas": metas}
+    return data
+# ── Figure builders ───────────────────────────────────────────────────────────
+def fig_label_distribution(data: dict) -> go.Figure:
+    """Grouped bar: label distribution per conference."""
+    labels_order = ["System 1", "Mixed", "System 2", "Non-evaluative"]
+    confs = list(data.keys())
+    fig = go.Figure()
+    for lbl in labels_order:
+        y_vals = []
+        for conf in confs:
+            reviews = data[conf]["reviews"]
+            if not reviews:
+                y_vals.append(0)
+                continue
+            cnt = sum(1 for r in reviews if r["label"] == lbl)
+            y_vals.append(round(cnt / len(reviews) * 100, 1))
+        fig.add_trace(go.Bar(
+            name=lbl,
+            x=confs,
+            y=y_vals,
+            marker_color=LABEL_COLORS.get(lbl, "#888"),
+            text=[f"{v}%" for v in y_vals],
+            textposition="outside",
+        ))
+    fig.update_layout(
+        title="Review Label Distribution by Conference",
+        barmode="group",
+        yaxis=dict(title="% of reviews", range=[0, 75]),
+        legend=dict(orientation="h", y=-0.2),
+        height=420,
+        margin=dict(t=50, b=80),
+    )
+    return fig
+def fig_rqs_by_decision(data: dict) -> go.Figure:
+    """Grouped bar: mean RQS per decision tier per conference."""
+    decision_map = {
+        "Accept (Oral)":             "Oral",
+        "Accept (oral)":             "Oral",
+        "Accept (Spotlight)":        "Spotlight",
+        "Accept (spotlight)":        "Spotlight",
+        "Accept (spotlight poster)": "Spotlight",
+        "Accept (Poster)":           "Poster",
+        "Accept (poster)":           "Poster",
+    }
+    tiers = ["Oral", "Spotlight", "Poster"]
+    confs = list(data.keys())
+    fig = go.Figure()
+    for conf in confs:
+        by_tier = defaultdict(list)
+        for r in data[conf]["reviews"]:
+            tier = decision_map.get(r["_decision"])
+            rqs = r.get("overall_reasoning_quality_score")
+            if tier and rqs:
+                by_tier[tier].append(float(rqs))
+        y_vals = [round(sum(by_tier[t]) / len(by_tier[t]), 2) if by_tier[t] else None for t in tiers]
+        counts = [len(by_tier[t]) for t in tiers]
+        fig.add_trace(go.Bar(
+            name=conf,
+            x=tiers,
+            y=y_vals,
+            marker_color=CONF_COLORS[conf],
+            text=[f"{v:.2f}<br>(n={c})" if v else "" for v, c in zip(y_vals, counts)],
+            textposition="outside",
+        ))
+    fig.update_layout(
+        title="Mean Reasoning Quality Score by Decision Tier",
+        barmode="group",
+        yaxis=dict(title="RQS (1–5)", range=[0, 4]),
+        legend=dict(orientation="h", y=-0.2),
+        height=420,
+        margin=dict(t=50, b=80),
+    )
+    return fig
+def fig_s1_s2_scatter(data: dict) -> go.Figure:
+    """Scatter: S1 score vs S2 score, colored by label, one trace per conf."""
+    fig = go.Figure()
+    for conf in data:
+        reviews = data[conf]["reviews"]
+        for lbl in ["System 1", "Mixed", "System 2", "Non-evaluative"]:
+            subset = [r for r in reviews if r.get("label") == lbl
+                      and r.get("system1_score") and r.get("system2_score")]
+            if not subset:
+                continue
+            fig.add_trace(go.Scatter(
+                x=[r["system1_score"] for r in subset],
+                y=[r["system2_score"] for r in subset],
+                mode="markers",
+                name=f"{conf} — {lbl}",
+                marker=dict(color=LABEL_COLORS.get(lbl, "#888"), size=5, opacity=0.6),
+                legendgroup=lbl,
+                showlegend=True,
+            ))
+    # diagonal reference line
+    fig.add_shape(type="line", x0=1, y0=1, x1=5, y1=5,
+                  line=dict(color="gray", dash="dash", width=1))
+    fig.update_layout(
+        title="System 1 vs System 2 Score (all reviews)",
+        xaxis=dict(title="System 1 Score", range=[0.8, 5.2]),
+        yaxis=dict(title="System 2 Score", range=[0.8, 5.2]),
+        height=480,
+        margin=dict(t=50, b=40),
+    )
+    return fig
+def fig_bias_heatmap(data: dict) -> go.Figure:
+    """Heatmap: bias frequency (% of reviews) per conference."""
+    bias_order = [
+        "Checklist Inflation",
+        "Representativeness Heuristic",
+        "Question Substitution",
+        "Conclusion-First Justification",
+        "Overconfidence",
+        "Narrative Fallacy",
+        "Authority Substitution",
+        "Confirmation Bias",
+    ]
+    confs = list(data.keys())
+    z = []
+    text = []
+    for conf in confs:
+        reviews = data[conf]["reviews"]
+        n = len(reviews) or 1
+        row = []
+        trow = []
+        for b in bias_order:
+            cnt = sum(1 for r in reviews if b in r.get("bias_diagnostics", []))
+            pct = round(cnt / n * 100, 1)
+            row.append(pct)
+            trow.append(f"{pct}%<br>({cnt})")
+        z.append(row)
+        text.append(trow)
+    fig = go.Figure(go.Heatmap(
+        z=z,
+        x=bias_order,
+        y=confs,
+        text=text,
+        texttemplate="%{text}",
+        colorscale="YlOrRd",
+        showscale=True,
+        colorbar=dict(title="% reviews"),
+    ))
+    fig.update_layout(
+        title="Bias Diagnostics Frequency (% of reviews per conference)",
+        xaxis=dict(tickangle=-30),
+        height=320,
+        margin=dict(t=50, b=120),
+    )
+    return fig
+def fig_rqs_distribution(data: dict) -> go.Figure:
+    """Violin: RQS distribution per conference."""
+    fig = go.Figure()
+    for conf in data:
+        rqs_vals = [float(r["overall_reasoning_quality_score"])
+                    for r in data[conf]["reviews"]
+                    if r.get("overall_reasoning_quality_score")]
+        fig.add_trace(go.Violin(
+            y=rqs_vals,
+            name=conf,
+            box_visible=True,
+            meanline_visible=True,
+            fillcolor=CONF_COLORS[conf],
+            opacity=0.7,
+            line_color="white",
+        ))
+    fig.update_layout(
+        title="RQS Distribution by Conference",
+        yaxis=dict(title="Overall Reasoning Quality Score (1–5)"),
+        height=400,
+        margin=dict(t=50, b=40),
+    )
+    return fig
+# ── Summary text ──────────────────────────────────────────────────────────────
+def build_summary(data: dict) -> str:
+    lines = []
+    for conf in data:
+        reviews = data[conf]["reviews"]
+        if not reviews:
+            continue
+        n = len(reviews)
+        lc = Counter(r["label"] for r in reviews)
+        rqs = [float(r["overall_reasoning_quality_score"]) for r in reviews if r.get("overall_reasoning_quality_score")]
+        mean_rqs = sum(rqs) / len(rqs) if rqs else 0
+        lines.append(f"**{conf}** — {n} reviews · RQS mean {mean_rqs:.2f} · "
+                     f"Mixed {lc.get('Mixed',0)/n*100:.0f}% · "
+                     f"S1 {lc.get('System 1',0)/n*100:.0f}% · "
+                     f"S2 {lc.get('System 2',0)/n*100:.0f}%")
+    return "\n\n".join(lines)
+FINDINGS = """
+### Key Findings (100 papers × 3 conferences, ~1,150 reviews)
+1. **Mixed reasoning dominates across all venues (49–57%).** Pure System 1 or System 2 reviews are the minority — most reviewers blend intuitive and analytical modes rather than operating at either extreme.
+2. **ICLR reviewers show more System 1 tendency (35%) than ICML/NeurIPS (~21%).** This may reflect ICLR's open-ended review format, which imposes less structural scaffolding than ICML's field-by-field template — less structure → more impression-driven writing.
+3. **ICML and NeurIPS reviewers show more System 2 tendency (~23–26%) than ICLR (16%).** ICML's structured fields (*Claims and Evidence*, *Theoretical Claims*, *Experimental Designs*) appear to scaffold more explicit, decomposed reasoning.
+4. **Reasoning quality (RQS) is nearly identical across venues (2.80–2.94 / 5).** Despite different formats and communities, the overall analytical depth of peer review is remarkably uniform — suggesting a field-wide ceiling rather than venue-specific culture.
+5. **Decision tier does not predict review quality.** Oral-paper reviews are not systematically stronger than Poster reviews (differences < 0.2 RQS points). Reviewers do not write more analytically for papers they rate highly.
+6. **Checklist Inflation is the dominant bias in all three venues** (50–58% of reviews). Reviewers frequently enumerate specific concerns without analytical linkage, prioritization, or core-claim relevance — mistaking list length for reasoning depth.
+7. **Representativeness Heuristic is more prevalent at NeurIPS (27%) than ICLR/ICML (~17–21%).** NeurIPS reviewers more often judge papers by surface similarity to known strong work rather than explicit criteria.
+"""

app.py CHANGED Viewed

@@ -9,6 +9,8 @@ from rater import (
     rate_review, format_result_markdown,
     rate_metareview, format_metareview_result_markdown,
 )
 _paper_cache: dict = {}
 _last_result: dict = {}
@@ -461,6 +463,18 @@ This perspective reframes peer review as a **reasoning process** rather than mer
                     manual_meta_btn = gr.Button("AI Rate This Meta-Review", variant="primary")
                     manual_meta_result = gr.Markdown("")
     # ── Wire events ────────────────────────────────────────────────────────────
     provider_dd.change(update_provider, [provider_dd], [model_dd, api_key_box])

     rate_review, format_result_markdown,
     rate_metareview, format_metareview_result_markdown,
 )
+from analytics import load_all, fig_label_distribution, fig_rqs_by_decision, \
+    fig_s1_s2_scatter, fig_bias_heatmap, fig_rqs_distribution, FINDINGS
 _paper_cache: dict = {}
 _last_result: dict = {}
                     manual_meta_btn = gr.Button("AI Rate This Meta-Review", variant="primary")
                     manual_meta_result = gr.Markdown("")
+        with gr.Tab("📊 Analytics"):
+            gr.Markdown(FINDINGS)
+            gr.Markdown("---")
+            _adata = load_all()
+            with gr.Row():
+                gr.Plot(value=fig_label_distribution(_adata))
+                gr.Plot(value=fig_rqs_by_decision(_adata))
+            with gr.Row():
+                gr.Plot(value=fig_rqs_distribution(_adata))
+                gr.Plot(value=fig_bias_heatmap(_adata))
+            gr.Plot(value=fig_s1_s2_scatter(_adata))
     # ── Wire events ────────────────────────────────────────────────────────────
     provider_dd.change(update_provider, [provider_dd], [model_dd, api_key_box])

iclr2025_v2_results.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

icml2025_v3_results.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

neurips2025_v3_results.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

@@ -4,3 +4,4 @@ openai
 requests
 huggingface_hub
 openreview-py

 requests
 huggingface_hub
 openreview-py
+plotly