nuocuhz Claude Sonnet 4.6 commited on
Commit
0ab9a26
Β·
1 Parent(s): f90a391

Add Analytics tab: interactive Plotly figures + findings for ICLR/ICML/NeurIPS 2025

Browse files
analytics.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """analytics.py β€” Load sample results and build Plotly figures for the Analytics tab."""
2
+
3
+ import json
4
+ import os
5
+ from collections import Counter, defaultdict
6
+
7
+ import plotly.graph_objects as go
8
+ from plotly.subplots import make_subplots
9
+
10
+ # ── Data loading ──────────────────────────────────────────────────────────────
11
+
12
+ _DIR = os.path.dirname(__file__)
13
+
14
+ DATASETS = {
15
+ "ICLR 2025": "iclr2025_v2_results.jsonl",
16
+ "ICML 2025": "icml2025_v3_results.jsonl",
17
+ "NeurIPS 2025": "neurips2025_v3_results.jsonl",
18
+ }
19
+
20
+ LABEL_COLORS = {
21
+ "System 1": "#ef4444",
22
+ "Mixed": "#f59e0b",
23
+ "System 2": "#22c55e",
24
+ "Non-evaluative": "#94a3b8",
25
+ }
26
+
27
+ CONF_COLORS = {
28
+ "ICLR 2025": "#6366f1",
29
+ "ICML 2025": "#f59e0b",
30
+ "NeurIPS 2025": "#22c55e",
31
+ }
32
+
33
+
34
+ def _load_results(fname: str) -> list:
35
+ path = os.path.join(_DIR, fname)
36
+ if not os.path.exists(path):
37
+ return []
38
+ out = []
39
+ for line in open(path):
40
+ line = line.strip()
41
+ if line:
42
+ try:
43
+ out.append(json.loads(line))
44
+ except Exception:
45
+ pass
46
+ return out
47
+
48
+
49
+ def load_all() -> dict:
50
+ """Returns {conf: {"papers": [...], "reviews": [...], "metas": [...]}}"""
51
+ data = {}
52
+ for conf, fname in DATASETS.items():
53
+ papers = _load_results(fname)
54
+ reviews = []
55
+ for p in papers:
56
+ for r in p.get("review_ratings", []):
57
+ if r.get("label"):
58
+ reviews.append({**r, "_decision": p.get("decision", ""), "_conf": conf})
59
+ metas = []
60
+ for p in papers:
61
+ m = p.get("metareview_rating")
62
+ if m and m.get("label"):
63
+ metas.append({**m, "_decision": p.get("decision", ""), "_conf": conf})
64
+ data[conf] = {"papers": papers, "reviews": reviews, "metas": metas}
65
+ return data
66
+
67
+
68
+ # ── Figure builders ───────────────────────────────────────────────────────────
69
+
70
+ def fig_label_distribution(data: dict) -> go.Figure:
71
+ """Grouped bar: label distribution per conference."""
72
+ labels_order = ["System 1", "Mixed", "System 2", "Non-evaluative"]
73
+ confs = list(data.keys())
74
+
75
+ fig = go.Figure()
76
+ for lbl in labels_order:
77
+ y_vals = []
78
+ for conf in confs:
79
+ reviews = data[conf]["reviews"]
80
+ if not reviews:
81
+ y_vals.append(0)
82
+ continue
83
+ cnt = sum(1 for r in reviews if r["label"] == lbl)
84
+ y_vals.append(round(cnt / len(reviews) * 100, 1))
85
+ fig.add_trace(go.Bar(
86
+ name=lbl,
87
+ x=confs,
88
+ y=y_vals,
89
+ marker_color=LABEL_COLORS.get(lbl, "#888"),
90
+ text=[f"{v}%" for v in y_vals],
91
+ textposition="outside",
92
+ ))
93
+
94
+ fig.update_layout(
95
+ title="Review Label Distribution by Conference",
96
+ barmode="group",
97
+ yaxis=dict(title="% of reviews", range=[0, 75]),
98
+ legend=dict(orientation="h", y=-0.2),
99
+ height=420,
100
+ margin=dict(t=50, b=80),
101
+ )
102
+ return fig
103
+
104
+
105
+ def fig_rqs_by_decision(data: dict) -> go.Figure:
106
+ """Grouped bar: mean RQS per decision tier per conference."""
107
+ decision_map = {
108
+ "Accept (Oral)": "Oral",
109
+ "Accept (oral)": "Oral",
110
+ "Accept (Spotlight)": "Spotlight",
111
+ "Accept (spotlight)": "Spotlight",
112
+ "Accept (spotlight poster)": "Spotlight",
113
+ "Accept (Poster)": "Poster",
114
+ "Accept (poster)": "Poster",
115
+ }
116
+ tiers = ["Oral", "Spotlight", "Poster"]
117
+ confs = list(data.keys())
118
+
119
+ fig = go.Figure()
120
+ for conf in confs:
121
+ by_tier = defaultdict(list)
122
+ for r in data[conf]["reviews"]:
123
+ tier = decision_map.get(r["_decision"])
124
+ rqs = r.get("overall_reasoning_quality_score")
125
+ if tier and rqs:
126
+ by_tier[tier].append(float(rqs))
127
+ y_vals = [round(sum(by_tier[t]) / len(by_tier[t]), 2) if by_tier[t] else None for t in tiers]
128
+ counts = [len(by_tier[t]) for t in tiers]
129
+ fig.add_trace(go.Bar(
130
+ name=conf,
131
+ x=tiers,
132
+ y=y_vals,
133
+ marker_color=CONF_COLORS[conf],
134
+ text=[f"{v:.2f}<br>(n={c})" if v else "" for v, c in zip(y_vals, counts)],
135
+ textposition="outside",
136
+ ))
137
+
138
+ fig.update_layout(
139
+ title="Mean Reasoning Quality Score by Decision Tier",
140
+ barmode="group",
141
+ yaxis=dict(title="RQS (1–5)", range=[0, 4]),
142
+ legend=dict(orientation="h", y=-0.2),
143
+ height=420,
144
+ margin=dict(t=50, b=80),
145
+ )
146
+ return fig
147
+
148
+
149
+ def fig_s1_s2_scatter(data: dict) -> go.Figure:
150
+ """Scatter: S1 score vs S2 score, colored by label, one trace per conf."""
151
+ fig = go.Figure()
152
+ for conf in data:
153
+ reviews = data[conf]["reviews"]
154
+ for lbl in ["System 1", "Mixed", "System 2", "Non-evaluative"]:
155
+ subset = [r for r in reviews if r.get("label") == lbl
156
+ and r.get("system1_score") and r.get("system2_score")]
157
+ if not subset:
158
+ continue
159
+ fig.add_trace(go.Scatter(
160
+ x=[r["system1_score"] for r in subset],
161
+ y=[r["system2_score"] for r in subset],
162
+ mode="markers",
163
+ name=f"{conf} β€” {lbl}",
164
+ marker=dict(color=LABEL_COLORS.get(lbl, "#888"), size=5, opacity=0.6),
165
+ legendgroup=lbl,
166
+ showlegend=True,
167
+ ))
168
+
169
+ # diagonal reference line
170
+ fig.add_shape(type="line", x0=1, y0=1, x1=5, y1=5,
171
+ line=dict(color="gray", dash="dash", width=1))
172
+ fig.update_layout(
173
+ title="System 1 vs System 2 Score (all reviews)",
174
+ xaxis=dict(title="System 1 Score", range=[0.8, 5.2]),
175
+ yaxis=dict(title="System 2 Score", range=[0.8, 5.2]),
176
+ height=480,
177
+ margin=dict(t=50, b=40),
178
+ )
179
+ return fig
180
+
181
+
182
+ def fig_bias_heatmap(data: dict) -> go.Figure:
183
+ """Heatmap: bias frequency (% of reviews) per conference."""
184
+ bias_order = [
185
+ "Checklist Inflation",
186
+ "Representativeness Heuristic",
187
+ "Question Substitution",
188
+ "Conclusion-First Justification",
189
+ "Overconfidence",
190
+ "Narrative Fallacy",
191
+ "Authority Substitution",
192
+ "Confirmation Bias",
193
+ ]
194
+ confs = list(data.keys())
195
+ z = []
196
+ text = []
197
+ for conf in confs:
198
+ reviews = data[conf]["reviews"]
199
+ n = len(reviews) or 1
200
+ row = []
201
+ trow = []
202
+ for b in bias_order:
203
+ cnt = sum(1 for r in reviews if b in r.get("bias_diagnostics", []))
204
+ pct = round(cnt / n * 100, 1)
205
+ row.append(pct)
206
+ trow.append(f"{pct}%<br>({cnt})")
207
+ z.append(row)
208
+ text.append(trow)
209
+
210
+ fig = go.Figure(go.Heatmap(
211
+ z=z,
212
+ x=bias_order,
213
+ y=confs,
214
+ text=text,
215
+ texttemplate="%{text}",
216
+ colorscale="YlOrRd",
217
+ showscale=True,
218
+ colorbar=dict(title="% reviews"),
219
+ ))
220
+ fig.update_layout(
221
+ title="Bias Diagnostics Frequency (% of reviews per conference)",
222
+ xaxis=dict(tickangle=-30),
223
+ height=320,
224
+ margin=dict(t=50, b=120),
225
+ )
226
+ return fig
227
+
228
+
229
+ def fig_rqs_distribution(data: dict) -> go.Figure:
230
+ """Violin: RQS distribution per conference."""
231
+ fig = go.Figure()
232
+ for conf in data:
233
+ rqs_vals = [float(r["overall_reasoning_quality_score"])
234
+ for r in data[conf]["reviews"]
235
+ if r.get("overall_reasoning_quality_score")]
236
+ fig.add_trace(go.Violin(
237
+ y=rqs_vals,
238
+ name=conf,
239
+ box_visible=True,
240
+ meanline_visible=True,
241
+ fillcolor=CONF_COLORS[conf],
242
+ opacity=0.7,
243
+ line_color="white",
244
+ ))
245
+ fig.update_layout(
246
+ title="RQS Distribution by Conference",
247
+ yaxis=dict(title="Overall Reasoning Quality Score (1–5)"),
248
+ height=400,
249
+ margin=dict(t=50, b=40),
250
+ )
251
+ return fig
252
+
253
+
254
+ # ── Summary text ──────────────────────────────────────────────────────────────
255
+
256
+ def build_summary(data: dict) -> str:
257
+ lines = []
258
+ for conf in data:
259
+ reviews = data[conf]["reviews"]
260
+ if not reviews:
261
+ continue
262
+ n = len(reviews)
263
+ lc = Counter(r["label"] for r in reviews)
264
+ rqs = [float(r["overall_reasoning_quality_score"]) for r in reviews if r.get("overall_reasoning_quality_score")]
265
+ mean_rqs = sum(rqs) / len(rqs) if rqs else 0
266
+ lines.append(f"**{conf}** β€” {n} reviews Β· RQS mean {mean_rqs:.2f} Β· "
267
+ f"Mixed {lc.get('Mixed',0)/n*100:.0f}% Β· "
268
+ f"S1 {lc.get('System 1',0)/n*100:.0f}% Β· "
269
+ f"S2 {lc.get('System 2',0)/n*100:.0f}%")
270
+ return "\n\n".join(lines)
271
+
272
+
273
+ FINDINGS = """
274
+ ### Key Findings (100 papers Γ— 3 conferences, ~1,150 reviews)
275
+
276
+ 1. **Mixed reasoning dominates across all venues (49–57%).** Pure System 1 or System 2 reviews are the minority β€” most reviewers blend intuitive and analytical modes rather than operating at either extreme.
277
+
278
+ 2. **ICLR reviewers show more System 1 tendency (35%) than ICML/NeurIPS (~21%).** This may reflect ICLR's open-ended review format, which imposes less structural scaffolding than ICML's field-by-field template β€” less structure β†’ more impression-driven writing.
279
+
280
+ 3. **ICML and NeurIPS reviewers show more System 2 tendency (~23–26%) than ICLR (16%).** ICML's structured fields (*Claims and Evidence*, *Theoretical Claims*, *Experimental Designs*) appear to scaffold more explicit, decomposed reasoning.
281
+
282
+ 4. **Reasoning quality (RQS) is nearly identical across venues (2.80–2.94 / 5).** Despite different formats and communities, the overall analytical depth of peer review is remarkably uniform β€” suggesting a field-wide ceiling rather than venue-specific culture.
283
+
284
+ 5. **Decision tier does not predict review quality.** Oral-paper reviews are not systematically stronger than Poster reviews (differences < 0.2 RQS points). Reviewers do not write more analytically for papers they rate highly.
285
+
286
+ 6. **Checklist Inflation is the dominant bias in all three venues** (50–58% of reviews). Reviewers frequently enumerate specific concerns without analytical linkage, prioritization, or core-claim relevance β€” mistaking list length for reasoning depth.
287
+
288
+ 7. **Representativeness Heuristic is more prevalent at NeurIPS (27%) than ICLR/ICML (~17–21%).** NeurIPS reviewers more often judge papers by surface similarity to known strong work rather than explicit criteria.
289
+ """
app.py CHANGED
@@ -9,6 +9,8 @@ from rater import (
9
  rate_review, format_result_markdown,
10
  rate_metareview, format_metareview_result_markdown,
11
  )
 
 
12
 
13
  _paper_cache: dict = {}
14
  _last_result: dict = {}
@@ -461,6 +463,18 @@ This perspective reframes peer review as a **reasoning process** rather than mer
461
  manual_meta_btn = gr.Button("AI Rate This Meta-Review", variant="primary")
462
  manual_meta_result = gr.Markdown("")
463
 
 
 
 
 
 
 
 
 
 
 
 
 
464
  # ── Wire events ────────────────────────────────────────────────────────────
465
  provider_dd.change(update_provider, [provider_dd], [model_dd, api_key_box])
466
 
 
9
  rate_review, format_result_markdown,
10
  rate_metareview, format_metareview_result_markdown,
11
  )
12
+ from analytics import load_all, fig_label_distribution, fig_rqs_by_decision, \
13
+ fig_s1_s2_scatter, fig_bias_heatmap, fig_rqs_distribution, FINDINGS
14
 
15
  _paper_cache: dict = {}
16
  _last_result: dict = {}
 
463
  manual_meta_btn = gr.Button("AI Rate This Meta-Review", variant="primary")
464
  manual_meta_result = gr.Markdown("")
465
 
466
+ with gr.Tab("πŸ“Š Analytics"):
467
+ gr.Markdown(FINDINGS)
468
+ gr.Markdown("---")
469
+ _adata = load_all()
470
+ with gr.Row():
471
+ gr.Plot(value=fig_label_distribution(_adata))
472
+ gr.Plot(value=fig_rqs_by_decision(_adata))
473
+ with gr.Row():
474
+ gr.Plot(value=fig_rqs_distribution(_adata))
475
+ gr.Plot(value=fig_bias_heatmap(_adata))
476
+ gr.Plot(value=fig_s1_s2_scatter(_adata))
477
+
478
  # ── Wire events ────────────────────────────────────────────────────────────
479
  provider_dd.change(update_provider, [provider_dd], [model_dd, api_key_box])
480
 
iclr2025_v2_results.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
icml2025_v3_results.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
neurips2025_v3_results.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -4,3 +4,4 @@ openai
4
  requests
5
  huggingface_hub
6
  openreview-py
 
 
4
  requests
5
  huggingface_hub
6
  openreview-py
7
+ plotly