county commited on
Commit
caf6918
·
1 Parent(s): 8e76971

Show OE_acc and OE_scr side by side in cross-judge table

Browse files
Files changed (2) hide show
  1. script.js +53 -37
  2. styles.css +90 -6
script.js CHANGED
@@ -340,10 +340,6 @@ function renderSamples(active = carouselState.activeDimension, index = 0) {
340
  }
341
  }
342
 
343
- function getMetricLabel(metric) {
344
- return metric === "oeAcc" ? "OE_acc" : "OE_scr";
345
- }
346
-
347
  function getMetricValue(item, metric) {
348
  return metric === "oeAcc" ? item.oeAcc : item.oeScr;
349
  }
@@ -385,10 +381,11 @@ function createToggleButton(label, isActive, onClick) {
385
  }
386
 
387
  function renderControls() {
388
- if (!taskToggleRoot || !judgeToggleRoot || !metricToggleRoot) {
389
  return;
390
  }
391
  const judgeToggleRow = judgeToggleRoot.closest(".toolbar-row");
 
392
 
393
  const taskItems = [
394
  { key: "oe", label: "OE" },
@@ -407,13 +404,18 @@ function renderControls() {
407
  });
408
 
409
  judgeToggleRoot.innerHTML = "";
410
- metricToggleRoot.innerHTML = "";
 
 
411
 
412
  if (crossJudgeState.activeTask === "oe") {
413
  judgeToggleRoot.classList.remove("is-hidden");
414
  if (judgeToggleRow) {
415
  judgeToggleRow.classList.remove("is-hidden");
416
  }
 
 
 
417
 
418
  const judgeItems = [
419
  { key: "gpt4o", label: "GPT-4o (full set)" },
@@ -425,17 +427,6 @@ function renderControls() {
425
  });
426
  judgeToggleRoot.appendChild(button);
427
  });
428
-
429
- const metricItems = [
430
- { key: "oeAcc", label: "OE_acc" },
431
- { key: "oeScr", label: "OE_scr" }
432
- ];
433
- metricItems.forEach((item) => {
434
- const button = createToggleButton(item.label, item.key === crossJudgeState.activeMetric, () => {
435
- renderResultsPanel(crossJudgeState.activeTask, crossJudgeState.activeJudge, item.key);
436
- });
437
- metricToggleRoot.appendChild(button);
438
- });
439
  return;
440
  }
441
 
@@ -443,17 +434,21 @@ function renderControls() {
443
  if (judgeToggleRow) {
444
  judgeToggleRow.classList.add("is-hidden");
445
  }
446
- const mcButton = createToggleButton("MC_acc", true, () => {});
447
- mcButton.disabled = true;
448
- metricToggleRoot.appendChild(mcButton);
449
  }
450
 
451
- function renderOeResults(activeJudge, activeMetric) {
452
  const otherJudge = getOtherJudge(activeJudge);
453
  const currentRows = [...judgeResults[activeJudge].rows].sort((a, b) => {
454
- const valueDiff = getMetricValue(b, activeMetric) - getMetricValue(a, activeMetric);
455
- if (Math.abs(valueDiff) > 1e-6) {
456
- return valueDiff;
 
 
 
 
457
  }
458
  return a.rank - b.rank;
459
  });
@@ -461,21 +456,26 @@ function renderOeResults(activeJudge, activeMetric) {
461
 
462
  if (resultsCaptionRoot) {
463
  resultsCaptionRoot.textContent =
464
- `${judgeResults[activeJudge].label} · Sorted by ${getMetricLabel(activeMetric)} · Δ vs ${judgeResults[activeJudge].deltaLabel}`;
465
  }
466
 
467
  resultsRoot.innerHTML = "";
 
 
 
 
 
 
 
 
 
 
468
  currentRows.forEach((item, index) => {
469
  const row = document.createElement("div");
470
- row.className = `result-row${item.isOurs ? " ours" : ""}`;
471
-
472
- const score = getMetricValue(item, activeMetric);
473
  const other = otherMap.get(item.model);
474
- const delta = other ? score - getMetricValue(other, activeMetric) : 0;
475
- const deltaClass = getDeltaClass(delta);
476
- const deltaText = other
477
- ? `${formatDeltaValue(delta, activeMetric)} vs ${judgeResults[activeJudge].deltaLabel}`
478
- : "N/A";
479
 
480
  row.innerHTML = `
481
  <div class="result-rank">#${index + 1}</div>
@@ -483,8 +483,14 @@ function renderOeResults(activeJudge, activeMetric) {
483
  <div class="result-model">${item.model}</div>
484
  ${item.isOurs ? '<span class="result-tag">Ours</span>' : ""}
485
  </div>
486
- <div class="result-score">${formatMetricValue(score, activeMetric)}</div>
487
- <div class="result-delta ${deltaClass}">${deltaText}</div>
 
 
 
 
 
 
488
  `;
489
  resultsRoot.appendChild(row);
490
  });
@@ -497,9 +503,19 @@ function renderMcResults() {
497
  }
498
 
499
  resultsRoot.innerHTML = "";
 
 
 
 
 
 
 
 
 
 
500
  ranked.forEach((item, index) => {
501
  const row = document.createElement("div");
502
- row.className = `result-row${item.isOurs ? " ours" : ""}`;
503
  row.innerHTML = `
504
  <div class="result-rank">#${index + 1}</div>
505
  <div class="result-model-wrap">
@@ -529,7 +545,7 @@ function renderResultsPanel(
529
  if (crossJudgeState.activeTask === "mc") {
530
  renderMcResults();
531
  } else {
532
- renderOeResults(crossJudgeState.activeJudge, crossJudgeState.activeMetric);
533
  }
534
  }
535
 
 
340
  }
341
  }
342
 
 
 
 
 
343
  function getMetricValue(item, metric) {
344
  return metric === "oeAcc" ? item.oeAcc : item.oeScr;
345
  }
 
381
  }
382
 
383
  function renderControls() {
384
+ if (!taskToggleRoot || !judgeToggleRoot) {
385
  return;
386
  }
387
  const judgeToggleRow = judgeToggleRoot.closest(".toolbar-row");
388
+ const metricToggleRow = metricToggleRoot ? metricToggleRoot.closest(".toolbar-row") : null;
389
 
390
  const taskItems = [
391
  { key: "oe", label: "OE" },
 
404
  });
405
 
406
  judgeToggleRoot.innerHTML = "";
407
+ if (metricToggleRoot) {
408
+ metricToggleRoot.innerHTML = "";
409
+ }
410
 
411
  if (crossJudgeState.activeTask === "oe") {
412
  judgeToggleRoot.classList.remove("is-hidden");
413
  if (judgeToggleRow) {
414
  judgeToggleRow.classList.remove("is-hidden");
415
  }
416
+ if (metricToggleRow) {
417
+ metricToggleRow.classList.add("is-hidden");
418
+ }
419
 
420
  const judgeItems = [
421
  { key: "gpt4o", label: "GPT-4o (full set)" },
 
427
  });
428
  judgeToggleRoot.appendChild(button);
429
  });
 
 
 
 
 
 
 
 
 
 
 
430
  return;
431
  }
432
 
 
434
  if (judgeToggleRow) {
435
  judgeToggleRow.classList.add("is-hidden");
436
  }
437
+ if (metricToggleRow) {
438
+ metricToggleRow.classList.add("is-hidden");
439
+ }
440
  }
441
 
442
+ function renderOeResults(activeJudge) {
443
  const otherJudge = getOtherJudge(activeJudge);
444
  const currentRows = [...judgeResults[activeJudge].rows].sort((a, b) => {
445
+ const accDiff = b.oeAcc - a.oeAcc;
446
+ if (Math.abs(accDiff) > 1e-6) {
447
+ return accDiff;
448
+ }
449
+ const scrDiff = b.oeScr - a.oeScr;
450
+ if (Math.abs(scrDiff) > 1e-6) {
451
+ return scrDiff;
452
  }
453
  return a.rank - b.rank;
454
  });
 
456
 
457
  if (resultsCaptionRoot) {
458
  resultsCaptionRoot.textContent =
459
+ `${judgeResults[activeJudge].label} · Sorted by OE_acc · per-metric deltas shown vs ${judgeResults[activeJudge].deltaLabel}`;
460
  }
461
 
462
  resultsRoot.innerHTML = "";
463
+ const header = document.createElement("div");
464
+ header.className = "results-head results-head-oe";
465
+ header.innerHTML = `
466
+ <div></div>
467
+ <div class="results-head-cell">Model</div>
468
+ <div class="results-head-cell results-head-cell-right">OE_acc</div>
469
+ <div class="results-head-cell results-head-cell-right">OE_scr</div>
470
+ `;
471
+ resultsRoot.appendChild(header);
472
+
473
  currentRows.forEach((item, index) => {
474
  const row = document.createElement("div");
475
+ row.className = `result-row result-row-oe${item.isOurs ? " ours" : ""}`;
 
 
476
  const other = otherMap.get(item.model);
477
+ const accDelta = other ? item.oeAcc - other.oeAcc : 0;
478
+ const scrDelta = other ? item.oeScr - other.oeScr : 0;
 
 
 
479
 
480
  row.innerHTML = `
481
  <div class="result-rank">#${index + 1}</div>
 
483
  <div class="result-model">${item.model}</div>
484
  ${item.isOurs ? '<span class="result-tag">Ours</span>' : ""}
485
  </div>
486
+ <div class="result-metric-col acc" data-label="OE_acc">
487
+ <div class="result-score">${formatMetricValue(item.oeAcc, "oeAcc")}</div>
488
+ <div class="result-delta ${getDeltaClass(accDelta)}">${formatDeltaValue(accDelta, "oeAcc")}</div>
489
+ </div>
490
+ <div class="result-metric-col scr" data-label="OE_scr">
491
+ <div class="result-score">${formatMetricValue(item.oeScr, "oeScr")}</div>
492
+ <div class="result-delta ${getDeltaClass(scrDelta)}">${formatDeltaValue(scrDelta, "oeScr")}</div>
493
+ </div>
494
  `;
495
  resultsRoot.appendChild(row);
496
  });
 
503
  }
504
 
505
  resultsRoot.innerHTML = "";
506
+ const header = document.createElement("div");
507
+ header.className = "results-head results-head-mc";
508
+ header.innerHTML = `
509
+ <div></div>
510
+ <div class="results-head-cell">Model</div>
511
+ <div class="results-head-cell results-head-cell-right">MC_acc</div>
512
+ <div class="results-head-cell results-head-cell-right">Setting</div>
513
+ `;
514
+ resultsRoot.appendChild(header);
515
+
516
  ranked.forEach((item, index) => {
517
  const row = document.createElement("div");
518
+ row.className = `result-row result-row-mc${item.isOurs ? " ours" : ""}`;
519
  row.innerHTML = `
520
  <div class="result-rank">#${index + 1}</div>
521
  <div class="result-model-wrap">
 
545
  if (crossJudgeState.activeTask === "mc") {
546
  renderMcResults();
547
  } else {
548
+ renderOeResults(crossJudgeState.activeJudge);
549
  }
550
  }
551
 
styles.css CHANGED
@@ -585,10 +585,33 @@ code {
585
  gap: 8px;
586
  }
587
 
588
- .result-row {
589
  display: grid;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
590
  grid-template-columns: 54px minmax(0, 1fr) 90px minmax(130px, 170px);
591
  gap: 10px;
 
 
 
 
 
 
 
 
592
  align-items: center;
593
  padding: 8px 10px;
594
  border: 1px solid #ddd8cc;
@@ -596,6 +619,16 @@ code {
596
  background: #fcfbf8;
597
  }
598
 
 
 
 
 
 
 
 
 
 
 
599
  .result-row.ours {
600
  border-color: #d89984;
601
  background: linear-gradient(90deg, #fff7f3 0%, #fffdfa 100%);
@@ -645,10 +678,16 @@ code {
645
  font-weight: 700;
646
  }
647
 
 
 
 
 
 
 
648
  .result-delta {
649
  min-width: 0;
650
  text-align: right;
651
- font-size: 0.8rem;
652
  font-weight: 600;
653
  color: #55514a;
654
  }
@@ -1106,27 +1145,72 @@ code {
1106
  gap: 5px;
1107
  }
1108
 
1109
- .result-row {
 
 
 
 
1110
  grid-template-columns: 46px minmax(0, 1fr) auto;
1111
  gap: 6px 8px;
1112
  align-items: start;
1113
  padding: 8px;
1114
  }
1115
 
1116
- .result-model-wrap {
1117
  grid-column: 2;
1118
  grid-row: 1;
1119
  }
1120
 
1121
- .result-score {
1122
  grid-column: 3;
1123
  grid-row: 1;
1124
  }
1125
 
1126
- .result-delta {
1127
  grid-column: 2 / 4;
1128
  grid-row: 2;
1129
  text-align: left;
1130
  }
1131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1132
  }
 
585
  gap: 8px;
586
  }
587
 
588
+ .results-head {
589
  display: grid;
590
+ align-items: center;
591
+ padding: 0 10px 2px;
592
+ color: #756f65;
593
+ font-size: 0.76rem;
594
+ font-weight: 700;
595
+ letter-spacing: 0.03em;
596
+ text-transform: uppercase;
597
+ }
598
+
599
+ .results-head-oe {
600
+ grid-template-columns: 54px minmax(0, 1fr) 94px 94px;
601
+ gap: 10px;
602
+ }
603
+
604
+ .results-head-mc {
605
  grid-template-columns: 54px minmax(0, 1fr) 90px minmax(130px, 170px);
606
  gap: 10px;
607
+ }
608
+
609
+ .results-head-cell-right {
610
+ text-align: right;
611
+ }
612
+
613
+ .result-row {
614
+ gap: 10px;
615
  align-items: center;
616
  padding: 8px 10px;
617
  border: 1px solid #ddd8cc;
 
619
  background: #fcfbf8;
620
  }
621
 
622
+ .result-row-oe {
623
+ display: grid;
624
+ grid-template-columns: 54px minmax(0, 1fr) 94px 94px;
625
+ }
626
+
627
+ .result-row-mc {
628
+ display: grid;
629
+ grid-template-columns: 54px minmax(0, 1fr) 90px minmax(130px, 170px);
630
+ }
631
+
632
  .result-row.ours {
633
  border-color: #d89984;
634
  background: linear-gradient(90deg, #fff7f3 0%, #fffdfa 100%);
 
678
  font-weight: 700;
679
  }
680
 
681
+ .result-metric-col {
682
+ display: grid;
683
+ gap: 2px;
684
+ min-width: 0;
685
+ }
686
+
687
  .result-delta {
688
  min-width: 0;
689
  text-align: right;
690
+ font-size: 0.75rem;
691
  font-weight: 600;
692
  color: #55514a;
693
  }
 
1145
  gap: 5px;
1146
  }
1147
 
1148
+ .results-head {
1149
+ display: none;
1150
+ }
1151
+
1152
+ .result-row-mc {
1153
  grid-template-columns: 46px minmax(0, 1fr) auto;
1154
  gap: 6px 8px;
1155
  align-items: start;
1156
  padding: 8px;
1157
  }
1158
 
1159
+ .result-row-mc .result-model-wrap {
1160
  grid-column: 2;
1161
  grid-row: 1;
1162
  }
1163
 
1164
+ .result-row-mc .result-score {
1165
  grid-column: 3;
1166
  grid-row: 1;
1167
  }
1168
 
1169
+ .result-row-mc .result-delta {
1170
  grid-column: 2 / 4;
1171
  grid-row: 2;
1172
  text-align: left;
1173
  }
1174
 
1175
+ .result-row-oe {
1176
+ grid-template-columns: 46px minmax(0, 1fr) minmax(72px, auto) minmax(72px, auto);
1177
+ gap: 7px 8px;
1178
+ align-items: start;
1179
+ padding: 8px;
1180
+ }
1181
+
1182
+ .result-row-oe .result-rank {
1183
+ grid-column: 1;
1184
+ grid-row: 1 / 3;
1185
+ }
1186
+
1187
+ .result-row-oe .result-model-wrap {
1188
+ grid-column: 2 / 5;
1189
+ grid-row: 1;
1190
+ }
1191
+
1192
+ .result-row-oe .result-metric-col {
1193
+ display: grid;
1194
+ gap: 1px;
1195
+ justify-items: end;
1196
+ }
1197
+
1198
+ .result-row-oe .result-metric-col.acc {
1199
+ grid-column: 3;
1200
+ grid-row: 2;
1201
+ }
1202
+
1203
+ .result-row-oe .result-metric-col.scr {
1204
+ grid-column: 4;
1205
+ grid-row: 2;
1206
+ }
1207
+
1208
+ .result-row-oe .result-metric-col::before {
1209
+ content: attr(data-label);
1210
+ font-size: 0.65rem;
1211
+ font-weight: 700;
1212
+ color: #7a7368;
1213
+ letter-spacing: 0.03em;
1214
+ }
1215
+
1216
  }