county commited on
Commit ·
caf6918
1
Parent(s): 8e76971
Show OE_acc and OE_scr side by side in cross-judge table
Browse files- script.js +53 -37
- styles.css +90 -6
script.js
CHANGED
|
@@ -340,10 +340,6 @@ function renderSamples(active = carouselState.activeDimension, index = 0) {
|
|
| 340 |
}
|
| 341 |
}
|
| 342 |
|
| 343 |
-
function getMetricLabel(metric) {
|
| 344 |
-
return metric === "oeAcc" ? "OE_acc" : "OE_scr";
|
| 345 |
-
}
|
| 346 |
-
|
| 347 |
function getMetricValue(item, metric) {
|
| 348 |
return metric === "oeAcc" ? item.oeAcc : item.oeScr;
|
| 349 |
}
|
|
@@ -385,10 +381,11 @@ function createToggleButton(label, isActive, onClick) {
|
|
| 385 |
}
|
| 386 |
|
| 387 |
function renderControls() {
|
| 388 |
-
if (!taskToggleRoot || !judgeToggleRoot
|
| 389 |
return;
|
| 390 |
}
|
| 391 |
const judgeToggleRow = judgeToggleRoot.closest(".toolbar-row");
|
|
|
|
| 392 |
|
| 393 |
const taskItems = [
|
| 394 |
{ key: "oe", label: "OE" },
|
|
@@ -407,13 +404,18 @@ function renderControls() {
|
|
| 407 |
});
|
| 408 |
|
| 409 |
judgeToggleRoot.innerHTML = "";
|
| 410 |
-
metricToggleRoot
|
|
|
|
|
|
|
| 411 |
|
| 412 |
if (crossJudgeState.activeTask === "oe") {
|
| 413 |
judgeToggleRoot.classList.remove("is-hidden");
|
| 414 |
if (judgeToggleRow) {
|
| 415 |
judgeToggleRow.classList.remove("is-hidden");
|
| 416 |
}
|
|
|
|
|
|
|
|
|
|
| 417 |
|
| 418 |
const judgeItems = [
|
| 419 |
{ key: "gpt4o", label: "GPT-4o (full set)" },
|
|
@@ -425,17 +427,6 @@ function renderControls() {
|
|
| 425 |
});
|
| 426 |
judgeToggleRoot.appendChild(button);
|
| 427 |
});
|
| 428 |
-
|
| 429 |
-
const metricItems = [
|
| 430 |
-
{ key: "oeAcc", label: "OE_acc" },
|
| 431 |
-
{ key: "oeScr", label: "OE_scr" }
|
| 432 |
-
];
|
| 433 |
-
metricItems.forEach((item) => {
|
| 434 |
-
const button = createToggleButton(item.label, item.key === crossJudgeState.activeMetric, () => {
|
| 435 |
-
renderResultsPanel(crossJudgeState.activeTask, crossJudgeState.activeJudge, item.key);
|
| 436 |
-
});
|
| 437 |
-
metricToggleRoot.appendChild(button);
|
| 438 |
-
});
|
| 439 |
return;
|
| 440 |
}
|
| 441 |
|
|
@@ -443,17 +434,21 @@ function renderControls() {
|
|
| 443 |
if (judgeToggleRow) {
|
| 444 |
judgeToggleRow.classList.add("is-hidden");
|
| 445 |
}
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
}
|
| 450 |
|
| 451 |
-
function renderOeResults(activeJudge
|
| 452 |
const otherJudge = getOtherJudge(activeJudge);
|
| 453 |
const currentRows = [...judgeResults[activeJudge].rows].sort((a, b) => {
|
| 454 |
-
const
|
| 455 |
-
if (Math.abs(
|
| 456 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
}
|
| 458 |
return a.rank - b.rank;
|
| 459 |
});
|
|
@@ -461,21 +456,26 @@ function renderOeResults(activeJudge, activeMetric) {
|
|
| 461 |
|
| 462 |
if (resultsCaptionRoot) {
|
| 463 |
resultsCaptionRoot.textContent =
|
| 464 |
-
`${judgeResults[activeJudge].label} · Sorted by
|
| 465 |
}
|
| 466 |
|
| 467 |
resultsRoot.innerHTML = "";
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
currentRows.forEach((item, index) => {
|
| 469 |
const row = document.createElement("div");
|
| 470 |
-
row.className = `result-row${item.isOurs ? " ours" : ""}`;
|
| 471 |
-
|
| 472 |
-
const score = getMetricValue(item, activeMetric);
|
| 473 |
const other = otherMap.get(item.model);
|
| 474 |
-
const
|
| 475 |
-
const
|
| 476 |
-
const deltaText = other
|
| 477 |
-
? `${formatDeltaValue(delta, activeMetric)} vs ${judgeResults[activeJudge].deltaLabel}`
|
| 478 |
-
: "N/A";
|
| 479 |
|
| 480 |
row.innerHTML = `
|
| 481 |
<div class="result-rank">#${index + 1}</div>
|
|
@@ -483,8 +483,14 @@ function renderOeResults(activeJudge, activeMetric) {
|
|
| 483 |
<div class="result-model">${item.model}</div>
|
| 484 |
${item.isOurs ? '<span class="result-tag">Ours</span>' : ""}
|
| 485 |
</div>
|
| 486 |
-
<div class="result-
|
| 487 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 488 |
`;
|
| 489 |
resultsRoot.appendChild(row);
|
| 490 |
});
|
|
@@ -497,9 +503,19 @@ function renderMcResults() {
|
|
| 497 |
}
|
| 498 |
|
| 499 |
resultsRoot.innerHTML = "";
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 500 |
ranked.forEach((item, index) => {
|
| 501 |
const row = document.createElement("div");
|
| 502 |
-
row.className = `result-row${item.isOurs ? " ours" : ""}`;
|
| 503 |
row.innerHTML = `
|
| 504 |
<div class="result-rank">#${index + 1}</div>
|
| 505 |
<div class="result-model-wrap">
|
|
@@ -529,7 +545,7 @@ function renderResultsPanel(
|
|
| 529 |
if (crossJudgeState.activeTask === "mc") {
|
| 530 |
renderMcResults();
|
| 531 |
} else {
|
| 532 |
-
renderOeResults(crossJudgeState.activeJudge
|
| 533 |
}
|
| 534 |
}
|
| 535 |
|
|
|
|
| 340 |
}
|
| 341 |
}
|
| 342 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
function getMetricValue(item, metric) {
|
| 344 |
return metric === "oeAcc" ? item.oeAcc : item.oeScr;
|
| 345 |
}
|
|
|
|
| 381 |
}
|
| 382 |
|
| 383 |
function renderControls() {
|
| 384 |
+
if (!taskToggleRoot || !judgeToggleRoot) {
|
| 385 |
return;
|
| 386 |
}
|
| 387 |
const judgeToggleRow = judgeToggleRoot.closest(".toolbar-row");
|
| 388 |
+
const metricToggleRow = metricToggleRoot ? metricToggleRoot.closest(".toolbar-row") : null;
|
| 389 |
|
| 390 |
const taskItems = [
|
| 391 |
{ key: "oe", label: "OE" },
|
|
|
|
| 404 |
});
|
| 405 |
|
| 406 |
judgeToggleRoot.innerHTML = "";
|
| 407 |
+
if (metricToggleRoot) {
|
| 408 |
+
metricToggleRoot.innerHTML = "";
|
| 409 |
+
}
|
| 410 |
|
| 411 |
if (crossJudgeState.activeTask === "oe") {
|
| 412 |
judgeToggleRoot.classList.remove("is-hidden");
|
| 413 |
if (judgeToggleRow) {
|
| 414 |
judgeToggleRow.classList.remove("is-hidden");
|
| 415 |
}
|
| 416 |
+
if (metricToggleRow) {
|
| 417 |
+
metricToggleRow.classList.add("is-hidden");
|
| 418 |
+
}
|
| 419 |
|
| 420 |
const judgeItems = [
|
| 421 |
{ key: "gpt4o", label: "GPT-4o (full set)" },
|
|
|
|
| 427 |
});
|
| 428 |
judgeToggleRoot.appendChild(button);
|
| 429 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 430 |
return;
|
| 431 |
}
|
| 432 |
|
|
|
|
| 434 |
if (judgeToggleRow) {
|
| 435 |
judgeToggleRow.classList.add("is-hidden");
|
| 436 |
}
|
| 437 |
+
if (metricToggleRow) {
|
| 438 |
+
metricToggleRow.classList.add("is-hidden");
|
| 439 |
+
}
|
| 440 |
}
|
| 441 |
|
| 442 |
+
function renderOeResults(activeJudge) {
|
| 443 |
const otherJudge = getOtherJudge(activeJudge);
|
| 444 |
const currentRows = [...judgeResults[activeJudge].rows].sort((a, b) => {
|
| 445 |
+
const accDiff = b.oeAcc - a.oeAcc;
|
| 446 |
+
if (Math.abs(accDiff) > 1e-6) {
|
| 447 |
+
return accDiff;
|
| 448 |
+
}
|
| 449 |
+
const scrDiff = b.oeScr - a.oeScr;
|
| 450 |
+
if (Math.abs(scrDiff) > 1e-6) {
|
| 451 |
+
return scrDiff;
|
| 452 |
}
|
| 453 |
return a.rank - b.rank;
|
| 454 |
});
|
|
|
|
| 456 |
|
| 457 |
if (resultsCaptionRoot) {
|
| 458 |
resultsCaptionRoot.textContent =
|
| 459 |
+
`${judgeResults[activeJudge].label} · Sorted by OE_acc · per-metric deltas shown vs ${judgeResults[activeJudge].deltaLabel}`;
|
| 460 |
}
|
| 461 |
|
| 462 |
resultsRoot.innerHTML = "";
|
| 463 |
+
const header = document.createElement("div");
|
| 464 |
+
header.className = "results-head results-head-oe";
|
| 465 |
+
header.innerHTML = `
|
| 466 |
+
<div></div>
|
| 467 |
+
<div class="results-head-cell">Model</div>
|
| 468 |
+
<div class="results-head-cell results-head-cell-right">OE_acc</div>
|
| 469 |
+
<div class="results-head-cell results-head-cell-right">OE_scr</div>
|
| 470 |
+
`;
|
| 471 |
+
resultsRoot.appendChild(header);
|
| 472 |
+
|
| 473 |
currentRows.forEach((item, index) => {
|
| 474 |
const row = document.createElement("div");
|
| 475 |
+
row.className = `result-row result-row-oe${item.isOurs ? " ours" : ""}`;
|
|
|
|
|
|
|
| 476 |
const other = otherMap.get(item.model);
|
| 477 |
+
const accDelta = other ? item.oeAcc - other.oeAcc : 0;
|
| 478 |
+
const scrDelta = other ? item.oeScr - other.oeScr : 0;
|
|
|
|
|
|
|
|
|
|
| 479 |
|
| 480 |
row.innerHTML = `
|
| 481 |
<div class="result-rank">#${index + 1}</div>
|
|
|
|
| 483 |
<div class="result-model">${item.model}</div>
|
| 484 |
${item.isOurs ? '<span class="result-tag">Ours</span>' : ""}
|
| 485 |
</div>
|
| 486 |
+
<div class="result-metric-col acc" data-label="OE_acc">
|
| 487 |
+
<div class="result-score">${formatMetricValue(item.oeAcc, "oeAcc")}</div>
|
| 488 |
+
<div class="result-delta ${getDeltaClass(accDelta)}">${formatDeltaValue(accDelta, "oeAcc")}</div>
|
| 489 |
+
</div>
|
| 490 |
+
<div class="result-metric-col scr" data-label="OE_scr">
|
| 491 |
+
<div class="result-score">${formatMetricValue(item.oeScr, "oeScr")}</div>
|
| 492 |
+
<div class="result-delta ${getDeltaClass(scrDelta)}">${formatDeltaValue(scrDelta, "oeScr")}</div>
|
| 493 |
+
</div>
|
| 494 |
`;
|
| 495 |
resultsRoot.appendChild(row);
|
| 496 |
});
|
|
|
|
| 503 |
}
|
| 504 |
|
| 505 |
resultsRoot.innerHTML = "";
|
| 506 |
+
const header = document.createElement("div");
|
| 507 |
+
header.className = "results-head results-head-mc";
|
| 508 |
+
header.innerHTML = `
|
| 509 |
+
<div></div>
|
| 510 |
+
<div class="results-head-cell">Model</div>
|
| 511 |
+
<div class="results-head-cell results-head-cell-right">MC_acc</div>
|
| 512 |
+
<div class="results-head-cell results-head-cell-right">Setting</div>
|
| 513 |
+
`;
|
| 514 |
+
resultsRoot.appendChild(header);
|
| 515 |
+
|
| 516 |
ranked.forEach((item, index) => {
|
| 517 |
const row = document.createElement("div");
|
| 518 |
+
row.className = `result-row result-row-mc${item.isOurs ? " ours" : ""}`;
|
| 519 |
row.innerHTML = `
|
| 520 |
<div class="result-rank">#${index + 1}</div>
|
| 521 |
<div class="result-model-wrap">
|
|
|
|
| 545 |
if (crossJudgeState.activeTask === "mc") {
|
| 546 |
renderMcResults();
|
| 547 |
} else {
|
| 548 |
+
renderOeResults(crossJudgeState.activeJudge);
|
| 549 |
}
|
| 550 |
}
|
| 551 |
|
styles.css
CHANGED
|
@@ -585,10 +585,33 @@ code {
|
|
| 585 |
gap: 8px;
|
| 586 |
}
|
| 587 |
|
| 588 |
-
.
|
| 589 |
display: grid;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 590 |
grid-template-columns: 54px minmax(0, 1fr) 90px minmax(130px, 170px);
|
| 591 |
gap: 10px;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 592 |
align-items: center;
|
| 593 |
padding: 8px 10px;
|
| 594 |
border: 1px solid #ddd8cc;
|
|
@@ -596,6 +619,16 @@ code {
|
|
| 596 |
background: #fcfbf8;
|
| 597 |
}
|
| 598 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 599 |
.result-row.ours {
|
| 600 |
border-color: #d89984;
|
| 601 |
background: linear-gradient(90deg, #fff7f3 0%, #fffdfa 100%);
|
|
@@ -645,10 +678,16 @@ code {
|
|
| 645 |
font-weight: 700;
|
| 646 |
}
|
| 647 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 648 |
.result-delta {
|
| 649 |
min-width: 0;
|
| 650 |
text-align: right;
|
| 651 |
-
font-size: 0.
|
| 652 |
font-weight: 600;
|
| 653 |
color: #55514a;
|
| 654 |
}
|
|
@@ -1106,27 +1145,72 @@ code {
|
|
| 1106 |
gap: 5px;
|
| 1107 |
}
|
| 1108 |
|
| 1109 |
-
.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1110 |
grid-template-columns: 46px minmax(0, 1fr) auto;
|
| 1111 |
gap: 6px 8px;
|
| 1112 |
align-items: start;
|
| 1113 |
padding: 8px;
|
| 1114 |
}
|
| 1115 |
|
| 1116 |
-
.result-model-wrap {
|
| 1117 |
grid-column: 2;
|
| 1118 |
grid-row: 1;
|
| 1119 |
}
|
| 1120 |
|
| 1121 |
-
.result-score {
|
| 1122 |
grid-column: 3;
|
| 1123 |
grid-row: 1;
|
| 1124 |
}
|
| 1125 |
|
| 1126 |
-
.result-delta {
|
| 1127 |
grid-column: 2 / 4;
|
| 1128 |
grid-row: 2;
|
| 1129 |
text-align: left;
|
| 1130 |
}
|
| 1131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1132 |
}
|
|
|
|
| 585 |
gap: 8px;
|
| 586 |
}
|
| 587 |
|
| 588 |
+
.results-head {
|
| 589 |
display: grid;
|
| 590 |
+
align-items: center;
|
| 591 |
+
padding: 0 10px 2px;
|
| 592 |
+
color: #756f65;
|
| 593 |
+
font-size: 0.76rem;
|
| 594 |
+
font-weight: 700;
|
| 595 |
+
letter-spacing: 0.03em;
|
| 596 |
+
text-transform: uppercase;
|
| 597 |
+
}
|
| 598 |
+
|
| 599 |
+
.results-head-oe {
|
| 600 |
+
grid-template-columns: 54px minmax(0, 1fr) 94px 94px;
|
| 601 |
+
gap: 10px;
|
| 602 |
+
}
|
| 603 |
+
|
| 604 |
+
.results-head-mc {
|
| 605 |
grid-template-columns: 54px minmax(0, 1fr) 90px minmax(130px, 170px);
|
| 606 |
gap: 10px;
|
| 607 |
+
}
|
| 608 |
+
|
| 609 |
+
.results-head-cell-right {
|
| 610 |
+
text-align: right;
|
| 611 |
+
}
|
| 612 |
+
|
| 613 |
+
.result-row {
|
| 614 |
+
gap: 10px;
|
| 615 |
align-items: center;
|
| 616 |
padding: 8px 10px;
|
| 617 |
border: 1px solid #ddd8cc;
|
|
|
|
| 619 |
background: #fcfbf8;
|
| 620 |
}
|
| 621 |
|
| 622 |
+
.result-row-oe {
|
| 623 |
+
display: grid;
|
| 624 |
+
grid-template-columns: 54px minmax(0, 1fr) 94px 94px;
|
| 625 |
+
}
|
| 626 |
+
|
| 627 |
+
.result-row-mc {
|
| 628 |
+
display: grid;
|
| 629 |
+
grid-template-columns: 54px minmax(0, 1fr) 90px minmax(130px, 170px);
|
| 630 |
+
}
|
| 631 |
+
|
| 632 |
.result-row.ours {
|
| 633 |
border-color: #d89984;
|
| 634 |
background: linear-gradient(90deg, #fff7f3 0%, #fffdfa 100%);
|
|
|
|
| 678 |
font-weight: 700;
|
| 679 |
}
|
| 680 |
|
| 681 |
+
.result-metric-col {
|
| 682 |
+
display: grid;
|
| 683 |
+
gap: 2px;
|
| 684 |
+
min-width: 0;
|
| 685 |
+
}
|
| 686 |
+
|
| 687 |
.result-delta {
|
| 688 |
min-width: 0;
|
| 689 |
text-align: right;
|
| 690 |
+
font-size: 0.75rem;
|
| 691 |
font-weight: 600;
|
| 692 |
color: #55514a;
|
| 693 |
}
|
|
|
|
| 1145 |
gap: 5px;
|
| 1146 |
}
|
| 1147 |
|
| 1148 |
+
.results-head {
|
| 1149 |
+
display: none;
|
| 1150 |
+
}
|
| 1151 |
+
|
| 1152 |
+
.result-row-mc {
|
| 1153 |
grid-template-columns: 46px minmax(0, 1fr) auto;
|
| 1154 |
gap: 6px 8px;
|
| 1155 |
align-items: start;
|
| 1156 |
padding: 8px;
|
| 1157 |
}
|
| 1158 |
|
| 1159 |
+
.result-row-mc .result-model-wrap {
|
| 1160 |
grid-column: 2;
|
| 1161 |
grid-row: 1;
|
| 1162 |
}
|
| 1163 |
|
| 1164 |
+
.result-row-mc .result-score {
|
| 1165 |
grid-column: 3;
|
| 1166 |
grid-row: 1;
|
| 1167 |
}
|
| 1168 |
|
| 1169 |
+
.result-row-mc .result-delta {
|
| 1170 |
grid-column: 2 / 4;
|
| 1171 |
grid-row: 2;
|
| 1172 |
text-align: left;
|
| 1173 |
}
|
| 1174 |
|
| 1175 |
+
.result-row-oe {
|
| 1176 |
+
grid-template-columns: 46px minmax(0, 1fr) minmax(72px, auto) minmax(72px, auto);
|
| 1177 |
+
gap: 7px 8px;
|
| 1178 |
+
align-items: start;
|
| 1179 |
+
padding: 8px;
|
| 1180 |
+
}
|
| 1181 |
+
|
| 1182 |
+
.result-row-oe .result-rank {
|
| 1183 |
+
grid-column: 1;
|
| 1184 |
+
grid-row: 1 / 3;
|
| 1185 |
+
}
|
| 1186 |
+
|
| 1187 |
+
.result-row-oe .result-model-wrap {
|
| 1188 |
+
grid-column: 2 / 5;
|
| 1189 |
+
grid-row: 1;
|
| 1190 |
+
}
|
| 1191 |
+
|
| 1192 |
+
.result-row-oe .result-metric-col {
|
| 1193 |
+
display: grid;
|
| 1194 |
+
gap: 1px;
|
| 1195 |
+
justify-items: end;
|
| 1196 |
+
}
|
| 1197 |
+
|
| 1198 |
+
.result-row-oe .result-metric-col.acc {
|
| 1199 |
+
grid-column: 3;
|
| 1200 |
+
grid-row: 2;
|
| 1201 |
+
}
|
| 1202 |
+
|
| 1203 |
+
.result-row-oe .result-metric-col.scr {
|
| 1204 |
+
grid-column: 4;
|
| 1205 |
+
grid-row: 2;
|
| 1206 |
+
}
|
| 1207 |
+
|
| 1208 |
+
.result-row-oe .result-metric-col::before {
|
| 1209 |
+
content: attr(data-label);
|
| 1210 |
+
font-size: 0.65rem;
|
| 1211 |
+
font-weight: 700;
|
| 1212 |
+
color: #7a7368;
|
| 1213 |
+
letter-spacing: 0.03em;
|
| 1214 |
+
}
|
| 1215 |
+
|
| 1216 |
}
|