File size: 22,820 Bytes
edcaa7f fb3df2e edcaa7f fb3df2e edcaa7f fb3df2e edcaa7f fb3df2e edcaa7f bcb2aa6 fdc1e03 bcb2aa6 28569b7 bcb2aa6 edcaa7f 0da3d51 edcaa7f 0da3d51 bcb2aa6 edcaa7f bcb2aa6 0da3d51 bcb2aa6 edcaa7f fb3df2e edcaa7f fb3df2e edcaa7f cfb3156 edcaa7f fb3df2e edcaa7f bcb2aa6 96eb17e 0da3d51 caf6918 bcb2aa6 6421da1 bcb2aa6 0da3d51 bcb2aa6 0da3d51 96eb17e 0da3d51 bcb2aa6 0da3d51 6421da1 0da3d51 fdc1e03 28569b7 0da3d51 96eb17e 0da3d51 bcb2aa6 0da3d51 6421da1 0da3d51 bcb2aa6 caf6918 bcb2aa6 caf6918 bcb2aa6 caf6918 bcb2aa6 edcaa7f caf6918 bcb2aa6 edcaa7f caf6918 bcb2aa6 caf6918 bcb2aa6 edcaa7f bcb2aa6 edcaa7f caf6918 edcaa7f 0da3d51 caf6918 0da3d51 caf6918 0da3d51 caf6918 0da3d51 edcaa7f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 | const sampleData = [
{
qaId: "0KoG04_heLE@1@character_2@belief@first",
id: "0KoG04_heLE",
dimension: "Belief",
order: "First-order",
target: "Man",
question: "What does the Woman believe about the Man's memory of their breakup at the moment he claims not to remember?",
openEndedAnswer:
"The Woman perceives the Man's claim of forgetfulness as a deliberate act to avoid taking responsibility for their breakup. His neutral expression and evasive body language suggest he is not genuinely confused but rather using forgetfulness as a tactic.",
options: [
{ label: "A", text: "She believes he has blocked out the breakup details because facing them would be too painful for him." },
{ label: "B", text: "She believes he is mixing up the reasons for their breakup because so much time has passed and details have gotten fuzzy." },
{ label: "C", text: "She believes he wants to move forward and avoid revisiting their breakup because he thinks it will help them start fresh." },
{ label: "D", text: "She believes he is deliberately pretending to forget to avoid blame.", correct: true }
]
},
{
qaId: "-2KGPYEFnsU@2@character_1@belief@second",
id: "-2KGPYEFnsU",
dimension: "Belief",
order: "Second-order",
target: "Dark-Haired Woman",
question: "What does the Blonde Woman think the Dark-Haired Woman believes about her situation?",
openEndedAnswer:
"The Blonde Woman perceives the Dark-Haired Woman's apologetic tone and slight smile as signs of empathy, suggesting she sees her as a victim needing sympathy. However, the Blonde Woman might also sense that the Dark-Haired Woman feels awkward, possibly viewing her as someone who dwells on the negative.",
options: [
{ label: "A", text: "She thinks the Dark-Haired Woman is trying to be supportive on the surface, but actually feels detached and uninterested in what she's going through." },
{ label: "B", text: "She thinks the Dark-Haired Woman sees her as a victim needing sympathy but maybe also as someone who dwells on the negative.", correct: true },
{ label: "C", text: "She thinks the Dark-Haired Woman assumes she made poor choices that led to the failed date, putting the responsibility solely on her actions." },
{ label: "D", text: "She thinks the Dark-Haired Woman suspects she is making her problems seem worse than they are, possibly to get comfort or reassurance from people around her." }
]
},
{
qaId: "A-I3dc0Gct8@3@Willa@desire@first",
id: "A-I3dc0Gct8",
dimension: "Desire",
order: "First-order",
target: "Willa",
question: "What does Willa desire when she says Will is supposed to be with someone like Becca?",
openEndedAnswer:
"Willa desires reassurance from Will that she is worthy despite her doubts. Her vulnerable tone suggests she wants him to prove her fears wrong.",
options: [
{ label: "A", text: "She desires Will to acknowledge that Becca is a better match for him, hoping he will choose Becca and end their relationship." },
{ label: "B", text: "She desires Will to openly discuss their future together, wanting clarity and communication about where their relationship is headed." },
{ label: "C", text: "She desires Will to prove her wrong and reassure her worth, wanting him to show that she is valuable to him deeply.", correct: true },
{ label: "D", text: "She desires evidence that her insecurities are justified, seeking affirmation that she is unworthy of Will and that her doubts are correct." }
]
},
{
qaId: "07YuuA_2O9w@3@Gina@desire@second",
id: "07YuuA_2O9w",
dimension: "Desire",
order: "Second-order",
target: "Jorge",
question: "What does Gina think Jorge desires by giving this lecture?",
openEndedAnswer:
"Gina perceives Jorge's aggressive tone and gestures as an attempt to intimidate her into submission, reinforcing his control over the workplace dynamics.",
options: [
{ label: "A", text: "Jorge desires Gina to demonstrate more initiative in managing the salon's daily operations." },
{ label: "B", text: "Jorge desires to repair his relationship with Gina by fostering open and respectful communication." },
{ label: "C", text: "Jorge desires to express vulnerability about his recent struggles that are not connected to their professional relationship." },
{ label: "D", text: "Jorge desires to reassert control and ensure Gina's obedience through intimidation.", correct: true }
]
},
{
qaId: "0-HM2VCdrC0@1@Jane@intention@first",
id: "0-HM2VCdrC0",
dimension: "Intention",
order: "First-order",
target: "Kevin",
question: "What does Jane intend to achieve by talking to Kevin?",
openEndedAnswer:
"Jane's sharp questioning and defensive posture indicate she wants to maintain emotional distance from Kevin, protecting herself from further manipulation.",
options: [
{ label: "A", text: "To push Kevin away and assert emotional distance because she wants to protect herself", correct: true },
{ label: "B", text: "To question Kevin's motives and see if he will admit to using her, while remaining emotionally detached" },
{ label: "C", text: "To give Kevin an opportunity to defend his actions, hoping he will take responsibility and show genuine remorse" },
{ label: "D", text: "To keep Kevin uncertain about her true feelings by alternating between confrontation and brief moments of listening" }
]
},
{
qaId: "HgQDAW28DsA@1@character_1@intention@second",
id: "HgQDAW28DsA",
dimension: "Intention",
order: "Second-order",
target: "Woman",
question: "What does Man think the woman intends by saying 'We'll see'?",
openEndedAnswer:
"The woman's challenging tone suggests she is skeptical of the man's claim about his emotional state. Her response 'We'll see' implies she intends to test whether he is truly emotionally detached or if he is masking deeper feelings.",
options: [
{ label: "A", text: "She wants to distance herself from the conversation, signaling that she prefers to avoid discussing their issues further." },
{ label: "B", text: "She uses 'We'll see' to subtly mock Man's statement, indicating she doesn't believe he means what he says." },
{ label: "C", text: "She hopes to prompt Man to express more about his feelings, inviting honest emotional conversation." },
{ label: "D", text: "She intends to test or challenge Man's claim, doubting his emotional detachment", correct: true }
]
},
{
qaId: "B2CEGhwMjkQ@5@Juno@emotion@first",
id: "B2CEGhwMjkQ",
dimension: "Emotion",
order: "First-order",
target: "Juno",
question: "What complex emotion is Juno experiencing while talking?",
openEndedAnswer:
"Juno's sarcastic tone masks a complex mix of hurt, disappointment, and defiance. Her gestures and gaze indicate she is struggling with these layered emotions.",
options: [
{ label: "A", text: "Detachment and mild irritation about the conversation" },
{ label: "B", text: "Overwhelming frustration with no underlying vulnerability" },
{ label: "C", text: "Amusement concealed beneath a layer of wit and irony" },
{ label: "D", text: "A mix of hurt, disappointment, and defiance", correct: true }
]
},
{
qaId: "h9Zol4P5u9Q@4@character_2@emotion@second",
id: "h9Zol4P5u9Q",
dimension: "Emotion",
order: "Second-order",
target: "Man",
question: "What does Woman think the man believes about her emotional state during her outburst?",
openEndedAnswer:
"The woman believes the man thinks she is overreacting because his lack of visible concern or engagement suggests he dismisses her emotional display as exaggerated.",
options: [
{ label: "A", text: "She thinks he believes she is overreacting and being overly emotional without cause.", correct: true },
{ label: "B", text: "She thinks he believes she is sincerely upset because of Sylvia's death and that he should comfort her." },
{ label: "C", text: "She thinks he believes she is exaggerating for effect and not actually serious about her accusations." },
{ label: "D", text: "She thinks he believes she is emotionally detached and unaffected by the situation with Sylvia." }
]
},
{
qaId: "89yPHsT8AKg@74@knowledge@first_order",
id: "89yPHsT8AKg",
dimension: "Knowledge",
order: "First-order",
target: "Male protagonist",
question: "Does the male protagonist know who replaced the female protagonist?",
openEndedAnswer:
"Yes. He knows, despite saying 'None,' because he consistently avoids eye contact with the female protagonist, keeps his head down, and appears tense.",
options: [
{ label: "A", text: "Yes", correct: true },
{ label: "B", text: "No" }
]
},
{
qaId: "ARD5AUnl7_M@33@knowledge@tom_second_order",
id: "ARD5AUnl7_M",
dimension: "Knowledge",
order: "Second-order",
target: "The man",
question: "Does the man know that his wife feels both comforted and sad by what he said, and is crying with her face covered?",
openEndedAnswer:
"No. He does not know because he has his back turned to the woman and cannot know the specifics, even if he might guess some of it.",
options: [
{ label: "A", text: "Yes" },
{ label: "B", text: "No", correct: true }
]
}
];
const filters = ["Belief", "Desire", "Intention", "Emotion", "Knowledge"];
const judgeResults = {
gpt4o: {
label: "GPT-4o",
deltaLabel: "Gemini",
rows: [
{ model: "WildToM-Reasoner", rank: 1, oeAcc: 32.7, oeScr: 2.6, isOurs: true },
{ model: "GPT-4o-mini", rank: 2, oeAcc: 32.2, oeScr: 2.6 },
{ model: "Qwen3-Omni", rank: 3, oeAcc: 29.1, oeScr: 2.3 },
{ model: "Qwen3-VL", rank: 4, oeAcc: 28.6, oeScr: 2.2 },
{ model: "GPT-5-mini", rank: 5, oeAcc: 28.0, oeScr: 2.11 },
{ model: "GLM-4.6V", rank: 6, oeAcc: 16.6, oeScr: 1.9 },
{ model: "MiniCPM-V-4.5", rank: 7, oeAcc: 14.6, oeScr: 1.7 },
{ model: "Emotion-Qwen", rank: 8, oeAcc: 12.1, oeScr: 1.6 },
{ model: "Video-LLaVA", rank: 9, oeAcc: 8.4, oeScr: 1.7 },
{ model: "AffectGPT", rank: 10, oeAcc: 4.8, oeScr: 1.1 }
]
},
gemini: {
label: "Gemini-2.5-Flash",
deltaLabel: "GPT-4o",
rows: [
{ model: "WildToM-Reasoner", rank: 1, oeAcc: 47.5, oeScr: 2.45, isOurs: true },
{ model: "Qwen3-VL", rank: 2, oeAcc: 40.0, oeScr: 2.3 },
{ model: "GPT-4o-mini", rank: 3, oeAcc: 37.5, oeScr: 2.23 },
{ model: "GLM-4.6V", rank: 4, oeAcc: 35.0, oeScr: 2.02 },
{ model: "GPT-5-mini", rank: 5, oeAcc: 25.0, oeScr: 1.4 },
{ model: "Qwen3-Omni", rank: 6, oeAcc: 22.5, oeScr: 1.68 },
{ model: "Emotion-Qwen", rank: 7, oeAcc: 20.0, oeScr: 1.43 },
{ model: "MiniCPM-V-4.5", rank: 8, oeAcc: 17.5, oeScr: 1.35 },
{ model: "AffectGPT", rank: 9, oeAcc: 15.0, oeScr: 1.35 },
{ model: "Video-LLaVA", rank: 10, oeAcc: 12.5, oeScr: 1.43 }
]
}
};
const mcResults = [
{ model: "WildToM-Reasoner", mcAcc: 72.7, isOurs: true },
{ model: "Qwen3-VL", mcAcc: 62.1 },
{ model: "Qwen3-Omni", mcAcc: 61.8 },
{ model: "GPT-4o-mini", mcAcc: 57.2 },
{ model: "Emotion-Qwen", mcAcc: 54.2 },
{ model: "GLM-4.6V", mcAcc: 51.2 },
{ model: "MiniCPM-V-4.5", mcAcc: 46.8 },
{ model: "AffectGPT", mcAcc: 35.9 },
{ model: "Video-LLaVA", mcAcc: 25.8 }
];
const filterRoot = document.getElementById("showcase-filters");
const showcaseRoot = document.getElementById("showcase-browser");
const resultsRoot = document.getElementById("results-table");
const taskToggleRoot = document.getElementById("task-toggle");
const judgeToggleRoot = document.getElementById("judge-toggle");
const resultsCaptionRoot = document.getElementById("results-caption");
const carouselState = {
activeDimension: "Belief",
activeIndex: 0
};
const crossJudgeState = {
activeTask: "oe",
activeJudge: "gpt4o",
activeMetric: "oeAcc"
};
function renderFilters(active) {
filterRoot.innerHTML = "";
filters.forEach((filter) => {
const button = document.createElement("button");
button.type = "button";
button.className = `filter-chip${filter === active ? " active" : ""}`;
button.textContent = filter;
button.addEventListener("click", () => renderSamples(filter));
filterRoot.appendChild(button);
});
}
function getVisibleSamples(active) {
return sampleData.filter((item) => item.dimension === active);
}
function renderSamples(active = carouselState.activeDimension, index = 0) {
carouselState.activeDimension = active;
renderFilters(active);
const visible = getVisibleSamples(active);
if (visible.length === 0) {
showcaseRoot.innerHTML = "";
return;
}
const safeIndex = ((index % visible.length) + visible.length) % visible.length;
carouselState.activeIndex = safeIndex;
const item = visible[safeIndex];
const gold = item.options.find((option) => option.correct)?.label ?? "N/A";
const optionsHtml = item.options
.map(
(option) => `
<li class="option-item${option.correct ? " correct" : ""}">
<span class="option-key">${option.label}</span>
<span class="option-text">${option.text}</span>
</li>
`
)
.join("");
const detailBlocks = item.openEndedAnswer
? `
<div class="oe-card">
<button type="button" class="oe-head oe-toggle" aria-expanded="false">
<span class="oe-badge">Open-Ended Answer</span>
<span class="oe-caret" aria-hidden="true">▾</span>
</button>
<div class="oe-body" hidden>
<p class="oe-text">${item.openEndedAnswer}</p>
</div>
</div>
`
: "";
showcaseRoot.innerHTML = "";
const card = document.createElement("article");
card.className = "sample-card";
card.innerHTML = `
<div class="sample-media">
<div class="sample-video-wrap">
<video class="sample-video" controls preload="metadata">
<source src="./videos/${item.id}.mp4" type="video/mp4">
</video>
</div>
<div class="sample-video-note">
<div class="sample-video-note-title">Case Metadata</div>
<div class="sample-video-note-line"><span>QA ID</span><strong>${item.qaId}</strong></div>
<div class="sample-video-note-line"><span>Target</span><strong>${item.target || "N/A"}</strong></div>
<div class="sample-video-note-line"><span>Reasoning</span><strong>${item.order}</strong></div>
</div>
</div>
<div class="sample-content">
<div class="sample-header">
<div class="sample-counter">${active} case ${safeIndex + 1} / ${visible.length}</div>
<div class="sample-nav">
<button type="button" aria-label="Previous sample" data-nav="prev">←</button>
<button type="button" aria-label="Next sample" data-nav="next">→</button>
</div>
</div>
<div class="sample-meta">
<span class="sample-chip">${item.dimension}</span>
<span class="sample-chip">${item.order}</span>
${item.target ? `<span class="sample-chip">${item.target}</span>` : ""}
<span class="sample-chip qa">QA ${item.qaId}</span>
<span class="sample-chip gold">Gold ${gold}</span>
</div>
<h3 class="sample-question">${item.question}</h3>
<div class="option-block">
<strong>Options</strong>
<ol class="option-list">
${optionsHtml}
</ol>
</div>
${detailBlocks}
</div>
`;
showcaseRoot.appendChild(card);
card.querySelector('[data-nav="prev"]').addEventListener("click", () => {
renderSamples(active, safeIndex - 1);
});
card.querySelector('[data-nav="next"]').addEventListener("click", () => {
renderSamples(active, safeIndex + 1);
});
const oeToggle = card.querySelector(".oe-toggle");
const oeBody = card.querySelector(".oe-body");
if (oeToggle && oeBody) {
oeToggle.addEventListener("click", () => {
const expanded = oeToggle.getAttribute("aria-expanded") === "true";
oeToggle.setAttribute("aria-expanded", String(!expanded));
oeBody.hidden = expanded;
});
}
}
function getMetricValue(item, metric) {
return metric === "oeAcc" ? item.oeAcc : item.oeScr;
}
function getOtherJudge(activeJudge) {
return activeJudge === "gpt4o" ? "gemini" : "gpt4o";
}
function formatMetricValue(value, metric) {
return metric === "oeAcc" ? `${value.toFixed(1)}%` : value.toFixed(2);
}
function formatDeltaValue(delta, metric) {
const sign = delta >= 0 ? "+" : "";
if (metric === "oeAcc") {
return `${sign}${delta.toFixed(1)} pp`;
}
return `${sign}${delta.toFixed(2)}`;
}
function getDeltaClass(delta) {
if (Math.abs(delta) < 0.05) {
return "neutral";
}
return delta > 0 ? "positive" : "negative";
}
function createToggleButton(label, isActive, onClick) {
const button = document.createElement("button");
button.type = "button";
button.className = `toggle-btn${isActive ? " active" : ""}`;
button.textContent = label;
button.setAttribute("aria-pressed", String(isActive));
if (isActive) {
button.setAttribute("aria-current", "true");
}
button.addEventListener("click", onClick);
return button;
}
function renderControls() {
if (!taskToggleRoot || !judgeToggleRoot) {
return;
}
const judgeToggleRow = judgeToggleRoot.closest(".toolbar-row");
const taskItems = [
{ key: "oe", label: "OE" },
{ key: "mc", label: "MC" }
];
taskToggleRoot.innerHTML = "";
taskItems.forEach((item) => {
const button = createToggleButton(
item.label === "MC" ? "MC (main benchmark)" : "OE (cross-judge)",
item.key === crossJudgeState.activeTask,
() => {
renderResultsPanel(item.key, crossJudgeState.activeJudge, crossJudgeState.activeMetric);
}
);
taskToggleRoot.appendChild(button);
});
judgeToggleRoot.innerHTML = "";
if (crossJudgeState.activeTask === "oe") {
judgeToggleRoot.classList.remove("is-hidden");
if (judgeToggleRow) {
judgeToggleRow.classList.remove("is-hidden");
}
const judgeItems = [
{ key: "gpt4o", label: "GPT-4o" },
{ key: "gemini", label: "Gemini-2.5-Flash" }
];
judgeItems.forEach((item) => {
const button = createToggleButton(item.label, item.key === crossJudgeState.activeJudge, () => {
renderResultsPanel(crossJudgeState.activeTask, item.key, crossJudgeState.activeMetric);
});
judgeToggleRoot.appendChild(button);
});
return;
}
judgeToggleRoot.classList.add("is-hidden");
if (judgeToggleRow) {
judgeToggleRow.classList.add("is-hidden");
}
}
function renderOeResults(activeJudge) {
const otherJudge = getOtherJudge(activeJudge);
const currentRows = [...judgeResults[activeJudge].rows].sort((a, b) => {
const accDiff = b.oeAcc - a.oeAcc;
if (Math.abs(accDiff) > 1e-6) {
return accDiff;
}
const scrDiff = b.oeScr - a.oeScr;
if (Math.abs(scrDiff) > 1e-6) {
return scrDiff;
}
return a.rank - b.rank;
});
const otherMap = new Map(judgeResults[otherJudge].rows.map((item) => [item.model, item]));
if (resultsCaptionRoot) {
resultsCaptionRoot.textContent =
`${judgeResults[activeJudge].label} 路 Sorted by OE_acc 路 per-metric deltas shown vs ${judgeResults[activeJudge].deltaLabel}`;
}
resultsRoot.innerHTML = "";
const header = document.createElement("div");
header.className = "results-head results-head-oe";
header.innerHTML = `
<div></div>
<div class="results-head-cell">Model</div>
<div class="results-head-cell results-head-cell-right">OE_acc</div>
<div class="results-head-cell results-head-cell-right">OE_scr</div>
`;
resultsRoot.appendChild(header);
currentRows.forEach((item, index) => {
const row = document.createElement("div");
row.className = `result-row result-row-oe${item.isOurs ? " ours" : ""}`;
const other = otherMap.get(item.model);
const accDelta = other ? item.oeAcc - other.oeAcc : 0;
const scrDelta = other ? item.oeScr - other.oeScr : 0;
row.innerHTML = `
<div class="result-rank">#${index + 1}</div>
<div class="result-model-wrap">
<div class="result-model">${item.model}</div>
${item.isOurs ? '<span class="result-tag">Ours</span>' : ""}
</div>
<div class="result-metric-col acc" data-label="OE_acc">
<div class="result-score">${formatMetricValue(item.oeAcc, "oeAcc")}</div>
<div class="result-delta ${getDeltaClass(accDelta)}">${formatDeltaValue(accDelta, "oeAcc")}</div>
</div>
<div class="result-metric-col scr" data-label="OE_scr">
<div class="result-score">${formatMetricValue(item.oeScr, "oeScr")}</div>
<div class="result-delta ${getDeltaClass(scrDelta)}">${formatDeltaValue(scrDelta, "oeScr")}</div>
</div>
`;
resultsRoot.appendChild(row);
});
}
function renderMcResults() {
const ranked = [...mcResults].sort((a, b) => b.mcAcc - a.mcAcc);
if (resultsCaptionRoot) {
resultsCaptionRoot.textContent = "Main benchmark (MC) 路 Sorted by MC_acc";
}
resultsRoot.innerHTML = "";
const header = document.createElement("div");
header.className = "results-head results-head-mc";
header.innerHTML = `
<div></div>
<div class="results-head-cell">Model</div>
<div class="results-head-cell results-head-cell-right">MC_acc</div>
<div class="results-head-cell results-head-cell-right">Setting</div>
`;
resultsRoot.appendChild(header);
ranked.forEach((item, index) => {
const row = document.createElement("div");
row.className = `result-row result-row-mc${item.isOurs ? " ours" : ""}`;
row.innerHTML = `
<div class="result-rank">#${index + 1}</div>
<div class="result-model-wrap">
<div class="result-model">${item.model}</div>
${item.isOurs ? '<span class="result-tag">Ours</span>' : ""}
</div>
<div class="result-score">${item.mcAcc.toFixed(1)}%</div>
<div class="result-delta neutral">single-judge MC setting</div>
`;
resultsRoot.appendChild(row);
});
}
function renderResultsPanel(
activeTask = crossJudgeState.activeTask,
activeJudge = crossJudgeState.activeJudge,
activeMetric = crossJudgeState.activeMetric
) {
if (!resultsRoot) {
return;
}
crossJudgeState.activeTask = activeTask;
crossJudgeState.activeJudge = activeJudge;
crossJudgeState.activeMetric = activeMetric;
renderControls();
if (crossJudgeState.activeTask === "mc") {
renderMcResults();
} else {
renderOeResults(crossJudgeState.activeJudge);
}
}
renderResultsPanel("oe", "gpt4o", "oeAcc");
renderSamples("Belief", 0);
|