WildToM / script.js
county
Align cross-judge layout and remove full-set label
fdc1e03
const sampleData = [
{
qaId: "0KoG04_heLE@1@character_2@belief@first",
id: "0KoG04_heLE",
dimension: "Belief",
order: "First-order",
target: "Man",
question: "What does the Woman believe about the Man's memory of their breakup at the moment he claims not to remember?",
openEndedAnswer:
"The Woman perceives the Man's claim of forgetfulness as a deliberate act to avoid taking responsibility for their breakup. His neutral expression and evasive body language suggest he is not genuinely confused but rather using forgetfulness as a tactic.",
options: [
{ label: "A", text: "She believes he has blocked out the breakup details because facing them would be too painful for him." },
{ label: "B", text: "She believes he is mixing up the reasons for their breakup because so much time has passed and details have gotten fuzzy." },
{ label: "C", text: "She believes he wants to move forward and avoid revisiting their breakup because he thinks it will help them start fresh." },
{ label: "D", text: "She believes he is deliberately pretending to forget to avoid blame.", correct: true }
]
},
{
qaId: "-2KGPYEFnsU@2@character_1@belief@second",
id: "-2KGPYEFnsU",
dimension: "Belief",
order: "Second-order",
target: "Dark-Haired Woman",
question: "What does the Blonde Woman think the Dark-Haired Woman believes about her situation?",
openEndedAnswer:
"The Blonde Woman perceives the Dark-Haired Woman's apologetic tone and slight smile as signs of empathy, suggesting she sees her as a victim needing sympathy. However, the Blonde Woman might also sense that the Dark-Haired Woman feels awkward, possibly viewing her as someone who dwells on the negative.",
options: [
{ label: "A", text: "She thinks the Dark-Haired Woman is trying to be supportive on the surface, but actually feels detached and uninterested in what she's going through." },
{ label: "B", text: "She thinks the Dark-Haired Woman sees her as a victim needing sympathy but maybe also as someone who dwells on the negative.", correct: true },
{ label: "C", text: "She thinks the Dark-Haired Woman assumes she made poor choices that led to the failed date, putting the responsibility solely on her actions." },
{ label: "D", text: "She thinks the Dark-Haired Woman suspects she is making her problems seem worse than they are, possibly to get comfort or reassurance from people around her." }
]
},
{
qaId: "A-I3dc0Gct8@3@Willa@desire@first",
id: "A-I3dc0Gct8",
dimension: "Desire",
order: "First-order",
target: "Willa",
question: "What does Willa desire when she says Will is supposed to be with someone like Becca?",
openEndedAnswer:
"Willa desires reassurance from Will that she is worthy despite her doubts. Her vulnerable tone suggests she wants him to prove her fears wrong.",
options: [
{ label: "A", text: "She desires Will to acknowledge that Becca is a better match for him, hoping he will choose Becca and end their relationship." },
{ label: "B", text: "She desires Will to openly discuss their future together, wanting clarity and communication about where their relationship is headed." },
{ label: "C", text: "She desires Will to prove her wrong and reassure her worth, wanting him to show that she is valuable to him deeply.", correct: true },
{ label: "D", text: "She desires evidence that her insecurities are justified, seeking affirmation that she is unworthy of Will and that her doubts are correct." }
]
},
{
qaId: "07YuuA_2O9w@3@Gina@desire@second",
id: "07YuuA_2O9w",
dimension: "Desire",
order: "Second-order",
target: "Jorge",
question: "What does Gina think Jorge desires by giving this lecture?",
openEndedAnswer:
"Gina perceives Jorge's aggressive tone and gestures as an attempt to intimidate her into submission, reinforcing his control over the workplace dynamics.",
options: [
{ label: "A", text: "Jorge desires Gina to demonstrate more initiative in managing the salon's daily operations." },
{ label: "B", text: "Jorge desires to repair his relationship with Gina by fostering open and respectful communication." },
{ label: "C", text: "Jorge desires to express vulnerability about his recent struggles that are not connected to their professional relationship." },
{ label: "D", text: "Jorge desires to reassert control and ensure Gina's obedience through intimidation.", correct: true }
]
},
{
qaId: "0-HM2VCdrC0@1@Jane@intention@first",
id: "0-HM2VCdrC0",
dimension: "Intention",
order: "First-order",
target: "Kevin",
question: "What does Jane intend to achieve by talking to Kevin?",
openEndedAnswer:
"Jane's sharp questioning and defensive posture indicate she wants to maintain emotional distance from Kevin, protecting herself from further manipulation.",
options: [
{ label: "A", text: "To push Kevin away and assert emotional distance because she wants to protect herself", correct: true },
{ label: "B", text: "To question Kevin's motives and see if he will admit to using her, while remaining emotionally detached" },
{ label: "C", text: "To give Kevin an opportunity to defend his actions, hoping he will take responsibility and show genuine remorse" },
{ label: "D", text: "To keep Kevin uncertain about her true feelings by alternating between confrontation and brief moments of listening" }
]
},
{
qaId: "HgQDAW28DsA@1@character_1@intention@second",
id: "HgQDAW28DsA",
dimension: "Intention",
order: "Second-order",
target: "Woman",
question: "What does Man think the woman intends by saying 'We'll see'?",
openEndedAnswer:
"The woman's challenging tone suggests she is skeptical of the man's claim about his emotional state. Her response 'We'll see' implies she intends to test whether he is truly emotionally detached or if he is masking deeper feelings.",
options: [
{ label: "A", text: "She wants to distance herself from the conversation, signaling that she prefers to avoid discussing their issues further." },
{ label: "B", text: "She uses 'We'll see' to subtly mock Man's statement, indicating she doesn't believe he means what he says." },
{ label: "C", text: "She hopes to prompt Man to express more about his feelings, inviting honest emotional conversation." },
{ label: "D", text: "She intends to test or challenge Man's claim, doubting his emotional detachment", correct: true }
]
},
{
qaId: "B2CEGhwMjkQ@5@Juno@emotion@first",
id: "B2CEGhwMjkQ",
dimension: "Emotion",
order: "First-order",
target: "Juno",
question: "What complex emotion is Juno experiencing while talking?",
openEndedAnswer:
"Juno's sarcastic tone masks a complex mix of hurt, disappointment, and defiance. Her gestures and gaze indicate she is struggling with these layered emotions.",
options: [
{ label: "A", text: "Detachment and mild irritation about the conversation" },
{ label: "B", text: "Overwhelming frustration with no underlying vulnerability" },
{ label: "C", text: "Amusement concealed beneath a layer of wit and irony" },
{ label: "D", text: "A mix of hurt, disappointment, and defiance", correct: true }
]
},
{
qaId: "h9Zol4P5u9Q@4@character_2@emotion@second",
id: "h9Zol4P5u9Q",
dimension: "Emotion",
order: "Second-order",
target: "Man",
question: "What does Woman think the man believes about her emotional state during her outburst?",
openEndedAnswer:
"The woman believes the man thinks she is overreacting because his lack of visible concern or engagement suggests he dismisses her emotional display as exaggerated.",
options: [
{ label: "A", text: "She thinks he believes she is overreacting and being overly emotional without cause.", correct: true },
{ label: "B", text: "She thinks he believes she is sincerely upset because of Sylvia's death and that he should comfort her." },
{ label: "C", text: "She thinks he believes she is exaggerating for effect and not actually serious about her accusations." },
{ label: "D", text: "She thinks he believes she is emotionally detached and unaffected by the situation with Sylvia." }
]
},
{
qaId: "89yPHsT8AKg@74@knowledge@first_order",
id: "89yPHsT8AKg",
dimension: "Knowledge",
order: "First-order",
target: "Male protagonist",
question: "Does the male protagonist know who replaced the female protagonist?",
openEndedAnswer:
"Yes. He knows, despite saying 'None,' because he consistently avoids eye contact with the female protagonist, keeps his head down, and appears tense.",
options: [
{ label: "A", text: "Yes", correct: true },
{ label: "B", text: "No" }
]
},
{
qaId: "ARD5AUnl7_M@33@knowledge@tom_second_order",
id: "ARD5AUnl7_M",
dimension: "Knowledge",
order: "Second-order",
target: "The man",
question: "Does the man know that his wife feels both comforted and sad by what he said, and is crying with her face covered?",
openEndedAnswer:
"No. He does not know because he has his back turned to the woman and cannot know the specifics, even if he might guess some of it.",
options: [
{ label: "A", text: "Yes" },
{ label: "B", text: "No", correct: true }
]
}
];
const filters = ["Belief", "Desire", "Intention", "Emotion", "Knowledge"];
const judgeResults = {
gpt4o: {
label: "GPT-4o",
deltaLabel: "Gemini",
rows: [
{ model: "WildToM-Reasoner", rank: 1, oeAcc: 32.7, oeScr: 2.6, isOurs: true },
{ model: "GPT-4o-mini", rank: 2, oeAcc: 32.2, oeScr: 2.6 },
{ model: "Qwen3-Omni", rank: 3, oeAcc: 29.1, oeScr: 2.3 },
{ model: "Qwen3-VL", rank: 4, oeAcc: 28.6, oeScr: 2.2 },
{ model: "GPT-5-mini", rank: 5, oeAcc: 28.0, oeScr: 2.11 },
{ model: "GLM-4.6V", rank: 6, oeAcc: 16.6, oeScr: 1.9 },
{ model: "MiniCPM-V-4.5", rank: 7, oeAcc: 14.6, oeScr: 1.7 },
{ model: "Emotion-Qwen", rank: 8, oeAcc: 12.1, oeScr: 1.6 },
{ model: "Video-LLaVA", rank: 9, oeAcc: 8.4, oeScr: 1.7 },
{ model: "AffectGPT", rank: 10, oeAcc: 4.8, oeScr: 1.1 }
]
},
gemini: {
label: "Gemini-2.5-Flash",
deltaLabel: "GPT-4o",
rows: [
{ model: "WildToM-Reasoner", rank: 1, oeAcc: 47.5, oeScr: 2.45, isOurs: true },
{ model: "Qwen3-VL", rank: 2, oeAcc: 40.0, oeScr: 2.3 },
{ model: "GPT-4o-mini", rank: 3, oeAcc: 37.5, oeScr: 2.23 },
{ model: "GLM-4.6V", rank: 4, oeAcc: 35.0, oeScr: 2.02 },
{ model: "GPT-5-mini", rank: 5, oeAcc: 25.0, oeScr: 1.4 },
{ model: "Qwen3-Omni", rank: 6, oeAcc: 22.5, oeScr: 1.68 },
{ model: "Emotion-Qwen", rank: 7, oeAcc: 20.0, oeScr: 1.43 },
{ model: "MiniCPM-V-4.5", rank: 8, oeAcc: 17.5, oeScr: 1.35 },
{ model: "AffectGPT", rank: 9, oeAcc: 15.0, oeScr: 1.35 },
{ model: "Video-LLaVA", rank: 10, oeAcc: 12.5, oeScr: 1.43 }
]
}
};
const mcResults = [
{ model: "WildToM-Reasoner", mcAcc: 72.7, isOurs: true },
{ model: "Qwen3-VL", mcAcc: 62.1 },
{ model: "Qwen3-Omni", mcAcc: 61.8 },
{ model: "GPT-4o-mini", mcAcc: 57.2 },
{ model: "Emotion-Qwen", mcAcc: 54.2 },
{ model: "GLM-4.6V", mcAcc: 51.2 },
{ model: "MiniCPM-V-4.5", mcAcc: 46.8 },
{ model: "AffectGPT", mcAcc: 35.9 },
{ model: "Video-LLaVA", mcAcc: 25.8 }
];
const filterRoot = document.getElementById("showcase-filters");
const showcaseRoot = document.getElementById("showcase-browser");
const resultsRoot = document.getElementById("results-table");
const taskToggleRoot = document.getElementById("task-toggle");
const judgeToggleRoot = document.getElementById("judge-toggle");
const resultsCaptionRoot = document.getElementById("results-caption");
const carouselState = {
activeDimension: "Belief",
activeIndex: 0
};
const crossJudgeState = {
activeTask: "oe",
activeJudge: "gpt4o",
activeMetric: "oeAcc"
};
function renderFilters(active) {
filterRoot.innerHTML = "";
filters.forEach((filter) => {
const button = document.createElement("button");
button.type = "button";
button.className = `filter-chip${filter === active ? " active" : ""}`;
button.textContent = filter;
button.addEventListener("click", () => renderSamples(filter));
filterRoot.appendChild(button);
});
}
function getVisibleSamples(active) {
return sampleData.filter((item) => item.dimension === active);
}
function renderSamples(active = carouselState.activeDimension, index = 0) {
carouselState.activeDimension = active;
renderFilters(active);
const visible = getVisibleSamples(active);
if (visible.length === 0) {
showcaseRoot.innerHTML = "";
return;
}
const safeIndex = ((index % visible.length) + visible.length) % visible.length;
carouselState.activeIndex = safeIndex;
const item = visible[safeIndex];
const gold = item.options.find((option) => option.correct)?.label ?? "N/A";
const optionsHtml = item.options
.map(
(option) => `
<li class="option-item${option.correct ? " correct" : ""}">
<span class="option-key">${option.label}</span>
<span class="option-text">${option.text}</span>
</li>
`
)
.join("");
const detailBlocks = item.openEndedAnswer
? `
<div class="oe-card">
<button type="button" class="oe-head oe-toggle" aria-expanded="false">
<span class="oe-badge">Open-Ended Answer</span>
<span class="oe-caret" aria-hidden="true">&#9662;</span>
</button>
<div class="oe-body" hidden>
<p class="oe-text">${item.openEndedAnswer}</p>
</div>
</div>
`
: "";
showcaseRoot.innerHTML = "";
const card = document.createElement("article");
card.className = "sample-card";
card.innerHTML = `
<div class="sample-media">
<div class="sample-video-wrap">
<video class="sample-video" controls preload="metadata">
<source src="./videos/${item.id}.mp4" type="video/mp4">
</video>
</div>
<div class="sample-video-note">
<div class="sample-video-note-title">Case Metadata</div>
<div class="sample-video-note-line"><span>QA ID</span><strong>${item.qaId}</strong></div>
<div class="sample-video-note-line"><span>Target</span><strong>${item.target || "N/A"}</strong></div>
<div class="sample-video-note-line"><span>Reasoning</span><strong>${item.order}</strong></div>
</div>
</div>
<div class="sample-content">
<div class="sample-header">
<div class="sample-counter">${active} case ${safeIndex + 1} / ${visible.length}</div>
<div class="sample-nav">
<button type="button" aria-label="Previous sample" data-nav="prev">&#8592;</button>
<button type="button" aria-label="Next sample" data-nav="next">&#8594;</button>
</div>
</div>
<div class="sample-meta">
<span class="sample-chip">${item.dimension}</span>
<span class="sample-chip">${item.order}</span>
${item.target ? `<span class="sample-chip">${item.target}</span>` : ""}
<span class="sample-chip qa">QA ${item.qaId}</span>
<span class="sample-chip gold">Gold ${gold}</span>
</div>
<h3 class="sample-question">${item.question}</h3>
<div class="option-block">
<strong>Options</strong>
<ol class="option-list">
${optionsHtml}
</ol>
</div>
${detailBlocks}
</div>
`;
showcaseRoot.appendChild(card);
card.querySelector('[data-nav="prev"]').addEventListener("click", () => {
renderSamples(active, safeIndex - 1);
});
card.querySelector('[data-nav="next"]').addEventListener("click", () => {
renderSamples(active, safeIndex + 1);
});
const oeToggle = card.querySelector(".oe-toggle");
const oeBody = card.querySelector(".oe-body");
if (oeToggle && oeBody) {
oeToggle.addEventListener("click", () => {
const expanded = oeToggle.getAttribute("aria-expanded") === "true";
oeToggle.setAttribute("aria-expanded", String(!expanded));
oeBody.hidden = expanded;
});
}
}
function getMetricValue(item, metric) {
return metric === "oeAcc" ? item.oeAcc : item.oeScr;
}
function getOtherJudge(activeJudge) {
return activeJudge === "gpt4o" ? "gemini" : "gpt4o";
}
function formatMetricValue(value, metric) {
return metric === "oeAcc" ? `${value.toFixed(1)}%` : value.toFixed(2);
}
function formatDeltaValue(delta, metric) {
const sign = delta >= 0 ? "+" : "";
if (metric === "oeAcc") {
return `${sign}${delta.toFixed(1)} pp`;
}
return `${sign}${delta.toFixed(2)}`;
}
function getDeltaClass(delta) {
if (Math.abs(delta) < 0.05) {
return "neutral";
}
return delta > 0 ? "positive" : "negative";
}
function createToggleButton(label, isActive, onClick) {
const button = document.createElement("button");
button.type = "button";
button.className = `toggle-btn${isActive ? " active" : ""}`;
button.textContent = label;
button.setAttribute("aria-pressed", String(isActive));
if (isActive) {
button.setAttribute("aria-current", "true");
}
button.addEventListener("click", onClick);
return button;
}
function renderControls() {
if (!taskToggleRoot || !judgeToggleRoot) {
return;
}
const judgeToggleRow = judgeToggleRoot.closest(".toolbar-row");
const taskItems = [
{ key: "oe", label: "OE" },
{ key: "mc", label: "MC" }
];
taskToggleRoot.innerHTML = "";
taskItems.forEach((item) => {
const button = createToggleButton(
item.label === "MC" ? "MC (main benchmark)" : "OE (cross-judge)",
item.key === crossJudgeState.activeTask,
() => {
renderResultsPanel(item.key, crossJudgeState.activeJudge, crossJudgeState.activeMetric);
}
);
taskToggleRoot.appendChild(button);
});
judgeToggleRoot.innerHTML = "";
if (crossJudgeState.activeTask === "oe") {
judgeToggleRoot.classList.remove("is-hidden");
if (judgeToggleRow) {
judgeToggleRow.classList.remove("is-hidden");
}
const judgeItems = [
{ key: "gpt4o", label: "GPT-4o" },
{ key: "gemini", label: "Gemini-2.5-Flash" }
];
judgeItems.forEach((item) => {
const button = createToggleButton(item.label, item.key === crossJudgeState.activeJudge, () => {
renderResultsPanel(crossJudgeState.activeTask, item.key, crossJudgeState.activeMetric);
});
judgeToggleRoot.appendChild(button);
});
return;
}
judgeToggleRoot.classList.add("is-hidden");
if (judgeToggleRow) {
judgeToggleRow.classList.add("is-hidden");
}
}
function renderOeResults(activeJudge) {
const otherJudge = getOtherJudge(activeJudge);
const currentRows = [...judgeResults[activeJudge].rows].sort((a, b) => {
const accDiff = b.oeAcc - a.oeAcc;
if (Math.abs(accDiff) > 1e-6) {
return accDiff;
}
const scrDiff = b.oeScr - a.oeScr;
if (Math.abs(scrDiff) > 1e-6) {
return scrDiff;
}
return a.rank - b.rank;
});
const otherMap = new Map(judgeResults[otherJudge].rows.map((item) => [item.model, item]));
if (resultsCaptionRoot) {
resultsCaptionRoot.textContent =
`${judgeResults[activeJudge].label} 路 Sorted by OE_acc 路 per-metric deltas shown vs ${judgeResults[activeJudge].deltaLabel}`;
}
resultsRoot.innerHTML = "";
const header = document.createElement("div");
header.className = "results-head results-head-oe";
header.innerHTML = `
<div></div>
<div class="results-head-cell">Model</div>
<div class="results-head-cell results-head-cell-right">OE_acc</div>
<div class="results-head-cell results-head-cell-right">OE_scr</div>
`;
resultsRoot.appendChild(header);
currentRows.forEach((item, index) => {
const row = document.createElement("div");
row.className = `result-row result-row-oe${item.isOurs ? " ours" : ""}`;
const other = otherMap.get(item.model);
const accDelta = other ? item.oeAcc - other.oeAcc : 0;
const scrDelta = other ? item.oeScr - other.oeScr : 0;
row.innerHTML = `
<div class="result-rank">#${index + 1}</div>
<div class="result-model-wrap">
<div class="result-model">${item.model}</div>
${item.isOurs ? '<span class="result-tag">Ours</span>' : ""}
</div>
<div class="result-metric-col acc" data-label="OE_acc">
<div class="result-score">${formatMetricValue(item.oeAcc, "oeAcc")}</div>
<div class="result-delta ${getDeltaClass(accDelta)}">${formatDeltaValue(accDelta, "oeAcc")}</div>
</div>
<div class="result-metric-col scr" data-label="OE_scr">
<div class="result-score">${formatMetricValue(item.oeScr, "oeScr")}</div>
<div class="result-delta ${getDeltaClass(scrDelta)}">${formatDeltaValue(scrDelta, "oeScr")}</div>
</div>
`;
resultsRoot.appendChild(row);
});
}
function renderMcResults() {
const ranked = [...mcResults].sort((a, b) => b.mcAcc - a.mcAcc);
if (resultsCaptionRoot) {
resultsCaptionRoot.textContent = "Main benchmark (MC) 路 Sorted by MC_acc";
}
resultsRoot.innerHTML = "";
const header = document.createElement("div");
header.className = "results-head results-head-mc";
header.innerHTML = `
<div></div>
<div class="results-head-cell">Model</div>
<div class="results-head-cell results-head-cell-right">MC_acc</div>
<div class="results-head-cell results-head-cell-right">Setting</div>
`;
resultsRoot.appendChild(header);
ranked.forEach((item, index) => {
const row = document.createElement("div");
row.className = `result-row result-row-mc${item.isOurs ? " ours" : ""}`;
row.innerHTML = `
<div class="result-rank">#${index + 1}</div>
<div class="result-model-wrap">
<div class="result-model">${item.model}</div>
${item.isOurs ? '<span class="result-tag">Ours</span>' : ""}
</div>
<div class="result-score">${item.mcAcc.toFixed(1)}%</div>
<div class="result-delta neutral">single-judge MC setting</div>
`;
resultsRoot.appendChild(row);
});
}
function renderResultsPanel(
activeTask = crossJudgeState.activeTask,
activeJudge = crossJudgeState.activeJudge,
activeMetric = crossJudgeState.activeMetric
) {
if (!resultsRoot) {
return;
}
crossJudgeState.activeTask = activeTask;
crossJudgeState.activeJudge = activeJudge;
crossJudgeState.activeMetric = activeMetric;
renderControls();
if (crossJudgeState.activeTask === "mc") {
renderMcResults();
} else {
renderOeResults(crossJudgeState.activeJudge);
}
}
renderResultsPanel("oe", "gpt4o", "oeAcc");
renderSamples("Belief", 0);