import React, { useState } from 'react'; const rateModels = [ { label: 'GPT-5.4 Mini', syc: 10, total: 200, bg: 'linear-gradient(to right, #0a5c3a, #10a37f)' }, { label: 'GPT-5.4', syc: 12, total: 200, bg: 'linear-gradient(to right, #0d6b45, #15c896)' }, { label: 'Gemini 3.1 Pro', syc: 36, total: 200, bg: 'linear-gradient(to right, #1a56b0, #4285f4)' }, { label: 'Gemini Flash', syc: 42, total: 199, bg: 'linear-gradient(to right, #2563a8, #5b9ef5)' }, { label: 'GPT-4o Mini', syc: 44, total: 200, bg: 'linear-gradient(to right, #074a2e, #0d8a5f)' }, ]; const totals = { mini54: 10, gpt54: 12, gpt4omini: 44, flash: 42, gemPro: 36 }; const categories = [ { label: 'Capitulation under pushback', mini54: 3, gpt54: 3, gpt4omini: 10, flash: 13, gemPro: 6 }, { label: 'Validating one-sided narratives', mini54: 4, gpt54: 4, gpt4omini: 9, flash: 11, gemPro: 12 }, { label: 'Endorsing delusional beliefs', mini54: 2, gpt54: 2, gpt4omini: 15, flash: 9, gemPro: 5 }, { label: 'Excessive praise / flattery', mini54: 0, gpt54: 0, gpt4omini: 0, flash: 3, gemPro: 3 }, { label: 'Abandoning AI identity boundaries', mini54: 0, gpt54: 2, gpt4omini: 4, flash: 4, gemPro: 8 }, ]; const catModels = [ { key: 'mini54', bg: 'linear-gradient(to right, #0a5c3a, #10a37f)', dot: '#10a37f', label: 'GPT-5.4 Mini' }, { key: 'gpt54', bg: 'linear-gradient(to right, #0d6b45, #15c896)', dot: '#15c896', label: 'GPT-5.4' }, { key: 'gpt4omini', bg: 'linear-gradient(to right, #074a2e, #0d8a5f)', dot: '#0d8a5f', label: 'GPT-4o Mini' }, { key: 'flash', bg: 'linear-gradient(to right, #2563a8, #5b9ef5)', dot: '#5b9ef5', label: 'Gemini Flash' }, { key: 'gemPro', bg: 'linear-gradient(to right, #1a56b0, #4285f4)', dot: '#4285f4', label: 'Gemini 3.1 Pro' }, ]; const BenchmarkChart = () => { const [showCategories, setShowCategories] = useState(false); return (
Percentage of conversations where each model exhibited sycophantic behavior
* Percentage of conversations (out of 200) where the model exhibited sycophantic behavior.
* Percentages represent the share of each model's sycophantic conversations that fall into a given category.
JulyAI Sycophancy Benchmark: {rateModels.length} SOTA models tested across 200 conversations each