Skip to content

Instantly share code, notes, and snippets.

@karminski
Created October 14, 2025 23:07
Show Gist options
  • Select an option

  • Save karminski/aa56b5b73e2f3070a7530dfe11c2ddb3 to your computer and use it in GitHub Desktop.

Select an option

Save karminski/aa56b5b73e2f3070a7530dfe11c2ddb3 to your computer and use it in GitHub Desktop.
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Qwen3-VL Performance Benchmark</title>
<style>
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Helvetica Neue', Arial, sans-serif;
background: #ffffff;
min-height: 100vh;
padding: 40px 20px;
color: #1d1d1f;
}
.container {
max-width: 1400px;
margin: 0 auto;
}
.card {
background: rgba(255, 255, 255, 0.95);
backdrop-filter: blur(10px);
border-radius: 20px;
box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
overflow: hidden;
}
.header {
background: linear-gradient(135deg, #8e9eab 0%, #6d7a86 100%);
color: white;
padding: 40px;
text-align: center;
}
.header h1 {
font-size: 42px;
font-weight: 700;
margin-bottom: 10px;
letter-spacing: -0.5px;
}
.header p {
font-size: 18px;
opacity: 0.9;
font-weight: 400;
}
.stats-section {
padding: 40px;
background: #f5f5f7;
border-bottom: 1px solid #d2d2d7;
}
.stats-title {
font-size: 24px;
font-weight: 600;
margin-bottom: 24px;
color: #1d1d1f;
}
.stats-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
gap: 20px;
}
.stat-card {
background: white;
padding: 24px;
border-radius: 12px;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
transition: transform 0.3s ease, box-shadow 0.3s ease;
}
.stat-card:hover {
transform: translateY(-4px);
box-shadow: 0 8px 24px rgba(0, 0, 0, 0.12);
}
.stat-card .model-name {
font-size: 16px;
font-weight: 600;
color: #1d1d1f;
margin-bottom: 8px;
}
.stat-card .win-count {
font-size: 36px;
font-weight: 700;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
background-clip: text;
}
.stat-card .win-label {
font-size: 14px;
color: #86868b;
margin-top: 4px;
}
.stat-card .elo-score {
font-size: 20px;
font-weight: 600;
color: #667eea;
margin-top: 12px;
padding-top: 12px;
border-top: 1px solid #e5e5e7;
}
.stat-card .elo-label {
font-size: 12px;
color: #86868b;
margin-top: 4px;
}
.stat-card .elo-diff {
font-size: 11px;
color: #86868b;
margin-top: 6px;
font-style: italic;
}
.table-section {
padding: 40px;
overflow-x: auto;
}
table {
width: 100%;
border-collapse: collapse;
font-size: 14px;
}
thead {
background: linear-gradient(135deg, #8e9eab 0%, #6d7a86 100%);
color: white;
position: sticky;
top: 0;
z-index: 10;
}
thead th {
padding: 16px 12px;
text-align: left;
font-weight: 600;
font-size: 13px;
letter-spacing: 0.5px;
white-space: nowrap;
}
tbody tr {
border-bottom: 1px solid #e5e5e7;
transition: background-color 0.2s ease;
}
tbody tr:hover {
background-color: #f5f5f7;
}
tbody td {
padding: 14px 12px;
color: #1d1d1f;
}
.category-cell {
font-weight: 600;
color: #6d7a86;
background: #f9f9fb;
}
.benchmark-cell {
font-weight: 500;
}
.score-cell {
text-align: center;
font-weight: 500;
font-variant-numeric: tabular-nums;
}
.best-score {
background: linear-gradient(135deg, #ffd89b 0%, #19547b 100%);
color: white;
font-weight: 700;
border-radius: 6px;
padding: 6px 8px;
display: inline-block;
min-width: 60px;
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.15);
}
.na-cell {
color: #d2d2d7;
text-align: center;
}
.notes-section {
padding: 40px;
background: #f9f9fb;
font-size: 14px;
color: #6e6e73;
line-height: 1.8;
}
.notes-section h3 {
font-size: 18px;
color: #1d1d1f;
margin-bottom: 16px;
font-weight: 600;
}
.notes-section ul {
list-style: none;
padding-left: 0;
}
.notes-section li {
padding-left: 24px;
position: relative;
margin-bottom: 8px;
}
.notes-section li:before {
content: "•";
position: absolute;
left: 8px;
color: #667eea;
font-weight: bold;
}
@media (max-width: 768px) {
.header h1 {
font-size: 28px;
}
.header p {
font-size: 16px;
}
.stats-section,
.table-section,
.notes-section {
padding: 24px;
}
table {
font-size: 12px;
}
thead th,
tbody td {
padding: 10px 8px;
}
}
.rank-badge {
display: inline-block;
padding: 4px 12px;
border-radius: 20px;
font-size: 12px;
font-weight: 600;
margin-left: 8px;
}
.rank-1 {
background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
color: white;
}
.rank-2 {
background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
color: white;
}
.rank-3 {
background: linear-gradient(135deg, #43e97b 0%, #38f9d7 100%);
color: white;
}
.lang-switcher {
position: fixed;
top: 20px;
left: 20px;
z-index: 1000;
background: white;
border-radius: 30px;
box-shadow: 0 4px 20px rgba(0, 0, 0, 0.15);
padding: 8px 12px;
display: flex;
align-items: center;
gap: 8px;
cursor: pointer;
transition: all 0.3s ease;
user-select: none;
}
.lang-switcher:hover {
box-shadow: 0 6px 30px rgba(0, 0, 0, 0.25);
transform: translateY(-2px);
}
.lang-switcher .lang-icon {
font-size: 20px;
}
.lang-switcher .lang-text {
font-size: 14px;
font-weight: 600;
color: #1d1d1f;
}
.footer-logo {
padding: 40px;
margin-top: 40px;
text-align: center;
background: transparent;
}
.footer-logo img {
max-width: 300px;
width: 100%;
height: auto;
opacity: 0.6;
transition: opacity 0.3s ease;
}
.footer-logo img:hover {
opacity: 0.9;
}
@media (max-width: 768px) {
.footer-logo {
padding: 30px 20px;
margin-top: 30px;
}
.footer-logo img {
max-width: 200px;
}
}
</style>
</head>
<body>
<div class="lang-switcher" onclick="toggleLanguage()">
<span class="lang-icon">🌐</span>
<span class="lang-text" id="langText">中文</span>
</div>
<div class="container">
<div class="card">
<div class="header">
<h1 id="mainTitle">Qwen3-VL Performance Benchmark</h1>
<p id="mainSubtitle">Comprehensive evaluation across multiple vision-language tasks</p>
</div>
<div class="stats-section">
<div class="stats-title" id="statsTitle">🏆 Model Performance Rankings</div>
<div class="stats-grid" id="statsGrid"></div>
</div>
<div class="table-section">
<table id="benchmarkTable">
<thead>
<tr>
<th id="thCategory">Category</th>
<th id="thBenchmark">Benchmark</th>
<th>Qwen3-VL 4B</th>
<th>Qwen3-VL 8B</th>
<th>Qwen3-VL 235B</th>
<th>Gemini2.5</th>
<th>GPT5-Nano</th>
</tr>
</thead>
<tbody id="tableBody"></tbody>
</table>
</div>
<div class="notes-section">
<h3 id="notesTitle">📝 Notes</h3>
<ul id="notesList">
<li id="note1">Highlighted scores indicate the best performance among all models for that benchmark</li>
<li id="note2">The ranking section shows how many times each model achieved the highest score</li>
<li id="note3">Qwen3-VL 235B-A22B Thinking is the large-scale thinking model version</li>
<li id="note4">Results on video understanding are measured using a 256k-token context, handling up to 2048 frames</li>
<li id="note5">"—" indicates data not available for that benchmark</li>
</ul>
</div>
</div>
<div class="footer-logo">
<img src="./assets/images/kcores-llm-arena-logo-black.png" alt="KCORES LLM Arena Logo">
</div>
</div>
<script>
// Language data
let currentLang = 'en';
const i18n = {
en: {
langText: '中文',
mainTitle: 'Qwen3-VL Performance Benchmark',
mainSubtitle: 'Comprehensive evaluation across multiple vision-language tasks',
statsTitle: '🏆 Model Performance Rankings',
thCategory: 'Category',
thBenchmark: 'Benchmark',
notesTitle: '📝 Notes',
note1: 'Highlighted scores indicate the best performance among all models for that benchmark',
note2: 'The ranking section shows how many times each model achieved the highest score',
note3: 'Qwen3-VL 235B-A22B Thinking is the large-scale thinking model version',
note4: 'Results on video understanding are measured using a 256k-token context, handling up to 2048 frames',
note5: '"—" indicates data not available for that benchmark',
firstPlaceWins: 'First Place Wins',
eloRating: 'Elo Rating',
topAmongQwen: 'Top among Qwen models',
vs: 'vs'
},
zh: {
langText: 'English',
mainTitle: 'Qwen3-VL 性能基准测试',
mainSubtitle: '跨多个视觉语言任务的综合评估',
statsTitle: '🏆 模型性能排名',
thCategory: '分类',
thBenchmark: '基准测试',
notesTitle: '📝 说明',
note1: '高亮显示的分数表示该基准测试中所有模型的最佳性能',
note2: '排名部分显示每个模型获得最高分的次数',
note3: 'Qwen3-VL 235B-A22B Thinking 是大规模思维模型版本',
note4: '视频理解结果使用 256k token 上下文进行测量,可处理多达 2048 帧',
note5: '"—" 表示该基准测试的数据不可用',
firstPlaceWins: '第一名次数',
eloRating: 'Elo 评分',
topAmongQwen: 'Qwen 模型中最高',
vs: '对比'
}
};
function toggleLanguage() {
currentLang = currentLang === 'en' ? 'zh' : 'en';
updateLanguage();
}
function updateLanguage() {
const t = i18n[currentLang];
// Update static texts
document.getElementById('langText').textContent = t.langText;
document.getElementById('mainTitle').textContent = t.mainTitle;
document.getElementById('mainSubtitle').textContent = t.mainSubtitle;
document.getElementById('statsTitle').textContent = t.statsTitle;
document.getElementById('thCategory').textContent = t.thCategory;
document.getElementById('thBenchmark').textContent = t.thBenchmark;
document.getElementById('notesTitle').textContent = t.notesTitle;
document.getElementById('note1').textContent = t.note1;
document.getElementById('note2').textContent = t.note2;
document.getElementById('note3').textContent = t.note3;
document.getElementById('note4').textContent = t.note4;
document.getElementById('note5').textContent = t.note5;
// Re-render stats grid
renderStatsGrid();
}
const benchmarkData = [
{ category: "STEM & Puzzle", benchmark: "MMMU_VAL", scores: [70.8, 74.1, 80.6, 73.4, 75.8] },
{ category: "STEM & Puzzle", benchmark: "MMMU_Pro", scores: [57.0, 60.4, 69.3, 59.7, 57.2] },
{ category: "STEM & Puzzle", benchmark: "MathVista_mini", scores: [79.5, 81.4, 85.8, 72.8, 71.5] },
{ category: "STEM & Puzzle", benchmark: "MathVision", scores: [60.0, 62.7, 74.6, 52.1, 62.2] },
{ category: "STEM & Puzzle", benchmark: "MathVerse_mini", scores: [75.2, 77.7, 85.0, 69.6, 74.2] },
{ category: "STEM & Puzzle", benchmark: "ZEROBench", scores: [null, null, 4.0, null, null] },
{ category: "STEM & Puzzle", benchmark: "ZEROBench_Sub", scores: [null, null, 27.7, null, null] },
{ category: "STEM & Puzzle", benchmark: "VisuLogic", scores: [null, null, 34.4, null, null] },
{ category: "STEM & Puzzle", benchmark: "MMBenchDEV_EN_V1.1", scores: [86.7, 87.5, 90.6, 82.7, 80.3] },
{ category: "General VQA", benchmark: "RealWorldQA", scores: [73.2, 73.5, 81.3, 72.2, 71.8] },
{ category: "General VQA", benchmark: "MMStar", scores: [73.2, 75.3, 78.7, 69.1, 68.6] },
{ category: "General VQA", benchmark: "SimpleVQA", scores: [48.8, 49.6, 61.3, 54.1, 46.0] },
{ category: "Subjective Experience", benchmark: "HallusionBench", scores: [64.1, 65.4, 66.7, 64.5, 58.4] },
{ category: "Subjective Experience", benchmark: "MM-MT-Bench", scores: [7.7, 8.0, 8.5, 7.7, 6.6] },
{ category: "Subjective Experience", benchmark: "MIABench", scores: [91.0, 91.5, 92.7, 91.6, 89.9] },
{ category: "Subjective Experience", benchmark: "MMLongBench-Doc", scores: [44.4, 48.0, 56.2, 46.5, 31.8] },
{ category: "Subjective Experience", benchmark: "DocVQA_TEST", scores: [94.2, 95.3, 96.5, 92.5, 88.2] },
{ category: "Subjective Experience", benchmark: "InfoVQA_TEST", scores: [83.0, 86.0, 89.5, 81.5, 68.6] },
{ category: "Text Recognition", benchmark: "AI2D_TEST", scores: [84.9, 84.9, 89.2, 85.7, 81.9] },
{ category: "Text Recognition", benchmark: "OCRBench", scores: [808.0, 819.0, 875.0, 825.0, 753.0] },
{ category: "Text Recognition", benchmark: "OCRBenchV2", scores: [58.8, 61.55, 65.15, 47.75, 40.85] },
{ category: "Text Recognition", benchmark: "CC_OCR", scores: [null, null, 81.5, null, null] },
{ category: "Text Recognition", benchmark: "CharXiv_(RQ)", scores: [50.3, 53.0, 66.1, 56.1, 50.1] },
{ category: "Text Recognition", benchmark: "RefCOCO-avg", scores: [null, null, 92.4, null, null] },
{ category: "Text Recognition", benchmark: "CountBench", scores: [89.4, 91.5, 93.7, 79.2, 80.0] },
{ category: "2D/3D Grounding", benchmark: "ODinW13", scores: [39.4, 39.8, 43.2, null, null] },
{ category: "2D/3D Grounding", benchmark: "ARKitScenes", scores: [46.3, 46.6, 53.7, null, null] },
{ category: "2D/3D Grounding", benchmark: "Hypersim", scores: [11.9, 12.0, 11.0, null, null] },
{ category: "2D/3D Grounding", benchmark: "SUNRGBD", scores: [28.0, 30.4, 34.9, null, null] },
{ category: "2D/3D Grounding", benchmark: "Objectron", scores: [null, null, 71.2, null, null] },
{ category: "Multi-Image", benchmark: "BLINK", scores: [63.4, 64.7, 67.1, 64.4, 58.3] },
{ category: "Multi-Image", benchmark: "MUIRBENCH", scores: [75.0, 76.8, 80.1, 64.0, 65.7] },
{ category: "Multi-Image", benchmark: "ERQA", scores: [47.3, 46.8, 52.5, 44.3, 45.8] },
{ category: "Embodied & Spatial", benchmark: "EmbSpatialBench", scores: [80.7, 81.1, 84.3, 66.1, 74.2] },
{ category: "Embodied & Spatial", benchmark: "RefSpatialBench", scores: [45.3, 44.6, 69.9, 11.2, 12.6] },
{ category: "Embodied & Spatial", benchmark: "RoboSpatialHome", scores: [63.2, 62.0, 73.9, 50.3, 46.1] },
{ category: "Embodied & Spatial", benchmark: "VSI-Bench", scores: [55.2, 56.6, null, 30.3, 15.4] },
{ category: "Embodied & Spatial", benchmark: "MVBench", scores: [69.3, 69.0, null, null, null] },
{ category: "Video", benchmark: "VideoMME", scores: [68.9, 71.8, 79.0, 72.7, 66.2] },
{ category: "Video", benchmark: "MLVU", scores: [75.7, 75.1, 83.8, 78.5, 69.2] },
{ category: "Video", benchmark: "LVBench", scores: [53.5, 55.8, 63.6, 60.9, null] },
{ category: "Video", benchmark: "CharadesSTA", scores: [59.0, 59.9, 63.5, null, null] },
{ category: "Video", benchmark: "VideoMMU", scores: [69.4, 72.8, 80.0, 69.2, 63.0] },
{ category: "Video", benchmark: "ScreenSpot", scores: [92.9, 93.6, 95.4, null, null] },
{ category: "Video", benchmark: "ScreenSpot Pro", scores: [49.2, 46.6, 61.8, null, null] },
{ category: "Agent", benchmark: "OSWorldG", scores: [53.9, 56.7, 68.3, null, null] },
{ category: "Agent", benchmark: "OSWorld", scores: [31.4, 33.9, 38.1, null, null] },
{ category: "Agent", benchmark: "AndroidWorld", scores: [52.0, 50.0, null, null, null] },
{ category: "Coding", benchmark: "Design2Code", scores: [null, null, 93.4, null, null] },
{ category: "Fine-grained", benchmark: "V*", scores: [74.9, 77.5, null, 70.2, null] },
{ category: "Fine-grained", benchmark: "HRBench4K", scores: [73.5, 72.4, null, 77.8, null] },
{ category: "Fine-grained", benchmark: "HRBench8K", scores: [67.1, 68.1, null, 75.5, null] }
];
const modelNames = [
"Qwen3-VL 4B Thinking",
"Qwen3-VL 8B Thinking",
"Qwen3-VL 235B-A22B",
"Gemini2.5-Flash-Lite",
"GPT5-Nano High"
];
// Calculate wins for each model
const wins = [0, 0, 0, 0, 0];
benchmarkData.forEach(row => {
const validScores = row.scores.map((score, idx) => ({ score, idx }))
.filter(item => item.score !== null);
if (validScores.length > 0) {
const maxScore = Math.max(...validScores.map(item => item.score));
validScores.forEach(item => {
if (item.score === maxScore) {
wins[item.idx]++;
}
});
}
});
// Calculate Elo ratings
function calculateElo(benchmarkData, numModels) {
// Initialize Elo ratings (starting at 1500)
let elo = new Array(numModels).fill(1500);
const K = 32; // K-factor for Elo calculation
// Process each benchmark as a tournament
benchmarkData.forEach(row => {
const validScores = row.scores.map((score, idx) => ({ score, idx }))
.filter(item => item.score !== null);
if (validScores.length < 2) return;
// Compare each pair of models
for (let i = 0; i < validScores.length; i++) {
for (let j = i + 1; j < validScores.length; j++) {
const modelA = validScores[i];
const modelB = validScores[j];
// Expected scores
const expectedA = 1 / (1 + Math.pow(10, (elo[modelB.idx] - elo[modelA.idx]) / 400));
const expectedB = 1 / (1 + Math.pow(10, (elo[modelA.idx] - elo[modelB.idx]) / 400));
// Actual scores (1 for win, 0.5 for tie, 0 for loss)
let actualA, actualB;
if (modelA.score > modelB.score) {
actualA = 1;
actualB = 0;
} else if (modelA.score < modelB.score) {
actualA = 0;
actualB = 1;
} else {
actualA = 0.5;
actualB = 0.5;
}
// Update Elo ratings
elo[modelA.idx] += K * (actualA - expectedA);
elo[modelB.idx] += K * (actualB - expectedB);
}
}
});
return elo.map(rating => Math.round(rating));
}
const eloRatings = calculateElo(benchmarkData, 5);
// Create ranking data
const rankings = modelNames.map((name, idx) => ({
name,
wins: wins[idx],
elo: eloRatings[idx],
modelIdx: idx
}))
.sort((a, b) => b.wins - a.wins);
// Calculate Elo differences between Qwen models
const qwenModels = [
{ name: "Qwen3-VL 4B", idx: 0, elo: eloRatings[0] },
{ name: "Qwen3-VL 8B", idx: 1, elo: eloRatings[1] },
{ name: "Qwen3-VL 235B", idx: 2, elo: eloRatings[2] }
].sort((a, b) => b.elo - a.elo);
function getEloDiff(modelIdx) {
const t = i18n[currentLang];
if (modelIdx < 3) { // Only for Qwen models
const currentElo = eloRatings[modelIdx];
const topQwenElo = qwenModels[0].elo;
if (currentElo === topQwenElo) {
return t.topAmongQwen;
}
const diff = topQwenElo - currentElo;
return `${diff > 0 ? '-' : '+'}${Math.abs(diff)} ${t.vs} ${qwenModels[0].name}`;
}
return "";
}
// Render stats grid
function renderStatsGrid() {
const t = i18n[currentLang];
const statsGrid = document.getElementById('statsGrid');
statsGrid.innerHTML = '';
rankings.forEach((model, idx) => {
const rankBadge = idx < 3 ? `<span class="rank-badge rank-${idx + 1}">#${idx + 1}</span>` : '';
const eloDiff = getEloDiff(model.modelIdx);
const eloDiffHtml = eloDiff ? `<div class="elo-diff">${eloDiff}</div>` : '';
statsGrid.innerHTML += `
<div class="stat-card">
<div class="model-name">${model.name}${rankBadge}</div>
<div class="win-count">${model.wins}</div>
<div class="win-label">${t.firstPlaceWins}</div>
<div class="elo-score">${model.elo}</div>
<div class="elo-label">${t.eloRating}</div>
${eloDiffHtml}
</div>
`;
});
}
// Initial render
renderStatsGrid();
// Render table
const tableBody = document.getElementById('tableBody');
let currentCategory = '';
benchmarkData.forEach(row => {
const tr = document.createElement('tr');
// Category cell
const categoryCell = document.createElement('td');
categoryCell.className = 'category-cell';
if (row.category !== currentCategory) {
categoryCell.textContent = row.category;
currentCategory = row.category;
}
tr.appendChild(categoryCell);
// Benchmark cell
const benchmarkCell = document.createElement('td');
benchmarkCell.className = 'benchmark-cell';
benchmarkCell.textContent = row.benchmark;
tr.appendChild(benchmarkCell);
// Find max score
const validScores = row.scores.filter(s => s !== null);
const maxScore = validScores.length > 0 ? Math.max(...validScores) : null;
// Score cells
row.scores.forEach(score => {
const td = document.createElement('td');
td.className = 'score-cell';
if (score === null) {
td.className += ' na-cell';
td.textContent = '—';
} else {
if (score === maxScore && maxScore !== null) {
td.innerHTML = `<span class="best-score">${score}</span>`;
} else {
td.textContent = score;
}
}
tr.appendChild(td);
});
tableBody.appendChild(tr);
});
</script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment