karminski · October 14, 2025 23:07
diff --git a/Qwen3-VL Performance Benchmark.html b/Qwen3-VL Performance Benchmark.html
 <!DOCTYPE html>
 <html lang="en">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Qwen3-VL Performance Benchmark</title>
    <style>
        * {
            margin: 0;
            padding: 0;
            box-sizing: border-box;
        }

        body {
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Helvetica Neue', Arial, sans-serif;
            background: #ffffff;
            min-height: 100vh;
            padding: 40px 20px;
            color: #1d1d1f;
        }

        .container {
            max-width: 1400px;
            margin: 0 auto;
        }

        .card {
            background: rgba(255, 255, 255, 0.95);
            backdrop-filter: blur(10px);
            border-radius: 20px;
            box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
            overflow: hidden;
        }

        .header {
            background: linear-gradient(135deg, #8e9eab 0%, #6d7a86 100%);
            color: white;
            padding: 40px;
            text-align: center;
        }

        .header h1 {
            font-size: 42px;
            font-weight: 700;
            margin-bottom: 10px;
            letter-spacing: -0.5px;
        }

        .header p {
            font-size: 18px;
            opacity: 0.9;
            font-weight: 400;
        }

        .stats-section {
            padding: 40px;
            background: #f5f5f7;
            border-bottom: 1px solid #d2d2d7;
        }

        .stats-title {
            font-size: 24px;
            font-weight: 600;
            margin-bottom: 24px;
            color: #1d1d1f;
        }

        .stats-grid {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
            gap: 20px;
        }

        .stat-card {
            background: white;
            padding: 24px;
            border-radius: 12px;
            box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
            transition: transform 0.3s ease, box-shadow 0.3s ease;
        }

        .stat-card:hover {
            transform: translateY(-4px);
            box-shadow: 0 8px 24px rgba(0, 0, 0, 0.12);
        }

        .stat-card .model-name {
            font-size: 16px;
            font-weight: 600;
            color: #1d1d1f;
            margin-bottom: 8px;
        }

        .stat-card .win-count {
            font-size: 36px;
            font-weight: 700;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            -webkit-background-clip: text;
            -webkit-text-fill-color: transparent;
            background-clip: text;
        }

        .stat-card .win-label {
            font-size: 14px;
            color: #86868b;
            margin-top: 4px;
        }

        .stat-card .elo-score {
            font-size: 20px;
            font-weight: 600;
            color: #667eea;
            margin-top: 12px;
            padding-top: 12px;
            border-top: 1px solid #e5e5e7;
        }

        .stat-card .elo-label {
            font-size: 12px;
            color: #86868b;
            margin-top: 4px;
        }

        .stat-card .elo-diff {
            font-size: 11px;
            color: #86868b;
            margin-top: 6px;
            font-style: italic;
        }

        .table-section {
            padding: 40px;
            overflow-x: auto;
        }

        table {
            width: 100%;
            border-collapse: collapse;
            font-size: 14px;
        }

        thead {
            background: linear-gradient(135deg, #8e9eab 0%, #6d7a86 100%);
            color: white;
            position: sticky;
            top: 0;
            z-index: 10;
        }

        thead th {
            padding: 16px 12px;
            text-align: left;
            font-weight: 600;
            font-size: 13px;
            letter-spacing: 0.5px;
            white-space: nowrap;
        }

        tbody tr {
            border-bottom: 1px solid #e5e5e7;
            transition: background-color 0.2s ease;
        }

        tbody tr:hover {
            background-color: #f5f5f7;
        }

        tbody td {
            padding: 14px 12px;
            color: #1d1d1f;
        }

        .category-cell {
            font-weight: 600;
            color: #6d7a86;
            background: #f9f9fb;
        }

        .benchmark-cell {
            font-weight: 500;
        }

        .score-cell {
            text-align: center;
            font-weight: 500;
            font-variant-numeric: tabular-nums;
        }

        .best-score {
            background: linear-gradient(135deg, #ffd89b 0%, #19547b 100%);
            color: white;
            font-weight: 700;
            border-radius: 6px;
            padding: 6px 8px;
            display: inline-block;
            min-width: 60px;
            box-shadow: 0 2px 8px rgba(0, 0, 0, 0.15);
        }

        .na-cell {
            color: #d2d2d7;
            text-align: center;
        }

        .notes-section {
            padding: 40px;
            background: #f9f9fb;
            font-size: 14px;
            color: #6e6e73;
            line-height: 1.8;
        }

        .notes-section h3 {
            font-size: 18px;
            color: #1d1d1f;
            margin-bottom: 16px;
            font-weight: 600;
        }

        .notes-section ul {
            list-style: none;
            padding-left: 0;
        }

        .notes-section li {
            padding-left: 24px;
            position: relative;
            margin-bottom: 8px;
        }

        .notes-section li:before {
            content: "•";
            position: absolute;
            left: 8px;
            color: #667eea;
            font-weight: bold;
        }

        @media (max-width: 768px) {
            .header h1 {
                font-size: 28px;
            }

            .header p {
                font-size: 16px;
            }

            .stats-section,
            .table-section,
            .notes-section {
                padding: 24px;
            }

            table {
                font-size: 12px;
            }

            thead th,
            tbody td {
                padding: 10px 8px;
            }
        }

        .rank-badge {
            display: inline-block;
            padding: 4px 12px;
            border-radius: 20px;
            font-size: 12px;
            font-weight: 600;
            margin-left: 8px;
        }

        .rank-1 {
            background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
            color: white;
        }

        .rank-2 {
            background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
            color: white;
        }

        .rank-3 {
            background: linear-gradient(135deg, #43e97b 0%, #38f9d7 100%);
            color: white;
        }

        .lang-switcher {
            position: fixed;
            top: 20px;
            left: 20px;
            z-index: 1000;
            background: white;
            border-radius: 30px;
            box-shadow: 0 4px 20px rgba(0, 0, 0, 0.15);
            padding: 8px 12px;
            display: flex;
            align-items: center;
            gap: 8px;
            cursor: pointer;
            transition: all 0.3s ease;
            user-select: none;
        }

        .lang-switcher:hover {
            box-shadow: 0 6px 30px rgba(0, 0, 0, 0.25);
            transform: translateY(-2px);
        }

        .lang-switcher .lang-icon {
            font-size: 20px;
        }

        .lang-switcher .lang-text {
            font-size: 14px;
            font-weight: 600;
            color: #1d1d1f;
        }

        .footer-logo {
            padding: 40px;
            margin-top: 40px;
            text-align: center;
            background: transparent;
        }

        .footer-logo img {
            max-width: 300px;
            width: 100%;
            height: auto;
            opacity: 0.6;
            transition: opacity 0.3s ease;
        }

        .footer-logo img:hover {
            opacity: 0.9;
        }

        @media (max-width: 768px) {
            .footer-logo {
                padding: 30px 20px;
                margin-top: 30px;
            }
            
            .footer-logo img {
                max-width: 200px;
            }
        }
    </style>
 </head>
 <body>
    <div class="lang-switcher" onclick="toggleLanguage()">
        <span class="lang-icon">🌐</span>
        <span class="lang-text" id="langText">中文</span>
    </div>

    <div class="container">
        <div class="card">
            <div class="header">
                <h1 id="mainTitle">Qwen3-VL Performance Benchmark</h1>
                <p id="mainSubtitle">Comprehensive evaluation across multiple vision-language tasks</p>
            </div>

            <div class="stats-section">
                <div class="stats-title" id="statsTitle">🏆 Model Performance Rankings</div>
                <div class="stats-grid" id="statsGrid"></div>
            </div>

            <div class="table-section">
                <table id="benchmarkTable">
                    <thead>
                        <tr>
                            <th id="thCategory">Category</th>
                            <th id="thBenchmark">Benchmark</th>
                            <th>Qwen3-VL 4B</th>
                            <th>Qwen3-VL 8B</th>
                            <th>Qwen3-VL 235B</th>
                            <th>Gemini2.5</th>
                            <th>GPT5-Nano</th>
                        </tr>
                    </thead>
                    <tbody id="tableBody"></tbody>
                </table>
            </div>

            <div class="notes-section">
                <h3 id="notesTitle">📝 Notes</h3>
                <ul id="notesList">
                    <li id="note1">Highlighted scores indicate the best performance among all models for that benchmark</li>
                    <li id="note2">The ranking section shows how many times each model achieved the highest score</li>
                    <li id="note3">Qwen3-VL 235B-A22B Thinking is the large-scale thinking model version</li>
                    <li id="note4">Results on video understanding are measured using a 256k-token context, handling up to 2048 frames</li>
                    <li id="note5">"—" indicates data not available for that benchmark</li>
                </ul>
            </div>
        </div>

        <div class="footer-logo">
            <img src="./assets/images/kcores-llm-arena-logo-black.png" alt="KCORES LLM Arena Logo">
        </div>
    </div>

    <script>
        // Language data
        let currentLang = 'en';
        
        const i18n = {
            en: {
                langText: '中文',
                mainTitle: 'Qwen3-VL Performance Benchmark',
                mainSubtitle: 'Comprehensive evaluation across multiple vision-language tasks',
                statsTitle: '🏆 Model Performance Rankings',
                thCategory: 'Category',
                thBenchmark: 'Benchmark',
                notesTitle: '📝 Notes',
                note1: 'Highlighted scores indicate the best performance among all models for that benchmark',
                note2: 'The ranking section shows how many times each model achieved the highest score',
                note3: 'Qwen3-VL 235B-A22B Thinking is the large-scale thinking model version',
                note4: 'Results on video understanding are measured using a 256k-token context, handling up to 2048 frames',
                note5: '"—" indicates data not available for that benchmark',
                firstPlaceWins: 'First Place Wins',
                eloRating: 'Elo Rating',
                topAmongQwen: 'Top among Qwen models',
                vs: 'vs'
            },
            zh: {
                langText: 'English',
                mainTitle: 'Qwen3-VL 性能基准测试',
                mainSubtitle: '跨多个视觉语言任务的综合评估',
                statsTitle: '🏆 模型性能排名',
                thCategory: '分类',
                thBenchmark: '基准测试',
                notesTitle: '📝 说明',
                note1: '高亮显示的分数表示该基准测试中所有模型的最佳性能',
                note2: '排名部分显示每个模型获得最高分的次数',
                note3: 'Qwen3-VL 235B-A22B Thinking 是大规模思维模型版本',
                note4: '视频理解结果使用 256k token 上下文进行测量，可处理多达 2048 帧',
                note5: '"—" 表示该基准测试的数据不可用',
                firstPlaceWins: '第一名次数',
                eloRating: 'Elo 评分',
                topAmongQwen: 'Qwen 模型中最高',
                vs: '对比'
            }
        };

        function toggleLanguage() {
            currentLang = currentLang === 'en' ? 'zh' : 'en';
            updateLanguage();
        }

        function updateLanguage() {
            const t = i18n[currentLang];
            
            // Update static texts
            document.getElementById('langText').textContent = t.langText;
            document.getElementById('mainTitle').textContent = t.mainTitle;
            document.getElementById('mainSubtitle').textContent = t.mainSubtitle;
            document.getElementById('statsTitle').textContent = t.statsTitle;
            document.getElementById('thCategory').textContent = t.thCategory;
            document.getElementById('thBenchmark').textContent = t.thBenchmark;
            document.getElementById('notesTitle').textContent = t.notesTitle;
            document.getElementById('note1').textContent = t.note1;
            document.getElementById('note2').textContent = t.note2;
            document.getElementById('note3').textContent = t.note3;
            document.getElementById('note4').textContent = t.note4;
            document.getElementById('note5').textContent = t.note5;
            
            // Re-render stats grid
            renderStatsGrid();
        }

        const benchmarkData = [
            { category: "STEM & Puzzle", benchmark: "MMMU_VAL", scores: [70.8, 74.1, 80.6, 73.4, 75.8] },
            { category: "STEM & Puzzle", benchmark: "MMMU_Pro", scores: [57.0, 60.4, 69.3, 59.7, 57.2] },
            { category: "STEM & Puzzle", benchmark: "MathVista_mini", scores: [79.5, 81.4, 85.8, 72.8, 71.5] },
            { category: "STEM & Puzzle", benchmark: "MathVision", scores: [60.0, 62.7, 74.6, 52.1, 62.2] },
            { category: "STEM & Puzzle", benchmark: "MathVerse_mini", scores: [75.2, 77.7, 85.0, 69.6, 74.2] },
            { category: "STEM & Puzzle", benchmark: "ZEROBench", scores: [null, null, 4.0, null, null] },
            { category: "STEM & Puzzle", benchmark: "ZEROBench_Sub", scores: [null, null, 27.7, null, null] },
            { category: "STEM & Puzzle", benchmark: "VisuLogic", scores: [null, null, 34.4, null, null] },
            { category: "STEM & Puzzle", benchmark: "MMBenchDEV_EN_V1.1", scores: [86.7, 87.5, 90.6, 82.7, 80.3] },
            
            { category: "General VQA", benchmark: "RealWorldQA", scores: [73.2, 73.5, 81.3, 72.2, 71.8] },
            { category: "General VQA", benchmark: "MMStar", scores: [73.2, 75.3, 78.7, 69.1, 68.6] },
            { category: "General VQA", benchmark: "SimpleVQA", scores: [48.8, 49.6, 61.3, 54.1, 46.0] },
            
            { category: "Subjective Experience", benchmark: "HallusionBench", scores: [64.1, 65.4, 66.7, 64.5, 58.4] },
            { category: "Subjective Experience", benchmark: "MM-MT-Bench", scores: [7.7, 8.0, 8.5, 7.7, 6.6] },
            { category: "Subjective Experience", benchmark: "MIABench", scores: [91.0, 91.5, 92.7, 91.6, 89.9] },
            { category: "Subjective Experience", benchmark: "MMLongBench-Doc", scores: [44.4, 48.0, 56.2, 46.5, 31.8] },
            { category: "Subjective Experience", benchmark: "DocVQA_TEST", scores: [94.2, 95.3, 96.5, 92.5, 88.2] },
            { category: "Subjective Experience", benchmark: "InfoVQA_TEST", scores: [83.0, 86.0, 89.5, 81.5, 68.6] },
            
            { category: "Text Recognition", benchmark: "AI2D_TEST", scores: [84.9, 84.9, 89.2, 85.7, 81.9] },
            { category: "Text Recognition", benchmark: "OCRBench", scores: [808.0, 819.0, 875.0, 825.0, 753.0] },
            { category: "Text Recognition", benchmark: "OCRBenchV2", scores: [58.8, 61.55, 65.15, 47.75, 40.85] },
            { category: "Text Recognition", benchmark: "CC_OCR", scores: [null, null, 81.5, null, null] },
            { category: "Text Recognition", benchmark: "CharXiv_(RQ)", scores: [50.3, 53.0, 66.1, 56.1, 50.1] },
            { category: "Text Recognition", benchmark: "RefCOCO-avg", scores: [null, null, 92.4, null, null] },
            { category: "Text Recognition", benchmark: "CountBench", scores: [89.4, 91.5, 93.7, 79.2, 80.0] },
            
            { category: "2D/3D Grounding", benchmark: "ODinW13", scores: [39.4, 39.8, 43.2, null, null] },
            { category: "2D/3D Grounding", benchmark: "ARKitScenes", scores: [46.3, 46.6, 53.7, null, null] },
            { category: "2D/3D Grounding", benchmark: "Hypersim", scores: [11.9, 12.0, 11.0, null, null] },
            { category: "2D/3D Grounding", benchmark: "SUNRGBD", scores: [28.0, 30.4, 34.9, null, null] },
            { category: "2D/3D Grounding", benchmark: "Objectron", scores: [null, null, 71.2, null, null] },
            
            { category: "Multi-Image", benchmark: "BLINK", scores: [63.4, 64.7, 67.1, 64.4, 58.3] },
            { category: "Multi-Image", benchmark: "MUIRBENCH", scores: [75.0, 76.8, 80.1, 64.0, 65.7] },
            { category: "Multi-Image", benchmark: "ERQA", scores: [47.3, 46.8, 52.5, 44.3, 45.8] },
            
            { category: "Embodied & Spatial", benchmark: "EmbSpatialBench", scores: [80.7, 81.1, 84.3, 66.1, 74.2] },
            { category: "Embodied & Spatial", benchmark: "RefSpatialBench", scores: [45.3, 44.6, 69.9, 11.2, 12.6] },
            { category: "Embodied & Spatial", benchmark: "RoboSpatialHome", scores: [63.2, 62.0, 73.9, 50.3, 46.1] },
            { category: "Embodied & Spatial", benchmark: "VSI-Bench", scores: [55.2, 56.6, null, 30.3, 15.4] },
            { category: "Embodied & Spatial", benchmark: "MVBench", scores: [69.3, 69.0, null, null, null] },
            
            { category: "Video", benchmark: "VideoMME", scores: [68.9, 71.8, 79.0, 72.7, 66.2] },
            { category: "Video", benchmark: "MLVU", scores: [75.7, 75.1, 83.8, 78.5, 69.2] },
            { category: "Video", benchmark: "LVBench", scores: [53.5, 55.8, 63.6, 60.9, null] },
            { category: "Video", benchmark: "CharadesSTA", scores: [59.0, 59.9, 63.5, null, null] },
            { category: "Video", benchmark: "VideoMMU", scores: [69.4, 72.8, 80.0, 69.2, 63.0] },
            { category: "Video", benchmark: "ScreenSpot", scores: [92.9, 93.6, 95.4, null, null] },
            { category: "Video", benchmark: "ScreenSpot Pro", scores: [49.2, 46.6, 61.8, null, null] },
            
            { category: "Agent", benchmark: "OSWorldG", scores: [53.9, 56.7, 68.3, null, null] },
            { category: "Agent", benchmark: "OSWorld", scores: [31.4, 33.9, 38.1, null, null] },
            { category: "Agent", benchmark: "AndroidWorld", scores: [52.0, 50.0, null, null, null] },
            
            { category: "Coding", benchmark: "Design2Code", scores: [null, null, 93.4, null, null] },
            
            { category: "Fine-grained", benchmark: "V*", scores: [74.9, 77.5, null, 70.2, null] },
            { category: "Fine-grained", benchmark: "HRBench4K", scores: [73.5, 72.4, null, 77.8, null] },
            { category: "Fine-grained", benchmark: "HRBench8K", scores: [67.1, 68.1, null, 75.5, null] }
        ];

        const modelNames = [
            "Qwen3-VL 4B Thinking",
            "Qwen3-VL 8B Thinking",
            "Qwen3-VL 235B-A22B",
            "Gemini2.5-Flash-Lite",
            "GPT5-Nano High"
        ];

        // Calculate wins for each model
        const wins = [0, 0, 0, 0, 0];
        
        benchmarkData.forEach(row => {
            const validScores = row.scores.map((score, idx) => ({ score, idx }))
                .filter(item => item.score !== null);
            
            if (validScores.length > 0) {
                const maxScore = Math.max(...validScores.map(item => item.score));
                validScores.forEach(item => {
                    if (item.score === maxScore) {
                        wins[item.idx]++;
                    }
                });
            }
        });

        // Calculate Elo ratings
        function calculateElo(benchmarkData, numModels) {
            // Initialize Elo ratings (starting at 1500)
            let elo = new Array(numModels).fill(1500);
            const K = 32; // K-factor for Elo calculation
            
            // Process each benchmark as a tournament
            benchmarkData.forEach(row => {
                const validScores = row.scores.map((score, idx) => ({ score, idx }))
                    .filter(item => item.score !== null);
                
                if (validScores.length < 2) return;
                
                // Compare each pair of models
                for (let i = 0; i < validScores.length; i++) {
                    for (let j = i + 1; j < validScores.length; j++) {
                        const modelA = validScores[i];
                        const modelB = validScores[j];
                        
                        // Expected scores
                        const expectedA = 1 / (1 + Math.pow(10, (elo[modelB.idx] - elo[modelA.idx]) / 400));
                        const expectedB = 1 / (1 + Math.pow(10, (elo[modelA.idx] - elo[modelB.idx]) / 400));
                        
                        // Actual scores (1 for win, 0.5 for tie, 0 for loss)
                        let actualA, actualB;
                        if (modelA.score > modelB.score) {
                            actualA = 1;
                            actualB = 0;
                        } else if (modelA.score < modelB.score) {
                            actualA = 0;
                            actualB = 1;
                        } else {
                            actualA = 0.5;
                            actualB = 0.5;
                        }
                        
                        // Update Elo ratings
                        elo[modelA.idx] += K * (actualA - expectedA);
                        elo[modelB.idx] += K * (actualB - expectedB);
                    }
                }
            });
            
            return elo.map(rating => Math.round(rating));
        }
        
        const eloRatings = calculateElo(benchmarkData, 5);

        // Create ranking data
        const rankings = modelNames.map((name, idx) => ({ 
            name, 
            wins: wins[idx], 
            elo: eloRatings[idx],
            modelIdx: idx
        }))
            .sort((a, b) => b.wins - a.wins);
        
        // Calculate Elo differences between Qwen models
        const qwenModels = [
            { name: "Qwen3-VL 4B", idx: 0, elo: eloRatings[0] },
            { name: "Qwen3-VL 8B", idx: 1, elo: eloRatings[1] },
            { name: "Qwen3-VL 235B", idx: 2, elo: eloRatings[2] }
        ].sort((a, b) => b.elo - a.elo);
        
        function getEloDiff(modelIdx) {
            const t = i18n[currentLang];
            if (modelIdx < 3) { // Only for Qwen models
                const currentElo = eloRatings[modelIdx];
                const topQwenElo = qwenModels[0].elo;
                if (currentElo === topQwenElo) {
                    return t.topAmongQwen;
                }
                const diff = topQwenElo - currentElo;
                return `${diff > 0 ? '-' : '+'}${Math.abs(diff)} ${t.vs} ${qwenModels[0].name}`;
            }
            return "";
        }

        // Render stats grid
        function renderStatsGrid() {
            const t = i18n[currentLang];
            const statsGrid = document.getElementById('statsGrid');
            statsGrid.innerHTML = '';
            
            rankings.forEach((model, idx) => {
                const rankBadge = idx < 3 ? `<span class="rank-badge rank-${idx + 1}">#${idx + 1}</span>` : '';
                const eloDiff = getEloDiff(model.modelIdx);
                const eloDiffHtml = eloDiff ? `<div class="elo-diff">${eloDiff}</div>` : '';
                
                statsGrid.innerHTML += `
                    <div class="stat-card">
                        <div class="model-name">${model.name}${rankBadge}</div>
                        <div class="win-count">${model.wins}</div>
                        <div class="win-label">${t.firstPlaceWins}</div>
                        <div class="elo-score">${model.elo}</div>
                        <div class="elo-label">${t.eloRating}</div>
                        ${eloDiffHtml}
                    </div>
                `;
            });
        }
        
        // Initial render
        renderStatsGrid();

        // Render table
        const tableBody = document.getElementById('tableBody');
        let currentCategory = '';
        
        benchmarkData.forEach(row => {
            const tr = document.createElement('tr');
            
            // Category cell
            const categoryCell = document.createElement('td');
            categoryCell.className = 'category-cell';
            if (row.category !== currentCategory) {
                categoryCell.textContent = row.category;
                currentCategory = row.category;
            }
            tr.appendChild(categoryCell);
            
            // Benchmark cell
            const benchmarkCell = document.createElement('td');
            benchmarkCell.className = 'benchmark-cell';
            benchmarkCell.textContent = row.benchmark;
            tr.appendChild(benchmarkCell);
            
            // Find max score
            const validScores = row.scores.filter(s => s !== null);
            const maxScore = validScores.length > 0 ? Math.max(...validScores) : null;
            
            // Score cells
            row.scores.forEach(score => {
                const td = document.createElement('td');
                td.className = 'score-cell';
                
                if (score === null) {
                    td.className += ' na-cell';
                    td.textContent = '—';
                } else {
                    if (score === maxScore && maxScore !== null) {
                        td.innerHTML = `<span class="best-score">${score}</span>`;
                    } else {
                        td.textContent = score;
                    }
                }
                
                tr.appendChild(td);
            });
            
            tableBody.appendChild(tr);
        });
    </script>
 </body>
 </html>
	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>Qwen3-VL Performance Benchmark</title>
	<style>
	* {
	margin: 0;
	padding: 0;
	box-sizing: border-box;
	}

	body {
	font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Helvetica Neue', Arial, sans-serif;
	background: #ffffff;
	min-height: 100vh;
	padding: 40px 20px;
	color: #1d1d1f;
	}

	.container {
	max-width: 1400px;
	margin: 0 auto;
	}

	.card {
	background: rgba(255, 255, 255, 0.95);
	backdrop-filter: blur(10px);
	border-radius: 20px;
	box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
	overflow: hidden;
	}

	.header {
	background: linear-gradient(135deg, #8e9eab 0%, #6d7a86 100%);
	color: white;
	padding: 40px;
	text-align: center;
	}

	.header h1 {
	font-size: 42px;
	font-weight: 700;
	margin-bottom: 10px;
	letter-spacing: -0.5px;
	}

	.header p {
	font-size: 18px;
	opacity: 0.9;
	font-weight: 400;
	}

	.stats-section {
	padding: 40px;
	background: #f5f5f7;
	border-bottom: 1px solid #d2d2d7;
	}

	.stats-title {
	font-size: 24px;
	font-weight: 600;
	margin-bottom: 24px;
	color: #1d1d1f;
	}

	.stats-grid {
	display: grid;
	grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
	gap: 20px;
	}

	.stat-card {
	background: white;
	padding: 24px;
	border-radius: 12px;
	box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
	transition: transform 0.3s ease, box-shadow 0.3s ease;
	}

	.stat-card:hover {
	transform: translateY(-4px);
	box-shadow: 0 8px 24px rgba(0, 0, 0, 0.12);
	}

	.stat-card .model-name {
	font-size: 16px;
	font-weight: 600;
	color: #1d1d1f;
	margin-bottom: 8px;
	}

	.stat-card .win-count {
	font-size: 36px;
	font-weight: 700;
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	background-clip: text;
	}

	.stat-card .win-label {
	font-size: 14px;
	color: #86868b;
	margin-top: 4px;
	}

	.stat-card .elo-score {
	font-size: 20px;
	font-weight: 600;
	color: #667eea;
	margin-top: 12px;
	padding-top: 12px;
	border-top: 1px solid #e5e5e7;
	}

	.stat-card .elo-label {
	font-size: 12px;
	color: #86868b;
	margin-top: 4px;
	}

	.stat-card .elo-diff {
	font-size: 11px;
	color: #86868b;
	margin-top: 6px;
	font-style: italic;
	}

	.table-section {
	padding: 40px;
	overflow-x: auto;
	}

	table {
	width: 100%;
	border-collapse: collapse;
	font-size: 14px;
	}

	thead {
	background: linear-gradient(135deg, #8e9eab 0%, #6d7a86 100%);
	color: white;
	position: sticky;
	top: 0;
	z-index: 10;
	}

	thead th {
	padding: 16px 12px;
	text-align: left;
	font-weight: 600;
	font-size: 13px;
	letter-spacing: 0.5px;
	white-space: nowrap;
	}

	tbody tr {
	border-bottom: 1px solid #e5e5e7;
	transition: background-color 0.2s ease;
	}

	tbody tr:hover {
	background-color: #f5f5f7;
	}

	tbody td {
	padding: 14px 12px;
	color: #1d1d1f;
	}

	.category-cell {
	font-weight: 600;
	color: #6d7a86;
	background: #f9f9fb;
	}

	.benchmark-cell {
	font-weight: 500;
	}

	.score-cell {
	text-align: center;
	font-weight: 500;
	font-variant-numeric: tabular-nums;
	}

	.best-score {
	background: linear-gradient(135deg, #ffd89b 0%, #19547b 100%);
	color: white;
	font-weight: 700;
	border-radius: 6px;
	padding: 6px 8px;
	display: inline-block;
	min-width: 60px;
	box-shadow: 0 2px 8px rgba(0, 0, 0, 0.15);
	}

	.na-cell {
	color: #d2d2d7;
	text-align: center;
	}

	.notes-section {
	padding: 40px;
	background: #f9f9fb;
	font-size: 14px;
	color: #6e6e73;
	line-height: 1.8;
	}

	.notes-section h3 {
	font-size: 18px;
	color: #1d1d1f;
	margin-bottom: 16px;
	font-weight: 600;
	}

	.notes-section ul {
	list-style: none;
	padding-left: 0;
	}

	.notes-section li {
	padding-left: 24px;
	position: relative;
	margin-bottom: 8px;
	}

	.notes-section li:before {
	content: "•";
	position: absolute;
	left: 8px;
	color: #667eea;
	font-weight: bold;
	}

	@media (max-width: 768px) {
	.header h1 {
	font-size: 28px;
	}

	.header p {
	font-size: 16px;
	}

	.stats-section,
	.table-section,
	.notes-section {
	padding: 24px;
	}

	table {
	font-size: 12px;
	}

	thead th,
	tbody td {
	padding: 10px 8px;
	}
	}

	.rank-badge {
	display: inline-block;
	padding: 4px 12px;
	border-radius: 20px;
	font-size: 12px;
	font-weight: 600;
	margin-left: 8px;
	}

	.rank-1 {
	background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
	color: white;
	}

	.rank-2 {
	background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
	color: white;
	}

	.rank-3 {
	background: linear-gradient(135deg, #43e97b 0%, #38f9d7 100%);
	color: white;
	}

	.lang-switcher {
	position: fixed;
	top: 20px;
	left: 20px;
	z-index: 1000;
	background: white;
	border-radius: 30px;
	box-shadow: 0 4px 20px rgba(0, 0, 0, 0.15);
	padding: 8px 12px;
	display: flex;
	align-items: center;
	gap: 8px;
	cursor: pointer;
	transition: all 0.3s ease;
	user-select: none;
	}

	.lang-switcher:hover {
	box-shadow: 0 6px 30px rgba(0, 0, 0, 0.25);
	transform: translateY(-2px);
	}

	.lang-switcher .lang-icon {
	font-size: 20px;
	}

	.lang-switcher .lang-text {
	font-size: 14px;
	font-weight: 600;
	color: #1d1d1f;
	}

	.footer-logo {
	padding: 40px;
	margin-top: 40px;
	text-align: center;
	background: transparent;
	}

	.footer-logo img {
	max-width: 300px;
	width: 100%;
	height: auto;
	opacity: 0.6;
	transition: opacity 0.3s ease;
	}

	.footer-logo img:hover {
	opacity: 0.9;
	}

	@media (max-width: 768px) {
	.footer-logo {
	padding: 30px 20px;
	margin-top: 30px;
	}

	.footer-logo img {
	max-width: 200px;
	}
	}
	</style>
	</head>
	<body>
	<div class="lang-switcher" onclick="toggleLanguage()">
	<span class="lang-icon">🌐</span>
	<span class="lang-text" id="langText">中文</span>
	</div>

	<div class="container">
	<div class="card">
	<div class="header">
	<h1 id="mainTitle">Qwen3-VL Performance Benchmark</h1>
	<p id="mainSubtitle">Comprehensive evaluation across multiple vision-language tasks</p>
	</div>

	<div class="stats-section">
	<div class="stats-title" id="statsTitle">🏆 Model Performance Rankings</div>
	<div class="stats-grid" id="statsGrid"></div>
	</div>

	<div class="table-section">
	<table id="benchmarkTable">
	<thead>
	<tr>
	<th id="thCategory">Category</th>
	<th id="thBenchmark">Benchmark</th>
	<th>Qwen3-VL 4B</th>
	<th>Qwen3-VL 8B</th>
	<th>Qwen3-VL 235B</th>
	<th>Gemini2.5</th>
	<th>GPT5-Nano</th>
	</tr>
	</thead>
	<tbody id="tableBody"></tbody>
	</table>
	</div>

	<div class="notes-section">
	<h3 id="notesTitle">📝 Notes</h3>
	<ul id="notesList">
	<li id="note1">Highlighted scores indicate the best performance among all models for that benchmark</li>
	<li id="note2">The ranking section shows how many times each model achieved the highest score</li>
	<li id="note3">Qwen3-VL 235B-A22B Thinking is the large-scale thinking model version</li>
	<li id="note4">Results on video understanding are measured using a 256k-token context, handling up to 2048 frames</li>
	<li id="note5">"—" indicates data not available for that benchmark</li>
	</ul>
	</div>
	</div>

	<div class="footer-logo">
	<img src="./assets/images/kcores-llm-arena-logo-black.png" alt="KCORES LLM Arena Logo">
	</div>
	</div>

	<script>
	// Language data
	let currentLang = 'en';

	const i18n = {
	en: {
	langText: '中文',
	mainTitle: 'Qwen3-VL Performance Benchmark',
	mainSubtitle: 'Comprehensive evaluation across multiple vision-language tasks',
	statsTitle: '🏆 Model Performance Rankings',
	thCategory: 'Category',
	thBenchmark: 'Benchmark',
	notesTitle: '📝 Notes',
	note1: 'Highlighted scores indicate the best performance among all models for that benchmark',
	note2: 'The ranking section shows how many times each model achieved the highest score',
	note3: 'Qwen3-VL 235B-A22B Thinking is the large-scale thinking model version',
	note4: 'Results on video understanding are measured using a 256k-token context, handling up to 2048 frames',
	note5: '"—" indicates data not available for that benchmark',
	firstPlaceWins: 'First Place Wins',
	eloRating: 'Elo Rating',
	topAmongQwen: 'Top among Qwen models',
	vs: 'vs'
	},
	zh: {
	langText: 'English',
	mainTitle: 'Qwen3-VL 性能基准测试',
	mainSubtitle: '跨多个视觉语言任务的综合评估',
	statsTitle: '🏆 模型性能排名',
	thCategory: '分类',
	thBenchmark: '基准测试',
	notesTitle: '📝 说明',
	note1: '高亮显示的分数表示该基准测试中所有模型的最佳性能',
	note2: '排名部分显示每个模型获得最高分的次数',
	note3: 'Qwen3-VL 235B-A22B Thinking 是大规模思维模型版本',
	note4: '视频理解结果使用 256k token 上下文进行测量，可处理多达 2048 帧',
	note5: '"—" 表示该基准测试的数据不可用',
	firstPlaceWins: '第一名次数',
	eloRating: 'Elo 评分',
	topAmongQwen: 'Qwen 模型中最高',
	vs: '对比'
	}
	};

	function toggleLanguage() {
	currentLang = currentLang === 'en' ? 'zh' : 'en';
	updateLanguage();
	}

	function updateLanguage() {
	const t = i18n[currentLang];

	// Update static texts
	document.getElementById('langText').textContent = t.langText;
	document.getElementById('mainTitle').textContent = t.mainTitle;
	document.getElementById('mainSubtitle').textContent = t.mainSubtitle;
	document.getElementById('statsTitle').textContent = t.statsTitle;
	document.getElementById('thCategory').textContent = t.thCategory;
	document.getElementById('thBenchmark').textContent = t.thBenchmark;
	document.getElementById('notesTitle').textContent = t.notesTitle;
	document.getElementById('note1').textContent = t.note1;
	document.getElementById('note2').textContent = t.note2;
	document.getElementById('note3').textContent = t.note3;
	document.getElementById('note4').textContent = t.note4;
	document.getElementById('note5').textContent = t.note5;

	// Re-render stats grid
	renderStatsGrid();
	}

	const benchmarkData = [
	{ category: "STEM & Puzzle", benchmark: "MMMU_VAL", scores: [70.8, 74.1, 80.6, 73.4, 75.8] },
	{ category: "STEM & Puzzle", benchmark: "MMMU_Pro", scores: [57.0, 60.4, 69.3, 59.7, 57.2] },
	{ category: "STEM & Puzzle", benchmark: "MathVista_mini", scores: [79.5, 81.4, 85.8, 72.8, 71.5] },
	{ category: "STEM & Puzzle", benchmark: "MathVision", scores: [60.0, 62.7, 74.6, 52.1, 62.2] },
	{ category: "STEM & Puzzle", benchmark: "MathVerse_mini", scores: [75.2, 77.7, 85.0, 69.6, 74.2] },
	{ category: "STEM & Puzzle", benchmark: "ZEROBench", scores: [null, null, 4.0, null, null] },
	{ category: "STEM & Puzzle", benchmark: "ZEROBench_Sub", scores: [null, null, 27.7, null, null] },
	{ category: "STEM & Puzzle", benchmark: "VisuLogic", scores: [null, null, 34.4, null, null] },
	{ category: "STEM & Puzzle", benchmark: "MMBenchDEV_EN_V1.1", scores: [86.7, 87.5, 90.6, 82.7, 80.3] },

	{ category: "General VQA", benchmark: "RealWorldQA", scores: [73.2, 73.5, 81.3, 72.2, 71.8] },
	{ category: "General VQA", benchmark: "MMStar", scores: [73.2, 75.3, 78.7, 69.1, 68.6] },
	{ category: "General VQA", benchmark: "SimpleVQA", scores: [48.8, 49.6, 61.3, 54.1, 46.0] },

	{ category: "Subjective Experience", benchmark: "HallusionBench", scores: [64.1, 65.4, 66.7, 64.5, 58.4] },
	{ category: "Subjective Experience", benchmark: "MM-MT-Bench", scores: [7.7, 8.0, 8.5, 7.7, 6.6] },
	{ category: "Subjective Experience", benchmark: "MIABench", scores: [91.0, 91.5, 92.7, 91.6, 89.9] },
	{ category: "Subjective Experience", benchmark: "MMLongBench-Doc", scores: [44.4, 48.0, 56.2, 46.5, 31.8] },
	{ category: "Subjective Experience", benchmark: "DocVQA_TEST", scores: [94.2, 95.3, 96.5, 92.5, 88.2] },
	{ category: "Subjective Experience", benchmark: "InfoVQA_TEST", scores: [83.0, 86.0, 89.5, 81.5, 68.6] },

	{ category: "Text Recognition", benchmark: "AI2D_TEST", scores: [84.9, 84.9, 89.2, 85.7, 81.9] },
	{ category: "Text Recognition", benchmark: "OCRBench", scores: [808.0, 819.0, 875.0, 825.0, 753.0] },
	{ category: "Text Recognition", benchmark: "OCRBenchV2", scores: [58.8, 61.55, 65.15, 47.75, 40.85] },
	{ category: "Text Recognition", benchmark: "CC_OCR", scores: [null, null, 81.5, null, null] },
	{ category: "Text Recognition", benchmark: "CharXiv_(RQ)", scores: [50.3, 53.0, 66.1, 56.1, 50.1] },
	{ category: "Text Recognition", benchmark: "RefCOCO-avg", scores: [null, null, 92.4, null, null] },
	{ category: "Text Recognition", benchmark: "CountBench", scores: [89.4, 91.5, 93.7, 79.2, 80.0] },

	{ category: "2D/3D Grounding", benchmark: "ODinW13", scores: [39.4, 39.8, 43.2, null, null] },
	{ category: "2D/3D Grounding", benchmark: "ARKitScenes", scores: [46.3, 46.6, 53.7, null, null] },
	{ category: "2D/3D Grounding", benchmark: "Hypersim", scores: [11.9, 12.0, 11.0, null, null] },
	{ category: "2D/3D Grounding", benchmark: "SUNRGBD", scores: [28.0, 30.4, 34.9, null, null] },
	{ category: "2D/3D Grounding", benchmark: "Objectron", scores: [null, null, 71.2, null, null] },

	{ category: "Multi-Image", benchmark: "BLINK", scores: [63.4, 64.7, 67.1, 64.4, 58.3] },
	{ category: "Multi-Image", benchmark: "MUIRBENCH", scores: [75.0, 76.8, 80.1, 64.0, 65.7] },
	{ category: "Multi-Image", benchmark: "ERQA", scores: [47.3, 46.8, 52.5, 44.3, 45.8] },

	{ category: "Embodied & Spatial", benchmark: "EmbSpatialBench", scores: [80.7, 81.1, 84.3, 66.1, 74.2] },
	{ category: "Embodied & Spatial", benchmark: "RefSpatialBench", scores: [45.3, 44.6, 69.9, 11.2, 12.6] },
	{ category: "Embodied & Spatial", benchmark: "RoboSpatialHome", scores: [63.2, 62.0, 73.9, 50.3, 46.1] },
	{ category: "Embodied & Spatial", benchmark: "VSI-Bench", scores: [55.2, 56.6, null, 30.3, 15.4] },
	{ category: "Embodied & Spatial", benchmark: "MVBench", scores: [69.3, 69.0, null, null, null] },

	{ category: "Video", benchmark: "VideoMME", scores: [68.9, 71.8, 79.0, 72.7, 66.2] },
	{ category: "Video", benchmark: "MLVU", scores: [75.7, 75.1, 83.8, 78.5, 69.2] },
	{ category: "Video", benchmark: "LVBench", scores: [53.5, 55.8, 63.6, 60.9, null] },
	{ category: "Video", benchmark: "CharadesSTA", scores: [59.0, 59.9, 63.5, null, null] },
	{ category: "Video", benchmark: "VideoMMU", scores: [69.4, 72.8, 80.0, 69.2, 63.0] },
	{ category: "Video", benchmark: "ScreenSpot", scores: [92.9, 93.6, 95.4, null, null] },
	{ category: "Video", benchmark: "ScreenSpot Pro", scores: [49.2, 46.6, 61.8, null, null] },

	{ category: "Agent", benchmark: "OSWorldG", scores: [53.9, 56.7, 68.3, null, null] },
	{ category: "Agent", benchmark: "OSWorld", scores: [31.4, 33.9, 38.1, null, null] },
	{ category: "Agent", benchmark: "AndroidWorld", scores: [52.0, 50.0, null, null, null] },

	{ category: "Coding", benchmark: "Design2Code", scores: [null, null, 93.4, null, null] },

	{ category: "Fine-grained", benchmark: "V*", scores: [74.9, 77.5, null, 70.2, null] },
	{ category: "Fine-grained", benchmark: "HRBench4K", scores: [73.5, 72.4, null, 77.8, null] },
	{ category: "Fine-grained", benchmark: "HRBench8K", scores: [67.1, 68.1, null, 75.5, null] }
	];

	const modelNames = [
	"Qwen3-VL 4B Thinking",
	"Qwen3-VL 8B Thinking",
	"Qwen3-VL 235B-A22B",
	"Gemini2.5-Flash-Lite",
	"GPT5-Nano High"
	];

	// Calculate wins for each model
	const wins = [0, 0, 0, 0, 0];

	benchmarkData.forEach(row => {
	const validScores = row.scores.map((score, idx) => ({ score, idx }))
	.filter(item => item.score !== null);

	if (validScores.length > 0) {
	const maxScore = Math.max(...validScores.map(item => item.score));
	validScores.forEach(item => {
	if (item.score === maxScore) {
	wins[item.idx]++;
	}
	});
	}
	});

	// Calculate Elo ratings
	function calculateElo(benchmarkData, numModels) {
	// Initialize Elo ratings (starting at 1500)
	let elo = new Array(numModels).fill(1500);
	const K = 32; // K-factor for Elo calculation

	// Process each benchmark as a tournament
	benchmarkData.forEach(row => {
	const validScores = row.scores.map((score, idx) => ({ score, idx }))
	.filter(item => item.score !== null);

	if (validScores.length < 2) return;

	// Compare each pair of models
	for (let i = 0; i < validScores.length; i++) {
	for (let j = i + 1; j < validScores.length; j++) {
	const modelA = validScores[i];
	const modelB = validScores[j];

	// Expected scores
	const expectedA = 1 / (1 + Math.pow(10, (elo[modelB.idx] - elo[modelA.idx]) / 400));
	const expectedB = 1 / (1 + Math.pow(10, (elo[modelA.idx] - elo[modelB.idx]) / 400));

	// Actual scores (1 for win, 0.5 for tie, 0 for loss)
	let actualA, actualB;
	if (modelA.score > modelB.score) {
	actualA = 1;
	actualB = 0;
	} else if (modelA.score < modelB.score) {
	actualA = 0;
	actualB = 1;
	} else {
	actualA = 0.5;
	actualB = 0.5;
	}

	// Update Elo ratings
	elo[modelA.idx] += K * (actualA - expectedA);
	elo[modelB.idx] += K * (actualB - expectedB);
	}
	}
	});

	return elo.map(rating => Math.round(rating));
	}

	const eloRatings = calculateElo(benchmarkData, 5);

	// Create ranking data
	const rankings = modelNames.map((name, idx) => ({
	name,
	wins: wins[idx],
	elo: eloRatings[idx],
	modelIdx: idx
	}))
	.sort((a, b) => b.wins - a.wins);

	// Calculate Elo differences between Qwen models
	const qwenModels = [
	{ name: "Qwen3-VL 4B", idx: 0, elo: eloRatings[0] },
	{ name: "Qwen3-VL 8B", idx: 1, elo: eloRatings[1] },
	{ name: "Qwen3-VL 235B", idx: 2, elo: eloRatings[2] }
	].sort((a, b) => b.elo - a.elo);

	function getEloDiff(modelIdx) {
	const t = i18n[currentLang];
	if (modelIdx < 3) { // Only for Qwen models
	const currentElo = eloRatings[modelIdx];
	const topQwenElo = qwenModels[0].elo;
	if (currentElo === topQwenElo) {
	return t.topAmongQwen;
	}
	const diff = topQwenElo - currentElo;
	return `${diff > 0 ? '-' : '+'}${Math.abs(diff)} ${t.vs} ${qwenModels[0].name}`;
	}
	return "";
	}

	// Render stats grid
	function renderStatsGrid() {
	const t = i18n[currentLang];
	const statsGrid = document.getElementById('statsGrid');
	statsGrid.innerHTML = '';

	rankings.forEach((model, idx) => {
	const rankBadge = idx < 3 ? `<span class="rank-badge rank-${idx + 1}">#${idx + 1}</span>` : '';
	const eloDiff = getEloDiff(model.modelIdx);
	const eloDiffHtml = eloDiff ? `<div class="elo-diff">${eloDiff}</div>` : '';

	statsGrid.innerHTML += `
	<div class="stat-card">
	<div class="model-name">${model.name}${rankBadge}</div>
	<div class="win-count">${model.wins}</div>
	<div class="win-label">${t.firstPlaceWins}</div>
	<div class="elo-score">${model.elo}</div>
	<div class="elo-label">${t.eloRating}</div>
	${eloDiffHtml}
	</div>
	`;
	});
	}

	// Initial render
	renderStatsGrid();

	// Render table
	const tableBody = document.getElementById('tableBody');
	let currentCategory = '';

	benchmarkData.forEach(row => {
	const tr = document.createElement('tr');

	// Category cell
	const categoryCell = document.createElement('td');
	categoryCell.className = 'category-cell';
	if (row.category !== currentCategory) {
	categoryCell.textContent = row.category;
	currentCategory = row.category;
	}
	tr.appendChild(categoryCell);

	// Benchmark cell
	const benchmarkCell = document.createElement('td');
	benchmarkCell.className = 'benchmark-cell';
	benchmarkCell.textContent = row.benchmark;
	tr.appendChild(benchmarkCell);

	// Find max score
	const validScores = row.scores.filter(s => s !== null);
	const maxScore = validScores.length > 0 ? Math.max(...validScores) : null;

	// Score cells
	row.scores.forEach(score => {
	const td = document.createElement('td');
	td.className = 'score-cell';

	if (score === null) {
	td.className += ' na-cell';
	td.textContent = '—';
	} else {
	if (score === maxScore && maxScore !== null) {
	td.innerHTML = `<span class="best-score">${score}</span>`;
	} else {
	td.textContent = score;
	}
	}

	tr.appendChild(td);
	});

	tableBody.appendChild(tr);
	});
	</script>
	</body>
	</html>
No results found