Skip to content

Instantly share code, notes, and snippets.

@bgauryy
Created November 15, 2025 08:46
Show Gist options
  • Select an option

  • Save bgauryy/b0dd78bab39b94d39a3a281d7cd5c647 to your computer and use it in GitHub Desktop.

Select an option

Save bgauryy/b0dd78bab39b94d39a3a281d7cd5c647 to your computer and use it in GitHub Desktop.
Measures token count using the cl100k_base tokenizer (used by GPT-4/Claude) for CSV, TOON, JSON
import { encode as toonEncoder } from '@byjohann/toon'
import { createByEncoderName } from '@microsoft/tiktokenizer'
import { stringify } from 'csv-stringify/sync'
import { writeFileSync, mkdirSync } from 'fs'
import { join } from 'path'
// Helper function to convert data to CSV format
function dataToCSV(data) {
// Handle arrays of objects (most common CSV case)
if (Array.isArray(data)) {
if (data.length === 0) return ''
// Check if it's an array of objects
if (typeof data[0] === 'object' && data[0] !== null && !Array.isArray(data[0])) {
return stringify(data, { header: true })
}
// Array of primitives or mixed - convert to single column
return stringify(data.map(item => ({ value: JSON.stringify(item) })), { header: true })
}
// Handle objects
if (typeof data === 'object' && data !== null) {
const keys = Object.keys(data)
// Check if this is a flat object (all values are primitives or simple types)
const isFlatObject = keys.every(key => {
const value = data[key]
return value === null ||
value === undefined ||
typeof value !== 'object' ||
(typeof value === 'object' && !Array.isArray(value) && Object.keys(value).length === 0)
})
if (isFlatObject) {
// Single row CSV
return stringify([data], { header: true })
}
// Check if all values are arrays of the same length
const arrayKeys = keys.filter(k => Array.isArray(data[k]))
if (arrayKeys.length > 0 && arrayKeys.length === keys.length) {
const lengths = arrayKeys.map(k => data[k].length)
const maxLength = Math.max(...lengths)
const flattened = []
for (let i = 0; i < maxLength; i++) {
const row = {}
keys.forEach(key => {
const value = data[key][i]
row[key] = value === null || value === undefined
? ''
: typeof value === 'object'
? JSON.stringify(value)
: String(value)
})
flattened.push(row)
}
return stringify(flattened, { header: true })
}
// For complex nested structures, convert to key-value pairs
// This handles deeply nested objects by flattening them
const flattened = []
const flattenObject = (obj, prefix = '') => {
for (const key in obj) {
const value = obj[key]
const newKey = prefix ? `${prefix}.${key}` : key
if (value === null || value === undefined) {
flattened.push({ key: newKey, value: '' })
} else if (Array.isArray(value)) {
value.forEach((item, idx) => {
if (typeof item === 'object' && item !== null) {
flattenObject(item, `${newKey}[${idx}]`)
} else {
flattened.push({ key: `${newKey}[${idx}]`, value: String(item) })
}
})
} else if (typeof value === 'object') {
flattenObject(value, newKey)
} else {
flattened.push({ key: newKey, value: String(value) })
}
}
}
flattenObject(data)
return stringify(flattened, { header: true })
}
// Fallback: convert to string
return String(data)
}
// Helper function to compare formats for a given data structure
async function compareFormats(data, testName, encoder, outputDir) {
// Convert data to JSON string for token counting
const dataJson = JSON.stringify(data)
const dataTokens = encoder.encode(dataJson)
const dataTokenCount = dataTokens.length
// Encode the data with TOON
const toonEncodedData = toonEncoder(data, { delimiter: '\t' })
const toonEncodedTokens = encoder.encode(toonEncodedData)
const toonEncodedTokenCount = toonEncodedTokens.length
// Convert data to CSV
const csvData = dataToCSV(data)
const csvTokens = encoder.encode(csvData)
const csvTokenCount = csvTokens.length
// Calculate reductions
const toonTokenReduction = dataTokenCount - toonEncodedTokenCount
const toonTokenPercentage = ((toonTokenReduction / dataTokenCount) * 100).toFixed(2)
const csvTokenReduction = dataTokenCount - csvTokenCount
const csvTokenPercentage = ((csvTokenReduction / dataTokenCount) * 100).toFixed(2)
// Save files
const testDir = join(outputDir, testName)
mkdirSync(testDir, { recursive: true })
writeFileSync(join(testDir, 'minified.json'), dataJson, 'utf8')
writeFileSync(join(testDir, 'toon.txt'), toonEncodedData, 'utf8')
writeFileSync(join(testDir, 'formatted.json'), JSON.stringify(data, null, 2), 'utf8')
writeFileSync(join(testDir, 'csv.txt'), csvData, 'utf8')
return {
testName,
minifiedJson: {
tokens: dataTokenCount,
tokenReduction: 0,
tokenPercentage: '0.00'
},
toon: {
tokens: toonEncodedTokenCount,
tokenReduction: toonTokenReduction,
tokenPercentage: toonTokenPercentage
},
csv: {
tokens: csvTokenCount,
tokenReduction: csvTokenReduction,
tokenPercentage: csvTokenPercentage
}
}
}
async function main() {
const encoder = await createByEncoderName('cl100k_base')
const outputDir = join(process.cwd(), 'output')
mkdirSync(outputDir, { recursive: true })
const results = []
// TEST 1: Key-Value Object (flat object with many key-value pairs)
console.log('Running Test 1: Key-Value Object...')
const keyValueData = {}
const keys = ['userId', 'productName', 'price', 'status', 'description', 'category', 'region', 'paymentMethod', 'billingCycle', 'customerCount', 'revenue', 'churnRate', 'retentionRate', 'lifetimeValue', 'acquisitionCost', 'supportTickets', 'avgResponseTime', 'satisfactionScore', 'featureUsage', 'createdDate']
const values = [
'[email protected]',
'Premium Subscription Plan v1',
33.69,
true,
'Monthly subscription with unlimited access to all features',
'Software as a Service',
'North America',
'credit_card',
'monthly',
1123,
5456.78,
0.53,
85.0,
1589.12,
57.34,
12,
2.15,
4.1,
45.0,
'2024-01-01T00:00:00.000Z'
]
// Generate 1000 key-value pairs
for (let i = 0; i < 1000; i++) {
const key = `${keys[i % keys.length]}_${i + 1}`
const valueIndex = i % values.length
keyValueData[key] = typeof values[valueIndex] === 'string' && values[valueIndex].includes('@')
? values[valueIndex].replace(/\d+/, i + 1)
: typeof values[valueIndex] === 'number'
? values[valueIndex] + (i * 0.1)
: values[valueIndex]
}
results.push(await compareFormats(keyValueData, 'test1-key-value-object', encoder, outputDir))
console.log('✓ Test 1 complete')
// TEST 2: Uniform Arrays of Objects
console.log('Running Test 2: Uniform Arrays of Objects...')
const emails = ['john.doe', 'jane.smith', 'bob.wilson', 'alice.johnson', 'charlie.brown', 'david.miller', 'emma.davis', 'frank.garcia', 'grace.martinez', 'henry.rodriguez']
const products = ['Premium Subscription Plan', 'Enterprise Business Solution', 'Professional Developer Tools', 'Advanced Analytics Dashboard', 'Cloud Storage Premium', 'Security Monitoring Service', 'Customer Support Platform', 'Marketing Automation Suite', 'Project Management Software', 'Collaboration Workspace']
const descriptions = [
'Monthly subscription with unlimited access to all features',
'Complete enterprise solution with dedicated support and custom integrations',
'Professional-grade development tools for modern software teams',
'Advanced analytics with real-time reporting and custom dashboards',
'Secure cloud storage with automatic backup and version control',
'Comprehensive security monitoring with threat detection and alerts',
'Multi-channel customer support platform with AI-powered automation',
'Full-featured marketing automation with email campaigns and analytics',
'Collaborative project management with task tracking and team coordination',
'All-in-one workspace for remote teams with video conferencing and file sharing'
]
const categories = ['Software as a Service', 'Enterprise Solutions', 'Developer Tools', 'Business Intelligence', 'Cloud Services', 'Security', 'Customer Service', 'Marketing', 'Productivity', 'Communication']
const regions = ['North America', 'Europe', 'Asia Pacific', 'South America', 'Middle East', 'Africa', 'Australia']
const paymentMethods = ['credit_card', 'bank_transfer', 'paypal', 'stripe', 'invoice', 'wire_transfer', 'crypto']
const billingCycles = ['monthly', 'annual', 'quarterly', 'biannual', 'weekly']
const uniformArrayData = []
for (let i = 0; i < 1000; i++) {
uniformArrayData.push({
userId: `${emails[i % emails.length]}${i + 1}@example.com`,
productName: `${products[i % products.length]} v${i + 1}`,
price: Math.round((19.99 + ((i + 1) * 13.7)) * 100) / 100,
statusActive: (i + 1) % 3 !== 0,
description: descriptions[i % descriptions.length],
createdDate: new Date(2024, ((i + 1) % 12), ((i + 1) % 28) + 1, ((i + 1) % 24), ((i + 1) % 60)).toISOString(),
category: categories[i % categories.length],
region: regions[i % regions.length],
paymentMethod: paymentMethods[i % paymentMethods.length],
billingCycle: billingCycles[i % billingCycles.length],
customerCount: 1000 + ((i + 1) * 123),
revenue: Math.round((5000 + ((i + 1) * 456.78)) * 100) / 100,
churnRate: Math.round((0.5 + ((i + 1) * 0.03)) * 100) / 100,
retentionRate: Math.round((85 + ((i + 1) % 15)) * 100) / 100,
lifetimeValue: Math.round((1500 + ((i + 1) * 89.12)) * 100) / 100,
acquisitionCost: Math.round((50 + ((i + 1) * 7.34)) * 100) / 100,
supportTickets: Math.floor(10 + ((i + 1) * 2.3)),
avgResponseTime: Math.round((2 + ((i + 1) * 0.15)) * 100) / 100,
satisfactionScore: Math.round((4.1 + (((i + 1) % 9) * 0.1)) * 100) / 100,
featureUsage: Math.round((45 + ((i + 1) % 55)) * 100) / 100
})
}
results.push(await compareFormats(uniformArrayData, 'test2-uniform-arrays', encoder, outputDir))
console.log('✓ Test 2 complete')
// TEST 3: Complex Object (nested objects, arrays, mixed structures)
console.log('Running Test 3: Complex Object...')
const complexData = {
metadata: {
version: '1.0.0',
generatedAt: new Date().toISOString(),
source: 'test-suite',
environment: 'production'
},
users: [],
products: {},
analytics: {
summary: {
totalUsers: 0,
totalRevenue: 0,
averageSatisfaction: 0
},
breakdown: {
byRegion: {},
byCategory: {},
byPaymentMethod: {}
},
trends: {
monthly: [],
quarterly: [],
yearly: []
}
},
settings: {
features: {
enabled: ['feature1', 'feature2', 'feature3'],
disabled: ['feature4', 'feature5'],
experimental: []
},
limits: {
maxUsers: 10000,
maxStorage: '1TB',
rateLimit: 1000
},
integrations: {
active: ['stripe', 'sendgrid', 'analytics'],
pending: ['slack', 'github'],
failed: []
}
}
}
// Populate complex structure
for (let i = 0; i < 100; i++) {
const user = {
id: `user_${i + 1}`,
email: `${emails[i % emails.length]}${i + 1}@example.com`,
profile: {
name: `${emails[i % emails.length].split('.')[0]} ${emails[i % emails.length].split('.')[1]}`,
age: 25 + (i % 40),
location: regions[i % regions.length],
preferences: {
theme: i % 2 === 0 ? 'dark' : 'light',
notifications: i % 3 !== 0,
language: ['en', 'es', 'fr', 'de'][i % 4]
}
},
subscriptions: [
{
productId: `prod_${i + 1}`,
plan: billingCycles[i % billingCycles.length],
status: i % 3 !== 0 ? 'active' : 'cancelled',
startDate: new Date(2024, i % 12, (i % 28) + 1).toISOString()
}
],
metrics: {
lifetimeValue: Math.round((1500 + (i * 89.12)) * 100) / 100,
totalOrders: Math.floor(10 + (i * 2.3)),
lastActive: new Date(2024, (i % 12), (i % 28) + 1).toISOString()
}
}
complexData.users.push(user)
const productId = `prod_${i + 1}`
complexData.products[productId] = {
name: products[i % products.length],
category: categories[i % categories.length],
pricing: {
base: Math.round((19.99 + (i * 13.7)) * 100) / 100,
currency: 'USD',
discounts: i % 5 === 0 ? [{ type: 'early_bird', value: 0.1 }] : []
},
features: descriptions[i % descriptions.length].split(' ').slice(0, 5),
availability: {
regions: [regions[i % regions.length]],
inStock: i % 3 !== 0
}
}
const region = regions[i % regions.length]
if (!complexData.analytics.breakdown.byRegion[region]) {
complexData.analytics.breakdown.byRegion[region] = { count: 0, revenue: 0 }
}
complexData.analytics.breakdown.byRegion[region].count++
complexData.analytics.breakdown.byRegion[region].revenue += Math.round((5000 + (i * 456.78)) * 100) / 100
if (i % 12 === 0) {
complexData.analytics.trends.monthly.push({
month: `2024-${String((i % 12) + 1).padStart(2, '0')}`,
users: 100 + (i * 10),
revenue: Math.round((50000 + (i * 4567.8)) * 100) / 100
})
}
}
complexData.analytics.summary.totalUsers = complexData.users.length
complexData.analytics.summary.totalRevenue = Object.values(complexData.analytics.breakdown.byRegion)
.reduce((sum, r) => sum + r.revenue, 0)
complexData.analytics.summary.averageSatisfaction = 4.2
results.push(await compareFormats(complexData, 'test3-complex-object', encoder, outputDir))
console.log('✓ Test 3 complete')
// TEST 4: Original Flat Key-Value Object (where TOON lost)
console.log('Running Test 4: Original Flat Key-Value Object...')
const originalFlatData = {}
const originalEmails = ['john.doe', 'jane.smith', 'bob.wilson', 'alice.johnson', 'charlie.brown', 'david.miller', 'emma.davis', 'frank.garcia', 'grace.martinez', 'henry.rodriguez']
const originalProducts = ['Premium Subscription Plan', 'Enterprise Business Solution', 'Professional Developer Tools', 'Advanced Analytics Dashboard', 'Cloud Storage Premium', 'Security Monitoring Service', 'Customer Support Platform', 'Marketing Automation Suite', 'Project Management Software', 'Collaboration Workspace']
const originalDescriptions = [
'Monthly subscription with unlimited access to all features',
'Complete enterprise solution with dedicated support and custom integrations',
'Professional-grade development tools for modern software teams',
'Advanced analytics with real-time reporting and custom dashboards',
'Secure cloud storage with automatic backup and version control',
'Comprehensive security monitoring with threat detection and alerts',
'Multi-channel customer support platform with AI-powered automation',
'Full-featured marketing automation with email campaigns and analytics',
'Collaborative project management with task tracking and team coordination',
'All-in-one workspace for remote teams with video conferencing and file sharing'
]
const originalCategories = ['Software as a Service', 'Enterprise Solutions', 'Developer Tools', 'Business Intelligence', 'Cloud Services', 'Security', 'Customer Service', 'Marketing', 'Productivity', 'Communication']
const originalRegions = ['North America', 'Europe', 'Asia Pacific', 'South America', 'Middle East', 'Africa', 'Australia']
const originalPaymentMethods = ['credit_card', 'bank_transfer', 'paypal', 'stripe', 'invoice', 'wire_transfer', 'crypto']
const originalBillingCycles = ['monthly', 'annual', 'quarterly', 'biannual', 'weekly']
// Generate 100 entries for each field type (matching original structure exactly)
for (let i = 1; i <= 100; i++) {
originalFlatData[`user_id_${i}`] = `${originalEmails[i % originalEmails.length]}${i}@example.com`
originalFlatData[`product_name_${i}`] = `${originalProducts[i % originalProducts.length]} v${i}`
originalFlatData[`price_${i}`] = Math.round((19.99 + (i * 13.7)) * 100) / 100
originalFlatData[`status_active_${i}`] = i % 3 !== 0
originalFlatData[`description_${i}`] = originalDescriptions[i % originalDescriptions.length]
originalFlatData[`created_date_${i}`] = new Date(2024, (i % 12), (i % 28) + 1, (i % 24), (i % 60)).toISOString()
originalFlatData[`category_${i}`] = originalCategories[i % originalCategories.length]
originalFlatData[`region_${i}`] = originalRegions[i % originalRegions.length]
originalFlatData[`payment_method_${i}`] = originalPaymentMethods[i % originalPaymentMethods.length]
originalFlatData[`billing_cycle_${i}`] = originalBillingCycles[i % originalBillingCycles.length]
originalFlatData[`customer_count_${i}`] = 1000 + (i * 123)
originalFlatData[`revenue_${i}`] = Math.round((5000 + (i * 456.78)) * 100) / 100
originalFlatData[`churn_rate_${i}`] = Math.round((0.5 + (i * 0.03)) * 100) / 100
originalFlatData[`retention_rate_${i}`] = Math.round((85 + (i % 15)) * 100) / 100
originalFlatData[`lifetime_value_${i}`] = Math.round((1500 + (i * 89.12)) * 100) / 100
originalFlatData[`acquisition_cost_${i}`] = Math.round((50 + (i * 7.34)) * 100) / 100
originalFlatData[`support_tickets_${i}`] = Math.floor(10 + (i * 2.3))
originalFlatData[`avg_response_time_${i}`] = Math.round((2 + (i * 0.15)) * 100) / 100
originalFlatData[`satisfaction_score_${i}`] = Math.round((4.1 + ((i % 9) * 0.1)) * 100) / 100
originalFlatData[`feature_usage_${i}`] = Math.round((45 + (i % 55)) * 100) / 100
}
results.push(await compareFormats(originalFlatData, 'test4-original-flat-object', encoder, outputDir))
console.log('✓ Test 4 complete')
// TEST 5: Deep Tree Structure
console.log('Running Test 5: Deep Tree Structure...')
const deepTreeData = { root: {} }
const deepTreeDepth = 7
const deepTreeBranches = 50
for (let i = 0; i < deepTreeBranches; i++) {
let current = deepTreeData.root
for (let depth = 1; depth <= deepTreeDepth; depth++) {
const key = `level${depth}_${i}`
if (!current[key]) {
current[key] = depth === deepTreeDepth
? {
value: `node_${i}`,
metrics: {
weight: Math.round((i * depth * 1.37) * 100) / 100,
active: (i + depth) % 3 !== 0,
updatedAt: new Date(2024, (i + depth) % 12, ((i + depth) % 28) + 1).toISOString()
},
descendants: Array.from({ length: 3 }, (_, j) => ({
id: `leaf_${i}_${j}`,
score: Math.round(((i + j) * 2.71) * 100) / 100
}))
}
: {}
}
current = current[key]
}
}
results.push(await compareFormats(deepTreeData, 'test5-deep-tree', encoder, outputDir))
console.log('✓ Test 5 complete')
// TEST 6: Sparse Arrays
console.log('Running Test 6: Sparse Arrays...')
const sparseArrayData = {
items: Array.from({ length: 500 }, (_, idx) => {
if (idx % 25 === 0) {
return {
id: `item_${idx}`,
status: idx % 50 === 0 ? 'active' : 'inactive',
notes: idx % 100 === 0 ? 'milestone reached' : null,
metrics: {
impressions: idx * 123,
clicks: Math.round((idx * 4.56) * 100) / 100
}
}
}
if (idx % 5 === 0) {
return idx
}
return null
})
}
results.push(await compareFormats(sparseArrayData, 'test6-sparse-arrays', encoder, outputDir))
console.log('✓ Test 6 complete')
// TEST 7: Heterogeneous Arrays
console.log('Running Test 7: Heterogeneous Arrays...')
const heterogeneousArrayData = []
for (let i = 0; i < 200; i++) {
switch (i % 6) {
case 0:
heterogeneousArrayData.push(i * 3.14)
break
case 1:
heterogeneousArrayData.push(`value_${i}`)
break
case 2:
heterogeneousArrayData.push({
id: `obj_${i}`,
flags: { a: i % 2 === 0, b: i % 3 === 0 },
tags: [`tag${i % 5}`, `group${i % 7}`]
})
break
case 3:
heterogeneousArrayData.push([
`nested_${i}`,
Math.round((i * 1.11) * 100) / 100,
{ deep: { ref: `deep_${i}` } }
])
break
case 4:
heterogeneousArrayData.push(i % 4 === 0)
break
default:
heterogeneousArrayData.push({
type: 'event',
payload: {
timestamp: new Date(2024, i % 12, (i % 28) + 1, i % 24, i % 60).toISOString(),
message: `Event number ${i}`,
severity: ['info', 'warn', 'error'][i % 3]
}
})
break
}
}
const heterogeneousData = { sequence: heterogeneousArrayData }
results.push(await compareFormats(heterogeneousData, 'test7-heterogeneous-array', encoder, outputDir))
console.log('✓ Test 7 complete')
// Log concise summary for each test
console.log('\nTests:\n')
const friendlyTestNames = {
'test1-key-value-object': 'Test 1: Key-Value Object',
'test2-uniform-arrays': 'Test 2: Uniform Arrays of Objects',
'test3-complex-object': 'Test 3: Complex Object',
'test4-original-flat-object': 'Test 4: Original Flat Key-Value Object',
'test5-deep-tree': 'Test 5: Deep Tree Structure',
'test6-sparse-arrays': 'Test 6: Sparse Arrays',
'test7-heterogeneous-array': 'Test 7: Heterogeneous Arrays'
}
results.forEach((result, index) => {
const challengeName = friendlyTestNames[result.testName] ?? result.testName
const json = result.minifiedJson
const toon = result.toon
const csv = result.csv
// Determine winners for tokens
const tokenValues = [
{ name: 'JSON', tokens: json.tokens },
{ name: 'TOON', tokens: toon.tokens },
{ name: 'CSV', tokens: csv.tokens }
]
tokenValues.sort((a, b) => a.tokens - b.tokens)
const tokenWinner = tokenValues[0].name
console.log(`${index + 1}. ${challengeName}`)
console.log(` 🏆 TOKENS Winner: ${tokenWinner}`)
console.log(` JSON: ${json.tokens.toLocaleString()} tokens`)
console.log(` TOON: ${toon.tokens.toLocaleString()} tokens`)
console.log(` CSV: ${csv.tokens.toLocaleString()} tokens`)
console.log()
})
// Removed detailed report generation per user request.
}
main().catch(console.error)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment