Skip to content

Instantly share code, notes, and snippets.

@bgauryy
Created October 27, 2025 14:56
Show Gist options
  • Select an option

  • Save bgauryy/6e32b34755d66f8f65e6482b2be987e3 to your computer and use it in GitHub Desktop.

Select an option

Save bgauryy/6e32b34755d66f8f65e6482b2be987e3 to your computer and use it in GitHub Desktop.
TOON vs MINIFIED JSON TOKENIZATION - regular object
import { encode as toonEncoder } from '@byjohann/toon'
import { createByEncoderName } from '@microsoft/tiktokenizer'
import { writeFileSync, mkdirSync } from 'fs'
import { join } from 'path'
async function main() {
const data = {};
// Generate large dataset programmatically
const emails = ['john.doe', 'jane.smith', 'bob.wilson', 'alice.johnson', 'charlie.brown', 'david.miller', 'emma.davis', 'frank.garcia', 'grace.martinez', 'henry.rodriguez'];
const products = ['Premium Subscription Plan', 'Enterprise Business Solution', 'Professional Developer Tools', 'Advanced Analytics Dashboard', 'Cloud Storage Premium', 'Security Monitoring Service', 'Customer Support Platform', 'Marketing Automation Suite', 'Project Management Software', 'Collaboration Workspace'];
const descriptions = [
'Monthly subscription with unlimited access to all features',
'Complete enterprise solution with dedicated support and custom integrations',
'Professional-grade development tools for modern software teams',
'Advanced analytics with real-time reporting and custom dashboards',
'Secure cloud storage with automatic backup and version control',
'Comprehensive security monitoring with threat detection and alerts',
'Multi-channel customer support platform with AI-powered automation',
'Full-featured marketing automation with email campaigns and analytics',
'Collaborative project management with task tracking and team coordination',
'All-in-one workspace for remote teams with video conferencing and file sharing'
];
const categories = ['Software as a Service', 'Enterprise Solutions', 'Developer Tools', 'Business Intelligence', 'Cloud Services', 'Security', 'Customer Service', 'Marketing', 'Productivity', 'Communication'];
const regions = ['North America', 'Europe', 'Asia Pacific', 'South America', 'Middle East', 'Africa', 'Australia'];
const paymentMethods = ['credit_card', 'bank_transfer', 'paypal', 'stripe', 'invoice', 'wire_transfer', 'crypto'];
const billingCycles = ['monthly', 'annual', 'quarterly', 'biannual', 'weekly'];
// Generate 100 entries for each field type
for (let i = 1; i <= 100; i++) {
data[`user_id_${i}`] = `${emails[i % emails.length]}${i}@example.com`;
data[`product_name_${i}`] = `${products[i % products.length]} v${i}`;
data[`price_${i}`] = Math.round((19.99 + (i * 13.7)) * 100) / 100;
data[`status_active_${i}`] = i % 3 !== 0;
data[`description_${i}`] = descriptions[i % descriptions.length];
data[`created_date_${i}`] = new Date(2024, (i % 12), (i % 28) + 1, (i % 24), (i % 60)).toISOString();
data[`category_${i}`] = categories[i % categories.length];
data[`region_${i}`] = regions[i % regions.length];
data[`payment_method_${i}`] = paymentMethods[i % paymentMethods.length];
data[`billing_cycle_${i}`] = billingCycles[i % billingCycles.length];
data[`customer_count_${i}`] = 1000 + (i * 123);
data[`revenue_${i}`] = Math.round((5000 + (i * 456.78)) * 100) / 100;
data[`churn_rate_${i}`] = Math.round((0.5 + (i * 0.03)) * 100) / 100;
data[`retention_rate_${i}`] = Math.round((85 + (i % 15)) * 100) / 100;
data[`lifetime_value_${i}`] = Math.round((1500 + (i * 89.12)) * 100) / 100;
data[`acquisition_cost_${i}`] = Math.round((50 + (i * 7.34)) * 100) / 100;
data[`support_tickets_${i}`] = Math.floor(10 + (i * 2.3));
data[`avg_response_time_${i}`] = Math.round((2 + (i * 0.15)) * 100) / 100;
data[`satisfaction_score_${i}`] = Math.round((4.1 + ((i % 9) * 0.1)) * 100) / 100;
data[`feature_usage_${i}`] = Math.round((45 + (i % 55)) * 100) / 100;
}
// Count tokens using tiktokenizer
const encoder = await createByEncoderName('cl100k_base')
// Convert data to JSON string for token counting
const dataJson = JSON.stringify(data)
const dataTokens = encoder.encode(dataJson)
const dataTokenCount = dataTokens.length
// Encode the data
const toonEncodedData = toonEncoder(data, { delimiter: '\t' })
const toonEncodedTokens = encoder.encode(toonEncodedData)
const toonEncodedTokenCount = toonEncodedTokens.length
// Calculate size reduction
const sizeReduction = dataTokenCount - toonEncodedTokenCount
const percentageReduction = ((sizeReduction / dataTokenCount) * 100).toFixed(2)
// Create output directory if it doesn't exist
const outputDir = join(process.cwd(), 'output')
mkdirSync(outputDir, { recursive: true })
// Save original JSON to file (formatted for readability)
const originalJsonPath = join(outputDir, 'original-data.json')
const formattedJson = JSON.stringify(data, null, 2)
writeFileSync(originalJsonPath, formattedJson, 'utf8')
console.log(`✓ Original JSON saved to: ${originalJsonPath}`)
// Save original JSON (minified) to file
const originalJsonMinPath = join(outputDir, 'original-data-minified.json')
writeFileSync(originalJsonMinPath, dataJson, 'utf8')
console.log(`✓ Original JSON (minified) saved to: ${originalJsonMinPath}`)
// Save toonEncoder output to file
const encodedDataPath = join(outputDir, 'toon-encoded-data.txt')
writeFileSync(encodedDataPath, toonEncodedData, 'utf8')
console.log(`✓ Toon encoded data saved to: ${encodedDataPath}`)
// Create and save comparison report
const formattedJsonSize = Buffer.from(formattedJson).length
const minifiedJsonSize = Buffer.from(dataJson).length
const toonEncodedSize = Buffer.from(toonEncodedData).length
const byteDifference = minifiedJsonSize - toonEncodedSize
const bytePercentage = ((byteDifference / minifiedJsonSize) * 100).toFixed(2)
const report = `Token Encoding Comparison Report
================================
FILES GENERATED:
1. original-data.json (formatted, 2-space indentation)
2. original-data-minified.json (no whitespace)
3. toon-encoded-data.txt (Toon encoder output)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
ORIGINAL JSON DATA (Minified):
- Tokens: ${dataTokenCount.toLocaleString()}
- Bytes: ${minifiedJsonSize.toLocaleString()}
- File: original-data-minified.json
TOON ENCODED DATA:
- Tokens: ${toonEncodedTokenCount.toLocaleString()}
- Bytes: ${toonEncodedSize.toLocaleString()}
- File: toon-encoded-data.txt
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
COMPARISON (Minified JSON vs Toon Encoded):
Token Comparison:
- Minified JSON: ${dataTokenCount.toLocaleString()} tokens
- Toon Encoded: ${toonEncodedTokenCount.toLocaleString()} tokens
- Difference: ${sizeReduction.toLocaleString()} tokens
- Change: ${percentageReduction}%
Byte Comparison:
- Minified JSON: ${minifiedJsonSize.toLocaleString()} bytes
- Toon Encoded: ${toonEncodedSize.toLocaleString()} bytes
- Difference: ${byteDifference.toLocaleString()} bytes
- Change: ${bytePercentage}%
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
SUMMARY:
${sizeReduction > 0
? `✓ The Toon encoder REDUCED token count by ${Math.abs(sizeReduction)} tokens (${Math.abs(parseFloat(percentageReduction))}%)`
: `✗ The Toon encoder INCREASED token count by ${Math.abs(sizeReduction)} tokens (${Math.abs(parseFloat(percentageReduction))}%)`
}
${byteDifference > 0
? `✓ The Toon encoder REDUCED byte size by ${Math.abs(byteDifference)} bytes (${Math.abs(parseFloat(bytePercentage))}%)`
: `✗ The Toon encoder INCREASED byte size by ${Math.abs(byteDifference)} bytes (${Math.abs(parseFloat(bytePercentage))}%)`
}
EXPLANATION:
The Toon encoder converts JSON to a YAML-like format with:
- Structured indentation for readability
- Array length annotations (e.g., "users[9]:")
- Compressed syntax for arrays and objects
This format is more human-readable but uses slightly more tokens
and bytes than minified JSON.
`
const reportPath = join(outputDir, 'comparison-report.txt')
writeFileSync(reportPath, report, 'utf8')
console.log(`✓ Comparison report saved to: ${reportPath}`)
console.log('\n' + report)
}
main().catch(console.error)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment