Created
October 27, 2025 14:56
-
-
Save bgauryy/6e32b34755d66f8f65e6482b2be987e3 to your computer and use it in GitHub Desktop.
TOON vs MINIFIED JSON TOKENIZATION - regular object
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import { encode as toonEncoder } from '@byjohann/toon' | |
| import { createByEncoderName } from '@microsoft/tiktokenizer' | |
| import { writeFileSync, mkdirSync } from 'fs' | |
| import { join } from 'path' | |
| async function main() { | |
| const data = {}; | |
| // Generate large dataset programmatically | |
| const emails = ['john.doe', 'jane.smith', 'bob.wilson', 'alice.johnson', 'charlie.brown', 'david.miller', 'emma.davis', 'frank.garcia', 'grace.martinez', 'henry.rodriguez']; | |
| const products = ['Premium Subscription Plan', 'Enterprise Business Solution', 'Professional Developer Tools', 'Advanced Analytics Dashboard', 'Cloud Storage Premium', 'Security Monitoring Service', 'Customer Support Platform', 'Marketing Automation Suite', 'Project Management Software', 'Collaboration Workspace']; | |
| const descriptions = [ | |
| 'Monthly subscription with unlimited access to all features', | |
| 'Complete enterprise solution with dedicated support and custom integrations', | |
| 'Professional-grade development tools for modern software teams', | |
| 'Advanced analytics with real-time reporting and custom dashboards', | |
| 'Secure cloud storage with automatic backup and version control', | |
| 'Comprehensive security monitoring with threat detection and alerts', | |
| 'Multi-channel customer support platform with AI-powered automation', | |
| 'Full-featured marketing automation with email campaigns and analytics', | |
| 'Collaborative project management with task tracking and team coordination', | |
| 'All-in-one workspace for remote teams with video conferencing and file sharing' | |
| ]; | |
| const categories = ['Software as a Service', 'Enterprise Solutions', 'Developer Tools', 'Business Intelligence', 'Cloud Services', 'Security', 'Customer Service', 'Marketing', 'Productivity', 'Communication']; | |
| const regions = ['North America', 'Europe', 'Asia Pacific', 'South America', 'Middle East', 'Africa', 'Australia']; | |
| const paymentMethods = ['credit_card', 'bank_transfer', 'paypal', 'stripe', 'invoice', 'wire_transfer', 'crypto']; | |
| const billingCycles = ['monthly', 'annual', 'quarterly', 'biannual', 'weekly']; | |
| // Generate 100 entries for each field type | |
| for (let i = 1; i <= 100; i++) { | |
| data[`user_id_${i}`] = `${emails[i % emails.length]}${i}@example.com`; | |
| data[`product_name_${i}`] = `${products[i % products.length]} v${i}`; | |
| data[`price_${i}`] = Math.round((19.99 + (i * 13.7)) * 100) / 100; | |
| data[`status_active_${i}`] = i % 3 !== 0; | |
| data[`description_${i}`] = descriptions[i % descriptions.length]; | |
| data[`created_date_${i}`] = new Date(2024, (i % 12), (i % 28) + 1, (i % 24), (i % 60)).toISOString(); | |
| data[`category_${i}`] = categories[i % categories.length]; | |
| data[`region_${i}`] = regions[i % regions.length]; | |
| data[`payment_method_${i}`] = paymentMethods[i % paymentMethods.length]; | |
| data[`billing_cycle_${i}`] = billingCycles[i % billingCycles.length]; | |
| data[`customer_count_${i}`] = 1000 + (i * 123); | |
| data[`revenue_${i}`] = Math.round((5000 + (i * 456.78)) * 100) / 100; | |
| data[`churn_rate_${i}`] = Math.round((0.5 + (i * 0.03)) * 100) / 100; | |
| data[`retention_rate_${i}`] = Math.round((85 + (i % 15)) * 100) / 100; | |
| data[`lifetime_value_${i}`] = Math.round((1500 + (i * 89.12)) * 100) / 100; | |
| data[`acquisition_cost_${i}`] = Math.round((50 + (i * 7.34)) * 100) / 100; | |
| data[`support_tickets_${i}`] = Math.floor(10 + (i * 2.3)); | |
| data[`avg_response_time_${i}`] = Math.round((2 + (i * 0.15)) * 100) / 100; | |
| data[`satisfaction_score_${i}`] = Math.round((4.1 + ((i % 9) * 0.1)) * 100) / 100; | |
| data[`feature_usage_${i}`] = Math.round((45 + (i % 55)) * 100) / 100; | |
| } | |
| // Count tokens using tiktokenizer | |
| const encoder = await createByEncoderName('cl100k_base') | |
| // Convert data to JSON string for token counting | |
| const dataJson = JSON.stringify(data) | |
| const dataTokens = encoder.encode(dataJson) | |
| const dataTokenCount = dataTokens.length | |
| // Encode the data | |
| const toonEncodedData = toonEncoder(data, { delimiter: '\t' }) | |
| const toonEncodedTokens = encoder.encode(toonEncodedData) | |
| const toonEncodedTokenCount = toonEncodedTokens.length | |
| // Calculate size reduction | |
| const sizeReduction = dataTokenCount - toonEncodedTokenCount | |
| const percentageReduction = ((sizeReduction / dataTokenCount) * 100).toFixed(2) | |
| // Create output directory if it doesn't exist | |
| const outputDir = join(process.cwd(), 'output') | |
| mkdirSync(outputDir, { recursive: true }) | |
| // Save original JSON to file (formatted for readability) | |
| const originalJsonPath = join(outputDir, 'original-data.json') | |
| const formattedJson = JSON.stringify(data, null, 2) | |
| writeFileSync(originalJsonPath, formattedJson, 'utf8') | |
| console.log(`✓ Original JSON saved to: ${originalJsonPath}`) | |
| // Save original JSON (minified) to file | |
| const originalJsonMinPath = join(outputDir, 'original-data-minified.json') | |
| writeFileSync(originalJsonMinPath, dataJson, 'utf8') | |
| console.log(`✓ Original JSON (minified) saved to: ${originalJsonMinPath}`) | |
| // Save toonEncoder output to file | |
| const encodedDataPath = join(outputDir, 'toon-encoded-data.txt') | |
| writeFileSync(encodedDataPath, toonEncodedData, 'utf8') | |
| console.log(`✓ Toon encoded data saved to: ${encodedDataPath}`) | |
| // Create and save comparison report | |
| const formattedJsonSize = Buffer.from(formattedJson).length | |
| const minifiedJsonSize = Buffer.from(dataJson).length | |
| const toonEncodedSize = Buffer.from(toonEncodedData).length | |
| const byteDifference = minifiedJsonSize - toonEncodedSize | |
| const bytePercentage = ((byteDifference / minifiedJsonSize) * 100).toFixed(2) | |
| const report = `Token Encoding Comparison Report | |
| ================================ | |
| FILES GENERATED: | |
| 1. original-data.json (formatted, 2-space indentation) | |
| 2. original-data-minified.json (no whitespace) | |
| 3. toon-encoded-data.txt (Toon encoder output) | |
| ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ | |
| ORIGINAL JSON DATA (Minified): | |
| - Tokens: ${dataTokenCount.toLocaleString()} | |
| - Bytes: ${minifiedJsonSize.toLocaleString()} | |
| - File: original-data-minified.json | |
| TOON ENCODED DATA: | |
| - Tokens: ${toonEncodedTokenCount.toLocaleString()} | |
| - Bytes: ${toonEncodedSize.toLocaleString()} | |
| - File: toon-encoded-data.txt | |
| ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ | |
| COMPARISON (Minified JSON vs Toon Encoded): | |
| Token Comparison: | |
| - Minified JSON: ${dataTokenCount.toLocaleString()} tokens | |
| - Toon Encoded: ${toonEncodedTokenCount.toLocaleString()} tokens | |
| - Difference: ${sizeReduction.toLocaleString()} tokens | |
| - Change: ${percentageReduction}% | |
| Byte Comparison: | |
| - Minified JSON: ${minifiedJsonSize.toLocaleString()} bytes | |
| - Toon Encoded: ${toonEncodedSize.toLocaleString()} bytes | |
| - Difference: ${byteDifference.toLocaleString()} bytes | |
| - Change: ${bytePercentage}% | |
| ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ | |
| SUMMARY: | |
| ${sizeReduction > 0 | |
| ? `✓ The Toon encoder REDUCED token count by ${Math.abs(sizeReduction)} tokens (${Math.abs(parseFloat(percentageReduction))}%)` | |
| : `✗ The Toon encoder INCREASED token count by ${Math.abs(sizeReduction)} tokens (${Math.abs(parseFloat(percentageReduction))}%)` | |
| } | |
| ${byteDifference > 0 | |
| ? `✓ The Toon encoder REDUCED byte size by ${Math.abs(byteDifference)} bytes (${Math.abs(parseFloat(bytePercentage))}%)` | |
| : `✗ The Toon encoder INCREASED byte size by ${Math.abs(byteDifference)} bytes (${Math.abs(parseFloat(bytePercentage))}%)` | |
| } | |
| EXPLANATION: | |
| The Toon encoder converts JSON to a YAML-like format with: | |
| - Structured indentation for readability | |
| - Array length annotations (e.g., "users[9]:") | |
| - Compressed syntax for arrays and objects | |
| This format is more human-readable but uses slightly more tokens | |
| and bytes than minified JSON. | |
| ` | |
| const reportPath = join(outputDir, 'comparison-report.txt') | |
| writeFileSync(reportPath, report, 'utf8') | |
| console.log(`✓ Comparison report saved to: ${reportPath}`) | |
| console.log('\n' + report) | |
| } | |
| main().catch(console.error) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment