Created
October 21, 2025 12:18
-
-
Save ouachitalabs/ee8033f944858612af8693877bddb5ef to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Total vocabulary size: 200,019 | |
| WHITESPACE TOKEN ANALYSIS | |
| Found 434 whitespace-only tokens | |
| Percentage of vocab: 0.217% | |
| SUMMARY BY WHITESPACE TYPE: | |
| Spaces only: 84 tokens | |
| Tabs only: 20 tokens | |
| Newlines only: 11 tokens | |
| Mixed: 319 tokens | |
| SPACE-ONLY TOKENS: | |
| Token ID 220: 1 space(s) - ' ' | |
| Token ID 256: 2 space(s) - ' ' | |
| Token ID 271: 3 space(s) - ' ' | |
| Token ID 257: 4 space(s) - ' ' | |
| Token ID 530: 5 space(s) - ' ' | |
| Token ID 1699: 6 space(s) - ' ' | |
| Token ID 309: 7 space(s) - ' ' | |
| Token ID 269: 8 space(s) - ' ' | |
| Token ID 983: 9 space(s) - ' ' | |
| Token ID 3550: 10 space(s) - ' ' | |
| Token ID 352: 11 space(s) - ' ' | |
| Token ID 3346: 12 space(s) - ' ' | |
| Token ID 1698: 13 space(s) - ' ' | |
| Token ID 4442: 14 space(s) - ' ' | |
| Token ID 506: 15 space(s) - ' ' | |
| Token ID 408: 16 space(s) - ' ' | |
| Token ID 2902: 17 space(s) - ' ' | |
| Token ID 8854: 18 space(s) - ' ' | |
| Token ID 699: 19 space(s) - ' ' | |
| Token ID 7692: 20 space(s) - ' ' | |
| Token ID 4451: 21 space(s) - ' ' | |
| Token ID 10682: 22 space(s) - ' ' | |
| Token ID 968: 23 space(s) - ' ' | |
| Token ID 10406: 24 space(s) - ' ' | |
| Token ID 6453: 25 space(s) - ' ' | |
| Token ID 12397: 26 space(s) - ' ' | |
| Token ID 1686: 27 space(s) - ' ' | |
| Token ID 13541: 28 space(s) - ' ' | |
| Token ID 8616: 29 space(s) - ' ' | |
| Token ID 15880: 30 space(s) - ' ' | |
| Token ID 2419: 31 space(s) - ' ' | |
| Token ID 1213: 32 space(s) - ' ' | |
| Token ID 12599: 33 space(s) - ' ' | |
| Token ID 21063: 34 space(s) - ' ' | |
| Token ID 3523: 35 space(s) - ' ' | |
| Token ID 22731: 36 space(s) - ' ' | |
| Token ID 16198: 37 space(s) - ' ' | |
| Token ID 24131: 38 space(s) - ' ' | |
| Token ID 4754: 39 space(s) - ' ' | |
| Token ID 27240: 40 space(s) - ' ' | |
| Token ID 20949: 41 space(s) - ' ' | |
| Token ID 31024: 42 space(s) - ' ' | |
| Token ID 6322: 43 space(s) - ' ' | |
| Token ID 32392: 44 space(s) - ' ' | |
| Token ID 26829: 45 space(s) - ' ' | |
| Token ID 38019: 46 space(s) - ' ' | |
| Token ID 8793: 47 space(s) - ' ' | |
| Token ID 41146: 48 space(s) - ' ' | |
| Token ID 35584: 49 space(s) - ' ' | |
| Token ID 48774: 50 space(s) - ' ' | |
| Token ID 12266: 51 space(s) - ' ' | |
| Token ID 54372: 52 space(s) - ' ' | |
| Token ID 43806: 53 space(s) - ' ' | |
| Token ID 57342: 54 space(s) - ' ' | |
| Token ID 16006: 55 space(s) - ' ' | |
| Token ID 63991: 56 space(s) - ' ' | |
| Token ID 56404: 57 space(s) - ' ' | |
| Token ID 72749: 58 space(s) - ' ' | |
| Token ID 23643: 59 space(s) - ' ' | |
| Token ID 83300: 60 space(s) - ' ' | |
| Token ID 66188: 61 space(s) - ' ' | |
| Token ID 84839: 62 space(s) - ' ' | |
| Token ID 30319: 63 space(s) - ' ' | |
| Token ID 9344: 64 space(s) - ' ' | |
| Token ID 82955: 65 space(s) - ' ' | |
| Token ID 110321: 66 space(s) - ' ' | |
| Token ID 38959: 67 space(s) - ' ' | |
| Token ID 122722: 68 space(s) - ' ' | |
| Token ID 94875: 69 space(s) - ' ' | |
| Token ID 113903: 70 space(s) - ' ' | |
| Token ID 46371: 71 space(s) - ' ' | |
| Token ID 88017: 72 space(s) - ' ' | |
| Token ID 102820: 73 space(s) - ' ' | |
| Token ID 97361: 74 space(s) - ' ' | |
| Token ID 30723: 75 space(s) - ' ' | |
| Token ID 136356: 76 space(s) - ' ' | |
| Token ID 133272: 77 space(s) - ' ' | |
| Token ID 187205: 78 space(s) - ' ' | |
| Token ID 82724: 79 space(s) - ' ' | |
| Token ID 116288: 83 space(s) - ' ' | |
| Token ID 134788: 87 space(s) - ' ' | |
| Token ID 174893: 91 space(s) - ' ' | |
| Token ID 195732: 95 space(s) - ' ' | |
| Token ID 72056: 128 space(s) - ' ' | |
| TAB-ONLY TOKENS: | |
| Token ID 197: 1 tab(s) - '\t' | |
| Token ID 335: 2 tab(s) - '\t\t' | |
| Token ID 833: 3 tab(s) - '\t\t\t' | |
| Token ID 626: 4 tab(s) - '\t\t\t\t' | |
| Token ID 1999: 5 tab(s) - '\t\t\t\t\t' | |
| Token ID 3083: 6 tab(s) - '\t\t\t\t\t\t' | |
| Token ID 5216: 7 tab(s) - '\t\t\t\t\t\t\t' | |
| Token ID 4011: 8 tab(s) - '\t\t\t\t\t\t\t\t' | |
| Token ID 10666: 9 tab(s) - '\t\t\t\t\t\t\t\t\t' | |
| Token ID 14973: 10 tab(s) - '\t\t\t\t\t\t\t\t\t\t' | |
| Token ID 20581: 11 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t' | |
| Token ID 27926: 12 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t' | |
| Token ID 36630: 13 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t' | |
| Token ID 51646: 14 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t' | |
| Token ID 62364: 15 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t' | |
| Token ID 43876: 16 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t' | |
| Token ID 100800: 17 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t' | |
| Token ID 124716: 18 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t' | |
| Token ID 153867: 19 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t' | |
| Token ID 193506: 20 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t' | |
| NEWLINE-ONLY TOKENS: | |
| Token ID 198: 1 newline(s) - '\n' | |
| Token ID 279: 2 newline(s) - '\n\n' | |
| Token ID 2499: 3 newline(s) - '\n\n\n' | |
| Token ID 4707: 4 newline(s) - '\n\n\n\n' | |
| Token ID 27559: 5 newline(s) - '\n\n\n\n\n' | |
| Token ID 37680: 6 newline(s) - '\n\n\n\n\n\n' | |
| Token ID 70224: 7 newline(s) - '\n\n\n\n\n\n\n' | |
| Token ID 21301: 8 newline(s) - '\n\n\n\n\n\n\n\n' | |
| Token ID 128841: 9 newline(s) - '\n\n\n\n\n\n\n\n\n' | |
| Token ID 160468: 10 newline(s) - '\n\n\n\n\n\n\n\n\n\n' | |
| Token ID 64469: 16 newline(s) - '\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n' | |
| MIXED WHITESPACE TOKENS: | |
| Token ID 199: '\x0b' [U+000B] | |
| Token ID 200: '\x0c' [U+000C] | |
| Token ID 201: '\r' [CR] | |
| Token ID 216: '\x1c' [U+001C] | |
| Token ID 217: '\x1d' [U+001D] | |
| Token ID 218: '\x1e' [U+001E] | |
| Token ID 219: '\x1f' [U+001F] | |
| Token ID 1397: '\u3000' [U+3000] | |
| Token ID 5310: '\xa0' [U+00A0] | |
| Token ID 17985: '\u2002' [U+2002] | |
| Token ID 29106: '\u2009' [U+2009] | |
| Token ID 33203: '\u2003' [U+2003] | |
| Token ID 35971: '\u202f' [U+202F] | |
| Token ID 51008: '\u2028' [U+2028] | |
| Token ID 86741: '\u200a' [U+200A] | |
| Token ID 169653: '\u2005' [U+2005] | |
| Token ID 370: '\r\n' [CR+NL] | |
| Token ID 793: ' \n' [SP+NL] | |
| Token ID 2775: '\t\n' [TAB+NL] | |
| Token ID 4577: '\u3000\u3000' [U+3000+U+3000] | |
| ... and 299 more | |
| """ | |
| import tiktoken | |
| def is_whitespace_only(text): | |
| """Check if a string contains only whitespace characters.""" | |
| return len(text) > 0 and text.isspace() | |
| def analyze_whitespace_tokens(): | |
| # Load the o200k_base encoding | |
| enc = tiktoken.get_encoding("o200k_base") | |
| # Get the vocabulary size | |
| vocab_size = enc.max_token_value + 1 | |
| print(f"Total vocabulary size: {vocab_size:,}") | |
| print() | |
| print("WHITESPACE TOKEN ANALYSIS") | |
| print() | |
| whitespace_tokens = [] | |
| # Iterate through all possible token IDs | |
| for token_id in range(vocab_size): | |
| try: | |
| # Decode the token to get its byte representation | |
| token_bytes = enc.decode_single_token_bytes(token_id) | |
| # Try to decode as UTF-8, but only count it as whitespace if: | |
| # 1. It decodes validly (no replacement characters) | |
| # 2. The decoded text is all whitespace | |
| # 3. The byte length matches the decoded text length (for ASCII/simple chars) | |
| try: | |
| token_text = token_bytes.decode('utf-8', errors='strict') | |
| # Check if this token is purely whitespace | |
| if is_whitespace_only(token_text): | |
| whitespace_tokens.append({ | |
| 'id': token_id, | |
| 'text': token_text, | |
| 'bytes': token_bytes, | |
| 'length': len(token_text), | |
| 'repr': repr(token_text) | |
| }) | |
| except UnicodeDecodeError: | |
| # Invalid UTF-8, skip it | |
| continue | |
| except Exception: | |
| # Some tokens might not decode properly | |
| continue | |
| # Sort by length and then by ID | |
| whitespace_tokens.sort(key=lambda x: (x['length'], x['id'])) | |
| print(f"Found {len(whitespace_tokens)} whitespace-only tokens") | |
| print(f"Percentage of vocab: {100 * len(whitespace_tokens) / vocab_size:.3f}%\n") | |
| # Group by type of whitespace | |
| spaces_only = [] | |
| tabs_only = [] | |
| newlines_only = [] | |
| mixed = [] | |
| for token in whitespace_tokens: | |
| text = token['text'] | |
| if all(c == ' ' for c in text): | |
| spaces_only.append(token) | |
| elif all(c == '\t' for c in text): | |
| tabs_only.append(token) | |
| elif all(c == '\n' for c in text): | |
| newlines_only.append(token) | |
| else: | |
| mixed.append(token) | |
| # Display summary by category | |
| print("SUMMARY BY WHITESPACE TYPE:") | |
| print() | |
| print(f" Spaces only: {len(spaces_only):4d} tokens") | |
| print(f" Tabs only: {len(tabs_only):4d} tokens") | |
| print(f" Newlines only: {len(newlines_only):4d} tokens") | |
| print(f" Mixed: {len(mixed):4d} tokens") | |
| print() | |
| # Show details for space tokens | |
| if spaces_only: | |
| print("\nSPACE-ONLY TOKENS:") | |
| print() | |
| for token in spaces_only: | |
| print(f" Token ID {token['id']:6d}: {token['length']:2d} space(s) - {token['repr']}") | |
| # Show details for tab tokens | |
| if tabs_only: | |
| print("\nTAB-ONLY TOKENS:") | |
| print() | |
| for token in tabs_only: | |
| print(f" Token ID {token['id']:6d}: {token['length']:2d} tab(s) - {token['repr']}") | |
| # Show details for newline tokens | |
| if newlines_only: | |
| print("\nNEWLINE-ONLY TOKENS:") | |
| print() | |
| for token in newlines_only: | |
| print(f" Token ID {token['id']:6d}: {token['length']:2d} newline(s) - {token['repr']}") | |
| # Show details for mixed whitespace tokens | |
| if mixed: | |
| print("\nMIXED WHITESPACE TOKENS:") | |
| print() | |
| for token in mixed[:20]: # Show first 20 to avoid too much output | |
| char_breakdown = [] | |
| for char in token['text']: | |
| if char == ' ': | |
| char_breakdown.append('SP') | |
| elif char == '\t': | |
| char_breakdown.append('TAB') | |
| elif char == '\n': | |
| char_breakdown.append('NL') | |
| elif char == '\r': | |
| char_breakdown.append('CR') | |
| else: | |
| char_breakdown.append(f'U+{ord(char):04X}') | |
| breakdown_str = '+'.join(char_breakdown) | |
| print(f" Token ID {token['id']:6d}: {token['repr']:30s} [{breakdown_str}]") | |
| if len(mixed) > 20: | |
| print(f" ... and {len(mixed) - 20} more") | |
| print() | |
| if __name__ == "__main__": | |
| analyze_whitespace_tokens() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment