Skip to content

Instantly share code, notes, and snippets.

@ouachitalabs
Created October 21, 2025 12:18
Show Gist options
  • Select an option

  • Save ouachitalabs/ee8033f944858612af8693877bddb5ef to your computer and use it in GitHub Desktop.

Select an option

Save ouachitalabs/ee8033f944858612af8693877bddb5ef to your computer and use it in GitHub Desktop.
"""
Total vocabulary size: 200,019
WHITESPACE TOKEN ANALYSIS
Found 434 whitespace-only tokens
Percentage of vocab: 0.217%
SUMMARY BY WHITESPACE TYPE:
Spaces only: 84 tokens
Tabs only: 20 tokens
Newlines only: 11 tokens
Mixed: 319 tokens
SPACE-ONLY TOKENS:
Token ID 220: 1 space(s) - ' '
Token ID 256: 2 space(s) - ' '
Token ID 271: 3 space(s) - ' '
Token ID 257: 4 space(s) - ' '
Token ID 530: 5 space(s) - ' '
Token ID 1699: 6 space(s) - ' '
Token ID 309: 7 space(s) - ' '
Token ID 269: 8 space(s) - ' '
Token ID 983: 9 space(s) - ' '
Token ID 3550: 10 space(s) - ' '
Token ID 352: 11 space(s) - ' '
Token ID 3346: 12 space(s) - ' '
Token ID 1698: 13 space(s) - ' '
Token ID 4442: 14 space(s) - ' '
Token ID 506: 15 space(s) - ' '
Token ID 408: 16 space(s) - ' '
Token ID 2902: 17 space(s) - ' '
Token ID 8854: 18 space(s) - ' '
Token ID 699: 19 space(s) - ' '
Token ID 7692: 20 space(s) - ' '
Token ID 4451: 21 space(s) - ' '
Token ID 10682: 22 space(s) - ' '
Token ID 968: 23 space(s) - ' '
Token ID 10406: 24 space(s) - ' '
Token ID 6453: 25 space(s) - ' '
Token ID 12397: 26 space(s) - ' '
Token ID 1686: 27 space(s) - ' '
Token ID 13541: 28 space(s) - ' '
Token ID 8616: 29 space(s) - ' '
Token ID 15880: 30 space(s) - ' '
Token ID 2419: 31 space(s) - ' '
Token ID 1213: 32 space(s) - ' '
Token ID 12599: 33 space(s) - ' '
Token ID 21063: 34 space(s) - ' '
Token ID 3523: 35 space(s) - ' '
Token ID 22731: 36 space(s) - ' '
Token ID 16198: 37 space(s) - ' '
Token ID 24131: 38 space(s) - ' '
Token ID 4754: 39 space(s) - ' '
Token ID 27240: 40 space(s) - ' '
Token ID 20949: 41 space(s) - ' '
Token ID 31024: 42 space(s) - ' '
Token ID 6322: 43 space(s) - ' '
Token ID 32392: 44 space(s) - ' '
Token ID 26829: 45 space(s) - ' '
Token ID 38019: 46 space(s) - ' '
Token ID 8793: 47 space(s) - ' '
Token ID 41146: 48 space(s) - ' '
Token ID 35584: 49 space(s) - ' '
Token ID 48774: 50 space(s) - ' '
Token ID 12266: 51 space(s) - ' '
Token ID 54372: 52 space(s) - ' '
Token ID 43806: 53 space(s) - ' '
Token ID 57342: 54 space(s) - ' '
Token ID 16006: 55 space(s) - ' '
Token ID 63991: 56 space(s) - ' '
Token ID 56404: 57 space(s) - ' '
Token ID 72749: 58 space(s) - ' '
Token ID 23643: 59 space(s) - ' '
Token ID 83300: 60 space(s) - ' '
Token ID 66188: 61 space(s) - ' '
Token ID 84839: 62 space(s) - ' '
Token ID 30319: 63 space(s) - ' '
Token ID 9344: 64 space(s) - ' '
Token ID 82955: 65 space(s) - ' '
Token ID 110321: 66 space(s) - ' '
Token ID 38959: 67 space(s) - ' '
Token ID 122722: 68 space(s) - ' '
Token ID 94875: 69 space(s) - ' '
Token ID 113903: 70 space(s) - ' '
Token ID 46371: 71 space(s) - ' '
Token ID 88017: 72 space(s) - ' '
Token ID 102820: 73 space(s) - ' '
Token ID 97361: 74 space(s) - ' '
Token ID 30723: 75 space(s) - ' '
Token ID 136356: 76 space(s) - ' '
Token ID 133272: 77 space(s) - ' '
Token ID 187205: 78 space(s) - ' '
Token ID 82724: 79 space(s) - ' '
Token ID 116288: 83 space(s) - ' '
Token ID 134788: 87 space(s) - ' '
Token ID 174893: 91 space(s) - ' '
Token ID 195732: 95 space(s) - ' '
Token ID 72056: 128 space(s) - ' '
TAB-ONLY TOKENS:
Token ID 197: 1 tab(s) - '\t'
Token ID 335: 2 tab(s) - '\t\t'
Token ID 833: 3 tab(s) - '\t\t\t'
Token ID 626: 4 tab(s) - '\t\t\t\t'
Token ID 1999: 5 tab(s) - '\t\t\t\t\t'
Token ID 3083: 6 tab(s) - '\t\t\t\t\t\t'
Token ID 5216: 7 tab(s) - '\t\t\t\t\t\t\t'
Token ID 4011: 8 tab(s) - '\t\t\t\t\t\t\t\t'
Token ID 10666: 9 tab(s) - '\t\t\t\t\t\t\t\t\t'
Token ID 14973: 10 tab(s) - '\t\t\t\t\t\t\t\t\t\t'
Token ID 20581: 11 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t'
Token ID 27926: 12 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t'
Token ID 36630: 13 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t'
Token ID 51646: 14 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
Token ID 62364: 15 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
Token ID 43876: 16 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
Token ID 100800: 17 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
Token ID 124716: 18 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
Token ID 153867: 19 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
Token ID 193506: 20 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
NEWLINE-ONLY TOKENS:
Token ID 198: 1 newline(s) - '\n'
Token ID 279: 2 newline(s) - '\n\n'
Token ID 2499: 3 newline(s) - '\n\n\n'
Token ID 4707: 4 newline(s) - '\n\n\n\n'
Token ID 27559: 5 newline(s) - '\n\n\n\n\n'
Token ID 37680: 6 newline(s) - '\n\n\n\n\n\n'
Token ID 70224: 7 newline(s) - '\n\n\n\n\n\n\n'
Token ID 21301: 8 newline(s) - '\n\n\n\n\n\n\n\n'
Token ID 128841: 9 newline(s) - '\n\n\n\n\n\n\n\n\n'
Token ID 160468: 10 newline(s) - '\n\n\n\n\n\n\n\n\n\n'
Token ID 64469: 16 newline(s) - '\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n'
MIXED WHITESPACE TOKENS:
Token ID 199: '\x0b' [U+000B]
Token ID 200: '\x0c' [U+000C]
Token ID 201: '\r' [CR]
Token ID 216: '\x1c' [U+001C]
Token ID 217: '\x1d' [U+001D]
Token ID 218: '\x1e' [U+001E]
Token ID 219: '\x1f' [U+001F]
Token ID 1397: '\u3000' [U+3000]
Token ID 5310: '\xa0' [U+00A0]
Token ID 17985: '\u2002' [U+2002]
Token ID 29106: '\u2009' [U+2009]
Token ID 33203: '\u2003' [U+2003]
Token ID 35971: '\u202f' [U+202F]
Token ID 51008: '\u2028' [U+2028]
Token ID 86741: '\u200a' [U+200A]
Token ID 169653: '\u2005' [U+2005]
Token ID 370: '\r\n' [CR+NL]
Token ID 793: ' \n' [SP+NL]
Token ID 2775: '\t\n' [TAB+NL]
Token ID 4577: '\u3000\u3000' [U+3000+U+3000]
... and 299 more
"""
import tiktoken
def is_whitespace_only(text):
"""Check if a string contains only whitespace characters."""
return len(text) > 0 and text.isspace()
def analyze_whitespace_tokens():
# Load the o200k_base encoding
enc = tiktoken.get_encoding("o200k_base")
# Get the vocabulary size
vocab_size = enc.max_token_value + 1
print(f"Total vocabulary size: {vocab_size:,}")
print()
print("WHITESPACE TOKEN ANALYSIS")
print()
whitespace_tokens = []
# Iterate through all possible token IDs
for token_id in range(vocab_size):
try:
# Decode the token to get its byte representation
token_bytes = enc.decode_single_token_bytes(token_id)
# Try to decode as UTF-8, but only count it as whitespace if:
# 1. It decodes validly (no replacement characters)
# 2. The decoded text is all whitespace
# 3. The byte length matches the decoded text length (for ASCII/simple chars)
try:
token_text = token_bytes.decode('utf-8', errors='strict')
# Check if this token is purely whitespace
if is_whitespace_only(token_text):
whitespace_tokens.append({
'id': token_id,
'text': token_text,
'bytes': token_bytes,
'length': len(token_text),
'repr': repr(token_text)
})
except UnicodeDecodeError:
# Invalid UTF-8, skip it
continue
except Exception:
# Some tokens might not decode properly
continue
# Sort by length and then by ID
whitespace_tokens.sort(key=lambda x: (x['length'], x['id']))
print(f"Found {len(whitespace_tokens)} whitespace-only tokens")
print(f"Percentage of vocab: {100 * len(whitespace_tokens) / vocab_size:.3f}%\n")
# Group by type of whitespace
spaces_only = []
tabs_only = []
newlines_only = []
mixed = []
for token in whitespace_tokens:
text = token['text']
if all(c == ' ' for c in text):
spaces_only.append(token)
elif all(c == '\t' for c in text):
tabs_only.append(token)
elif all(c == '\n' for c in text):
newlines_only.append(token)
else:
mixed.append(token)
# Display summary by category
print("SUMMARY BY WHITESPACE TYPE:")
print()
print(f" Spaces only: {len(spaces_only):4d} tokens")
print(f" Tabs only: {len(tabs_only):4d} tokens")
print(f" Newlines only: {len(newlines_only):4d} tokens")
print(f" Mixed: {len(mixed):4d} tokens")
print()
# Show details for space tokens
if spaces_only:
print("\nSPACE-ONLY TOKENS:")
print()
for token in spaces_only:
print(f" Token ID {token['id']:6d}: {token['length']:2d} space(s) - {token['repr']}")
# Show details for tab tokens
if tabs_only:
print("\nTAB-ONLY TOKENS:")
print()
for token in tabs_only:
print(f" Token ID {token['id']:6d}: {token['length']:2d} tab(s) - {token['repr']}")
# Show details for newline tokens
if newlines_only:
print("\nNEWLINE-ONLY TOKENS:")
print()
for token in newlines_only:
print(f" Token ID {token['id']:6d}: {token['length']:2d} newline(s) - {token['repr']}")
# Show details for mixed whitespace tokens
if mixed:
print("\nMIXED WHITESPACE TOKENS:")
print()
for token in mixed[:20]: # Show first 20 to avoid too much output
char_breakdown = []
for char in token['text']:
if char == ' ':
char_breakdown.append('SP')
elif char == '\t':
char_breakdown.append('TAB')
elif char == '\n':
char_breakdown.append('NL')
elif char == '\r':
char_breakdown.append('CR')
else:
char_breakdown.append(f'U+{ord(char):04X}')
breakdown_str = '+'.join(char_breakdown)
print(f" Token ID {token['id']:6d}: {token['repr']:30s} [{breakdown_str}]")
if len(mixed) > 20:
print(f" ... and {len(mixed) - 20} more")
print()
if __name__ == "__main__":
analyze_whitespace_tokens()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment