ouachitalabs · October 21, 2025 12:18
diff --git a/analyze_whitespace_tokens.py b/analyze_whitespace_tokens.py
 """
 Total vocabulary size: 200,019

 WHITESPACE TOKEN ANALYSIS

 Found 434 whitespace-only tokens
 Percentage of vocab: 0.217%

 SUMMARY BY WHITESPACE TYPE:

  Spaces only:     84 tokens
  Tabs only:       20 tokens
  Newlines only:   11 tokens
  Mixed:          319 tokens


 SPACE-ONLY TOKENS:

  Token ID    220:  1 space(s) - ' '
  Token ID    256:  2 space(s) - '  '
  Token ID    271:  3 space(s) - '   '
  Token ID    257:  4 space(s) - '    '
  Token ID    530:  5 space(s) - '     '
  Token ID   1699:  6 space(s) - '      '
  Token ID    309:  7 space(s) - '       '
  Token ID    269:  8 space(s) - '        '
  Token ID    983:  9 space(s) - '         '
  Token ID   3550: 10 space(s) - '          '
  Token ID    352: 11 space(s) - '           '
  Token ID   3346: 12 space(s) - '            '
  Token ID   1698: 13 space(s) - '             '
  Token ID   4442: 14 space(s) - '              '
  Token ID    506: 15 space(s) - '               '
  Token ID    408: 16 space(s) - '                '
  Token ID   2902: 17 space(s) - '                 '
  Token ID   8854: 18 space(s) - '                  '
  Token ID    699: 19 space(s) - '                   '
  Token ID   7692: 20 space(s) - '                    '
  Token ID   4451: 21 space(s) - '                     '
  Token ID  10682: 22 space(s) - '                      '
  Token ID    968: 23 space(s) - '                       '
  Token ID  10406: 24 space(s) - '                        '
  Token ID   6453: 25 space(s) - '                         '
  Token ID  12397: 26 space(s) - '                          '
  Token ID   1686: 27 space(s) - '                           '
  Token ID  13541: 28 space(s) - '                            '
  Token ID   8616: 29 space(s) - '                             '
  Token ID  15880: 30 space(s) - '                              '
  Token ID   2419: 31 space(s) - '                               '
  Token ID   1213: 32 space(s) - '                                '
  Token ID  12599: 33 space(s) - '                                 '
  Token ID  21063: 34 space(s) - '                                  '
  Token ID   3523: 35 space(s) - '                                   '
  Token ID  22731: 36 space(s) - '                                    '
  Token ID  16198: 37 space(s) - '                                     '
  Token ID  24131: 38 space(s) - '                                      '
  Token ID   4754: 39 space(s) - '                                       '
  Token ID  27240: 40 space(s) - '                                        '
  Token ID  20949: 41 space(s) - '                                         '
  Token ID  31024: 42 space(s) - '                                          '
  Token ID   6322: 43 space(s) - '                                           '
  Token ID  32392: 44 space(s) - '                                            '
  Token ID  26829: 45 space(s) - '                                             '
  Token ID  38019: 46 space(s) - '                                              '
  Token ID   8793: 47 space(s) - '                                               '
  Token ID  41146: 48 space(s) - '                                                '
  Token ID  35584: 49 space(s) - '                                                 '
  Token ID  48774: 50 space(s) - '                                                  '
  Token ID  12266: 51 space(s) - '                                                   '
  Token ID  54372: 52 space(s) - '                                                    '
  Token ID  43806: 53 space(s) - '                                                     '
  Token ID  57342: 54 space(s) - '                                                      '
  Token ID  16006: 55 space(s) - '                                                       '
  Token ID  63991: 56 space(s) - '                                                        '
  Token ID  56404: 57 space(s) - '                                                         '
  Token ID  72749: 58 space(s) - '                                                          '
  Token ID  23643: 59 space(s) - '                                                           '
  Token ID  83300: 60 space(s) - '                                                            '
  Token ID  66188: 61 space(s) - '                                                             '
  Token ID  84839: 62 space(s) - '                                                              '
  Token ID  30319: 63 space(s) - '                                                               '
  Token ID   9344: 64 space(s) - '                                                                '
  Token ID  82955: 65 space(s) - '                                                                 '
  Token ID 110321: 66 space(s) - '                                                                  '
  Token ID  38959: 67 space(s) - '                                                                   '
  Token ID 122722: 68 space(s) - '                                                                    '
  Token ID  94875: 69 space(s) - '                                                                     '
  Token ID 113903: 70 space(s) - '                                                                      '
  Token ID  46371: 71 space(s) - '                                                                       '
  Token ID  88017: 72 space(s) - '                                                                        '
  Token ID 102820: 73 space(s) - '                                                                         '
  Token ID  97361: 74 space(s) - '                                                                          '
  Token ID  30723: 75 space(s) - '                                                                           '
  Token ID 136356: 76 space(s) - '                                                                            '
  Token ID 133272: 77 space(s) - '                                                                             '
  Token ID 187205: 78 space(s) - '                                                                              '
  Token ID  82724: 79 space(s) - '                                                                               '
  Token ID 116288: 83 space(s) - '                                                                                   '
  Token ID 134788: 87 space(s) - '                                                                                       '
  Token ID 174893: 91 space(s) - '                                                                                           '
  Token ID 195732: 95 space(s) - '                                                                                               '
  Token ID  72056: 128 space(s) - '                                                                                                                                '

 TAB-ONLY TOKENS:

  Token ID    197:  1 tab(s) - '\t'
  Token ID    335:  2 tab(s) - '\t\t'
  Token ID    833:  3 tab(s) - '\t\t\t'
  Token ID    626:  4 tab(s) - '\t\t\t\t'
  Token ID   1999:  5 tab(s) - '\t\t\t\t\t'
  Token ID   3083:  6 tab(s) - '\t\t\t\t\t\t'
  Token ID   5216:  7 tab(s) - '\t\t\t\t\t\t\t'
  Token ID   4011:  8 tab(s) - '\t\t\t\t\t\t\t\t'
  Token ID  10666:  9 tab(s) - '\t\t\t\t\t\t\t\t\t'
  Token ID  14973: 10 tab(s) - '\t\t\t\t\t\t\t\t\t\t'
  Token ID  20581: 11 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t'
  Token ID  27926: 12 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t'
  Token ID  36630: 13 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t'
  Token ID  51646: 14 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
  Token ID  62364: 15 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
  Token ID  43876: 16 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
  Token ID 100800: 17 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
  Token ID 124716: 18 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
  Token ID 153867: 19 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
  Token ID 193506: 20 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'

 NEWLINE-ONLY TOKENS:

  Token ID    198:  1 newline(s) - '\n'
  Token ID    279:  2 newline(s) - '\n\n'
  Token ID   2499:  3 newline(s) - '\n\n\n'
  Token ID   4707:  4 newline(s) - '\n\n\n\n'
  Token ID  27559:  5 newline(s) - '\n\n\n\n\n'
  Token ID  37680:  6 newline(s) - '\n\n\n\n\n\n'
  Token ID  70224:  7 newline(s) - '\n\n\n\n\n\n\n'
  Token ID  21301:  8 newline(s) - '\n\n\n\n\n\n\n\n'
  Token ID 128841:  9 newline(s) - '\n\n\n\n\n\n\n\n\n'
  Token ID 160468: 10 newline(s) - '\n\n\n\n\n\n\n\n\n\n'
  Token ID  64469: 16 newline(s) - '\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n'

 MIXED WHITESPACE TOKENS:

  Token ID    199: '\x0b'                         [U+000B]
  Token ID    200: '\x0c'                         [U+000C]
  Token ID    201: '\r'                           [CR]
  Token ID    216: '\x1c'                         [U+001C]
  Token ID    217: '\x1d'                         [U+001D]
  Token ID    218: '\x1e'                         [U+001E]
  Token ID    219: '\x1f'                         [U+001F]
  Token ID   1397: '\u3000'                       [U+3000]
  Token ID   5310: '\xa0'                         [U+00A0]
  Token ID  17985: '\u2002'                       [U+2002]
  Token ID  29106: '\u2009'                       [U+2009]
  Token ID  33203: '\u2003'                       [U+2003]
  Token ID  35971: '\u202f'                       [U+202F]
  Token ID  51008: '\u2028'                       [U+2028]
  Token ID  86741: '\u200a'                       [U+200A]
  Token ID 169653: '\u2005'                       [U+2005]
  Token ID    370: '\r\n'                         [CR+NL]
  Token ID    793: ' \n'                          [SP+NL]
  Token ID   2775: '\t\n'                         [TAB+NL]
  Token ID   4577: '\u3000\u3000'                 [U+3000+U+3000]
  ... and 299 more
 """

 import tiktoken

 def is_whitespace_only(text):
    """Check if a string contains only whitespace characters."""
    return len(text) > 0 and text.isspace()

 def analyze_whitespace_tokens():
    # Load the o200k_base encoding
    enc = tiktoken.get_encoding("o200k_base")

    # Get the vocabulary size
    vocab_size = enc.max_token_value + 1

    print(f"Total vocabulary size: {vocab_size:,}")
    print()
    print("WHITESPACE TOKEN ANALYSIS")
    print()

    whitespace_tokens = []

    # Iterate through all possible token IDs
    for token_id in range(vocab_size):
        try:
            # Decode the token to get its byte representation
            token_bytes = enc.decode_single_token_bytes(token_id)

            # Try to decode as UTF-8, but only count it as whitespace if:
            # 1. It decodes validly (no replacement characters)
            # 2. The decoded text is all whitespace
            # 3. The byte length matches the decoded text length (for ASCII/simple chars)
            try:
                token_text = token_bytes.decode('utf-8', errors='strict')
                # Check if this token is purely whitespace
                if is_whitespace_only(token_text):
                    whitespace_tokens.append({
                        'id': token_id,
                        'text': token_text,
                        'bytes': token_bytes,
                        'length': len(token_text),
                        'repr': repr(token_text)
                    })
            except UnicodeDecodeError:
                # Invalid UTF-8, skip it
                continue
        except Exception:
            # Some tokens might not decode properly
            continue

    # Sort by length and then by ID
    whitespace_tokens.sort(key=lambda x: (x['length'], x['id']))

    print(f"Found {len(whitespace_tokens)} whitespace-only tokens")
    print(f"Percentage of vocab: {100 * len(whitespace_tokens) / vocab_size:.3f}%\n")

    # Group by type of whitespace
    spaces_only = []
    tabs_only = []
    newlines_only = []
    mixed = []

    for token in whitespace_tokens:
        text = token['text']
        if all(c == ' ' for c in text):
            spaces_only.append(token)
        elif all(c == '\t' for c in text):
            tabs_only.append(token)
        elif all(c == '\n' for c in text):
            newlines_only.append(token)
        else:
            mixed.append(token)

    # Display summary by category
    print("SUMMARY BY WHITESPACE TYPE:")
    print()
    print(f"  Spaces only:   {len(spaces_only):4d} tokens")
    print(f"  Tabs only:     {len(tabs_only):4d} tokens")
    print(f"  Newlines only: {len(newlines_only):4d} tokens")
    print(f"  Mixed:         {len(mixed):4d} tokens")
    print()

    # Show details for space tokens
    if spaces_only:
        print("\nSPACE-ONLY TOKENS:")
        print()
        for token in spaces_only:
            print(f"  Token ID {token['id']:6d}: {token['length']:2d} space(s) - {token['repr']}")

    # Show details for tab tokens
    if tabs_only:
        print("\nTAB-ONLY TOKENS:")
        print()
        for token in tabs_only:
            print(f"  Token ID {token['id']:6d}: {token['length']:2d} tab(s) - {token['repr']}")

    # Show details for newline tokens
    if newlines_only:
        print("\nNEWLINE-ONLY TOKENS:")
        print()
        for token in newlines_only:
            print(f"  Token ID {token['id']:6d}: {token['length']:2d} newline(s) - {token['repr']}")

    # Show details for mixed whitespace tokens
    if mixed:
        print("\nMIXED WHITESPACE TOKENS:")
        print()
        for token in mixed[:20]: # Show first 20 to avoid too much output
            char_breakdown = []
            for char in token['text']:
                if char == ' ':
                    char_breakdown.append('SP')
                elif char == '\t':
                    char_breakdown.append('TAB')
                elif char == '\n':
                    char_breakdown.append('NL')
                elif char == '\r':
                    char_breakdown.append('CR')
                else:
                    char_breakdown.append(f'U+{ord(char):04X}')
            breakdown_str = '+'.join(char_breakdown)
            print(f"  Token ID {token['id']:6d}: {token['repr']:30s} [{breakdown_str}]")

        if len(mixed) > 20:
            print(f"  ... and {len(mixed) - 20} more")

    print()

 if __name__ == "__main__":
    analyze_whitespace_tokens()
	"""
	Total vocabulary size: 200,019

	WHITESPACE TOKEN ANALYSIS

	Found 434 whitespace-only tokens
	Percentage of vocab: 0.217%

	SUMMARY BY WHITESPACE TYPE:

	Spaces only: 84 tokens
	Tabs only: 20 tokens
	Newlines only: 11 tokens
	Mixed: 319 tokens


	SPACE-ONLY TOKENS:

	Token ID 220: 1 space(s) - ' '
	Token ID 256: 2 space(s) - ' '
	Token ID 271: 3 space(s) - ' '
	Token ID 257: 4 space(s) - ' '
	Token ID 530: 5 space(s) - ' '
	Token ID 1699: 6 space(s) - ' '
	Token ID 309: 7 space(s) - ' '
	Token ID 269: 8 space(s) - ' '
	Token ID 983: 9 space(s) - ' '
	Token ID 3550: 10 space(s) - ' '
	Token ID 352: 11 space(s) - ' '
	Token ID 3346: 12 space(s) - ' '
	Token ID 1698: 13 space(s) - ' '
	Token ID 4442: 14 space(s) - ' '
	Token ID 506: 15 space(s) - ' '
	Token ID 408: 16 space(s) - ' '
	Token ID 2902: 17 space(s) - ' '
	Token ID 8854: 18 space(s) - ' '
	Token ID 699: 19 space(s) - ' '
	Token ID 7692: 20 space(s) - ' '
	Token ID 4451: 21 space(s) - ' '
	Token ID 10682: 22 space(s) - ' '
	Token ID 968: 23 space(s) - ' '
	Token ID 10406: 24 space(s) - ' '
	Token ID 6453: 25 space(s) - ' '
	Token ID 12397: 26 space(s) - ' '
	Token ID 1686: 27 space(s) - ' '
	Token ID 13541: 28 space(s) - ' '
	Token ID 8616: 29 space(s) - ' '
	Token ID 15880: 30 space(s) - ' '
	Token ID 2419: 31 space(s) - ' '
	Token ID 1213: 32 space(s) - ' '
	Token ID 12599: 33 space(s) - ' '
	Token ID 21063: 34 space(s) - ' '
	Token ID 3523: 35 space(s) - ' '
	Token ID 22731: 36 space(s) - ' '
	Token ID 16198: 37 space(s) - ' '
	Token ID 24131: 38 space(s) - ' '
	Token ID 4754: 39 space(s) - ' '
	Token ID 27240: 40 space(s) - ' '
	Token ID 20949: 41 space(s) - ' '
	Token ID 31024: 42 space(s) - ' '
	Token ID 6322: 43 space(s) - ' '
	Token ID 32392: 44 space(s) - ' '
	Token ID 26829: 45 space(s) - ' '
	Token ID 38019: 46 space(s) - ' '
	Token ID 8793: 47 space(s) - ' '
	Token ID 41146: 48 space(s) - ' '
	Token ID 35584: 49 space(s) - ' '
	Token ID 48774: 50 space(s) - ' '
	Token ID 12266: 51 space(s) - ' '
	Token ID 54372: 52 space(s) - ' '
	Token ID 43806: 53 space(s) - ' '
	Token ID 57342: 54 space(s) - ' '
	Token ID 16006: 55 space(s) - ' '
	Token ID 63991: 56 space(s) - ' '
	Token ID 56404: 57 space(s) - ' '
	Token ID 72749: 58 space(s) - ' '
	Token ID 23643: 59 space(s) - ' '
	Token ID 83300: 60 space(s) - ' '
	Token ID 66188: 61 space(s) - ' '
	Token ID 84839: 62 space(s) - ' '
	Token ID 30319: 63 space(s) - ' '
	Token ID 9344: 64 space(s) - ' '
	Token ID 82955: 65 space(s) - ' '
	Token ID 110321: 66 space(s) - ' '
	Token ID 38959: 67 space(s) - ' '
	Token ID 122722: 68 space(s) - ' '
	Token ID 94875: 69 space(s) - ' '
	Token ID 113903: 70 space(s) - ' '
	Token ID 46371: 71 space(s) - ' '
	Token ID 88017: 72 space(s) - ' '
	Token ID 102820: 73 space(s) - ' '
	Token ID 97361: 74 space(s) - ' '
	Token ID 30723: 75 space(s) - ' '
	Token ID 136356: 76 space(s) - ' '
	Token ID 133272: 77 space(s) - ' '
	Token ID 187205: 78 space(s) - ' '
	Token ID 82724: 79 space(s) - ' '
	Token ID 116288: 83 space(s) - ' '
	Token ID 134788: 87 space(s) - ' '
	Token ID 174893: 91 space(s) - ' '
	Token ID 195732: 95 space(s) - ' '
	Token ID 72056: 128 space(s) - ' '

	TAB-ONLY TOKENS:

	Token ID 197: 1 tab(s) - '\t'
	Token ID 335: 2 tab(s) - '\t\t'
	Token ID 833: 3 tab(s) - '\t\t\t'
	Token ID 626: 4 tab(s) - '\t\t\t\t'
	Token ID 1999: 5 tab(s) - '\t\t\t\t\t'
	Token ID 3083: 6 tab(s) - '\t\t\t\t\t\t'
	Token ID 5216: 7 tab(s) - '\t\t\t\t\t\t\t'
	Token ID 4011: 8 tab(s) - '\t\t\t\t\t\t\t\t'
	Token ID 10666: 9 tab(s) - '\t\t\t\t\t\t\t\t\t'
	Token ID 14973: 10 tab(s) - '\t\t\t\t\t\t\t\t\t\t'
	Token ID 20581: 11 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t'
	Token ID 27926: 12 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t'
	Token ID 36630: 13 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t'
	Token ID 51646: 14 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
	Token ID 62364: 15 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
	Token ID 43876: 16 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
	Token ID 100800: 17 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
	Token ID 124716: 18 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
	Token ID 153867: 19 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'
	Token ID 193506: 20 tab(s) - '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'

	NEWLINE-ONLY TOKENS:

	Token ID 198: 1 newline(s) - '\n'
	Token ID 279: 2 newline(s) - '\n\n'
	Token ID 2499: 3 newline(s) - '\n\n\n'
	Token ID 4707: 4 newline(s) - '\n\n\n\n'
	Token ID 27559: 5 newline(s) - '\n\n\n\n\n'
	Token ID 37680: 6 newline(s) - '\n\n\n\n\n\n'
	Token ID 70224: 7 newline(s) - '\n\n\n\n\n\n\n'
	Token ID 21301: 8 newline(s) - '\n\n\n\n\n\n\n\n'
	Token ID 128841: 9 newline(s) - '\n\n\n\n\n\n\n\n\n'
	Token ID 160468: 10 newline(s) - '\n\n\n\n\n\n\n\n\n\n'
	Token ID 64469: 16 newline(s) - '\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n'

	MIXED WHITESPACE TOKENS:

	Token ID 199: '\x0b' [U+000B]
	Token ID 200: '\x0c' [U+000C]
	Token ID 201: '\r' [CR]
	Token ID 216: '\x1c' [U+001C]
	Token ID 217: '\x1d' [U+001D]
	Token ID 218: '\x1e' [U+001E]
	Token ID 219: '\x1f' [U+001F]
	Token ID 1397: '\u3000' [U+3000]
	Token ID 5310: '\xa0' [U+00A0]
	Token ID 17985: '\u2002' [U+2002]
	Token ID 29106: '\u2009' [U+2009]
	Token ID 33203: '\u2003' [U+2003]
	Token ID 35971: '\u202f' [U+202F]
	Token ID 51008: '\u2028' [U+2028]
	Token ID 86741: '\u200a' [U+200A]
	Token ID 169653: '\u2005' [U+2005]
	Token ID 370: '\r\n' [CR+NL]
	Token ID 793: ' \n' [SP+NL]
	Token ID 2775: '\t\n' [TAB+NL]
	Token ID 4577: '\u3000\u3000' [U+3000+U+3000]
	... and 299 more
	"""

	import tiktoken

	def is_whitespace_only(text):
	"""Check if a string contains only whitespace characters."""
	return len(text) > 0 and text.isspace()

	def analyze_whitespace_tokens():
	# Load the o200k_base encoding
	enc = tiktoken.get_encoding("o200k_base")

	# Get the vocabulary size
	vocab_size = enc.max_token_value + 1

	print(f"Total vocabulary size: {vocab_size:,}")
	print()
	print("WHITESPACE TOKEN ANALYSIS")
	print()

	whitespace_tokens = []

	# Iterate through all possible token IDs
	for token_id in range(vocab_size):
	try:
	# Decode the token to get its byte representation
	token_bytes = enc.decode_single_token_bytes(token_id)

	# Try to decode as UTF-8, but only count it as whitespace if:
	# 1. It decodes validly (no replacement characters)
	# 2. The decoded text is all whitespace
	# 3. The byte length matches the decoded text length (for ASCII/simple chars)
	try:
	token_text = token_bytes.decode('utf-8', errors='strict')
	# Check if this token is purely whitespace
	if is_whitespace_only(token_text):
	whitespace_tokens.append({
	'id': token_id,
	'text': token_text,
	'bytes': token_bytes,
	'length': len(token_text),
	'repr': repr(token_text)
	})
	except UnicodeDecodeError:
	# Invalid UTF-8, skip it
	continue
	except Exception:
	# Some tokens might not decode properly
	continue

	# Sort by length and then by ID
	whitespace_tokens.sort(key=lambda x: (x['length'], x['id']))

	print(f"Found {len(whitespace_tokens)} whitespace-only tokens")
	print(f"Percentage of vocab: {100 * len(whitespace_tokens) / vocab_size:.3f}%\n")

	# Group by type of whitespace
	spaces_only = []
	tabs_only = []
	newlines_only = []
	mixed = []

	for token in whitespace_tokens:
	text = token['text']
	if all(c == ' ' for c in text):
	spaces_only.append(token)
	elif all(c == '\t' for c in text):
	tabs_only.append(token)
	elif all(c == '\n' for c in text):
	newlines_only.append(token)
	else:
	mixed.append(token)

	# Display summary by category
	print("SUMMARY BY WHITESPACE TYPE:")
	print()
	print(f" Spaces only: {len(spaces_only):4d} tokens")
	print(f" Tabs only: {len(tabs_only):4d} tokens")
	print(f" Newlines only: {len(newlines_only):4d} tokens")
	print(f" Mixed: {len(mixed):4d} tokens")
	print()

	# Show details for space tokens
	if spaces_only:
	print("\nSPACE-ONLY TOKENS:")
	print()
	for token in spaces_only:
	print(f" Token ID {token['id']:6d}: {token['length']:2d} space(s) - {token['repr']}")

	# Show details for tab tokens
	if tabs_only:
	print("\nTAB-ONLY TOKENS:")
	print()
	for token in tabs_only:
	print(f" Token ID {token['id']:6d}: {token['length']:2d} tab(s) - {token['repr']}")

	# Show details for newline tokens
	if newlines_only:
	print("\nNEWLINE-ONLY TOKENS:")
	print()
	for token in newlines_only:
	print(f" Token ID {token['id']:6d}: {token['length']:2d} newline(s) - {token['repr']}")

	# Show details for mixed whitespace tokens
	if mixed:
	print("\nMIXED WHITESPACE TOKENS:")
	print()
	for token in mixed[:20]: # Show first 20 to avoid too much output
	char_breakdown = []
	for char in token['text']:
	if char == ' ':
	char_breakdown.append('SP')
	elif char == '\t':
	char_breakdown.append('TAB')
	elif char == '\n':
	char_breakdown.append('NL')
	elif char == '\r':
	char_breakdown.append('CR')
	else:
	char_breakdown.append(f'U+{ord(char):04X}')
	breakdown_str = '+'.join(char_breakdown)
	print(f" Token ID {token['id']:6d}: {token['repr']:30s} [{breakdown_str}]")

	if len(mixed) > 20:
	print(f" ... and {len(mixed) - 20} more")

	print()

	if __name__ == "__main__":
	analyze_whitespace_tokens()
No results found