Created
August 23, 2025 05:32
-
-
Save pamelafox/086c3c1d781dd0d6ad2cc832fbbd89b3 to your computer and use it in GitHub Desktop.
Find a stable single-token
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Find characters whose repetitions tokenize to one token per character with tiktoken. | |
| A character c is considered "stable single-token" if: | |
| len(encode(c)) == 1 AND for all k in [1, max_reps], len(encode(c * k)) == k. | |
| Usage (from repo root with virtual env active): | |
| python find_single_token_char.py | |
| python find_single_token_char.py --model text-embedding-3-large --max-reps 32 | |
| Exit code is 0 if at least one qualifying character was found, else 1. | |
| """ | |
| import argparse | |
| import sys | |
| import unicodedata | |
| from collections.abc import Iterable | |
| import tiktoken | |
| def parse_range(spec: str) -> Iterable[int]: | |
| """Parse a single range spec like '0x20-0x7E' or '160-255' or a single value '0x2022'.""" | |
| if "-" in spec: | |
| a, b = spec.split("-", 1) | |
| start = int(a, 0) | |
| end = int(b, 0) | |
| if start > end: | |
| start, end = end, start | |
| return range(start, end + 1) | |
| else: | |
| val = int(spec, 0) | |
| return range(val, val + 1) | |
| def default_ranges() -> list[tuple[int, int]]: | |
| return [ | |
| (0x20, 0x7E), # Basic Latin printable | |
| (0xA0, 0xFF), # Latin-1 Supplement (printable) | |
| (0x2000, 0x206F), # General Punctuation | |
| (0x2190, 0x21FF), # Arrows | |
| (0x2460, 0x24FF), # Enclosed Alphanumerics | |
| ] | |
| def iter_codepoints(ranges: list[tuple[int, int]]): | |
| for start, end in ranges: | |
| yield from range(start, end + 1) | |
| def is_stable_single_token_char(ch: str, enc, max_reps: int) -> bool: | |
| if len(enc.encode(ch)) != 1: | |
| return False | |
| cat = unicodedata.category(ch) | |
| if cat.startswith("C") and ch not in ("\u00A0",): # skip most control chars | |
| return False | |
| for k in range(2, max_reps + 1): | |
| rep = ch * k | |
| if len(enc.encode(rep)) != k: | |
| return False | |
| return True | |
| def main(): | |
| ap = argparse.ArgumentParser(description="Find characters with stable 1 char -> 1 token repetition behavior.") | |
| ap.add_argument("--model", default="text-embedding-ada-002", | |
| help="Model whose encoding to inspect (tiktoken.encoding_for_model).") | |
| ap.add_argument("--max-reps", type=int, default=16, | |
| help="Maximum repetition length to validate per character.") | |
| ap.add_argument("--ranges", nargs="*", metavar="RANGE", | |
| help="Code point ranges like 0x20-0x7E 160-255 0x2022. If omitted, uses a curated default set.") | |
| ap.add_argument("--all", action="store_true", | |
| help="List all qualifying characters instead of stopping at first match.") | |
| ap.add_argument("--show-names", action="store_true", | |
| help="Include Unicode names in output.") | |
| args = ap.parse_args() | |
| try: | |
| enc = tiktoken.encoding_for_model(args.model) | |
| except KeyError: | |
| enc = tiktoken.get_encoding(args.model) | |
| if args.ranges: | |
| parsed_ranges: list[tuple[int, int]] = [] | |
| for spec in args.ranges: | |
| cps = list(parse_range(spec)) | |
| if not cps: | |
| continue | |
| parsed_ranges.append((cps[0], cps[-1])) | |
| else: | |
| parsed_ranges = default_ranges() | |
| found = [] | |
| for cp in iter_codepoints(parsed_ranges): | |
| try: | |
| ch = chr(cp) | |
| except ValueError: | |
| continue | |
| if is_stable_single_token_char(ch, enc, args.max_reps): | |
| name = unicodedata.name(ch, "<no name>") | |
| if args.show_names: | |
| print(f"U+{cp:04X} {repr(ch)} (name={name}) OK (1 token per char up to {args.max_reps})") | |
| else: | |
| print(f"U+{cp:04X} {repr(ch)} OK") | |
| found.append(ch) | |
| if not args.all: | |
| break | |
| if not found: | |
| print("No stable single-token characters found in the scanned ranges.", file=sys.stderr) | |
| sys.exit(1) | |
| else: | |
| print(f"\nSummary: {len(found)} character(s) found. First: {repr(found[0])}") | |
| sys.exit(0) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment