Skip to content

Instantly share code, notes, and snippets.

@pamelafox
Created August 23, 2025 05:32
Show Gist options
  • Select an option

  • Save pamelafox/086c3c1d781dd0d6ad2cc832fbbd89b3 to your computer and use it in GitHub Desktop.

Select an option

Save pamelafox/086c3c1d781dd0d6ad2cc832fbbd89b3 to your computer and use it in GitHub Desktop.
Find a stable single-token
#!/usr/bin/env python3
"""
Find characters whose repetitions tokenize to one token per character with tiktoken.
A character c is considered "stable single-token" if:
len(encode(c)) == 1 AND for all k in [1, max_reps], len(encode(c * k)) == k.
Usage (from repo root with virtual env active):
python find_single_token_char.py
python find_single_token_char.py --model text-embedding-3-large --max-reps 32
Exit code is 0 if at least one qualifying character was found, else 1.
"""
import argparse
import sys
import unicodedata
from collections.abc import Iterable
import tiktoken
def parse_range(spec: str) -> Iterable[int]:
"""Parse a single range spec like '0x20-0x7E' or '160-255' or a single value '0x2022'."""
if "-" in spec:
a, b = spec.split("-", 1)
start = int(a, 0)
end = int(b, 0)
if start > end:
start, end = end, start
return range(start, end + 1)
else:
val = int(spec, 0)
return range(val, val + 1)
def default_ranges() -> list[tuple[int, int]]:
return [
(0x20, 0x7E), # Basic Latin printable
(0xA0, 0xFF), # Latin-1 Supplement (printable)
(0x2000, 0x206F), # General Punctuation
(0x2190, 0x21FF), # Arrows
(0x2460, 0x24FF), # Enclosed Alphanumerics
]
def iter_codepoints(ranges: list[tuple[int, int]]):
for start, end in ranges:
yield from range(start, end + 1)
def is_stable_single_token_char(ch: str, enc, max_reps: int) -> bool:
if len(enc.encode(ch)) != 1:
return False
cat = unicodedata.category(ch)
if cat.startswith("C") and ch not in ("\u00A0",): # skip most control chars
return False
for k in range(2, max_reps + 1):
rep = ch * k
if len(enc.encode(rep)) != k:
return False
return True
def main():
ap = argparse.ArgumentParser(description="Find characters with stable 1 char -> 1 token repetition behavior.")
ap.add_argument("--model", default="text-embedding-ada-002",
help="Model whose encoding to inspect (tiktoken.encoding_for_model).")
ap.add_argument("--max-reps", type=int, default=16,
help="Maximum repetition length to validate per character.")
ap.add_argument("--ranges", nargs="*", metavar="RANGE",
help="Code point ranges like 0x20-0x7E 160-255 0x2022. If omitted, uses a curated default set.")
ap.add_argument("--all", action="store_true",
help="List all qualifying characters instead of stopping at first match.")
ap.add_argument("--show-names", action="store_true",
help="Include Unicode names in output.")
args = ap.parse_args()
try:
enc = tiktoken.encoding_for_model(args.model)
except KeyError:
enc = tiktoken.get_encoding(args.model)
if args.ranges:
parsed_ranges: list[tuple[int, int]] = []
for spec in args.ranges:
cps = list(parse_range(spec))
if not cps:
continue
parsed_ranges.append((cps[0], cps[-1]))
else:
parsed_ranges = default_ranges()
found = []
for cp in iter_codepoints(parsed_ranges):
try:
ch = chr(cp)
except ValueError:
continue
if is_stable_single_token_char(ch, enc, args.max_reps):
name = unicodedata.name(ch, "<no name>")
if args.show_names:
print(f"U+{cp:04X} {repr(ch)} (name={name}) OK (1 token per char up to {args.max_reps})")
else:
print(f"U+{cp:04X} {repr(ch)} OK")
found.append(ch)
if not args.all:
break
if not found:
print("No stable single-token characters found in the scanned ranges.", file=sys.stderr)
sys.exit(1)
else:
print(f"\nSummary: {len(found)} character(s) found. First: {repr(found[0])}")
sys.exit(0)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment