Created
February 20, 2026 14:56
-
-
Save marr75/b9963c317eb682d5c652eac75f433379 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # /// script | |
| # dependencies = ["tiktoken"] | |
| # /// | |
| """Explore how GPT-2's BPE tokenizer handles periods in different contexts.""" | |
| import tiktoken | |
| enc = tiktoken.get_encoding("gpt2") | |
| def show(label: str, text: str) -> None: | |
| tokens = enc.encode(text) | |
| decoded = [enc.decode([t]) for t in tokens] | |
| print(f" {label}") | |
| print(f" text: {text!r}") | |
| print(f" tokens: {decoded}") | |
| print(f" ids: {tokens}") | |
| print() | |
| print("=" * 80) | |
| print("PERIOD AS SENTENCE ENDER") | |
| print("=" * 80) | |
| show("The reddit example", "The temperature was 98.") | |
| show("Followed by space + new sentence", "The temperature was 98. The next day") | |
| show("Followed by newline", "The temperature was 98.\nThe next day") | |
| show("Simple sentence end", "He left.") | |
| show("End with space after", "He left. She stayed.") | |
| print("=" * 80) | |
| print("PERIOD IN DECIMAL NUMBERS") | |
| print("=" * 80) | |
| show("Decimal mid-sentence", "The temperature was 98.6 degrees.") | |
| show("Decimal at sentence end", "The temperature was 98.6.") | |
| show("Small decimal", "It was 3.14 radians.") | |
| show("Leading zero", "The value is 0.5 percent.") | |
| show("Multiple decimals", "From 98.6 to 99.1 degrees.") | |
| show("Large decimal", "The price was 1234.56 dollars.") | |
| print("=" * 80) | |
| print("PERIOD IN ABBREVIATIONS / SPECIAL CONTEXTS") | |
| print("=" * 80) | |
| show("Abbreviation", "Dr. Smith went home.") | |
| show("Initials", "J. R. R. Tolkien wrote books.") | |
| show("Ellipsis", "He paused... then continued.") | |
| show("Domain name", "Visit openai.com for details.") | |
| show("File extension", "Edit the file main.py please.") | |
| show("Version number", "Using version 3.12.1 now.") | |
| show("IP address", "The server is at 192.168.1.1 today.") | |
| print("=" * 80) | |
| print("AMBIGUOUS CASES — DOES CONTEXT CHANGE TOKENIZATION?") | |
| print("=" * 80) | |
| show("98 + period alone", "98.") | |
| show("98 + period + digit", "98.6") | |
| show("98 + period + space", "98. ") | |
| show("98 + period + space + upper", "98. The") | |
| show("98 + period + space + lower", "98. the") | |
| show("98 + period + newline", "98.\n") | |
| print("=" * 80) | |
| print("TOKEN IDENTITY CHECK — IS '.' ALWAYS THE SAME TOKEN?") | |
| print("=" * 80) | |
| # Check if the period character on its own is a single consistent token | |
| period_token = enc.encode(".") | |
| print(f" Bare period '.' encodes to: {period_token} (decoded: {[enc.decode([t]) for t in period_token]})") | |
| print() | |
| # Now find all tokens that contain a period | |
| print(" Tokens containing '.' (sampling from vocab):") | |
| count = 0 | |
| for token_id in range(enc.n_vocab): | |
| try: | |
| decoded = enc.decode([token_id]) | |
| if "." in decoded and decoded != ".": | |
| print(f" id={token_id:>6} {decoded!r}") | |
| count += 1 | |
| if count >= 40: | |
| print(f" ... (stopping at {count}, there are more)") | |
| break | |
| except Exception: | |
| pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment