Skip to content

Instantly share code, notes, and snippets.

@avi-arora
Created August 2, 2025 11:08
Show Gist options
  • Select an option

  • Save avi-arora/f4ee8c19050dbbc8ba7f0a40b69f1d4b to your computer and use it in GitHub Desktop.

Select an option

Save avi-arora/f4ee8c19050dbbc8ba7f0a40b69f1d4b to your computer and use it in GitHub Desktop.
BPE Implementation
from collections import Counter, defaultdict
def get_pairs(word):
"""Return set of symbol pairs in a word."""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
def bpe_merge(sentence, num_merges):
# Split sentence into words and initialize vocabulary with characters
vocab = Counter()
words = sentence.split()
for word in words:
# Each word ends with a special marker (for word boundaries)
chars = tuple(word) + ('</w>',)
vocab[chars] += 1
for i in range(num_merges):
# Count frequency of symbol pairs across vocab
pairs = defaultdict(int)
for word, freq in vocab.items():
word_pairs = get_pairs(word)
for pair in word_pairs:
pairs[pair] += freq
if not pairs:
break
# Choose most frequent pair
best_pair = max(pairs, key=pairs.get)
if pairs[best_pair] < 1:
break
# Merge best pair in all words
new_vocab = Counter()
bigram = ' '.join(best_pair)
for word in vocab:
w = ' '.join(word)
w_new = w.replace(bigram, ''.join(best_pair))
new_vocab[tuple(w_new.split())] += vocab[word]
vocab = new_vocab
return vocab
# Example usage:
sentence = "low lower newest widest"
num_merges = 10
result_vocab = bpe_merge(sentence, num_merges)
print(result_vocab)
# Output would show the current vocabulary (tokens) after merges
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment