Created
August 2, 2025 11:08
-
-
Save avi-arora/f4ee8c19050dbbc8ba7f0a40b69f1d4b to your computer and use it in GitHub Desktop.
BPE Implementation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from collections import Counter, defaultdict | |
| def get_pairs(word): | |
| """Return set of symbol pairs in a word.""" | |
| pairs = set() | |
| prev_char = word[0] | |
| for char in word[1:]: | |
| pairs.add((prev_char, char)) | |
| prev_char = char | |
| return pairs | |
| def bpe_merge(sentence, num_merges): | |
| # Split sentence into words and initialize vocabulary with characters | |
| vocab = Counter() | |
| words = sentence.split() | |
| for word in words: | |
| # Each word ends with a special marker (for word boundaries) | |
| chars = tuple(word) + ('</w>',) | |
| vocab[chars] += 1 | |
| for i in range(num_merges): | |
| # Count frequency of symbol pairs across vocab | |
| pairs = defaultdict(int) | |
| for word, freq in vocab.items(): | |
| word_pairs = get_pairs(word) | |
| for pair in word_pairs: | |
| pairs[pair] += freq | |
| if not pairs: | |
| break | |
| # Choose most frequent pair | |
| best_pair = max(pairs, key=pairs.get) | |
| if pairs[best_pair] < 1: | |
| break | |
| # Merge best pair in all words | |
| new_vocab = Counter() | |
| bigram = ' '.join(best_pair) | |
| for word in vocab: | |
| w = ' '.join(word) | |
| w_new = w.replace(bigram, ''.join(best_pair)) | |
| new_vocab[tuple(w_new.split())] += vocab[word] | |
| vocab = new_vocab | |
| return vocab | |
| # Example usage: | |
| sentence = "low lower newest widest" | |
| num_merges = 10 | |
| result_vocab = bpe_merge(sentence, num_merges) | |
| print(result_vocab) | |
| # Output would show the current vocabulary (tokens) after merges |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment