avi-arora · August 2, 2025 11:08
diff --git a/bpe.py b/bpe.py
 from collections import Counter, defaultdict

 def get_pairs(word):
    """Return set of symbol pairs in a word."""
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs

 def bpe_merge(sentence, num_merges):
    # Split sentence into words and initialize vocabulary with characters
    vocab = Counter()
    words = sentence.split()
    for word in words:
        # Each word ends with a special marker (for word boundaries)
        chars = tuple(word) + ('</w>',)
        vocab[chars] += 1

    for i in range(num_merges):
        # Count frequency of symbol pairs across vocab
        pairs = defaultdict(int)
        for word, freq in vocab.items():
            word_pairs = get_pairs(word)
            for pair in word_pairs:
                pairs[pair] += freq

        if not pairs:
            break
        # Choose most frequent pair
        best_pair = max(pairs, key=pairs.get)
        if pairs[best_pair] < 1:
            break

        # Merge best pair in all words
        new_vocab = Counter()
        bigram = ' '.join(best_pair)
        for word in vocab:
            w = ' '.join(word)
            w_new = w.replace(bigram, ''.join(best_pair))
            new_vocab[tuple(w_new.split())] += vocab[word]
        vocab = new_vocab
    return vocab

 # Example usage:
 sentence = "low lower newest widest"
 num_merges = 10
 result_vocab = bpe_merge(sentence, num_merges)
 print(result_vocab)

 # Output would show the current vocabulary (tokens) after merges
	from collections import Counter, defaultdict

	def get_pairs(word):
	"""Return set of symbol pairs in a word."""
	pairs = set()
	prev_char = word[0]
	for char in word[1:]:
	pairs.add((prev_char, char))
	prev_char = char
	return pairs

	def bpe_merge(sentence, num_merges):
	# Split sentence into words and initialize vocabulary with characters
	vocab = Counter()
	words = sentence.split()
	for word in words:
	# Each word ends with a special marker (for word boundaries)
	chars = tuple(word) + ('</w>',)
	vocab[chars] += 1

	for i in range(num_merges):
	# Count frequency of symbol pairs across vocab
	pairs = defaultdict(int)
	for word, freq in vocab.items():
	word_pairs = get_pairs(word)
	for pair in word_pairs:
	pairs[pair] += freq

	if not pairs:
	break
	# Choose most frequent pair
	best_pair = max(pairs, key=pairs.get)
	if pairs[best_pair] < 1:
	break

	# Merge best pair in all words
	new_vocab = Counter()
	bigram = ' '.join(best_pair)
	for word in vocab:
	w = ' '.join(word)
	w_new = w.replace(bigram, ''.join(best_pair))
	new_vocab[tuple(w_new.split())] += vocab[word]
	vocab = new_vocab
	return vocab

	# Example usage:
	sentence = "low lower newest widest"
	num_merges = 10
	result_vocab = bpe_merge(sentence, num_merges)
	print(result_vocab)

	# Output would show the current vocabulary (tokens) after merges
No results found