Skip to content

Instantly share code, notes, and snippets.

@DouglasOrr
Last active May 16, 2024 15:35
Show Gist options
  • Select an option

  • Save DouglasOrr/b5ce3bda0787260051a738739f30b283 to your computer and use it in GitHub Desktop.

Select an option

Save DouglasOrr/b5ce3bda0787260051a738739f30b283 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "UN8qSkbRRJjw",
"outputId": "dcda5fa4-6875-4ab8-eeae-a1f24d3d334d"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
"Collecting tokenizers\n",
" Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m35.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hInstalling collected packages: tokenizers\n",
"Successfully installed tokenizers-0.13.3\n"
]
}
],
"source": [
"%pip install tokenizers\n",
"import tokenizers\n",
"import string"
]
},
{
"cell_type": "code",
"source": [
"# Train\n",
"training_text = f\"Training 🤗 tokenizers. (Your ABC: {' '.join(string.printable)})\"\n",
"\n",
"tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE(\n",
" end_of_word_suffix=\"<\"+\"/w>\",\n",
" byte_fallback=True\n",
"))\n",
"tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.WhitespaceSplit()\n",
"tokenizer.decoder = tokenizers.decoders.Sequence(\n",
" [\n",
" tokenizers.decoders.ByteFallback(),\n",
" tokenizers.decoders.BPEDecoder(suffix=tokenizer.model.end_of_word_suffix),\n",
" ]\n",
")\n",
"tokenizer.train_from_iterator(\n",
" [training_text],\n",
" tokenizers.trainers.BpeTrainer(\n",
" vocab_size=4096,\n",
" limit_alphabet=1024,\n",
" special_tokens=[f\"<0x{i:02X}>\" for i in range(256)],\n",
" end_of_word_suffix=tokenizer.model.end_of_word_suffix,\n",
" ),\n",
")\n",
"\n",
"# Test\n",
"test_text = \"Testing 🤗 tokenizers. Working 🤔?\"\n",
"print(\" Original:\", test_text)\n",
"encoding = tokenizer.encode(test_text)\n",
"print(\"\\n Tokens:\", encoding.tokens)\n",
"print(\"\\nReconstructed:\", tokenizer.decode(encoding.ids, skip_special_tokens=False))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "pPyOdmSpRTeR",
"outputId": "e89f01dc-9092-48c7-e2d9-85577511355c"
},
"execution_count": 34,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" Original: Testing 🤗 tokenizers. Working 🤔?\n",
"\n",
" Tokens: ['T', 'e', 's', 't', 'ing</w>', '🤗</w>', 'tokenizers.</w>', 'W', 'o', 'r', 'k', 'ing</w>', '<0xF0>', '<0x9F>', '<0xA4>', '<0x94>', '?</w>']\n",
"\n",
"Reconstructed: Testing 🤗 tokenizers. Working 🤔?\n"
]
}
]
}
]
}
@DouglasOrr
Copy link
Author

Note: end_of_word_suffix="</w>" was not rendering properly on GitHub.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment