Skip to content

Instantly share code, notes, and snippets.

@ddh0
Created November 25, 2025 03:09
Show Gist options
  • Select an option

  • Save ddh0/dc23ca7fbd44607a0613db7f6a50c97d to your computer and use it in GitHub Desktop.

Select an option

Save ddh0/dc23ca7fbd44607a0613db7f6a50c97d to your computer and use it in GitHub Desktop.
train a tokenizer given one or more parquet files. includes pre-defined special tokens.
# train_tokenizer2.py
# Python 3.12.3
"""Train a `PreTrainedTokenizerFast` given parquet files and a vocabulary size."""
import sys
import argparse
import tempfile
import shutil
import os
from transformers import PreTrainedTokenizerFast
from logger import Log, ANSI
from tokenizers import (
models,
trainers,
Tokenizer,
pre_tokenizers,
decoders
)
from utils import parquet_to_txt
DEFAULT_N_VOCAB = 1024
DEFAULT_OUTPUT_DIR = "./tokenizer-output"
MAX_TOKEN_LENGTH = 64
log = Log(stdout=sys.stdout, stderr=sys.stderr)
special_tokens: dict[str, int] = {
"<|PAD|>" : 0,
"<|BOS|>" : 1,
"<|EOS|>" : 2,
"<|SEP|>" : 3,
"<|UNK|>" : 4,
"<|MASK|>" : 5,
"<|SYSTEM|>" : 6,
"<|/SYSTEM|>" : 7,
"<|USER|>" : 8,
"<|/USER|>" : 9,
"<|ASSISTANT|>" : 10,
"<|/ASSISTANT|>" : 11,
"<|THINK|>" : 12,
"<|/THINK|>" : 13,
"<|SPECIAL_14|>" : 14,
"<|SPECIAL_15|>" : 15,
"<|SPECIAL_16|>" : 16,
"<|SPECIAL_17|>" : 17,
"<|SPECIAL_18|>" : 18,
"<|SPECIAL_19|>" : 19
}
def main(files: list[str], n_vocab: int, output_dir: str) -> int:
_RETURN_CODE_SUCCESS = 0
_RETURN_CODE_FAILURE = 1
assert isinstance(files, list)
assert all(isinstance(f, str) for f in files)
assert isinstance(n_vocab, int)
assert isinstance(output_dir, str)
n_special_tokens = len(special_tokens)
log.info(f'main start')
log.info(f'-- files: {files!r}')
log.info(f'-- n_vocab: {n_vocab}')
log.info(f'-- output_dir: {output_dir!r}')
log.info(f'-- n_special_tokens: {n_special_tokens}')
for f_path in files:
if not os.path.isfile(f_path):
log.error(f"input file not found: {f_path}")
return _RETURN_CODE_FAILURE
# remove output directory if it exists
if os.path.exists(output_dir):
if os.path.isdir(output_dir) and os.listdir(output_dir):
log.warn(f"output directory exists and is not empty - the contents will be erased!")
else:
log.info(f"output directory exists but is empty, so no worries")
shutil.rmtree(output_dir)
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt', encoding='utf-8') as tmp_file:
temp_txt_file_path = tmp_file.name
# initialize a new tokenizer
_tokenizer = Tokenizer(models.BPE())
_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
_tokenizer.decoder = decoders.ByteLevel()
tok_trainer = trainers.BpeTrainer(
vocab_size=n_vocab,
special_tokens=list(special_tokens.keys()),
max_token_length=MAX_TOKEN_LENGTH
)
_tokenizer.model = models.BPE()
# train the tokenizer using the training data files
log.info(f"converting input parquet files to a single temporary plaintext file ...")
for file in files:
log.info(f"processing {file} ...")
parquet_to_txt(file, temp_txt_file_path, text_column=None)
log.info(f"done.")
log.info('training tokenizer ...')
_tokenizer.train([temp_txt_file_path], trainer=tok_trainer)
log.info('done.')
special_token_params = {
"pad_token": "<|PAD|>",
"bos_token": "<|BOS|>",
"eos_token": "<|EOS|>",
"sep_token": "<|SEP|>",
"unk_token": "<|UNK|>",
"mask_token": "<|MASK|>",
"additional_special_tokens": [
"<|SYSTEM|>", "<|/SYSTEM|>",
"<|USER|>", "<|/USER|>",
"<|ASSISTANT|>", "<|/ASSISTANT|>",
"<|THINK|>", "<|/THINK|>",
"<|SPECIAL_14|>", "<|SPECIAL_15|>", "<|SPECIAL_16|>",
"<|SPECIAL_17|>", "<|SPECIAL_18|>", "<|SPECIAL_19|>"
]
}
tokenizer = PreTrainedTokenizerFast(
tokenizer_object=_tokenizer,
model_max_length=2**30, # over one billion characters
**special_token_params
)
for tok_str, tok_id in sorted(tokenizer.get_vocab().items(), key=lambda x: x[1]):
special = tok_id in tokenizer.all_special_ids
log.info(f'-- token ID {tok_id:06}: {ANSI.FG_BRIGHT_YELLOW if special else ""}{tok_str!r}{ANSI.MODE_RESET_ALL}')
log.info(f'saving trained tokenizer to {output_dir} ...')
tokenizer.save_pretrained(output_dir)
os.remove(temp_txt_file_path)
log.info(f'all done.')
return _RETURN_CODE_SUCCESS
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Train a HuggingFace tokenizer given parquet files")
parser.add_argument(
"-n", "--n-vocab",
help=f"The desired vocabulary size (default: {DEFAULT_N_VOCAB})",
default=DEFAULT_N_VOCAB,
)
parser.add_argument(
"-o", "--output-dir",
help=f"Destination directory for the newly trained tokenizer (default: {DEFAULT_OUTPUT_DIR})",
default=DEFAULT_OUTPUT_DIR
)
parser.add_argument(
"--file",
help="A training data file in Apache Parquet format. May be specified multiple times.",
required=True,
action='append'
)
args = parser.parse_args()
sys.exit(main(files=args.file, n_vocab=int(args.n_vocab), output_dir=args.output_dir))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment