Created
November 25, 2025 03:09
-
-
Save ddh0/dc23ca7fbd44607a0613db7f6a50c97d to your computer and use it in GitHub Desktop.
train a tokenizer given one or more parquet files. includes pre-defined special tokens.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # train_tokenizer2.py | |
| # Python 3.12.3 | |
| """Train a `PreTrainedTokenizerFast` given parquet files and a vocabulary size.""" | |
| import sys | |
| import argparse | |
| import tempfile | |
| import shutil | |
| import os | |
| from transformers import PreTrainedTokenizerFast | |
| from logger import Log, ANSI | |
| from tokenizers import ( | |
| models, | |
| trainers, | |
| Tokenizer, | |
| pre_tokenizers, | |
| decoders | |
| ) | |
| from utils import parquet_to_txt | |
| DEFAULT_N_VOCAB = 1024 | |
| DEFAULT_OUTPUT_DIR = "./tokenizer-output" | |
| MAX_TOKEN_LENGTH = 64 | |
| log = Log(stdout=sys.stdout, stderr=sys.stderr) | |
| special_tokens: dict[str, int] = { | |
| "<|PAD|>" : 0, | |
| "<|BOS|>" : 1, | |
| "<|EOS|>" : 2, | |
| "<|SEP|>" : 3, | |
| "<|UNK|>" : 4, | |
| "<|MASK|>" : 5, | |
| "<|SYSTEM|>" : 6, | |
| "<|/SYSTEM|>" : 7, | |
| "<|USER|>" : 8, | |
| "<|/USER|>" : 9, | |
| "<|ASSISTANT|>" : 10, | |
| "<|/ASSISTANT|>" : 11, | |
| "<|THINK|>" : 12, | |
| "<|/THINK|>" : 13, | |
| "<|SPECIAL_14|>" : 14, | |
| "<|SPECIAL_15|>" : 15, | |
| "<|SPECIAL_16|>" : 16, | |
| "<|SPECIAL_17|>" : 17, | |
| "<|SPECIAL_18|>" : 18, | |
| "<|SPECIAL_19|>" : 19 | |
| } | |
| def main(files: list[str], n_vocab: int, output_dir: str) -> int: | |
| _RETURN_CODE_SUCCESS = 0 | |
| _RETURN_CODE_FAILURE = 1 | |
| assert isinstance(files, list) | |
| assert all(isinstance(f, str) for f in files) | |
| assert isinstance(n_vocab, int) | |
| assert isinstance(output_dir, str) | |
| n_special_tokens = len(special_tokens) | |
| log.info(f'main start') | |
| log.info(f'-- files: {files!r}') | |
| log.info(f'-- n_vocab: {n_vocab}') | |
| log.info(f'-- output_dir: {output_dir!r}') | |
| log.info(f'-- n_special_tokens: {n_special_tokens}') | |
| for f_path in files: | |
| if not os.path.isfile(f_path): | |
| log.error(f"input file not found: {f_path}") | |
| return _RETURN_CODE_FAILURE | |
| # remove output directory if it exists | |
| if os.path.exists(output_dir): | |
| if os.path.isdir(output_dir) and os.listdir(output_dir): | |
| log.warn(f"output directory exists and is not empty - the contents will be erased!") | |
| else: | |
| log.info(f"output directory exists but is empty, so no worries") | |
| shutil.rmtree(output_dir) | |
| with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt', encoding='utf-8') as tmp_file: | |
| temp_txt_file_path = tmp_file.name | |
| # initialize a new tokenizer | |
| _tokenizer = Tokenizer(models.BPE()) | |
| _tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) | |
| _tokenizer.decoder = decoders.ByteLevel() | |
| tok_trainer = trainers.BpeTrainer( | |
| vocab_size=n_vocab, | |
| special_tokens=list(special_tokens.keys()), | |
| max_token_length=MAX_TOKEN_LENGTH | |
| ) | |
| _tokenizer.model = models.BPE() | |
| # train the tokenizer using the training data files | |
| log.info(f"converting input parquet files to a single temporary plaintext file ...") | |
| for file in files: | |
| log.info(f"processing {file} ...") | |
| parquet_to_txt(file, temp_txt_file_path, text_column=None) | |
| log.info(f"done.") | |
| log.info('training tokenizer ...') | |
| _tokenizer.train([temp_txt_file_path], trainer=tok_trainer) | |
| log.info('done.') | |
| special_token_params = { | |
| "pad_token": "<|PAD|>", | |
| "bos_token": "<|BOS|>", | |
| "eos_token": "<|EOS|>", | |
| "sep_token": "<|SEP|>", | |
| "unk_token": "<|UNK|>", | |
| "mask_token": "<|MASK|>", | |
| "additional_special_tokens": [ | |
| "<|SYSTEM|>", "<|/SYSTEM|>", | |
| "<|USER|>", "<|/USER|>", | |
| "<|ASSISTANT|>", "<|/ASSISTANT|>", | |
| "<|THINK|>", "<|/THINK|>", | |
| "<|SPECIAL_14|>", "<|SPECIAL_15|>", "<|SPECIAL_16|>", | |
| "<|SPECIAL_17|>", "<|SPECIAL_18|>", "<|SPECIAL_19|>" | |
| ] | |
| } | |
| tokenizer = PreTrainedTokenizerFast( | |
| tokenizer_object=_tokenizer, | |
| model_max_length=2**30, # over one billion characters | |
| **special_token_params | |
| ) | |
| for tok_str, tok_id in sorted(tokenizer.get_vocab().items(), key=lambda x: x[1]): | |
| special = tok_id in tokenizer.all_special_ids | |
| log.info(f'-- token ID {tok_id:06}: {ANSI.FG_BRIGHT_YELLOW if special else ""}{tok_str!r}{ANSI.MODE_RESET_ALL}') | |
| log.info(f'saving trained tokenizer to {output_dir} ...') | |
| tokenizer.save_pretrained(output_dir) | |
| os.remove(temp_txt_file_path) | |
| log.info(f'all done.') | |
| return _RETURN_CODE_SUCCESS | |
| if __name__ == '__main__': | |
| parser = argparse.ArgumentParser(description="Train a HuggingFace tokenizer given parquet files") | |
| parser.add_argument( | |
| "-n", "--n-vocab", | |
| help=f"The desired vocabulary size (default: {DEFAULT_N_VOCAB})", | |
| default=DEFAULT_N_VOCAB, | |
| ) | |
| parser.add_argument( | |
| "-o", "--output-dir", | |
| help=f"Destination directory for the newly trained tokenizer (default: {DEFAULT_OUTPUT_DIR})", | |
| default=DEFAULT_OUTPUT_DIR | |
| ) | |
| parser.add_argument( | |
| "--file", | |
| help="A training data file in Apache Parquet format. May be specified multiple times.", | |
| required=True, | |
| action='append' | |
| ) | |
| args = parser.parse_args() | |
| sys.exit(main(files=args.file, n_vocab=int(args.n_vocab), output_dir=args.output_dir)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment