Created
April 17, 2025 01:10
-
-
Save DraconicDragon/10ac26d0d11ea9b14a0edae5d728bc96 to your computer and use it in GitHub Desktop.
script that downloads the json files and merge.txt from openai/clip-vit-large-patch14 using transformers required to tokenize text and then tokenizes given input and outputs the number of tokens as well as the tokens themselves
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from transformers import CLIPTokenizer | |
| # Load the tokenizer from the local directory | |
| tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") | |
| # Your input text | |
| text = input("Input text:\n") | |
| # Tokenize the text | |
| tokens = tokenizer(text, return_tensors="pt") | |
| # Get the number of tokens | |
| num_tokens = tokens["input_ids"].shape[1] | |
| print(f"Number of tokens: {num_tokens}") | |
| # Decode and print each token | |
| input_ids = tokens["input_ids"][0] | |
| tokens_str = [tokenizer.convert_ids_to_tokens([token_id.item()])[0] for token_id in input_ids] | |
| print("Tokens:") | |
| for i, tok in enumerate(tokens_str): | |
| print(f"{i}: {tok}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment