Created
November 20, 2025 19:32
-
-
Save HDCharles/78c55da388076aa046ee893edfbd9df5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json | |
| import torch | |
| from tokenizers import Tokenizer | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| Qwen3MoeConfig, | |
| Qwen3MoeForCausalLM, | |
| ) | |
| # source_model = "meta-llama/Llama-3.2-1B" | |
| # output_path = "./scrap/tinysmokellama-3.2" | |
| source_model = "Qwen/Qwen3-30B-A3B" | |
| output_path = "./tinysmokeqwen3moe" | |
| quantized_output_path = "./tinysmokeqwen3moe-W4A16-first-only" | |
| vocab_keep_items = 1024 | |
| ##### Tokenizer ###### | |
| # Reduce vocabulary size, while maintaining special tokens | |
| num_added_tokens_to_keep = 11 | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| source_model, use_fast=True, model_max_length=2048 | |
| ) | |
| assert tokenizer.is_fast, "This only works for fast tokenizers." | |
| tokenizer_json = json.loads(tokenizer._tokenizer.to_str()) | |
| vocab = tokenizer_json["model"]["vocab"] | |
| assert tokenizer_json["model"]["type"] == "BPE" | |
| new_vocab = {token: i for token, i in vocab.items() if i < vocab_keep_items} | |
| merges = tokenizer_json["model"]["merges"] | |
| new_merges = [] | |
| for i in range(len(merges)): | |
| a, b = merges[i] | |
| new_token = "".join((a, b)) | |
| if a in new_vocab and b in new_vocab and new_token in new_vocab: | |
| new_merges.append(merges[i]) | |
| tokenizer_json["model"]["merges"] = new_merges | |
| tokenizer_json["model"]["vocab"] = new_vocab | |
| new_added_tokens = [] | |
| for i in range(num_added_tokens_to_keep): | |
| added_token = tokenizer_json["added_tokens"][i] | |
| added_token["id"] = vocab_keep_items + i | |
| new_added_tokens.append(added_token) | |
| tokenizer_json["added_tokens"] = new_added_tokens | |
| added_map = {token["content"]: token["id"] for token in new_added_tokens} | |
| # Update post_processor special tokens if they exist | |
| # Different tokenizers have different structures | |
| if "post_processor" in tokenizer_json and tokenizer_json["post_processor"] is not None: | |
| post_proc = tokenizer_json["post_processor"] | |
| if "processors" in post_proc: | |
| # Llama-style structure | |
| if len(post_proc["processors"]) > 0 and "special_tokens" in post_proc["processors"][-1]: | |
| if "<|begin_of_text|>" in post_proc["processors"][-1]["special_tokens"]: | |
| post_proc["processors"][-1]["special_tokens"]["<|begin_of_text|>"]["ids"] = [vocab_keep_items] | |
| elif "special_tokens" in post_proc: | |
| # Alternative structure (e.g., Qwen) | |
| if "<|begin_of_text|>" in post_proc["special_tokens"]: | |
| post_proc["special_tokens"]["<|begin_of_text|>"]["ids"] = [vocab_keep_items] | |
| tokenizer._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json)) | |
| # tokenizer = AutoTokenizer.from_pretrained(source_model) | |
| tokenizer.save_pretrained(output_path) | |
| ##### Model ##### | |
| # Reduce weight size and copy weights from a real llama model, so that weight distribution matches | |
| weight_source_llama = AutoModelForCausalLM.from_pretrained(source_model) | |
| weight_source_llama_dict = dict(weight_source_llama.named_parameters()) | |
| new_config = Qwen3MoeConfig( | |
| vocab_size=vocab_keep_items + num_added_tokens_to_keep, | |
| hidden_size=128, | |
| num_attention_heads=2, | |
| num_hidden_layers=6, | |
| num_key_value_heads=4, | |
| intermediate_size=128, | |
| moe_intermediate_size=128, | |
| shared_expert_intermediate_size=128, | |
| num_experts_per_tok=2, | |
| num_experts=8, | |
| tie_word_embeddings=True, | |
| ) | |
| def rec_setattr(obj, key, value): | |
| if "." in key: | |
| attr, rem_key = key.split(".", 1) | |
| rec_setattr(getattr(obj, attr), rem_key, value) | |
| else: | |
| setattr(obj, key, value) | |
| new_model = Qwen3MoeForCausalLM(new_config) | |
| for w_name, w_value in list(new_model.named_parameters()): | |
| if w_name == "lm_head.weight": | |
| continue | |
| # w_name = "model.embed_tokens.weight" | |
| elif w_name not in weight_source_llama_dict: | |
| raise ValueError(f"Couldn't find weight ref {w_name}") | |
| w = weight_source_llama_dict[w_name] | |
| slices = tuple(slice(0, n) for n in w_value.shape) | |
| if any(x < y for x, y in zip(w.shape, w_value.shape)): | |
| raise RuntimeError(f"Can't slice to size {w_name}") | |
| sliced_weight = w[slices].detach().clone() | |
| rec_setattr(new_model, w_name, torch.nn.Parameter(sliced_weight)) | |
| new_model.save_pretrained(output_path) | |
| # Tie lm head to embed weights | |
| # new_model.lm_head.weight = new_model.model.embed_tokens.weight | |
| ###### APPLY QUANTIZATION ########### | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.quantization import QuantizationModifier | |
| recipe = QuantizationModifier( | |
| targets = ["Linear"], | |
| scheme = "W4A16", | |
| ignore = [ | |
| "lm_head", | |
| "re:.*mlp.gate[.].*", | |
| "re:.*model\\.layers\\.([1-9])\\..*", | |
| "re:.*model\\.layers\\.([1-9][0-9])\\..*", | |
| ] | |
| ) | |
| oneshot( | |
| model=new_model, | |
| recipe=recipe, | |
| ) | |
| ###### SAVE ###### | |
| new_model.save_pretrained(quantized_output_path, save_compressed=True) | |
| tokenizer.save_pretrained(quantized_output_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment