Skip to content

Instantly share code, notes, and snippets.

@HDCharles
Created November 20, 2025 19:32
Show Gist options
  • Select an option

  • Save HDCharles/78c55da388076aa046ee893edfbd9df5 to your computer and use it in GitHub Desktop.

Select an option

Save HDCharles/78c55da388076aa046ee893edfbd9df5 to your computer and use it in GitHub Desktop.
import json
import torch
from tokenizers import Tokenizer
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
Qwen3MoeConfig,
Qwen3MoeForCausalLM,
)
# source_model = "meta-llama/Llama-3.2-1B"
# output_path = "./scrap/tinysmokellama-3.2"
source_model = "Qwen/Qwen3-30B-A3B"
output_path = "./tinysmokeqwen3moe"
quantized_output_path = "./tinysmokeqwen3moe-W4A16-first-only"
vocab_keep_items = 1024
##### Tokenizer ######
# Reduce vocabulary size, while maintaining special tokens
num_added_tokens_to_keep = 11
tokenizer = AutoTokenizer.from_pretrained(
source_model, use_fast=True, model_max_length=2048
)
assert tokenizer.is_fast, "This only works for fast tokenizers."
tokenizer_json = json.loads(tokenizer._tokenizer.to_str())
vocab = tokenizer_json["model"]["vocab"]
assert tokenizer_json["model"]["type"] == "BPE"
new_vocab = {token: i for token, i in vocab.items() if i < vocab_keep_items}
merges = tokenizer_json["model"]["merges"]
new_merges = []
for i in range(len(merges)):
a, b = merges[i]
new_token = "".join((a, b))
if a in new_vocab and b in new_vocab and new_token in new_vocab:
new_merges.append(merges[i])
tokenizer_json["model"]["merges"] = new_merges
tokenizer_json["model"]["vocab"] = new_vocab
new_added_tokens = []
for i in range(num_added_tokens_to_keep):
added_token = tokenizer_json["added_tokens"][i]
added_token["id"] = vocab_keep_items + i
new_added_tokens.append(added_token)
tokenizer_json["added_tokens"] = new_added_tokens
added_map = {token["content"]: token["id"] for token in new_added_tokens}
# Update post_processor special tokens if they exist
# Different tokenizers have different structures
if "post_processor" in tokenizer_json and tokenizer_json["post_processor"] is not None:
post_proc = tokenizer_json["post_processor"]
if "processors" in post_proc:
# Llama-style structure
if len(post_proc["processors"]) > 0 and "special_tokens" in post_proc["processors"][-1]:
if "<|begin_of_text|>" in post_proc["processors"][-1]["special_tokens"]:
post_proc["processors"][-1]["special_tokens"]["<|begin_of_text|>"]["ids"] = [vocab_keep_items]
elif "special_tokens" in post_proc:
# Alternative structure (e.g., Qwen)
if "<|begin_of_text|>" in post_proc["special_tokens"]:
post_proc["special_tokens"]["<|begin_of_text|>"]["ids"] = [vocab_keep_items]
tokenizer._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))
# tokenizer = AutoTokenizer.from_pretrained(source_model)
tokenizer.save_pretrained(output_path)
##### Model #####
# Reduce weight size and copy weights from a real llama model, so that weight distribution matches
weight_source_llama = AutoModelForCausalLM.from_pretrained(source_model)
weight_source_llama_dict = dict(weight_source_llama.named_parameters())
new_config = Qwen3MoeConfig(
vocab_size=vocab_keep_items + num_added_tokens_to_keep,
hidden_size=128,
num_attention_heads=2,
num_hidden_layers=6,
num_key_value_heads=4,
intermediate_size=128,
moe_intermediate_size=128,
shared_expert_intermediate_size=128,
num_experts_per_tok=2,
num_experts=8,
tie_word_embeddings=True,
)
def rec_setattr(obj, key, value):
if "." in key:
attr, rem_key = key.split(".", 1)
rec_setattr(getattr(obj, attr), rem_key, value)
else:
setattr(obj, key, value)
new_model = Qwen3MoeForCausalLM(new_config)
for w_name, w_value in list(new_model.named_parameters()):
if w_name == "lm_head.weight":
continue
# w_name = "model.embed_tokens.weight"
elif w_name not in weight_source_llama_dict:
raise ValueError(f"Couldn't find weight ref {w_name}")
w = weight_source_llama_dict[w_name]
slices = tuple(slice(0, n) for n in w_value.shape)
if any(x < y for x, y in zip(w.shape, w_value.shape)):
raise RuntimeError(f"Can't slice to size {w_name}")
sliced_weight = w[slices].detach().clone()
rec_setattr(new_model, w_name, torch.nn.Parameter(sliced_weight))
new_model.save_pretrained(output_path)
# Tie lm head to embed weights
# new_model.lm_head.weight = new_model.model.embed_tokens.weight
###### APPLY QUANTIZATION ###########
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
recipe = QuantizationModifier(
targets = ["Linear"],
scheme = "W4A16",
ignore = [
"lm_head",
"re:.*mlp.gate[.].*",
"re:.*model\\.layers\\.([1-9])\\..*",
"re:.*model\\.layers\\.([1-9][0-9])\\..*",
]
)
oneshot(
model=new_model,
recipe=recipe,
)
###### SAVE ######
new_model.save_pretrained(quantized_output_path, save_compressed=True)
tokenizer.save_pretrained(quantized_output_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment