HDCharles · November 20, 2025 19:32
diff --git a/model_maker.py b/model_maker.py
 import json

 import torch
 from tokenizers import Tokenizer
 from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Qwen3MoeConfig,
    Qwen3MoeForCausalLM,
 )

 # source_model = "meta-llama/Llama-3.2-1B"
 # output_path = "./scrap/tinysmokellama-3.2"
 source_model = "Qwen/Qwen3-30B-A3B"
 output_path = "./tinysmokeqwen3moe"
 quantized_output_path = "./tinysmokeqwen3moe-W4A16-first-only"
 vocab_keep_items = 1024


 ##### Tokenizer ######
 # Reduce vocabulary size, while maintaining special tokens

 num_added_tokens_to_keep = 11
 tokenizer = AutoTokenizer.from_pretrained(
    source_model, use_fast=True, model_max_length=2048
 )
 assert tokenizer.is_fast, "This only works for fast tokenizers."
 tokenizer_json = json.loads(tokenizer._tokenizer.to_str())
 vocab = tokenizer_json["model"]["vocab"]

 assert tokenizer_json["model"]["type"] == "BPE"
 new_vocab = {token: i for token, i in vocab.items() if i < vocab_keep_items}
 merges = tokenizer_json["model"]["merges"]
 new_merges = []
 for i in range(len(merges)):
    a, b = merges[i]
    new_token = "".join((a, b))
    if a in new_vocab and b in new_vocab and new_token in new_vocab:
        new_merges.append(merges[i])
 tokenizer_json["model"]["merges"] = new_merges
 tokenizer_json["model"]["vocab"] = new_vocab

 new_added_tokens = []
 for i in range(num_added_tokens_to_keep):
    added_token = tokenizer_json["added_tokens"][i]
    added_token["id"] = vocab_keep_items + i
    new_added_tokens.append(added_token)


 tokenizer_json["added_tokens"] = new_added_tokens

 added_map = {token["content"]: token["id"] for token in new_added_tokens}

 # Update post_processor special tokens if they exist
 # Different tokenizers have different structures
 if "post_processor" in tokenizer_json and tokenizer_json["post_processor"] is not None:
    post_proc = tokenizer_json["post_processor"]
    if "processors" in post_proc:
        # Llama-style structure
        if len(post_proc["processors"]) > 0 and "special_tokens" in post_proc["processors"][-1]:
            if "<|begin_of_text|>" in post_proc["processors"][-1]["special_tokens"]:
                post_proc["processors"][-1]["special_tokens"]["<|begin_of_text|>"]["ids"] = [vocab_keep_items]
    elif "special_tokens" in post_proc:
        # Alternative structure (e.g., Qwen)
        if "<|begin_of_text|>" in post_proc["special_tokens"]:
            post_proc["special_tokens"]["<|begin_of_text|>"]["ids"] = [vocab_keep_items]

 tokenizer._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))


 # tokenizer = AutoTokenizer.from_pretrained(source_model)
 tokenizer.save_pretrained(output_path)

 ##### Model #####
 # Reduce weight size and copy weights from a real llama model, so that weight distribution matches

 weight_source_llama = AutoModelForCausalLM.from_pretrained(source_model)

 weight_source_llama_dict = dict(weight_source_llama.named_parameters())

 new_config = Qwen3MoeConfig(
    vocab_size=vocab_keep_items + num_added_tokens_to_keep,
    hidden_size=128,
    num_attention_heads=2,
    num_hidden_layers=6,
    num_key_value_heads=4,
    intermediate_size=128,
    moe_intermediate_size=128,
    shared_expert_intermediate_size=128,
    num_experts_per_tok=2,
    num_experts=8,
    tie_word_embeddings=True,
 )


 def rec_setattr(obj, key, value):
    if "." in key:
        attr, rem_key = key.split(".", 1)
        rec_setattr(getattr(obj, attr), rem_key, value)
    else:
        setattr(obj, key, value)


 new_model = Qwen3MoeForCausalLM(new_config)

 for w_name, w_value in list(new_model.named_parameters()):
    if w_name == "lm_head.weight":
        continue
        # w_name = "model.embed_tokens.weight"
    elif w_name not in weight_source_llama_dict:
        raise ValueError(f"Couldn't find weight ref {w_name}")

    w = weight_source_llama_dict[w_name]

    slices = tuple(slice(0, n) for n in w_value.shape)
    if any(x < y for x, y in zip(w.shape, w_value.shape)):
        raise RuntimeError(f"Can't slice to size {w_name}")
    sliced_weight = w[slices].detach().clone()
    rec_setattr(new_model, w_name, torch.nn.Parameter(sliced_weight))

 new_model.save_pretrained(output_path)

 # Tie lm head to embed weights
 # new_model.lm_head.weight = new_model.model.embed_tokens.weight


 ###### APPLY QUANTIZATION ###########


 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier

 recipe = QuantizationModifier(
    targets = ["Linear"],
    scheme = "W4A16",
    ignore = [
        "lm_head",
        "re:.*mlp.gate[.].*",
        "re:.*model\\.layers\\.([1-9])\\..*",
        "re:.*model\\.layers\\.([1-9][0-9])\\..*",
    ]
 )

 oneshot(
    model=new_model,
    recipe=recipe,
 )

 ###### SAVE ######

 new_model.save_pretrained(quantized_output_path, save_compressed=True)
 tokenizer.save_pretrained(quantized_output_path)
	import json

	import torch
	from tokenizers import Tokenizer
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	Qwen3MoeConfig,
	Qwen3MoeForCausalLM,
	)

	# source_model = "meta-llama/Llama-3.2-1B"
	# output_path = "./scrap/tinysmokellama-3.2"
	source_model = "Qwen/Qwen3-30B-A3B"
	output_path = "./tinysmokeqwen3moe"
	quantized_output_path = "./tinysmokeqwen3moe-W4A16-first-only"
	vocab_keep_items = 1024


	##### Tokenizer ######
	# Reduce vocabulary size, while maintaining special tokens

	num_added_tokens_to_keep = 11
	tokenizer = AutoTokenizer.from_pretrained(
	source_model, use_fast=True, model_max_length=2048
	)
	assert tokenizer.is_fast, "This only works for fast tokenizers."
	tokenizer_json = json.loads(tokenizer._tokenizer.to_str())
	vocab = tokenizer_json["model"]["vocab"]

	assert tokenizer_json["model"]["type"] == "BPE"
	new_vocab = {token: i for token, i in vocab.items() if i < vocab_keep_items}
	merges = tokenizer_json["model"]["merges"]
	new_merges = []
	for i in range(len(merges)):
	a, b = merges[i]
	new_token = "".join((a, b))
	if a in new_vocab and b in new_vocab and new_token in new_vocab:
	new_merges.append(merges[i])
	tokenizer_json["model"]["merges"] = new_merges
	tokenizer_json["model"]["vocab"] = new_vocab

	new_added_tokens = []
	for i in range(num_added_tokens_to_keep):
	added_token = tokenizer_json["added_tokens"][i]
	added_token["id"] = vocab_keep_items + i
	new_added_tokens.append(added_token)


	tokenizer_json["added_tokens"] = new_added_tokens

	added_map = {token["content"]: token["id"] for token in new_added_tokens}

	# Update post_processor special tokens if they exist
	# Different tokenizers have different structures
	if "post_processor" in tokenizer_json and tokenizer_json["post_processor"] is not None:
	post_proc = tokenizer_json["post_processor"]
	if "processors" in post_proc:
	# Llama-style structure
	if len(post_proc["processors"]) > 0 and "special_tokens" in post_proc["processors"][-1]:
	if "<\|begin_of_text\|>" in post_proc["processors"][-1]["special_tokens"]:
	post_proc["processors"][-1]["special_tokens"]["<\|begin_of_text\|>"]["ids"] = [vocab_keep_items]
	elif "special_tokens" in post_proc:
	# Alternative structure (e.g., Qwen)
	if "<\|begin_of_text\|>" in post_proc["special_tokens"]:
	post_proc["special_tokens"]["<\|begin_of_text\|>"]["ids"] = [vocab_keep_items]

	tokenizer._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))


	# tokenizer = AutoTokenizer.from_pretrained(source_model)
	tokenizer.save_pretrained(output_path)

	##### Model #####
	# Reduce weight size and copy weights from a real llama model, so that weight distribution matches

	weight_source_llama = AutoModelForCausalLM.from_pretrained(source_model)

	weight_source_llama_dict = dict(weight_source_llama.named_parameters())

	new_config = Qwen3MoeConfig(
	vocab_size=vocab_keep_items + num_added_tokens_to_keep,
	hidden_size=128,
	num_attention_heads=2,
	num_hidden_layers=6,
	num_key_value_heads=4,
	intermediate_size=128,
	moe_intermediate_size=128,
	shared_expert_intermediate_size=128,
	num_experts_per_tok=2,
	num_experts=8,
	tie_word_embeddings=True,
	)


	def rec_setattr(obj, key, value):
	if "." in key:
	attr, rem_key = key.split(".", 1)
	rec_setattr(getattr(obj, attr), rem_key, value)
	else:
	setattr(obj, key, value)


	new_model = Qwen3MoeForCausalLM(new_config)

	for w_name, w_value in list(new_model.named_parameters()):
	if w_name == "lm_head.weight":
	continue
	# w_name = "model.embed_tokens.weight"
	elif w_name not in weight_source_llama_dict:
	raise ValueError(f"Couldn't find weight ref {w_name}")

	w = weight_source_llama_dict[w_name]

	slices = tuple(slice(0, n) for n in w_value.shape)
	if any(x < y for x, y in zip(w.shape, w_value.shape)):
	raise RuntimeError(f"Can't slice to size {w_name}")
	sliced_weight = w[slices].detach().clone()
	rec_setattr(new_model, w_name, torch.nn.Parameter(sliced_weight))

	new_model.save_pretrained(output_path)

	# Tie lm head to embed weights
	# new_model.lm_head.weight = new_model.model.embed_tokens.weight


	###### APPLY QUANTIZATION ###########


	from llmcompressor import oneshot
	from llmcompressor.modifiers.quantization import QuantizationModifier

	recipe = QuantizationModifier(
	targets = ["Linear"],
	scheme = "W4A16",
	ignore = [
	"lm_head",
	"re:.mlp.gate[.].",
	"re:.model\\.layers\\.([1-9])\\..",
	"re:.model\\.layers\\.([1-9][0-9])\\..",
	]
	)

	oneshot(
	model=new_model,
	recipe=recipe,
	)

	###### SAVE ######

	new_model.save_pretrained(quantized_output_path, save_compressed=True)
	tokenizer.save_pretrained(quantized_output_path)
No results found