Last active
November 19, 2025 16:13
-
-
Save HDCharles/3bb894b13388d8983332e79a65e89852 to your computer and use it in GitHub Desktop.
Demo AWQ Mapping vs Targets
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Test to verify what happens when layers are in mappings but not in targets for AWQ. | |
| Adapted from https://github.com/vllm-project/llm-compressor/tree/main/examples/awq | |
| """ | |
| import torch | |
| from datasets import load_dataset | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.awq import AWQModifier | |
| # Select model and load it - using TinyLlama for faster testing | |
| MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" | |
| model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto", device_map="auto") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| # Save original weights for comparison | |
| print("=" * 80) | |
| print("Saving original weights for comparison...") | |
| print("=" * 80) | |
| # Let's check layer 0 (will be in targets) and layer 10 (will NOT be in targets) | |
| layer0_qproj_weight = model.model.layers[0].self_attn.q_proj.weight.data.clone() | |
| layer0_input_ln_weight = model.model.layers[0].input_layernorm.weight.data.clone() | |
| layer10_qproj_weight = model.model.layers[10].self_attn.q_proj.weight.data.clone() | |
| layer10_input_ln_weight = model.model.layers[10].input_layernorm.weight.data.clone() | |
| print(f"Layer 0 q_proj weight norm: {layer0_qproj_weight.norm():.6f}") | |
| print(f"Layer 0 input_layernorm weight norm: {layer0_input_ln_weight.norm():.6f}") | |
| print(f"Layer 10 q_proj weight norm: {layer10_qproj_weight.norm():.6f}") | |
| print(f"Layer 10 input_layernorm weight norm: {layer10_input_ln_weight.norm():.6f}") | |
| print() | |
| # Select calibration dataset. | |
| DATASET_ID = "HuggingFaceH4/ultrachat_200k" | |
| DATASET_SPLIT = "train_sft" | |
| # Using fewer samples for faster testing | |
| NUM_CALIBRATION_SAMPLES = 16 | |
| MAX_SEQUENCE_LENGTH = 512 | |
| # Load dataset and preprocess. | |
| ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") | |
| ds = ds.shuffle(seed=42) | |
| def preprocess(example): | |
| return { | |
| "text": tokenizer.apply_chat_template( | |
| example["messages"], | |
| tokenize=False, | |
| ) | |
| } | |
| ds = ds.map(preprocess) | |
| # Tokenize inputs. | |
| def tokenize(sample): | |
| return tokenizer( | |
| sample["text"], | |
| padding=False, | |
| max_length=MAX_SEQUENCE_LENGTH, | |
| truncation=True, | |
| add_special_tokens=False, | |
| ) | |
| # Configure the quantization algorithm to run. | |
| # Key test: | |
| # - Mappings explicitly include BOTH layer 0 and layer 10 | |
| # - Targets only include layers 0-9 (NOT layer 10) | |
| print("=" * 80) | |
| print("Configuring AWQ:") | |
| print(" - Mappings: Explicitly set to include layers 0-10 (all layers)") | |
| print(" - Targets: Only match layers 0-9 (layer 10 excluded from quantization)") | |
| print(" - This tests: layer 10 in mappings but NOT in targets") | |
| print("=" * 80) | |
| print() | |
| from llmcompressor.modifiers.awq.mappings import get_layer_mappings_from_architecture | |
| # Get default mappings for LlamaForCausalLM (will cover all layers) | |
| default_mappings = get_layer_mappings_from_architecture("LlamaForCausalLM") | |
| recipe = [ | |
| AWQModifier( | |
| mappings=default_mappings, # This will smooth ALL layers including layer 10 | |
| config_groups={ | |
| "group_0": { | |
| # Only quantize Linear layers in layers 0-9, NOT layer 10 | |
| # Use a pattern that matches q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj | |
| # in layers 0-9 only | |
| "targets": [ | |
| "re:model.layers.[0-9].self_attn.(q|k|v|o)_proj$", | |
| "re:model.layers.[0-9].mlp.(gate|up|down)_proj$", | |
| ], | |
| "weights": { | |
| "num_bits": 4, | |
| "type": "int", | |
| "symmetric": False, | |
| "strategy": "group", | |
| "group_size": 128, | |
| } | |
| } | |
| }, | |
| ignore=["lm_head"], # Only ignore lm_head, not layer 10 | |
| ), | |
| ] | |
| # Apply algorithms. | |
| oneshot( | |
| model=model, | |
| dataset=ds, | |
| recipe=recipe, | |
| max_seq_length=MAX_SEQUENCE_LENGTH, | |
| num_calibration_samples=NUM_CALIBRATION_SAMPLES, | |
| ) | |
| # Check results | |
| print("\n\n") | |
| print("=" * 80) | |
| print("RESULTS: Checking if weights changed and quantization status") | |
| print("=" * 80) | |
| # Check if weights changed | |
| layer0_qproj_changed = not torch.allclose( | |
| model.model.layers[0].self_attn.q_proj.weight.data.cpu(), | |
| layer0_qproj_weight.cpu(), | |
| rtol=1e-5 | |
| ) | |
| layer0_ln_changed = not torch.allclose( | |
| model.model.layers[0].input_layernorm.weight.data.cpu(), | |
| layer0_input_ln_weight.cpu(), | |
| rtol=1e-5 | |
| ) | |
| layer10_qproj_changed = not torch.allclose( | |
| model.model.layers[10].self_attn.q_proj.weight.data.cpu(), | |
| layer10_qproj_weight.cpu(), | |
| rtol=1e-5 | |
| ) | |
| layer10_ln_changed = not torch.allclose( | |
| model.model.layers[10].input_layernorm.weight.data.cpu(), | |
| layer10_input_ln_weight.cpu(), | |
| rtol=1e-5 | |
| ) | |
| print("\nWeight changes:") | |
| print(f" Layer 0 q_proj changed: {layer0_qproj_changed}") | |
| print(f" Layer 0 input_layernorm changed: {layer0_ln_changed}") | |
| print(f" Layer 10 q_proj changed: {layer10_qproj_changed}") | |
| print(f" Layer 10 input_layernorm changed: {layer10_ln_changed}") | |
| # Check quantization status | |
| layer0_has_scale = hasattr(model.model.layers[0].self_attn.q_proj, "weight_scale") | |
| layer10_has_scale = hasattr(model.model.layers[10].self_attn.q_proj, "weight_scale") | |
| print("\nQuantization status:") | |
| print(f" Layer 0 q_proj has weight_scale (quantized): {layer0_has_scale}") | |
| print(f" Layer 10 q_proj has weight_scale (quantized): {layer10_has_scale}") | |
| print("\n" + "=" * 80) | |
| print("INTERPRETATION") | |
| print("=" * 80) | |
| if layer10_ln_changed and layer10_qproj_changed: | |
| print("\n✓ CONFIRMED: Layer 10 (in mappings but not in targets) WAS SMOOTHED") | |
| print(" - Both input_layernorm and q_proj weights were modified") | |
| if layer10_has_scale: | |
| print(" ✗ Layer 10 was ALSO quantized (unexpected!)") | |
| else: | |
| print(" ✓ Layer 10 was NOT quantized (as expected)") | |
| else: | |
| print("\n✗ Layer 10 was NOT smoothed (unexpected)") | |
| print(f" - input_layernorm changed: {layer10_ln_changed}") | |
| print(f" - q_proj changed: {layer10_qproj_changed}") | |
| if layer0_has_scale: | |
| print("\n✓ Layer 0 (in targets) was quantized as expected") | |
| else: | |
| print("\n✗ Layer 0 was NOT quantized (unexpected!)") | |
| print("\n" + "=" * 80) |
Author
HDCharles
commented
Nov 19, 2025
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment