HDCharles · November 19, 2025 16:13 · HDCharles · Nov 19, 2025
diff --git a/test_awq_mapping_vs_targets.py b/test_awq_mapping_vs_targets.py
 """
 Test to verify what happens when layers are in mappings but not in targets for AWQ.

 Adapted from https://github.com/vllm-project/llm-compressor/tree/main/examples/awq
 """
 import torch
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer

 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier

 # Select model and load it - using TinyLlama for faster testing
 MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

 model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto", device_map="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

 # Save original weights for comparison
 print("=" * 80)
 print("Saving original weights for comparison...")
 print("=" * 80)

 # Let's check layer 0 (will be in targets) and layer 10 (will NOT be in targets)
 layer0_qproj_weight = model.model.layers[0].self_attn.q_proj.weight.data.clone()
 layer0_input_ln_weight = model.model.layers[0].input_layernorm.weight.data.clone()
 layer10_qproj_weight = model.model.layers[10].self_attn.q_proj.weight.data.clone()
 layer10_input_ln_weight = model.model.layers[10].input_layernorm.weight.data.clone()

 print(f"Layer 0 q_proj weight norm: {layer0_qproj_weight.norm():.6f}")
 print(f"Layer 0 input_layernorm weight norm: {layer0_input_ln_weight.norm():.6f}")
 print(f"Layer 10 q_proj weight norm: {layer10_qproj_weight.norm():.6f}")
 print(f"Layer 10 input_layernorm weight norm: {layer10_input_ln_weight.norm():.6f}")
 print()

 # Select calibration dataset.
 DATASET_ID = "HuggingFaceH4/ultrachat_200k"
 DATASET_SPLIT = "train_sft"

 # Using fewer samples for faster testing
 NUM_CALIBRATION_SAMPLES = 16
 MAX_SEQUENCE_LENGTH = 512

 # Load dataset and preprocess.
 ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
 ds = ds.shuffle(seed=42)


 def preprocess(example):
    return {
        "text": tokenizer.apply_chat_template(
            example["messages"],
            tokenize=False,
        )
    }


 ds = ds.map(preprocess)


 # Tokenize inputs.
 def tokenize(sample):
    return tokenizer(
        sample["text"],
        padding=False,
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        add_special_tokens=False,
    )


 # Configure the quantization algorithm to run.
 # Key test:
 # - Mappings explicitly include BOTH layer 0 and layer 10
 # - Targets only include layers 0-9 (NOT layer 10)
 print("=" * 80)
 print("Configuring AWQ:")
 print("  - Mappings: Explicitly set to include layers 0-10 (all layers)")
 print("  - Targets: Only match layers 0-9 (layer 10 excluded from quantization)")
 print("  - This tests: layer 10 in mappings but NOT in targets")
 print("=" * 80)
 print()

 from llmcompressor.modifiers.awq.mappings import get_layer_mappings_from_architecture

 # Get default mappings for LlamaForCausalLM (will cover all layers)
 default_mappings = get_layer_mappings_from_architecture("LlamaForCausalLM")

 recipe = [
    AWQModifier(
        mappings=default_mappings,  # This will smooth ALL layers including layer 10
        config_groups={
            "group_0": {
                # Only quantize Linear layers in layers 0-9, NOT layer 10
                # Use a pattern that matches q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj
                # in layers 0-9 only
                "targets": [
                    "re:model.layers.[0-9].self_attn.(q|k|v|o)_proj$",
                    "re:model.layers.[0-9].mlp.(gate|up|down)_proj$",
                ],
                "weights": {
                    "num_bits": 4,
                    "type": "int",
                    "symmetric": False,
                    "strategy": "group",
                    "group_size": 128,
                }
            }
        },
        ignore=["lm_head"],  # Only ignore lm_head, not layer 10
    ),
 ]

 # Apply algorithms.
 oneshot(
    model=model,
    dataset=ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )

 # Check results
 print("\n\n")
 print("=" * 80)
 print("RESULTS: Checking if weights changed and quantization status")
 print("=" * 80)

 # Check if weights changed
 layer0_qproj_changed = not torch.allclose(
    model.model.layers[0].self_attn.q_proj.weight.data.cpu(),
    layer0_qproj_weight.cpu(),
    rtol=1e-5
 )
 layer0_ln_changed = not torch.allclose(
    model.model.layers[0].input_layernorm.weight.data.cpu(),
    layer0_input_ln_weight.cpu(),
    rtol=1e-5
 )
 layer10_qproj_changed = not torch.allclose(
    model.model.layers[10].self_attn.q_proj.weight.data.cpu(),
    layer10_qproj_weight.cpu(),
    rtol=1e-5
 )
 layer10_ln_changed = not torch.allclose(
    model.model.layers[10].input_layernorm.weight.data.cpu(),
    layer10_input_ln_weight.cpu(),
    rtol=1e-5
 )

 print("\nWeight changes:")
 print(f"  Layer 0 q_proj changed: {layer0_qproj_changed}")
 print(f"  Layer 0 input_layernorm changed: {layer0_ln_changed}")
 print(f"  Layer 10 q_proj changed: {layer10_qproj_changed}")
 print(f"  Layer 10 input_layernorm changed: {layer10_ln_changed}")

 # Check quantization status
 layer0_has_scale = hasattr(model.model.layers[0].self_attn.q_proj, "weight_scale")
 layer10_has_scale = hasattr(model.model.layers[10].self_attn.q_proj, "weight_scale")

 print("\nQuantization status:")
 print(f"  Layer 0 q_proj has weight_scale (quantized): {layer0_has_scale}")
 print(f"  Layer 10 q_proj has weight_scale (quantized): {layer10_has_scale}")

 print("\n" + "=" * 80)
 print("INTERPRETATION")
 print("=" * 80)

 if layer10_ln_changed and layer10_qproj_changed:
    print("\n✓ CONFIRMED: Layer 10 (in mappings but not in targets) WAS SMOOTHED")
    print("  - Both input_layernorm and q_proj weights were modified")
    if layer10_has_scale:
        print("  ✗ Layer 10 was ALSO quantized (unexpected!)")
    else:
        print("  ✓ Layer 10 was NOT quantized (as expected)")
 else:
    print("\n✗ Layer 10 was NOT smoothed (unexpected)")
    print(f"  - input_layernorm changed: {layer10_ln_changed}")
    print(f"  - q_proj changed: {layer10_qproj_changed}")

 if layer0_has_scale:
    print("\n✓ Layer 0 (in targets) was quantized as expected")
 else:
    print("\n✗ Layer 0 was NOT quantized (unexpected!)")

 print("\n" + "=" * 80)
	"""
	Test to verify what happens when layers are in mappings but not in targets for AWQ.

	Adapted from https://github.com/vllm-project/llm-compressor/tree/main/examples/awq
	"""
	import torch
	from datasets import load_dataset
	from transformers import AutoModelForCausalLM, AutoTokenizer

	from llmcompressor import oneshot
	from llmcompressor.modifiers.awq import AWQModifier

	# Select model and load it - using TinyLlama for faster testing
	MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

	model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto", device_map="auto")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

	# Save original weights for comparison
	print("=" * 80)
	print("Saving original weights for comparison...")
	print("=" * 80)

	# Let's check layer 0 (will be in targets) and layer 10 (will NOT be in targets)
	layer0_qproj_weight = model.model.layers[0].self_attn.q_proj.weight.data.clone()
	layer0_input_ln_weight = model.model.layers[0].input_layernorm.weight.data.clone()
	layer10_qproj_weight = model.model.layers[10].self_attn.q_proj.weight.data.clone()
	layer10_input_ln_weight = model.model.layers[10].input_layernorm.weight.data.clone()

	print(f"Layer 0 q_proj weight norm: {layer0_qproj_weight.norm():.6f}")
	print(f"Layer 0 input_layernorm weight norm: {layer0_input_ln_weight.norm():.6f}")
	print(f"Layer 10 q_proj weight norm: {layer10_qproj_weight.norm():.6f}")
	print(f"Layer 10 input_layernorm weight norm: {layer10_input_ln_weight.norm():.6f}")
	print()

	# Select calibration dataset.
	DATASET_ID = "HuggingFaceH4/ultrachat_200k"
	DATASET_SPLIT = "train_sft"

	# Using fewer samples for faster testing
	NUM_CALIBRATION_SAMPLES = 16
	MAX_SEQUENCE_LENGTH = 512

	# Load dataset and preprocess.
	ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
	ds = ds.shuffle(seed=42)


	def preprocess(example):
	return {
	"text": tokenizer.apply_chat_template(
	example["messages"],
	tokenize=False,
	)
	}


	ds = ds.map(preprocess)


	# Tokenize inputs.
	def tokenize(sample):
	return tokenizer(
	sample["text"],
	padding=False,
	max_length=MAX_SEQUENCE_LENGTH,
	truncation=True,
	add_special_tokens=False,
	)


	# Configure the quantization algorithm to run.
	# Key test:
	# - Mappings explicitly include BOTH layer 0 and layer 10
	# - Targets only include layers 0-9 (NOT layer 10)
	print("=" * 80)
	print("Configuring AWQ:")
	print(" - Mappings: Explicitly set to include layers 0-10 (all layers)")
	print(" - Targets: Only match layers 0-9 (layer 10 excluded from quantization)")
	print(" - This tests: layer 10 in mappings but NOT in targets")
	print("=" * 80)
	print()

	from llmcompressor.modifiers.awq.mappings import get_layer_mappings_from_architecture

	# Get default mappings for LlamaForCausalLM (will cover all layers)
	default_mappings = get_layer_mappings_from_architecture("LlamaForCausalLM")

	recipe = [
	AWQModifier(
	mappings=default_mappings, # This will smooth ALL layers including layer 10
	config_groups={
	"group_0": {
	# Only quantize Linear layers in layers 0-9, NOT layer 10
	# Use a pattern that matches q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj
	# in layers 0-9 only
	"targets": [
	"re:model.layers.[0-9].self_attn.(q\|k\|v\|o)_proj$",
	"re:model.layers.[0-9].mlp.(gate\|up\|down)_proj$",
	],
	"weights": {
	"num_bits": 4,
	"type": "int",
	"symmetric": False,
	"strategy": "group",
	"group_size": 128,
	}
	}
	},
	ignore=["lm_head"], # Only ignore lm_head, not layer 10
	),
	]

	# Apply algorithms.
	oneshot(
	model=model,
	dataset=ds,
	recipe=recipe,
	max_seq_length=MAX_SEQUENCE_LENGTH,
	num_calibration_samples=NUM_CALIBRATION_SAMPLES,
	)

	# Check results
	print("\n\n")
	print("=" * 80)
	print("RESULTS: Checking if weights changed and quantization status")
	print("=" * 80)

	# Check if weights changed
	layer0_qproj_changed = not torch.allclose(
	model.model.layers[0].self_attn.q_proj.weight.data.cpu(),
	layer0_qproj_weight.cpu(),
	rtol=1e-5
	)
	layer0_ln_changed = not torch.allclose(
	model.model.layers[0].input_layernorm.weight.data.cpu(),
	layer0_input_ln_weight.cpu(),
	rtol=1e-5
	)
	layer10_qproj_changed = not torch.allclose(
	model.model.layers[10].self_attn.q_proj.weight.data.cpu(),
	layer10_qproj_weight.cpu(),
	rtol=1e-5
	)
	layer10_ln_changed = not torch.allclose(
	model.model.layers[10].input_layernorm.weight.data.cpu(),
	layer10_input_ln_weight.cpu(),
	rtol=1e-5
	)

	print("\nWeight changes:")
	print(f" Layer 0 q_proj changed: {layer0_qproj_changed}")
	print(f" Layer 0 input_layernorm changed: {layer0_ln_changed}")
	print(f" Layer 10 q_proj changed: {layer10_qproj_changed}")
	print(f" Layer 10 input_layernorm changed: {layer10_ln_changed}")

	# Check quantization status
	layer0_has_scale = hasattr(model.model.layers[0].self_attn.q_proj, "weight_scale")
	layer10_has_scale = hasattr(model.model.layers[10].self_attn.q_proj, "weight_scale")

	print("\nQuantization status:")
	print(f" Layer 0 q_proj has weight_scale (quantized): {layer0_has_scale}")
	print(f" Layer 10 q_proj has weight_scale (quantized): {layer10_has_scale}")

	print("\n" + "=" * 80)
	print("INTERPRETATION")
	print("=" * 80)

	if layer10_ln_changed and layer10_qproj_changed:
	print("\n✓ CONFIRMED: Layer 10 (in mappings but not in targets) WAS SMOOTHED")
	print(" - Both input_layernorm and q_proj weights were modified")
	if layer10_has_scale:
	print(" ✗ Layer 10 was ALSO quantized (unexpected!)")
	else:
	print(" ✓ Layer 10 was NOT quantized (as expected)")
	else:
	print("\n✗ Layer 10 was NOT smoothed (unexpected)")
	print(f" - input_layernorm changed: {layer10_ln_changed}")
	print(f" - q_proj changed: {layer10_qproj_changed}")

	if layer0_has_scale:
	print("\n✓ Layer 0 (in targets) was quantized as expected")
	else:
	print("\n✗ Layer 0 was NOT quantized (unexpected!)")

	print("\n" + "=" * 80)
No results found