- skip sdpa
{
"mode": "QUANTIZE",
"observer": "maxabs",
"scale_method": "ACT_MAXABS_HW_WEIGHTS_PCS_MAXABS_POW2",
"scale_format": "const",
"allowlist": {
"types": [],
"names": []| diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py | |
| index 69c03d8efb8..f3668018c43 100755 | |
| --- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py | |
| +++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py | |
| @@ -930,6 +930,22 @@ class PatchedVllmMixtureOfExpertsOp(PatchedModuleBase): | |
| router_weights, | |
| permuted_weights=True, | |
| activation="silu"): | |
| + enable_moe_chunk = hasattr(self.orig_mod, "enable_moe_chunk") and self.orig_mod.enable_moe_chunk | |
| + if not enable_moe_chunk: |
| #!/bin/bash | |
| # | |
| # https://docs.docker.com/build/buildkit/ | |
| # https://github.com/docker/buildx/releases/ | |
| # https://github.com/docker/buildx | |
| ## docker builder prune --all | |
| ## docker buildx du --verbose | |
| ## For Ubuntu 24.04 try: sudo apt install docker-buildx |
| diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py | |
| index 69c03d8efb8..f3668018c43 100755 | |
| --- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py | |
| +++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py | |
| @@ -930,6 +930,22 @@ class PatchedVllmMixtureOfExpertsOp(PatchedModuleBase): | |
| router_weights, | |
| permuted_weights=True, | |
| activation="silu"): | |
| + enable_moe_chunk = hasattr(self.orig_mod, "enable_moe_chunk") and self.orig_mod.enable_moe_chunk | |
| + if enable_moe_chunk: |
| #!/bin/bash | |
| # Check if a model name is passed as an argument, otherwise use the default model path | |
| if [ -z "$1" ]; then | |
| model_path="Meta-Llama-3-8B-Instruct-W4A16-G128-AutoRound" | |
| else | |
| model_path="$1" | |
| fi | |
| tp_size=1 | |
| model_name=$(basename ${model_path}) |
| import ctypes | |
| import torch | |
| import time | |
| def nvrtc_compile(source: str) -> str: | |
| from ctypes import CDLL, c_void_p, c_char_p, c_size_t, byref, create_string_buffer | |
| libnvrtc = CDLL('libnvrtc.so') | |
| def get_error_string() -> str: | |
| err_p = c_char_p() | |
| libnvrtc.nvrtcGetErrorString(result, byref(err_str)) |
| Run 1: | |
| Auto-configed device: cuda | |
| WARNING:sglang.srt.server_args:Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel. | |
| WARNING:sglang.srt.server_args:TensorRT-LLM MHA only supports page_size of 16, 32 or 64, changing page_size from None to 64. | |
| [2025-09-06 08:26:09] server_args=ServerArgs(model_path='/home/yiliu7/models/openai/gpt-oss-120b', tokenizer_path='/home/yiliu7/models/openai/gpt-oss-120b', tokenizer_mode='auto', tokenizer_worker_num=1, skip_tokenizer_init=False, load_format='auto', model_loader_extra_config='{}', trust_remote_code=False, context_length=None, is_embedding=False, enable_multimodal=None, revision=None, model_impl='auto', host='127.0.0.1', port=8400, skip_server_warmup=False, warmups=None, nccl_port=None, dtype='bfloat16', quantization=None, quantization_param_path=None, kv_cache_dtype='auto', mem_fraction_static=0.93, max_running_requests=None, max_queued_requests=9223372036854775807, max_total_tokens=None, chunked_prefill_size=16384, max_p |
{
"mode": "QUANTIZE",
"observer": "maxabs",
"scale_method": "ACT_MAXABS_HW_WEIGHTS_PCS_MAXABS_POW2",
"scale_format": "const",
"allowlist": {
"types": [],
"names": []| from triton.testing import do_bench | |
| import torch | |
| from test_packing import _create_random_e2m1_tensor, pack_fp4_to_uint8_old | |
| from auto_round.export.export_to_autoround.qlinear_fp import FLOAT_TO_E2M1, pack_fp4_to_uint8 | |
| from dataclasses import dataclass | |
| from typing import List, Dict | |
| import json | |
| @dataclass | |
| class MoeOpInfo: | |
| num_inputs: int = 0 | |
| num_outputs: int = 0 |