November 19, 2025 02:12 · November 12, 2025 01:00 · November 7, 2025 03:30 · November 5, 2025 07:16 · September 19, 2025 08:53 · September 6, 2025 09:13
 diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
 index 69c03d8efb8..f3668018c43 100755
 --- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
 +++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
 @@ -930,6 +930,22 @@ class PatchedVllmMixtureOfExpertsOp(PatchedModuleBase):
                       router_weights,
                       permuted_weights=True,
                       activation="silu"):
 +        enable_moe_chunk = hasattr(self.orig_mod, "enable_moe_chunk") and self.orig_mod.enable_moe_chunk
 +        if not enable_moe_chunk:
 #!/bin/bash
 #
 # https://docs.docker.com/build/buildkit/
 # https://github.com/docker/buildx/releases/
 # https://github.com/docker/buildx

 ## docker builder prune --all
 ## docker buildx du --verbose

 ## For Ubuntu 24.04 try: sudo apt install docker-buildx
 diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
 index 69c03d8efb8..f3668018c43 100755
 --- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
 +++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
 @@ -930,6 +930,22 @@ class PatchedVllmMixtureOfExpertsOp(PatchedModuleBase):
                       router_weights,
                       permuted_weights=True,
                       activation="silu"):
 +        enable_moe_chunk = hasattr(self.orig_mod, "enable_moe_chunk") and self.orig_mod.enable_moe_chunk
 +        if enable_moe_chunk:
 #!/bin/bash
 # Check if a model name is passed as an argument, otherwise use the default model path
 if [ -z "$1" ]; then
  model_path="Meta-Llama-3-8B-Instruct-W4A16-G128-AutoRound"
 else
  model_path="$1"
 fi

 tp_size=1
 model_name=$(basename ${model_path})
 import ctypes
 import torch
 import time

 def nvrtc_compile(source: str) -> str:
    from ctypes import CDLL, c_void_p, c_char_p, c_size_t, byref, create_string_buffer
    libnvrtc = CDLL('libnvrtc.so')
    def get_error_string() -> str:
        err_p = c_char_p()
        libnvrtc.nvrtcGetErrorString(result, byref(err_str))
 Run 1:
 Auto-configed device: cuda
 WARNING:sglang.srt.server_args:Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel.
 WARNING:sglang.srt.server_args:TensorRT-LLM MHA only supports page_size of 16, 32 or 64, changing page_size from None to 64.
 [2025-09-06 08:26:09] server_args=ServerArgs(model_path='/home/yiliu7/models/openai/gpt-oss-120b', tokenizer_path='/home/yiliu7/models/openai/gpt-oss-120b', tokenizer_mode='auto', tokenizer_worker_num=1, skip_tokenizer_init=False, load_format='auto', model_loader_extra_config='{}', trust_remote_code=False, context_length=None, is_embedding=False, enable_multimodal=None, revision=None, model_impl='auto', host='127.0.0.1', port=8400, skip_server_warmup=False, warmups=None, nccl_port=None, dtype='bfloat16', quantization=None, quantization_param_path=None, kv_cache_dtype='auto', mem_fraction_static=0.93, max_running_requests=None, max_queued_requests=9223372036854775807, max_total_tokens=None, chunked_prefill_size=16384, max_p

 from triton.testing import do_bench
 import torch

 from test_packing import _create_random_e2m1_tensor, pack_fp4_to_uint8_old

 from auto_round.export.export_to_autoround.qlinear_fp import FLOAT_TO_E2M1, pack_fp4_to_uint8


 """
 ------------------------------------------------------------------------------
 out shape: torch.Size([4096, 7168])
 out shape: torch.Size([4096, 7168])
 out shape: torch.Size([8192, 7168])
 out shape: torch.Size([8192, 7168])
 out shape: torch.Size([16384, 7168])
 out shape: torch.Size([16384, 7168])
 out shape: torch.Size([4096, 7168])
 out shape: torch.Size([4096, 7168])
 from dataclasses import dataclass
 from typing import List, Dict
 import json


 @dataclass
 class MoeOpInfo:
    num_inputs: int = 0
    num_outputs: int = 0
	diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
	index 69c03d8efb8..f3668018c43 100755
	--- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
	+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
	@@ -930,6 +930,22 @@ class PatchedVllmMixtureOfExpertsOp(PatchedModuleBase):
	router_weights,
	permuted_weights=True,
	activation="silu"):
	+ enable_moe_chunk = hasattr(self.orig_mod, "enable_moe_chunk") and self.orig_mod.enable_moe_chunk
	+ if not enable_moe_chunk:
	#!/bin/bash
	#
	# https://docs.docker.com/build/buildkit/
	# https://github.com/docker/buildx/releases/
	# https://github.com/docker/buildx

	## docker builder prune --all
	## docker buildx du --verbose

	## For Ubuntu 24.04 try: sudo apt install docker-buildx
	#!/bin/bash
	# Check if a model name is passed as an argument, otherwise use the default model path
	if [ -z "$1" ]; then
	model_path="Meta-Llama-3-8B-Instruct-W4A16-G128-AutoRound"
	else
	model_path="$1"
	fi

	tp_size=1
	model_name=$(basename ${model_path})
	import ctypes
	import torch
	import time

	def nvrtc_compile(source: str) -> str:
	from ctypes import CDLL, c_void_p, c_char_p, c_size_t, byref, create_string_buffer
	libnvrtc = CDLL('libnvrtc.so')
	def get_error_string() -> str:
	err_p = c_char_p()
	libnvrtc.nvrtcGetErrorString(result, byref(err_str))
	Run 1:
	Auto-configed device: cuda
	WARNING:sglang.srt.server_args:Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel.
	WARNING:sglang.srt.server_args:TensorRT-LLM MHA only supports page_size of 16, 32 or 64, changing page_size from None to 64.
	[2025-09-06 08:26:09] server_args=ServerArgs(model_path='/home/yiliu7/models/openai/gpt-oss-120b', tokenizer_path='/home/yiliu7/models/openai/gpt-oss-120b', tokenizer_mode='auto', tokenizer_worker_num=1, skip_tokenizer_init=False, load_format='auto', model_loader_extra_config='{}', trust_remote_code=False, context_length=None, is_embedding=False, enable_multimodal=None, revision=None, model_impl='auto', host='127.0.0.1', port=8400, skip_server_warmup=False, warmups=None, nccl_port=None, dtype='bfloat16', quantization=None, quantization_param_path=None, kv_cache_dtype='auto', mem_fraction_static=0.93, max_running_requests=None, max_queued_requests=9223372036854775807, max_total_tokens=None, chunked_prefill_size=16384, max_p

	from triton.testing import do_bench
	import torch

	from test_packing import _create_random_e2m1_tensor, pack_fp4_to_uint8_old

	from auto_round.export.export_to_autoround.qlinear_fp import FLOAT_TO_E2M1, pack_fp4_to_uint8
	"""
	------------------------------------------------------------------------------
	out shape: torch.Size([4096, 7168])
	out shape: torch.Size([4096, 7168])
	out shape: torch.Size([8192, 7168])
	out shape: torch.Size([8192, 7168])
	out shape: torch.Size([16384, 7168])
	out shape: torch.Size([16384, 7168])
	out shape: torch.Size([4096, 7168])
	out shape: torch.Size([4096, 7168])
	from dataclasses import dataclass
	from typing import List, Dict
	import json


	@dataclass
	class MoeOpInfo:
	num_inputs: int = 0
	num_outputs: int = 0