Skip to content

Instantly share code, notes, and snippets.

View yiliu30's full-sized avatar
๐ŸŒ
Working on site

Yi Liu yiliu30

๐ŸŒ
Working on site
  • AI Frameworks Engineer @intel
  • SH
  • 11:47 (UTC +08:00)
View GitHub Profile
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
index 69c03d8efb8..f3668018c43 100755
--- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -930,6 +930,22 @@ class PatchedVllmMixtureOfExpertsOp(PatchedModuleBase):
router_weights,
permuted_weights=True,
activation="silu"):
+ enable_moe_chunk = hasattr(self.orig_mod, "enable_moe_chunk") and self.orig_mod.enable_moe_chunk
+ if not enable_moe_chunk:
@yiliu30
yiliu30 / install-buildkit.sh
Created November 12, 2025 01:00 — forked from jniltinho/install-buildkit.sh
Enable BuildKit Docker on Linux Dist, Debian, Ubuntu, Fedora
#!/bin/bash
#
# https://docs.docker.com/build/buildkit/
# https://github.com/docker/buildx/releases/
# https://github.com/docker/buildx
## docker builder prune --all
## docker buildx du --verbose
## For Ubuntu 24.04 try: sudo apt install docker-buildx
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
index 69c03d8efb8..f3668018c43 100755
--- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -930,6 +930,22 @@ class PatchedVllmMixtureOfExpertsOp(PatchedModuleBase):
router_weights,
permuted_weights=True,
activation="silu"):
+ enable_moe_chunk = hasattr(self.orig_mod, "enable_moe_chunk") and self.orig_mod.enable_moe_chunk
+ if enable_moe_chunk:
#!/bin/bash
# Check if a model name is passed as an argument, otherwise use the default model path
if [ -z "$1" ]; then
model_path="Meta-Llama-3-8B-Instruct-W4A16-G128-AutoRound"
else
model_path="$1"
fi
tp_size=1
model_name=$(basename ${model_path})
import ctypes
import torch
import time
def nvrtc_compile(source: str) -> str:
from ctypes import CDLL, c_void_p, c_char_p, c_size_t, byref, create_string_buffer
libnvrtc = CDLL('libnvrtc.so')
def get_error_string() -> str:
err_p = c_char_p()
libnvrtc.nvrtcGetErrorString(result, byref(err_str))
Run 1:
Auto-configed device: cuda
WARNING:sglang.srt.server_args:Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel.
WARNING:sglang.srt.server_args:TensorRT-LLM MHA only supports page_size of 16, 32 or 64, changing page_size from None to 64.
[2025-09-06 08:26:09] server_args=ServerArgs(model_path='/home/yiliu7/models/openai/gpt-oss-120b', tokenizer_path='/home/yiliu7/models/openai/gpt-oss-120b', tokenizer_mode='auto', tokenizer_worker_num=1, skip_tokenizer_init=False, load_format='auto', model_loader_extra_config='{}', trust_remote_code=False, context_length=None, is_embedding=False, enable_multimodal=None, revision=None, model_impl='auto', host='127.0.0.1', port=8400, skip_server_warmup=False, warmups=None, nccl_port=None, dtype='bfloat16', quantization=None, quantization_param_path=None, kv_cache_dtype='auto', mem_fraction_static=0.93, max_running_requests=None, max_queued_requests=9223372036854775807, max_total_tokens=None, chunked_prefill_size=16384, max_p
  1. skip sdpa
{
    "mode": "QUANTIZE",
    "observer": "maxabs",
    "scale_method": "ACT_MAXABS_HW_WEIGHTS_PCS_MAXABS_POW2",
    "scale_format": "const",
    "allowlist": {
        "types": [],
        "names": []
from triton.testing import do_bench
import torch
from test_packing import _create_random_e2m1_tensor, pack_fp4_to_uint8_old
from auto_round.export.export_to_autoround.qlinear_fp import FLOAT_TO_E2M1, pack_fp4_to_uint8
"""
------------------------------------------------------------------------------
out shape: torch.Size([4096, 7168])
out shape: torch.Size([4096, 7168])
out shape: torch.Size([8192, 7168])
out shape: torch.Size([8192, 7168])
out shape: torch.Size([16384, 7168])
out shape: torch.Size([16384, 7168])
out shape: torch.Size([4096, 7168])
out shape: torch.Size([4096, 7168])
from dataclasses import dataclass
from typing import List, Dict
import json
@dataclass
class MoeOpInfo:
num_inputs: int = 0
num_outputs: int = 0