Created
August 6, 2025 01:46
-
-
Save HDCharles/4a529e12709a490777f53aed0c28bc33 to your computer and use it in GitHub Desktop.
MoE Benchmark Results
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ################## RESULTS | |
| # ######### GROUPED_MM ####### | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 8 --moe_quant noquant --compile | |
| # Average tokens/sec: 24.15 | |
| # Average tokens/sec including batches 193.18 | |
| # Memory used: 95.25 GB | |
| # model size: 93.62 | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 8 --moe_quant noquant --compile --compile_mode "max-autotune" | |
| # Average tokens/sec: 23.97 | |
| # Average tokens/sec including batches 191.80 | |
| # Memory used: 97.13 GB | |
| # model size: 93.62 | |
| # scaled_grouped_mm | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 8 --moe_quant fp8dq-base --compile | |
| # Average tokens/sec: 39.45 | |
| # Average tokens/sec including batches 315.63 | |
| # Memory used: 72.73 GB | |
| # model size: 48.56 | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 8 --moe_quant fp8dq-base --compile --compile_mode "max-autotune" | |
| # Average tokens/sec: 42.83 | |
| # Average tokens/sec including batches 342.68 | |
| # Memory used: 72.73 GB | |
| # model size: 48.56 | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 1 --moe_quant noquant | |
| # Average tokens/sec: 34.19 | |
| # Memory used: 95.28 GB | |
| # model size: 93.43 | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 1 --moe_quant noquant --compile | |
| # Average tokens/sec: 68.29 | |
| # Memory used: 95.28 GB | |
| # model size: 93.43 | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 1 --moe_quant noquant --compile --compile_mode "max-autotune" | |
| # Average tokens/sec: 69.39 | |
| # Memory used: 97.15 GB | |
| # model size: 93.43 | |
| # scaled_grouped_mm | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 1 --moe_quant fp8dq-base | |
| # Average tokens/sec: 21.48 | |
| # Memory used: 72.77 GB | |
| # model size: 48.37 | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 1 --moe_quant fp8dq-base --compile | |
| # Average tokens/sec: 86.88 | |
| # Memory used: 72.76 GB | |
| # model size: 48.37 | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 1 --moe_quant fp8dq-base --compile --compile_mode "max-autotune" | |
| # Average tokens/sec: 100.74 | |
| # Memory used: 72.76 GB | |
| # model size: 48.37 | |
| ######### SINGLE TOKEN ####### | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 1 --compile | |
| # Average tokens/sec: 74.93 | |
| # Memory used: 97.80 GB | |
| # model size: 93.43 | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 1 --moe_quant noquant --compile --decompose_grouped_mm | |
| # Average tokens/sec: 76.84 | |
| # Memory used: 95.28 GB | |
| # model size: 93.43 | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 1 --moe_quant int8wo-base --compile --decompose_grouped_mm | |
| # Average tokens/sec: 112.89 | |
| # Memory used: 57.74 GB | |
| # model size: 48.42 | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 1 --moe_quant int4wo-base --compile --decompose_grouped_mm | |
| # Average tokens/sec: 80.17 | |
| # Memory used: 40.86 GB | |
| # model size: 27.20 | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 1 --moe_quant fp8wo-base --compile --decompose_grouped_mm | |
| # Average tokens/sec: 5.49 | |
| # Memory used: 72.76 GB | |
| # model size: 48.37 | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 1 --moe_quant fp8dq-base --compile --decompose_grouped_mm | |
| # Average tokens/sec: 45.55 | |
| # Memory used: 72.76 GB | |
| # model size: 48.37 | |
| # ######### MULTI TOKEN ####### | |
| # noquant | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 8 --moe_quant noquant --compile --decompose_grouped_mm | |
| # Average tokens/sec: 16.72 | |
| # Average tokens/sec including batches 133.72 | |
| # Memory used: 95.25 GB | |
| # model size: 93.62 | |
| # int8wo-base | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 8 --moe_quant int8wo-base --compile --decompose_grouped_mm | |
| # Average tokens/sec: 5.09 | |
| # Average tokens/sec including batches 40.70 | |
| # Memory used: 57.71 GB | |
| # model size: 48.61 | |
| # needs balanced tokens due to minimum matmul sizes | |
| # int8dq-base | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 8 --moe_quant int8dq-base --compile --decompose_grouped_mm | |
| # int4wo-base | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 8 --moe_quant int4wo-base --compile --decompose_grouped_mm | |
| # Average tokens/sec: 11.86 | |
| # Average tokens/sec including batches 94.91 | |
| # Memory used: 40.83 GB | |
| # model size: 27.39 | |
| # fp8wo-base | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 8 --moe_quant fp8wo-base --compile --decompose_grouped_mm | |
| # Average tokens/sec: 1.43 | |
| # Average tokens/sec including batches 11.44 | |
| # Memory used: 72.73 GB | |
| # model size: 48.56 | |
| # fp8dq-base | |
| # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 8 --moe_quant fp8dq-base --compile --decompose_grouped_mm | |
| # Average tokens/sec: 5.99 | |
| # Average tokens/sec including batches 47.91 | |
| # Memory used: 72.74 GB |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment