Skip to content

Instantly share code, notes, and snippets.

@lunzima
Created February 18, 2025 12:14
Show Gist options
  • Select an option

  • Save lunzima/dbca4281acf7c6bb0100e26a0a51de06 to your computer and use it in GitHub Desktop.

Select an option

Save lunzima/dbca4281acf7c6bb0100e26a0a51de06 to your computer and use it in GitHub Desktop.
llama-quant.cpp dynamic quantization mod
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index fb798265..cfa73700 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -187,6 +187,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
}
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
+ bool is_one_bit = (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S);
if (name.find("attn_v.weight") != std::string::npos) {
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
@@ -195,19 +196,76 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
new_type = GGML_TYPE_Q4_K;
}
+ else if (name.find("ffn_down.weight") != std::string::npos) {
+ // First 3 Layers
+ new_type = GGML_TYPE_Q5_K;
+ ++qs.i_ffn_down;
+ }
+ else if (name.find("ffn_down_shexp.weight") != std::string::npos) {
+ // Shared experts
+ new_type = GGML_TYPE_Q5_K;
+ ++qs.i_ffn_down;
+ }
else if (name.find("ffn_down") != std::string::npos) {
- if (qs.i_ffn_down < qs.n_ffn_down/8) {
- new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+ auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
+ int i_layer = info.first, n_layer = info.second;
+ // Layers 0, 1, 2 are Dense so Q4_K
+ // 3, 4, 5 left as Q2_K
+ if (is_one_bit) {
+ // 3, 4, 5, 6, 7, 8 left as 2.06 bpw
+ if (i_layer < 9) new_type = GGML_TYPE_IQ2_XXS; // 2.06 bpw
+ }
+ else {
+ if (i_layer < 6) new_type = GGML_TYPE_Q3_K;
+ else new_type = GGML_TYPE_Q2_K;
}
++qs.i_ffn_down;
}
+ else if (name.find("ffn_gate.weight") != std::string::npos) {
+ // First 3 Layers
+ new_type = GGML_TYPE_Q4_K;
+ ++qs.i_ffn_gate;
+ }
+ else if (name.find("ffn_gate_shexp.weight") != std::string::npos) {
+ // Shared experts
+ new_type = GGML_TYPE_Q5_K;
+ ++qs.i_ffn_gate;
+ }
+ else if (name.find("ffn_up.weight") != std::string::npos) {
+ // First 3 Layers
+ new_type = GGML_TYPE_Q4_K;
+ ++qs.i_ffn_up;
+ }
+ else if (name.find("ffn_up_shexp.weight") != std::string::npos) {
+ // Shared experts
+ new_type = GGML_TYPE_Q5_K;
+ ++qs.i_ffn_up;
+ }
+ else if (name.find("attn_kv_a_mqa.weight") != std::string::npos) {
+ // MLA projection matrices for KV
+ new_type = GGML_TYPE_Q5_K;
+ }
+ else if (name.find("attn_kv_b.weight") != std::string::npos) {
+ // MLA projection matrices for KV
+ new_type = GGML_TYPE_Q5_K;
+ }
+ else if (name.find("attn_q_a.weight") != std::string::npos) {
+ // MLA projection matrices for Q
+ new_type = GGML_TYPE_Q4_K;
+ }
+ else if (name.find("attn_q_b.weight") != std::string::npos) {
+ // MLA projection matrices for Q
+ new_type = GGML_TYPE_Q4_K;
+ }
else if (name.find("attn_output.weight") != std::string::npos) {
- if (qs.model.hparams.n_expert == 8) {
- new_type = GGML_TYPE_Q5_K;
- } else {
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
- }
+ // Leave as 4bit
+ new_type = GGML_TYPE_Q4_K;
+ // if (qs.model.hparams.n_expert == 8) {
+ // new_type = GGML_TYPE_Q5_K;
+ // } else {
+ // if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
+ // else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
+ // }
}
} else if (name.find("attn_v.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
@@ -266,10 +324,33 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = GGML_TYPE_IQ2_S;
}
+ } else if (name.find("ffn_down.weight") != std::string::npos) {
+ // First 3 Layers
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
+ new_type = GGML_TYPE_Q5_K;
+ }
+ else {
+ new_type = GGML_TYPE_Q6_K;
+ }
+ ++qs.i_ffn_down;
+ } else if (name.find("ffn_down_shexp.weight") != std::string::npos) {
+ // Shared experts
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
+ new_type = GGML_TYPE_Q5_K;
+ }
+ else {
+ new_type = GGML_TYPE_Q6_K;
+ }
+ ++qs.i_ffn_down;
} else if (name.find("ffn_down") != std::string::npos) {
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
int i_layer = info.first, n_layer = info.second;
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
+ // Layers 0, 1, 2 are Dense so Q4_K
+ // 3, 4, 5 left as Q3_K
+ if (i_layer < 6) new_type = GGML_TYPE_Q3_K;
+ else new_type = GGML_TYPE_Q2_K;
+ }
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
}
@@ -311,6 +392,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
}
++qs.i_ffn_down;
+ } else if (name.find("attn_output.weight") != std::string::npos) {
+ // Leave as 4bit
+ new_type = GGML_TYPE_Q4_K;
} else if (name.find("attn_output.weight") != std::string::npos) {
if (arch != LLM_ARCH_FALCON) {
if (qs.model.hparams.n_expert == 8) {
@@ -337,8 +421,15 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
- }
- else if (name.find("ffn_gate") != std::string::npos) {
+ } else if (name.find("ffn_gate.weight") != std::string::npos) {
+ // First 3 Layers
+ new_type = GGML_TYPE_Q4_K;
+ ++qs.i_ffn_gate;
+ } else if (name.find("ffn_gate_shexp.weight") != std::string::npos) {
+ // Shared experts
+ new_type = GGML_TYPE_Q4_K;
+ ++qs.i_ffn_gate;
+ } else if (name.find("ffn_gate") != std::string::npos) {
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
int i_layer = info.first, n_layer = info.second;
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
@@ -346,6 +437,16 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
}
++qs.i_ffn_gate;
}
+ else if (name.find("ffn_up.weight") != std::string::npos) {
+ // First 3 Layers
+ new_type = GGML_TYPE_Q4_K;
+ ++qs.i_ffn_up;
+ }
+ else if (name.find("ffn_up_shexp.weight") != std::string::npos) {
+ // Shared experts
+ new_type = GGML_TYPE_Q4_K;
+ ++qs.i_ffn_up;
+ }
else if (name.find("ffn_up") != std::string::npos) {
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
int i_layer = info.first, n_layer = info.second;
@@ -354,6 +455,32 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
}
++qs.i_ffn_up;
}
+ else if (name.find("attn_kv_a_mqa.weight") != std::string::npos) {
+ // MLA projection matrices for KV
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
+ new_type = GGML_TYPE_Q5_K;
+ }
+ else {
+ new_type = GGML_TYPE_Q6_K;
+ }
+ }
+ else if (name.find("attn_kv_b.weight") != std::string::npos) {
+ // MLA projection matrices for KV
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
+ new_type = GGML_TYPE_Q5_K;
+ }
+ else {
+ new_type = GGML_TYPE_Q6_K;
+ }
+ }
+ else if (name.find("attn_q_a.weight") != std::string::npos) {
+ // MLA projection matrices for Q
+ new_type = GGML_TYPE_Q4_K;
+ }
+ else if (name.find("attn_q_b.weight") != std::string::npos) {
+ // MLA projection matrices for Q
+ new_type = GGML_TYPE_Q4_K;
+ }
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
//}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment