Created
February 18, 2025 12:14
-
-
Save lunzima/dbca4281acf7c6bb0100e26a0a51de06 to your computer and use it in GitHub Desktop.
llama-quant.cpp dynamic quantization mod
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp | |
| index fb798265..cfa73700 100644 | |
| --- a/src/llama-quant.cpp | |
| +++ b/src/llama-quant.cpp | |
| @@ -187,6 +187,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t | |
| } | |
| } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || | |
| ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { | |
| + bool is_one_bit = (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S); | |
| if (name.find("attn_v.weight") != std::string::npos) { | |
| if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; | |
| else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; | |
| @@ -195,19 +196,76 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t | |
| else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) { | |
| new_type = GGML_TYPE_Q4_K; | |
| } | |
| + else if (name.find("ffn_down.weight") != std::string::npos) { | |
| + // First 3 Layers | |
| + new_type = GGML_TYPE_Q5_K; | |
| + ++qs.i_ffn_down; | |
| + } | |
| + else if (name.find("ffn_down_shexp.weight") != std::string::npos) { | |
| + // Shared experts | |
| + new_type = GGML_TYPE_Q5_K; | |
| + ++qs.i_ffn_down; | |
| + } | |
| else if (name.find("ffn_down") != std::string::npos) { | |
| - if (qs.i_ffn_down < qs.n_ffn_down/8) { | |
| - new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; | |
| + auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); | |
| + int i_layer = info.first, n_layer = info.second; | |
| + // Layers 0, 1, 2 are Dense so Q4_K | |
| + // 3, 4, 5 left as Q2_K | |
| + if (is_one_bit) { | |
| + // 3, 4, 5, 6, 7, 8 left as 2.06 bpw | |
| + if (i_layer < 9) new_type = GGML_TYPE_IQ2_XXS; // 2.06 bpw | |
| + } | |
| + else { | |
| + if (i_layer < 6) new_type = GGML_TYPE_Q3_K; | |
| + else new_type = GGML_TYPE_Q2_K; | |
| } | |
| ++qs.i_ffn_down; | |
| } | |
| + else if (name.find("ffn_gate.weight") != std::string::npos) { | |
| + // First 3 Layers | |
| + new_type = GGML_TYPE_Q4_K; | |
| + ++qs.i_ffn_gate; | |
| + } | |
| + else if (name.find("ffn_gate_shexp.weight") != std::string::npos) { | |
| + // Shared experts | |
| + new_type = GGML_TYPE_Q5_K; | |
| + ++qs.i_ffn_gate; | |
| + } | |
| + else if (name.find("ffn_up.weight") != std::string::npos) { | |
| + // First 3 Layers | |
| + new_type = GGML_TYPE_Q4_K; | |
| + ++qs.i_ffn_up; | |
| + } | |
| + else if (name.find("ffn_up_shexp.weight") != std::string::npos) { | |
| + // Shared experts | |
| + new_type = GGML_TYPE_Q5_K; | |
| + ++qs.i_ffn_up; | |
| + } | |
| + else if (name.find("attn_kv_a_mqa.weight") != std::string::npos) { | |
| + // MLA projection matrices for KV | |
| + new_type = GGML_TYPE_Q5_K; | |
| + } | |
| + else if (name.find("attn_kv_b.weight") != std::string::npos) { | |
| + // MLA projection matrices for KV | |
| + new_type = GGML_TYPE_Q5_K; | |
| + } | |
| + else if (name.find("attn_q_a.weight") != std::string::npos) { | |
| + // MLA projection matrices for Q | |
| + new_type = GGML_TYPE_Q4_K; | |
| + } | |
| + else if (name.find("attn_q_b.weight") != std::string::npos) { | |
| + // MLA projection matrices for Q | |
| + new_type = GGML_TYPE_Q4_K; | |
| + } | |
| else if (name.find("attn_output.weight") != std::string::npos) { | |
| - if (qs.model.hparams.n_expert == 8) { | |
| - new_type = GGML_TYPE_Q5_K; | |
| - } else { | |
| - if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; | |
| - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S; | |
| - } | |
| + // Leave as 4bit | |
| + new_type = GGML_TYPE_Q4_K; | |
| + // if (qs.model.hparams.n_expert == 8) { | |
| + // new_type = GGML_TYPE_Q5_K; | |
| + // } else { | |
| + // if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; | |
| + // else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S; | |
| + // } | |
| } | |
| } else if (name.find("attn_v.weight") != std::string::npos) { | |
| if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { | |
| @@ -266,10 +324,33 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t | |
| else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { | |
| new_type = GGML_TYPE_IQ2_S; | |
| } | |
| + } else if (name.find("ffn_down.weight") != std::string::npos) { | |
| + // First 3 Layers | |
| + if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { | |
| + new_type = GGML_TYPE_Q5_K; | |
| + } | |
| + else { | |
| + new_type = GGML_TYPE_Q6_K; | |
| + } | |
| + ++qs.i_ffn_down; | |
| + } else if (name.find("ffn_down_shexp.weight") != std::string::npos) { | |
| + // Shared experts | |
| + if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { | |
| + new_type = GGML_TYPE_Q5_K; | |
| + } | |
| + else { | |
| + new_type = GGML_TYPE_Q6_K; | |
| + } | |
| + ++qs.i_ffn_down; | |
| } else if (name.find("ffn_down") != std::string::npos) { | |
| auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); | |
| int i_layer = info.first, n_layer = info.second; | |
| - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; | |
| + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { | |
| + // Layers 0, 1, 2 are Dense so Q4_K | |
| + // 3, 4, 5 left as Q3_K | |
| + if (i_layer < 6) new_type = GGML_TYPE_Q3_K; | |
| + else new_type = GGML_TYPE_Q2_K; | |
| + } | |
| else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { | |
| if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; | |
| } | |
| @@ -311,6 +392,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t | |
| new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; | |
| } | |
| ++qs.i_ffn_down; | |
| + } else if (name.find("attn_output.weight") != std::string::npos) { | |
| + // Leave as 4bit | |
| + new_type = GGML_TYPE_Q4_K; | |
| } else if (name.find("attn_output.weight") != std::string::npos) { | |
| if (arch != LLM_ARCH_FALCON) { | |
| if (qs.model.hparams.n_expert == 8) { | |
| @@ -337,8 +421,15 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t | |
| } | |
| else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; | |
| else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; | |
| - } | |
| - else if (name.find("ffn_gate") != std::string::npos) { | |
| + } else if (name.find("ffn_gate.weight") != std::string::npos) { | |
| + // First 3 Layers | |
| + new_type = GGML_TYPE_Q4_K; | |
| + ++qs.i_ffn_gate; | |
| + } else if (name.find("ffn_gate_shexp.weight") != std::string::npos) { | |
| + // Shared experts | |
| + new_type = GGML_TYPE_Q4_K; | |
| + ++qs.i_ffn_gate; | |
| + } else if (name.find("ffn_gate") != std::string::npos) { | |
| auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); | |
| int i_layer = info.first, n_layer = info.second; | |
| if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { | |
| @@ -346,6 +437,16 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t | |
| } | |
| ++qs.i_ffn_gate; | |
| } | |
| + else if (name.find("ffn_up.weight") != std::string::npos) { | |
| + // First 3 Layers | |
| + new_type = GGML_TYPE_Q4_K; | |
| + ++qs.i_ffn_up; | |
| + } | |
| + else if (name.find("ffn_up_shexp.weight") != std::string::npos) { | |
| + // Shared experts | |
| + new_type = GGML_TYPE_Q4_K; | |
| + ++qs.i_ffn_up; | |
| + } | |
| else if (name.find("ffn_up") != std::string::npos) { | |
| auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); | |
| int i_layer = info.first, n_layer = info.second; | |
| @@ -354,6 +455,32 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t | |
| } | |
| ++qs.i_ffn_up; | |
| } | |
| + else if (name.find("attn_kv_a_mqa.weight") != std::string::npos) { | |
| + // MLA projection matrices for KV | |
| + if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { | |
| + new_type = GGML_TYPE_Q5_K; | |
| + } | |
| + else { | |
| + new_type = GGML_TYPE_Q6_K; | |
| + } | |
| + } | |
| + else if (name.find("attn_kv_b.weight") != std::string::npos) { | |
| + // MLA projection matrices for KV | |
| + if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { | |
| + new_type = GGML_TYPE_Q5_K; | |
| + } | |
| + else { | |
| + new_type = GGML_TYPE_Q6_K; | |
| + } | |
| + } | |
| + else if (name.find("attn_q_a.weight") != std::string::npos) { | |
| + // MLA projection matrices for Q | |
| + new_type = GGML_TYPE_Q4_K; | |
| + } | |
| + else if (name.find("attn_q_b.weight") != std::string::npos) { | |
| + // MLA projection matrices for Q | |
| + new_type = GGML_TYPE_Q4_K; | |
| + } | |
| // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; | |
| //} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment