From bebf0da983632f47c8dff5cae5a578992c31ff26 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 16 Nov 2023 16:18:24 +0200 Subject: [PATCH] quantize : add support for K-quant types --- bindings/ios | 2 +- examples/common-ggml.cpp | 39 ++++++++++++++++----------------------- 2 files changed, 17 insertions(+), 24 deletions(-) diff --git a/bindings/ios b/bindings/ios index b5a163d..c9d5095 160000 --- a/bindings/ios +++ b/bindings/ios @@ -1 +1 @@ -Subproject commit b5a163decd5290a99806957905639c4456de97f5 +Subproject commit c9d5095f0c64455b201f1cd0b547efcf093ee7c3 diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp index 33ae03a..e69bd51 100644 --- a/examples/common-ggml.cpp +++ b/examples/common-ggml.cpp @@ -9,6 +9,11 @@ static const std::map GGML_FTYPE_MAP = { {"q5_0", GGML_FTYPE_MOSTLY_Q5_0}, {"q5_1", GGML_FTYPE_MOSTLY_Q5_1}, {"q8_0", GGML_FTYPE_MOSTLY_Q8_0}, + {"q2_k", GGML_FTYPE_MOSTLY_Q2_K}, + {"q3_k", GGML_FTYPE_MOSTLY_Q3_K}, + {"q4_k", GGML_FTYPE_MOSTLY_Q4_K}, + {"q5_k", GGML_FTYPE_MOSTLY_Q5_K}, + {"q6_k", GGML_FTYPE_MOSTLY_Q6_K}, }; void ggml_print_ftypes(FILE * fp) { @@ -48,15 +53,15 @@ bool ggml_common_quantize_0( case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break; case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break; case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break; + case GGML_FTYPE_MOSTLY_Q2_K: qtype = GGML_TYPE_Q2_K; break; + case GGML_FTYPE_MOSTLY_Q3_K: qtype = GGML_TYPE_Q3_K; break; + case GGML_FTYPE_MOSTLY_Q4_K: qtype = GGML_TYPE_Q4_K; break; + case GGML_FTYPE_MOSTLY_Q5_K: qtype = GGML_TYPE_Q5_K; break; + case GGML_FTYPE_MOSTLY_Q6_K: qtype = GGML_TYPE_Q6_K; break; case GGML_FTYPE_UNKNOWN: case GGML_FTYPE_ALL_F32: case GGML_FTYPE_MOSTLY_F16: case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: - case GGML_FTYPE_MOSTLY_Q2_K: - case GGML_FTYPE_MOSTLY_Q3_K: - case GGML_FTYPE_MOSTLY_Q4_K: - case GGML_FTYPE_MOSTLY_Q5_K: - case GGML_FTYPE_MOSTLY_Q6_K: { fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype); return false; @@ -167,24 +172,17 @@ bool ggml_common_quantize_0( switch ((ggml_type) ttype) { case GGML_TYPE_Q4_0: - { - cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; case GGML_TYPE_Q4_1: - { - cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; case GGML_TYPE_Q5_0: - { - cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; case GGML_TYPE_Q5_1: - { - cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: { - cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements, hist_cur.data()); } break; case GGML_TYPE_F32: case GGML_TYPE_F16: @@ -192,11 +190,6 @@ bool ggml_common_quantize_0( case GGML_TYPE_I16: case GGML_TYPE_I32: case GGML_TYPE_Q8_1: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: case GGML_TYPE_Q8_K: case GGML_TYPE_COUNT: {