From 245fc3c37da5ac5963f9f11a9f4f2ac08d96afc6 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Fri, 9 Jun 2023 10:39:59 +0300 Subject: [PATCH] metal : faster q4_0 (#1775) * metal : 8% faster q4_0 Avoid copying into local uchar4 anf float4. * metal : 17% faster Q4_0 Use 64 threads in a thread group. --------- Co-authored-by: Iwan Kawrakow --- ggml-metal.m | 2 +- ggml-metal.metal | 34 +++++++++++++++++++--------------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index ac4f1346c..54cbaf860 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -526,7 +526,7 @@ void ggml_metal_graph_compute( GGML_ASSERT(ne12 == 1); nth0 = 8; - nth1 = 4; + nth1 = 8; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32]; } break; case GGML_TYPE_Q2_K: diff --git a/ggml-metal.metal b/ggml-metal.metal index 43814ed09..8e730eb9c 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -267,6 +267,8 @@ kernel void kernel_mul_mat_q4_0_f32( uint2 tptg[[threads_per_threadgroup]]) { const int nb = ne00/QK4_0; + const int8_t m8 = 8; + const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; @@ -276,33 +278,34 @@ kernel void kernel_mul_mat_q4_0_f32( const uint nth = tptg.x*tptg.y; const uint ith = tptg.y*tpitg.x + tpitg.y; - sum[ith] = 0.0f; + const int ix = tpitg.y/4; // 0 or 1 + const int iy = tpitg.y - 4*ix; // 0...3 - for (int i = tpitg.x; i < nb; i += tptg.x) { - device const uchar4 * x0p = (device const uchar4 *) (x + i)->qs; - device const float4 * y0p = (device const float4 *) (y + i*QK4_0); + const int first = 4 * iy; - const float d = (float)((x + i)->d); + float sumf = 0; - const uchar4 x0v = *(x0p + tpitg.y); - const float4 y0v = *(y0p + tpitg.y + 0); - const float4 y1v = *(y0p + tpitg.y + 4); + for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) { - float acc = 0.0f; + const float d = (float)x[i].d; + + device const uint8_t * xl = x[i].qs + first; + device const float * yl = y + i * QK4_0 + first; + + float2 acc = {0.0f, 0.0f}; for (int j = 0; j < 4; ++j) { - const int x0 = x0v[j] & 0x0F; - const int x1 = x0v[j] >> 4; - const float y0 = y0v[j]; - const float y1 = y1v[j]; + acc[0] += yl[j+ 0] * ((int8_t)(xl[j] & 0xF) - m8); + acc[1] += yl[j+16] * ((int8_t)(xl[j] >> 4) - m8); - acc += (x0 - 8)*y0 + (x1 - 8)*y1; } - sum[ith] += acc*d; + sumf += d * (acc[0] + acc[1]); } + sum[ith] = sumf; + // // Accumulate the sum from all threads in the threadgroup // This version is slightly faster than the commented out one below, @@ -357,6 +360,7 @@ kernel void kernel_mul_mat_f16_f32( uint3 tpig[[thread_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]], uint3 tptg[[threads_per_threadgroup]]) { + const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; const int64_t im = tgpig.z;