From 95e9d8a780cdf957ce2bf8e405de5f761fe46f2a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 7 Dec 2023 13:41:33 +0200 Subject: [PATCH] sync : ggml (part 3, Metal) --- ggml-metal.h | 2 + ggml-metal.m | 666 ++++++++++++++++++++++++++++++----------- ggml-metal.metal | 755 +++++++++++++++++++++++++++++++++++++---------- 3 files changed, 1099 insertions(+), 324 deletions(-) diff --git a/ggml-metal.h b/ggml-metal.h index be2731f8b..14ab5029c 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -98,6 +98,8 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void); GGML_API bool ggml_backend_is_metal(ggml_backend_t backend); +GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); + GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb); #ifdef __cplusplus diff --git a/ggml-metal.m b/ggml-metal.m index be4ab0f2e..214464456 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -62,6 +62,8 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(add_row); // TODO: avoid this extra kernel, instead extend the "add" kernel to support broadcast GGML_METAL_DECL_KERNEL(mul); GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast + GGML_METAL_DECL_KERNEL(div); + GGML_METAL_DECL_KERNEL(div_row); GGML_METAL_DECL_KERNEL(scale); GGML_METAL_DECL_KERNEL(scale_4); GGML_METAL_DECL_KERNEL(silu); @@ -112,10 +114,24 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32); GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32); GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32); + GGML_METAL_DECL_KERNEL(mul_mm_id_f32_f32); + GGML_METAL_DECL_KERNEL(mul_mm_id_f16_f32); + GGML_METAL_DECL_KERNEL(mul_mm_id_q4_0_f32); + GGML_METAL_DECL_KERNEL(mul_mm_id_q4_1_f32); + GGML_METAL_DECL_KERNEL(mul_mm_id_q5_0_f32); + GGML_METAL_DECL_KERNEL(mul_mm_id_q5_1_f32); + GGML_METAL_DECL_KERNEL(mul_mm_id_q8_0_f32); + GGML_METAL_DECL_KERNEL(mul_mm_id_q2_K_f32); + GGML_METAL_DECL_KERNEL(mul_mm_id_q3_K_f32); + GGML_METAL_DECL_KERNEL(mul_mm_id_q4_K_f32); + GGML_METAL_DECL_KERNEL(mul_mm_id_q5_K_f32); + GGML_METAL_DECL_KERNEL(mul_mm_id_q6_K_f32); GGML_METAL_DECL_KERNEL(rope_f32); GGML_METAL_DECL_KERNEL(rope_f16); GGML_METAL_DECL_KERNEL(alibi_f32); GGML_METAL_DECL_KERNEL(im2col_f16); + GGML_METAL_DECL_KERNEL(argsort_f32_i32_asc); + GGML_METAL_DECL_KERNEL(argsort_f32_i32_desc); GGML_METAL_DECL_KERNEL(cpy_f32_f16); GGML_METAL_DECL_KERNEL(cpy_f32_f32); GGML_METAL_DECL_KERNEL(cpy_f32_q8_0); @@ -126,6 +142,7 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(cpy_f16_f16); GGML_METAL_DECL_KERNEL(concat); GGML_METAL_DECL_KERNEL(sqr); + GGML_METAL_DECL_KERNEL(sum_rows); #undef GGML_METAL_DECL_KERNEL }; @@ -169,12 +186,10 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ } } - - struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_LOG_INFO("%s: allocating\n", __func__); - id device; + id device; NSString * s; #if TARGET_OS_OSX @@ -250,6 +265,29 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { } } +#if TARGET_OS_OSX + // print MTL GPU family: + GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]); + + // determine max supported GPU family + // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf + // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf + for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) { + if ([ctx->device supportsFamily:i]) { + GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i); + break; + } + } + + GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); + GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6); + if (ctx->device.maxTransferRate != 0) { + GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6); + } else { + GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__); + } +#endif + // load kernels { NSError * error = nil; @@ -271,6 +309,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(add_row); GGML_METAL_ADD_KERNEL(mul); GGML_METAL_ADD_KERNEL(mul_row); + GGML_METAL_ADD_KERNEL(div); + GGML_METAL_ADD_KERNEL(div_row); GGML_METAL_ADD_KERNEL(scale); GGML_METAL_ADD_KERNEL(scale_4); GGML_METAL_ADD_KERNEL(silu); @@ -322,11 +362,25 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32); GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32); GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_id_f32_f32); + GGML_METAL_ADD_KERNEL(mul_mm_id_f16_f32); + GGML_METAL_ADD_KERNEL(mul_mm_id_q4_0_f32); + GGML_METAL_ADD_KERNEL(mul_mm_id_q4_1_f32); + GGML_METAL_ADD_KERNEL(mul_mm_id_q5_0_f32); + GGML_METAL_ADD_KERNEL(mul_mm_id_q5_1_f32); + GGML_METAL_ADD_KERNEL(mul_mm_id_q8_0_f32); + GGML_METAL_ADD_KERNEL(mul_mm_id_q2_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_id_q3_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_id_q4_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_id_q5_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_id_q6_K_f32); } GGML_METAL_ADD_KERNEL(rope_f32); GGML_METAL_ADD_KERNEL(rope_f16); GGML_METAL_ADD_KERNEL(alibi_f32); GGML_METAL_ADD_KERNEL(im2col_f16); + GGML_METAL_ADD_KERNEL(argsort_f32_i32_asc); + GGML_METAL_ADD_KERNEL(argsort_f32_i32_desc); GGML_METAL_ADD_KERNEL(cpy_f32_f16); GGML_METAL_ADD_KERNEL(cpy_f32_f32); GGML_METAL_ADD_KERNEL(cpy_f32_q8_0); @@ -337,33 +391,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(cpy_f16_f16); GGML_METAL_ADD_KERNEL(concat); GGML_METAL_ADD_KERNEL(sqr); + GGML_METAL_ADD_KERNEL(sum_rows); #undef GGML_METAL_ADD_KERNEL } -#if TARGET_OS_OSX - // print MTL GPU family: - GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]); - - // determine max supported GPU family - // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf - // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf - for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) { - if ([ctx->device supportsFamily:i]) { - GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i); - break; - } - } - - GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); - GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MiB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); - if (ctx->device.maxTransferRate != 0) { - GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MiB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); - } else { - GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__); - } -#endif - return ctx; } @@ -377,6 +409,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(add_row); GGML_METAL_DEL_KERNEL(mul); GGML_METAL_DEL_KERNEL(mul_row); + GGML_METAL_DEL_KERNEL(div); + GGML_METAL_DEL_KERNEL(div_row); GGML_METAL_DEL_KERNEL(scale); GGML_METAL_DEL_KERNEL(scale_4); GGML_METAL_DEL_KERNEL(silu); @@ -428,11 +462,25 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32); GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32); GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_id_f32_f32); + GGML_METAL_DEL_KERNEL(mul_mm_id_f16_f32); + GGML_METAL_DEL_KERNEL(mul_mm_id_q4_0_f32); + GGML_METAL_DEL_KERNEL(mul_mm_id_q4_1_f32); + GGML_METAL_DEL_KERNEL(mul_mm_id_q5_0_f32); + GGML_METAL_DEL_KERNEL(mul_mm_id_q5_1_f32); + GGML_METAL_DEL_KERNEL(mul_mm_id_q8_0_f32); + GGML_METAL_DEL_KERNEL(mul_mm_id_q2_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_id_q3_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_id_q4_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_id_q5_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_id_q6_K_f32); } GGML_METAL_DEL_KERNEL(rope_f32); GGML_METAL_DEL_KERNEL(rope_f16); GGML_METAL_DEL_KERNEL(alibi_f32); GGML_METAL_DEL_KERNEL(im2col_f16); + GGML_METAL_DEL_KERNEL(argsort_f32_i32_asc); + GGML_METAL_DEL_KERNEL(argsort_f32_i32_desc); GGML_METAL_DEL_KERNEL(cpy_f32_f16); GGML_METAL_DEL_KERNEL(cpy_f32_f32); GGML_METAL_DEL_KERNEL(cpy_f32_q8_0); @@ -443,6 +491,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(cpy_f16_f16); GGML_METAL_DEL_KERNEL(concat); GGML_METAL_DEL_KERNEL(sqr); + GGML_METAL_DEL_KERNEL(sum_rows); #undef GGML_METAL_DEL_KERNEL @@ -486,6 +535,13 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) { return ctx->concur_list; } +// temporarily defined here for compatibility between ggml-backend and the old API +struct ggml_backend_metal_buffer_context { + void * data; + + id metal; +}; + // finds the Metal buffer that contains the tensor data on the GPU device // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // Metal buffer based on the host memory pointer @@ -495,8 +551,17 @@ static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru const int64_t tsize = ggml_nbytes(t); - if (t->buffer && t->buffer->backend && t->buffer->backend->context) { - ctx = t->buffer->backend->context; + // compatibility with ggml-backend + if (t->buffer && t->buffer->buft == ggml_backend_metal_buffer_type()) { + struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) t->buffer->context; + + const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->data; + + GGML_ASSERT(ioffs >= 0 && ioffs + tsize <= (int64_t) t->buffer->size); + + *offs = (size_t) ioffs; + + return buf_ctx->metal; } // find the view that contains the tensor fully @@ -721,6 +786,52 @@ void ggml_metal_graph_find_concurrency( } } +static bool ggml_metal_supports_op(const struct ggml_tensor * op) { + switch (op->op) { + case GGML_OP_UNARY: + switch (ggml_get_unary_op(op)) { + case GGML_UNARY_OP_SILU: + case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_GELU: + return true; + default: + return false; + } + break; + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_TRANSPOSE: + case GGML_OP_PERMUTE: + case GGML_OP_CONCAT: + case GGML_OP_ADD: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_SCALE: + case GGML_OP_SQR: + case GGML_OP_SUM_ROWS: + case GGML_OP_SOFT_MAX: + case GGML_OP_RMS_NORM: + case GGML_OP_NORM: + case GGML_OP_ALIBI: + case GGML_OP_ROPE: + case GGML_OP_IM2COL: + case GGML_OP_ARGSORT: + case GGML_OP_DUP: + case GGML_OP_CPY: + case GGML_OP_CONT: + case GGML_OP_MUL_MAT: + case GGML_OP_MUL_MAT_ID: + return true; + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_GET_ROWS: + { + return op->ne[0] % 4 == 0; + } break; + default: + return false; + } +} void ggml_metal_graph_compute( struct ggml_metal_context * ctx, struct ggml_cgraph * gf) { @@ -791,6 +902,8 @@ void ggml_metal_graph_compute( } break; } + GGML_ASSERT(ggml_metal_supports_op(dst)); + const int64_t ne00 = src0 ? src0->ne[0] : 0; const int64_t ne01 = src0 ? src0->ne[1] : 0; const int64_t ne02 = src0 ? src0->ne[2] : 0; @@ -883,6 +996,8 @@ void ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; case GGML_OP_ADD: + case GGML_OP_MUL: + case GGML_OP_DIV: { GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src1)); @@ -896,11 +1011,21 @@ void ggml_metal_graph_compute( GGML_ASSERT(ne11 == 1); nb = ne00 / 4; - [encoder setComputePipelineState:ctx->pipeline_add_row]; + switch (dst->op) { + case GGML_OP_ADD: [encoder setComputePipelineState:ctx->pipeline_add_row]; break; + case GGML_OP_MUL: [encoder setComputePipelineState:ctx->pipeline_mul_row]; break; + case GGML_OP_DIV: [encoder setComputePipelineState:ctx->pipeline_div_row]; break; + default: GGML_ASSERT(false); + } bcast_row = true; } else { - [encoder setComputePipelineState:ctx->pipeline_add]; + switch (dst->op) { + case GGML_OP_ADD: [encoder setComputePipelineState:ctx->pipeline_add]; break; + case GGML_OP_MUL: [encoder setComputePipelineState:ctx->pipeline_mul]; break; + case GGML_OP_DIV: [encoder setComputePipelineState:ctx->pipeline_div]; break; + default: GGML_ASSERT(false); + } } [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; @@ -941,31 +1066,6 @@ void ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } } break; - case GGML_OP_MUL: - { - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(src1)); - - // utilize float4 - GGML_ASSERT(ne00 % 4 == 0); - const int64_t nb = ne00/4; - - if (ggml_nelements(src1) == ne10) { - // src1 is a row - GGML_ASSERT(ne11 == 1); - [encoder setComputePipelineState:ctx->pipeline_mul_row]; - } else { - [encoder setComputePipelineState:ctx->pipeline_mul]; - } - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&nb length:sizeof(nb) atIndex:3]; - - const int64_t n = ggml_nelements(dst)/4; - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; case GGML_OP_SCALE: { GGML_ASSERT(ggml_is_contiguous(src0)); @@ -1038,6 +1138,40 @@ void ggml_metal_graph_compute( const int64_t n = ggml_nelements(dst); [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; + case GGML_OP_SUM_ROWS: + { + GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); + + [encoder setComputePipelineState:ctx->pipeline_sum_rows]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16]; + [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:18]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:19]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:20]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:21]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:22]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:23]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:24]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:25]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; case GGML_OP_SOFT_MAX: { int nth = 32; // SIMD width @@ -1092,13 +1226,17 @@ void ggml_metal_graph_compute( case GGML_OP_MUL_MAT: { GGML_ASSERT(ne00 == ne10); - GGML_ASSERT(ne03 == ne13); - const uint gqa = ne12/ne02; + // TODO: assert that dim2 and dim3 are contiguous + GGML_ASSERT(ne12 % ne02 == 0); + GGML_ASSERT(ne13 % ne03 == 0); + + const uint r2 = ne12/ne02; + const uint r3 = ne13/ne03; // find the break-even point where the matrix-matrix kernel becomes more efficient compared // to the matrix-vector kernel - int ne11_mm_min = src0t == GGML_TYPE_F16 ? 1 : 16; + int ne11_mm_min = 1; #if 0 // the numbers below are measured on M2 Ultra for 7B and 13B models @@ -1159,9 +1297,10 @@ void ggml_metal_graph_compute( [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10]; [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12]; - [encoder setBytes:&gqa length:sizeof(gqa) atIndex:13]; + [encoder setBytes:&r2 length:sizeof(r2) atIndex:13]; + [encoder setBytes:&r3 length:sizeof(r3) atIndex:14]; [encoder setThreadgroupMemoryLength:8192 atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; } else { int nth0 = 32; int nth1 = 1; @@ -1197,90 +1336,60 @@ void ggml_metal_graph_compute( } break; case GGML_TYPE_Q4_0: { - GGML_ASSERT(ne02 == 1); - GGML_ASSERT(ne12 == 1); - nth0 = 8; nth1 = 8; [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_0_f32]; } break; case GGML_TYPE_Q4_1: { - GGML_ASSERT(ne02 == 1); - GGML_ASSERT(ne12 == 1); - nth0 = 8; nth1 = 8; [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32]; } break; case GGML_TYPE_Q5_0: { - GGML_ASSERT(ne02 == 1); - GGML_ASSERT(ne12 == 1); - nth0 = 8; nth1 = 8; [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_0_f32]; } break; case GGML_TYPE_Q5_1: { - GGML_ASSERT(ne02 == 1); - GGML_ASSERT(ne12 == 1); - nth0 = 8; nth1 = 8; [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_1_f32]; } break; case GGML_TYPE_Q8_0: { - GGML_ASSERT(ne02 == 1); - GGML_ASSERT(ne12 == 1); - nth0 = 8; nth1 = 8; [encoder setComputePipelineState:ctx->pipeline_mul_mv_q8_0_f32]; } break; case GGML_TYPE_Q2_K: { - GGML_ASSERT(ne02 == 1); - GGML_ASSERT(ne12 == 1); - nth0 = 2; nth1 = 32; [encoder setComputePipelineState:ctx->pipeline_mul_mv_q2_K_f32]; } break; case GGML_TYPE_Q3_K: { - GGML_ASSERT(ne02 == 1); - GGML_ASSERT(ne12 == 1); - nth0 = 2; nth1 = 32; [encoder setComputePipelineState:ctx->pipeline_mul_mv_q3_K_f32]; } break; case GGML_TYPE_Q4_K: { - GGML_ASSERT(ne02 == 1); - GGML_ASSERT(ne12 == 1); - nth0 = 4; //1; nth1 = 8; //32; [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_K_f32]; } break; case GGML_TYPE_Q5_K: { - GGML_ASSERT(ne02 == 1); - GGML_ASSERT(ne12 == 1); - nth0 = 2; nth1 = 32; [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_K_f32]; } break; case GGML_TYPE_Q6_K: { - GGML_ASSERT(ne02 == 1); - GGML_ASSERT(ne12 == 1); - nth0 = 2; nth1 = 32; [encoder setComputePipelineState:ctx->pipeline_mul_mv_q6_K_f32]; @@ -1309,34 +1418,127 @@ void ggml_metal_graph_compute( [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14]; [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16]; - [encoder setBytes:&gqa length:sizeof(gqa) atIndex:17]; + [encoder setBytes:&r2 length:sizeof(r2) atIndex:17]; + [encoder setBytes:&r3 length:sizeof(r3) atIndex:18]; if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_Q4_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_Q3_K) { #ifdef GGML_QKK_64 - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; #else - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; #endif } else if (src0t == GGML_TYPE_Q5_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_Q6_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else { int64_t ny = (ne11 + nrows - 1)/nrows; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } } } break; + case GGML_OP_MUL_MAT_ID: + { + //GGML_ASSERT(ne00 == ne10); + //GGML_ASSERT(ne03 == ne13); + + GGML_ASSERT(src0t == GGML_TYPE_I32); + + const int n_as = ne00; + + // TODO: make this more general + GGML_ASSERT(n_as <= 8); + + struct ggml_tensor * src2 = gf->nodes[i]->src[2]; + + const int64_t ne20 = src2 ? src2->ne[0] : 0; + const int64_t ne21 = src2 ? src2->ne[1] : 0; + const int64_t ne22 = src2 ? src2->ne[2] : 0; + const int64_t ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23); + + const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20); + const uint64_t nb21 = src2 ? src2->nb[1] : 0; + const uint64_t nb22 = src2 ? src2->nb[2] : 0; + const uint64_t nb23 = src2 ? src2->nb[3] : 0; GGML_UNUSED(nb23); + + const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t); + + GGML_ASSERT(!ggml_is_transposed(src2)); + GGML_ASSERT(!ggml_is_transposed(src1)); + + GGML_ASSERT(ne20 % 32 == 0); + // !!!!!!!!! TODO: this assert is probably required but not sure! + //GGML_ASSERT(ne20 >= 64); + GGML_ASSERT(src1t == GGML_TYPE_F32); + + const uint r2 = ne12/ne22; + const uint r3 = ne13/ne23; + + // find the break-even point where the matrix-matrix kernel becomes more efficient compared + // to the matrix-vector kernel + int ne11_mm_min = 0; + + const int idx = ((int32_t *) dst->op_params)[0]; + + // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs + // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel + if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && + ne11 > ne11_mm_min) { + switch (src2->type) { + case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f32_f32]; break; + case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f16_f32]; break; + case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q4_0_f32]; break; + case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q4_1_f32]; break; + case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_0_f32]; break; + case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_1_f32]; break; + case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q8_0_f32]; break; + case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q2_K_f32]; break; + case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q3_K_f32]; break; + case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q4_K_f32]; break; + case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_K_f32]; break; + case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q6_K_f32]; break; + default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); + } + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:3]; + [encoder setBytes:&ne22 length:sizeof(ne22) atIndex:4]; + [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:5]; + [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:6]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:8]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:9]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12]; + [encoder setBytes:&r2 length:sizeof(r2) atIndex:13]; + [encoder setBytes:&r3 length:sizeof(r3) atIndex:14]; + [encoder setBytes:&idx length:sizeof(idx) atIndex:15]; + // TODO: how to make this an array? read Metal docs + for (int j = 0; j < n_as; ++j) { + struct ggml_tensor * src_cur = dst->src[2 + j]; + + size_t offs_src_cur = 0; + id id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur); + + [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:16 + j]; + } + + [encoder setThreadgroupMemoryLength:8192 atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne21 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + } + } break; case GGML_OP_GET_ROWS: { switch (src0->type) { @@ -1560,6 +1762,27 @@ void ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)]; } break; + case GGML_OP_ARGSORT: + { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_I32); + + const int nrows = ggml_nrows(src0); + + enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; + + switch (order) { + case GGML_SORT_ASC: [encoder setComputePipelineState:ctx->pipeline_argsort_f32_i32_asc]; break; + case GGML_SORT_DESC: [encoder setComputePipelineState:ctx->pipeline_argsort_f32_i32_desc]; break; + default: GGML_ASSERT(false); + }; + + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + + [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00, 1, 1)]; + } break; case GGML_OP_DUP: case GGML_OP_CPY: case GGML_OP_CONT: @@ -1655,6 +1878,132 @@ void ggml_metal_graph_compute( // backend interface +static id g_backend_device = nil; +static int g_backend_device_ref_count = 0; + +static id ggml_backend_metal_get_device(void) { + if (g_backend_device == nil) { + g_backend_device = MTLCreateSystemDefaultDevice(); + } + + g_backend_device_ref_count++; + + return g_backend_device; +} + +static void ggml_backend_metal_free_device(void) { + assert(g_backend_device_ref_count > 0); + + g_backend_device_ref_count--; + + if (g_backend_device_ref_count == 0) { + [g_backend_device release]; + g_backend_device = nil; + } +} + +static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + return ctx->data; +} + +static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + [ctx->metal release]; + ggml_backend_metal_free_device(); + + free(ctx->data); + free(ctx); + + UNUSED(buffer); +} + +static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + + memcpy((char *)tensor->data + offset, data, size); + + UNUSED(buffer); +} + +static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + + memcpy(data, (const char *)tensor->data + offset, size); + + UNUSED(buffer); +} + +static void ggml_backend_metal_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { + ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); + + UNUSED(buffer); +} + +static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { + ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); + + UNUSED(buffer); +} + +static struct ggml_backend_buffer_i metal_backend_buffer_i = { + /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer, + /* .get_base = */ ggml_backend_metal_buffer_get_base, + /* .init_tensor = */ NULL, + /* .set_tensor = */ ggml_backend_metal_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_metal_buffer_get_tensor, + /* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from, + /* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to, +}; + +static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); + + const size_t size_page = sysconf(_SC_PAGESIZE); + + size_t size_aligned = size; + if ((size_aligned % size_page) != 0) { + size_aligned += (size_page - (size_aligned % size_page)); + } + + ctx->data = ggml_metal_host_malloc(size); + ctx->metal = [ggml_backend_metal_get_device() newBufferWithBytesNoCopy:ctx->data + length:size_aligned + options:MTLResourceStorageModeShared + deallocator:nil]; + + return ggml_backend_buffer_init(buft, metal_backend_buffer_i, ctx, size); +} + +static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + return 32; + UNUSED(buft); +} + +static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { + return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend); + + GGML_UNUSED(buft); +} + +ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { + static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = { + /* .iface = */ { + /* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend, + }, + /* .context = */ NULL, + }; + + return &ggml_backend_buffer_type_metal; +} + static const char * ggml_backend_metal_name(ggml_backend_t backend) { return "Metal"; @@ -1667,69 +2016,12 @@ static void ggml_backend_metal_free(ggml_backend_t backend) { free(backend); } -static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { - return (void *)buffer->context; -} - -static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { - free(buffer->context); - UNUSED(buffer); -} - -static struct ggml_backend_buffer_i metal_backend_buffer_i = { - /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer, - /* .get_base = */ ggml_backend_metal_buffer_get_base, - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .init_tensor = */ NULL, // no initialization required - /* .free_tensor = */ NULL, // no cleanup required -}; - -static ggml_backend_buffer_t ggml_backend_metal_alloc_buffer(ggml_backend_t backend, size_t size) { - struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; - - void * data = ggml_metal_host_malloc(size); - - // TODO: set proper name of the buffers - ggml_metal_add_buffer(ctx, "backend", data, size, 0); - - return ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size); -} - -static size_t ggml_backend_metal_get_alignment(ggml_backend_t backend) { - return 32; - UNUSED(backend); -} - -static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - - memcpy((char *)tensor->data + offset, data, size); - - UNUSED(backend); -} - -static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - - memcpy(data, (const char *)tensor->data + offset, size); - - UNUSED(backend); -} - static void ggml_backend_metal_synchronize(ggml_backend_t backend) { UNUSED(backend); } -static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { - ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); - - UNUSED(backend); -} - -static void ggml_backend_metal_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { - ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src)); +static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) { + return ggml_backend_metal_buffer_type(); UNUSED(backend); } @@ -1741,32 +2033,43 @@ static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml } static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { - return true; + return ggml_metal_supports_op(op); + UNUSED(backend); - UNUSED(op); } static struct ggml_backend_i metal_backend_i = { - /* .get_name = */ ggml_backend_metal_name, - /* .free = */ ggml_backend_metal_free, - /* .alloc_buffer = */ ggml_backend_metal_alloc_buffer, - /* .get_alignment = */ ggml_backend_metal_get_alignment, - /* .set_tensor_async = */ ggml_backend_metal_set_tensor_async, - /* .get_tensor_async = */ ggml_backend_metal_get_tensor_async, - /* .synchronize = */ ggml_backend_metal_synchronize, - /* .cpy_tensor_from = */ ggml_backend_metal_cpy_tensor_from, - /* .cpy_tensor_to = */ ggml_backend_metal_cpy_tensor_to, - /* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm - /* .graph_plan_free = */ NULL, - /* .graph_plan_compute = */ NULL, - /* .graph_compute = */ ggml_backend_metal_graph_compute, - /* .supports_op = */ ggml_backend_metal_supports_op, + /* .get_name = */ ggml_backend_metal_name, + /* .free = */ ggml_backend_metal_free, + /* .get_default_buffer_type = */ ggml_backend_metal_get_default_buffer_type, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_from_async = */ NULL, + /* .cpy_tensor_to_async = */ NULL, + /* .synchronize = */ ggml_backend_metal_synchronize, + /* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm + /* .graph_plan_free = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_metal_graph_compute, + /* .supports_op = */ ggml_backend_metal_supports_op, }; -ggml_backend_t ggml_backend_metal_init(void) { - struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); +// TODO: make a common log callback for all backends in ggml-backend +static void ggml_backend_log_callback(enum ggml_log_level level, const char * msg, void * user_data) { + fprintf(stderr, "%s", msg); - ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS); + UNUSED(level); + UNUSED(user_data); +} + +ggml_backend_t ggml_backend_metal_init(void) { + ggml_metal_log_set_callback(ggml_backend_log_callback, NULL); + + struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS); + + if (ctx == NULL) { + return NULL; + } ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend)); @@ -1783,7 +2086,18 @@ bool ggml_backend_is_metal(ggml_backend_t backend) { } void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { + GGML_ASSERT(ggml_backend_is_metal(backend)); + struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; ggml_metal_set_n_cb(ctx, n_cb); } + +static ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) { + return ggml_backend_metal_init(); + + GGML_UNUSED(params); + GGML_UNUSED(user_data); +} + +GGML_BACKEND_REGISTER("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL) diff --git a/ggml-metal.metal b/ggml-metal.metal index 9f5ffcbaf..2f8ea22d6 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -4,6 +4,7 @@ using namespace metal; #define MAX(x, y) ((x) > (y) ? (x) : (y)) #define MIN(x, y) ((x) < (y) ? (x) : (y)) +#define SWAP(x, y) { auto tmp = (x); (x) = (y); (y) = tmp; } #define QK4_0 32 #define QR4_0 2 @@ -42,8 +43,13 @@ typedef struct { #define N_SIMDWIDTH 32 // assuming SIMD group size is 32 -// general-purpose kernel for addition of two tensors -// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3 +enum ggml_sort_order { + GGML_SORT_ASC, + GGML_SORT_DESC, +}; + +// general-purpose kernel for addition, multiplication and division of two tensors +// pros: works for non-contiguous tensors, supports broadcast across all dims // cons: not very efficient kernel void kernel_add( device const char * src0, @@ -84,16 +90,111 @@ kernel void kernel_add( const int64_t i12 = i02 % ne12; const int64_t i11 = i01 % ne11; - device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + tpitg.x*nb00; - device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10; - device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + tpitg.x*nb0; + device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01; + device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11; + device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1; for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { - ((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0] + ((device float *)src1_ptr)[0]; + const int i10 = i0 % ne10; + *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) + *((device float *)(src1_ptr + i10*nb10)); + } +} - src0_ptr += ntg.x*nb00; - src1_ptr += ntg.x*nb10; - dst_ptr += ntg.x*nb0; +kernel void kernel_mul( + device const char * src0, + device const char * src1, + device char * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant int64_t & nb00, + constant int64_t & nb01, + constant int64_t & nb02, + constant int64_t & nb03, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant int64_t & nb10, + constant int64_t & nb11, + constant int64_t & nb12, + constant int64_t & nb13, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant int64_t & nb0, + constant int64_t & nb1, + constant int64_t & nb2, + constant int64_t & nb3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig.z; + const int64_t i02 = tgpig.y; + const int64_t i01 = tgpig.x; + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + + device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01; + device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11; + device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1; + + for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { + const int i10 = i0 % ne10; + *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) * *((device float *)(src1_ptr + i10*nb10)); + } +} + +kernel void kernel_div( + device const char * src0, + device const char * src1, + device char * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant int64_t & nb00, + constant int64_t & nb01, + constant int64_t & nb02, + constant int64_t & nb03, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant int64_t & nb10, + constant int64_t & nb11, + constant int64_t & nb12, + constant int64_t & nb13, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant int64_t & nb0, + constant int64_t & nb1, + constant int64_t & nb2, + constant int64_t & nb3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig.z; + const int64_t i02 = tgpig.y; + const int64_t i01 = tgpig.x; + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + + device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01; + device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11; + device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1; + + for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { + const int i10 = i0 % ne10; + *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) / *((device float *)(src1_ptr + i10*nb10)); } } @@ -108,25 +209,24 @@ kernel void kernel_add_row( dst[tpig] = src0[tpig] + src1[tpig % nb]; } -kernel void kernel_mul( - device const float4 * src0, - device const float4 * src1, - device float4 * dst, - uint tpig[[thread_position_in_grid]]) { - dst[tpig] = src0[tpig] * src1[tpig]; -} - -// assumption: src1 is a row -// broadcast src1 into src0 kernel void kernel_mul_row( device const float4 * src0, device const float4 * src1, device float4 * dst, - constant int64_t & nb, + constant int64_t & nb [[buffer(27)]], uint tpig[[thread_position_in_grid]]) { dst[tpig] = src0[tpig] * src1[tpig % nb]; } +kernel void kernel_div_row( + device const float4 * src0, + device const float4 * src1, + device float4 * dst, + constant int64_t & nb [[buffer(27)]], + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = src0[tpig] / src1[tpig % nb]; +} + kernel void kernel_scale( device const float * src0, device float * dst, @@ -165,6 +265,54 @@ kernel void kernel_sqr( dst[tpig] = src0[tpig] * src0[tpig]; } +kernel void kernel_sum_rows( + device const float * src0, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant int64_t & nb00, + constant int64_t & nb01, + constant int64_t & nb02, + constant int64_t & nb03, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant int64_t & nb10, + constant int64_t & nb11, + constant int64_t & nb12, + constant int64_t & nb13, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant int64_t & nb0, + constant int64_t & nb1, + constant int64_t & nb2, + constant int64_t & nb3, + uint3 tpig[[thread_position_in_grid]]) { + int64_t i3 = tpig.z; + int64_t i2 = tpig.y; + int64_t i1 = tpig.x; + + if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) { + return; + } + + device const float * src_row = (device const float *) ((device const char *) src0 + i1*nb01 + i2*nb02 + i3*nb03); + device float * dst_row = (device float *) ((device char *) dst + i1*nb1 + i2*nb2 + i3*nb3); + + float row_sum = 0; + + for (int64_t i0 = 0; i0 < ne00; i0++) { + row_sum += src_row[i0]; + } + + dst_row[0] = row_sum; +} + constant float GELU_COEF_A = 0.044715f; constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; @@ -583,9 +731,20 @@ inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thre // giard against the number of rows not being divisible by // N_DST, so this is another explicit assumption of the implementation. template -void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst, - int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne10, int64_t ne12, int64_t ne0, int64_t ne1, uint gqa, - uint3 tgpig, uint tiisg, uint sgitg) { +void mul_vec_q_n_f32( + device const void * src0, + device const float * src1, + device float * dst, + int64_t ne00, + int64_t ne01, + int64_t ne02, + int64_t ne10, + int64_t ne12, + int64_t ne0, + int64_t ne1, + uint r2, + uint r3, + uint3 tgpig, uint tiisg, uint sgitg) { const int nb = ne00/QK4_0; const int r0 = tgpig.x; @@ -594,7 +753,10 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device const int first_row = (r0 * nsg + sgitg) * nr; - const uint offset0 = first_row * nb + im/gqa*(nb*ne0); + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); device const block_q_type * x = (device const block_q_type *) src0 + offset0; device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; @@ -644,13 +806,14 @@ kernel void kernel_mul_mv_q4_0_f32( constant int64_t & ne02[[buffer(5)]], constant int64_t & ne10[[buffer(9)]], constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0[[buffer(15)]], - constant int64_t & ne1[[buffer(16)]], - constant uint & gqa[[buffer(17)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg); + mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg); } kernel void kernel_mul_mv_q4_1_f32( @@ -662,13 +825,14 @@ kernel void kernel_mul_mv_q4_1_f32( constant int64_t & ne02[[buffer(5)]], constant int64_t & ne10[[buffer(9)]], constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0[[buffer(15)]], - constant int64_t & ne1[[buffer(16)]], - constant uint & gqa[[buffer(17)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg); + mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg); } kernel void kernel_mul_mv_q5_0_f32( @@ -680,13 +844,14 @@ kernel void kernel_mul_mv_q5_0_f32( constant int64_t & ne02[[buffer(5)]], constant int64_t & ne10[[buffer(9)]], constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0[[buffer(15)]], - constant int64_t & ne1[[buffer(16)]], - constant uint & gqa[[buffer(17)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg); + mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg); } kernel void kernel_mul_mv_q5_1_f32( @@ -698,13 +863,14 @@ kernel void kernel_mul_mv_q5_1_f32( constant int64_t & ne02[[buffer(5)]], constant int64_t & ne10[[buffer(9)]], constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0[[buffer(15)]], - constant int64_t & ne1[[buffer(16)]], - constant uint & gqa[[buffer(17)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg); + mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg); } @@ -719,9 +885,10 @@ kernel void kernel_mul_mv_q8_0_f32( constant int64_t & ne02[[buffer(5)]], constant int64_t & ne10[[buffer(9)]], constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0[[buffer(15)]], - constant int64_t & ne1[[buffer(16)]], - constant uint & gqa[[buffer(17)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -733,8 +900,14 @@ kernel void kernel_mul_mv_q8_0_f32( const int r0 = tgpig.x; const int r1 = tgpig.y; const int im = tgpig.z; + const int first_row = (r0 * nsg + sgitg) * nr; - const uint offset0 = first_row * nb + im/gqa*(nb*ne0); + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0; device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; @@ -792,6 +965,8 @@ kernel void kernel_mul_mv_f32_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]]) { @@ -799,7 +974,12 @@ kernel void kernel_mul_mv_f32_f32( const int64_t rb = tgpig.y*N_F32_F32; const int64_t im = tgpig.z; - device const float * x = (device const float *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02); + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02; + + device const float * x = (device const float *) (src0 + offset0); if (ne00 < 128) { for (int row = 0; row < N_F32_F32; ++row) { @@ -865,6 +1045,8 @@ kernel void kernel_mul_mv_f16_f16( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]]) { @@ -872,7 +1054,12 @@ kernel void kernel_mul_mv_f16_f16( const int64_t rb = tgpig.y*N_F16_F16; const int64_t im = tgpig.z; - device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02); + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02; + + device const half * x = (device const half *) (src0 + offset0); if (ne00 < 128) { for (int row = 0; row < N_F16_F16; ++row) { @@ -936,6 +1123,8 @@ kernel void kernel_mul_mv_f16_f32_1row( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]]) { @@ -943,7 +1132,12 @@ kernel void kernel_mul_mv_f16_f32_1row( const int64_t r1 = tgpig.y; const int64_t im = tgpig.z; - device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02); + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02; + + device const half * x = (device const half *) (src0 + offset0); device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12); float sumf = 0; @@ -990,6 +1184,8 @@ kernel void kernel_mul_mv_f16_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]]) { @@ -997,7 +1193,12 @@ kernel void kernel_mul_mv_f16_f32( const int64_t rb = tgpig.y*N_F16_F32; const int64_t im = tgpig.z; - device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02); + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02; + + device const half * x = (device const half *) (src0 + offset0); if (ne00 < 128) { for (int row = 0; row < N_F16_F32; ++row) { @@ -1062,6 +1263,8 @@ kernel void kernel_mul_mv_f16_f32_l4( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]]) { @@ -1069,7 +1272,12 @@ kernel void kernel_mul_mv_f16_f32_l4( const int64_t r0 = tgpig.x; const int64_t im = tgpig.z; - device const half4 * x4 = (device const half4 *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02); + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02; + + device const half4 * x4 = (device const half4 *) (src0 + offset0); for (int r1 = 0; r1 < nrows; ++r1) { device const float4 * y4 = (device const float4 *) (src1 + r1*nb11 + im*nb12); @@ -1121,17 +1329,21 @@ kernel void kernel_alibi_f32( const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0); const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0; const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0); + const int64_t k = i3*ne3 + i2; - device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); float m_k; - if (i2 < n_heads_log2_floor) { - m_k = pow(m0, i2 + 1); + if (k < n_heads_log2_floor) { + m_k = pow(m0, k + 1); } else { - m_k = pow(m1, 2 * (i2 - n_heads_log2_floor) + 1); + m_k = pow(m1, 2 * (k - n_heads_log2_floor) + 1); } + + device char * dst_row = (device char *) dst + i3*nb3 + i2*nb2 + i1*nb1; + device const char * src_row = (device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01; for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) { - device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00); - dst_data[i00] = src[0] + m_k * (i00 - ne00 + 1); + const float src_v = *(device float *)(src_row + i00*nb00); + device float * dst_v = (device float *)(dst_row + i00*nb0); + *dst_v = i00 * m_k + src_v; } } @@ -1336,6 +1548,58 @@ kernel void kernel_im2col_f16( } } +// bitonic sort implementation following the CUDA kernels as reference +typedef void (argsort_t)( + device const float * x, + device int32_t * dst, + constant int64_t & ncols, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]]); + +template +kernel void kernel_argsort_f32_i32( + device const float * x, + device int32_t * dst, + constant int64_t & ncols, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]]) { + // bitonic sort + int col = tpitg[0]; + int row = tgpig[1]; + + if (col >= ncols) return; + + device const float * x_row = x + row * ncols; + device int32_t * dst_row = dst + row * ncols; + + // initialize indices + if (col < ncols) { + dst_row[col] = col; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (int k = 2; k <= ncols; k *= 2) { + for (int j = k / 2; j > 0; j /= 2) { + int ixj = col ^ j; + if (ixj > col) { + if ((col & k) == 0) { + if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { + SWAP(dst_row[col], dst_row[ixj]); + } + } else { + if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { + SWAP(dst_row[col], dst_row[ixj]); + } + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + } +} + +template [[host_name("kernel_argsort_f32_i32_asc")]] kernel argsort_t kernel_argsort_f32_i32; +template [[host_name("kernel_argsort_f32_i32_desc")]] kernel argsort_t kernel_argsort_f32_i32; + kernel void kernel_cpy_f16_f16( device const half * src0, device half * dst, @@ -1809,23 +2073,30 @@ kernel void kernel_mul_mv_q2_K_f32( constant int64_t & ne02[[buffer(5)]], constant int64_t & ne10[[buffer(9)]], constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0[[buffer(15)]], - constant int64_t & ne1[[buffer(16)]], - constant uint & gqa[[buffer(17)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { const int nb = ne00/QK_K; const int r0 = tgpig.x; const int r1 = tgpig.y; - const int r2 = tgpig.z; + const int im = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; const int ib_row = first_row * nb; - const uint offset0 = r2/gqa*(nb*ne0); + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + device const block_q2_K * x = (device const block_q2_K *) src0 + ib_row + offset0; - device const float * y = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + float yl[32]; float sumf[N_DST]={0.f}, all_sum; @@ -1834,11 +2105,11 @@ kernel void kernel_mul_mv_q2_K_f32( #if QK_K == 256 const int ix = tiisg/8; // 0...3 const int it = tiisg%8; // 0...7 - const int im = it/4; // 0 or 1 + const int iq = it/4; // 0 or 1 const int ir = it%4; // 0...3 const int is = (8*ir)/16;// 0 or 1 - device const float * y4 = y + ix * QK_K + 128 * im + 8 * ir; + device const float * y4 = y + ix * QK_K + 128 * iq + 8 * ir; for (int ib = ix; ib < nb; ib += 4) { @@ -1850,8 +2121,8 @@ kernel void kernel_mul_mv_q2_K_f32( yl[i+24] = y4[i+96]; sumy[3] += yl[i+24]; } - device const uint8_t * sc = (device const uint8_t *)x[ib].scales + 8*im + is; - device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 16 * im + 4 * ir; + device const uint8_t * sc = (device const uint8_t *)x[ib].scales + 8*iq + is; + device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 16 * iq + 4 * ir; device const half * dh = &x[ib].d; for (int row = 0; row < N_DST; row++) { @@ -1938,7 +2209,7 @@ kernel void kernel_mul_mv_q2_K_f32( for (int row = 0; row < N_DST; ++row) { all_sum = simd_sum(sumf[row]); if (tiisg == 0) { - dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = all_sum; + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum; } } } @@ -1953,9 +2224,10 @@ kernel void kernel_mul_mv_q3_K_f32( constant int64_t & ne02[[buffer(5)]], constant int64_t & ne10[[buffer(9)]], constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0[[buffer(15)]], - constant int64_t & ne1[[buffer(16)]], - constant uint & gqa[[buffer(17)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -1964,12 +2236,17 @@ kernel void kernel_mul_mv_q3_K_f32( const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; - const int64_t r2 = tgpig.z; + const int64_t im = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2; - const uint offset0 = r2/gqa*(nb*ne0); + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + device const block_q3_K * x = (device const block_q3_K *) src0 + first_row*nb + offset0; - device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; + device const float * yy = (device const float *) src1 + r1*ne10 + im*ne00*ne1; float yl[32]; @@ -2091,7 +2368,7 @@ kernel void kernel_mul_mv_q3_K_f32( } if (tiisg == 0) { for (int row = 0; row < 2; ++row) { - dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = sumf1[row]; + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = sumf1[row]; } } } @@ -2105,26 +2382,33 @@ kernel void kernel_mul_mv_q3_K_f32( constant int64_t & ne02[[buffer(5)]], constant int64_t & ne10[[buffer(9)]], constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0[[buffer(15)]], - constant int64_t & ne1[[buffer(16)]], - constant uint & gqa[[buffer(17)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { const int nb = ne00/QK_K; const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; - const int64_t r2 = tgpig.z; + const int64_t im = tgpig.z; const int row = 2 * r0 + sgitg; - const uint offset0 = r2/gqa*(nb*ne0); + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb + offset0; - device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; + device const float * yy = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + const int ix = tiisg/4; const int il = 4 * (tiisg%4);// 0, 4, 8, 12 - const int im = il/8; // 0, 0, 1, 1 + const int iq = il/8; // 0, 0, 1, 1 const int in = il%8; // 0, 4, 0, 4 float2 sum = {0.f, 0.f}; @@ -2144,7 +2428,7 @@ kernel void kernel_mul_mv_q3_K_f32( const float d4 = d_all * ((int32_t)(s[0] & 0xF000) - 32768) * 1.f/262144.f; for (int l = 0; l < 4; l += 2) { - const uint16_t hm = h[l/2] >> im; + const uint16_t hm = h[l/2] >> iq; sum[0] += y[l+ 0] * d1 * ((int32_t)(q[l/2] & 0x0003) - ((hm & 0x0001) ? 0 : 4)) + y[l+16] * d2 * ((int32_t)(q[l/2] & 0x000c) - ((hm & 0x0004) ? 0 : 16)) + y[l+32] * d3 * ((int32_t)(q[l/2] & 0x0030) - ((hm & 0x0010) ? 0 : 64)) @@ -2160,7 +2444,7 @@ kernel void kernel_mul_mv_q3_K_f32( const float tot = simd_sum(sumf); if (tiisg == 0) { - dst[r1*ne0 + r2*ne0*ne1 + row] = tot; + dst[r1*ne0 + im*ne0*ne1 + row] = tot; } } @@ -2178,10 +2462,11 @@ kernel void kernel_mul_mv_q4_K_f32( constant int64_t & ne12 [[buffer(11)]], constant int64_t & ne0 [[buffer(15)]], constant int64_t & ne1 [[buffer(16)]], - constant uint & gqa [[buffer(17)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { const uint16_t kmask1 = 0x3f3f; const uint16_t kmask2 = 0x0f0f; @@ -2189,26 +2474,32 @@ kernel void kernel_mul_mv_q4_K_f32( const int ix = tiisg/8; // 0...3 const int it = tiisg%8; // 0...7 - const int im = it/4; // 0 or 1 + const int iq = it/4; // 0 or 1 const int ir = it%4; // 0...3 const int nb = ne00/QK_K; const int r0 = tgpig.x; const int r1 = tgpig.y; - const int r2 = tgpig.z; + const int im = tgpig.z; //const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; const int first_row = r0 * N_DST; const int ib_row = first_row * nb; - const uint offset0 = r2/gqa*(nb*ne0); + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0; - device const float * y = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + float yl[16]; float yh[16]; float sumf[N_DST]={0.f}, all_sum; const int step = sizeof(block_q4_K) * nb / 2; - device const float * y4 = y + ix * QK_K + 64 * im + 8 * ir; + device const float * y4 = y + ix * QK_K + 64 * iq + 8 * ir; uint16_t sc16[4]; thread const uint8_t * sc8 = (thread const uint8_t *)sc16; @@ -2223,8 +2514,8 @@ kernel void kernel_mul_mv_q4_K_f32( yh[i+8] = y4[i+160]; sumy[3] += yh[i+8]; } - device const uint16_t * sc = (device const uint16_t *)x[ib].scales + im; - device const uint16_t * q1 = (device const uint16_t *)x[ib].qs + 16 * im + 4 * ir; + device const uint16_t * sc = (device const uint16_t *)x[ib].scales + iq; + device const uint16_t * q1 = (device const uint16_t *)x[ib].qs + 16 * iq + 4 * ir; device const half * dh = &x[ib].d; for (int row = 0; row < N_DST; row++) { @@ -2268,7 +2559,7 @@ kernel void kernel_mul_mv_q4_K_f32( for (int row = 0; row < N_DST; ++row) { all_sum = simd_sum(sumf[row]); if (tiisg == 0) { - dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = all_sum; + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum; } } } @@ -2282,9 +2573,10 @@ kernel void kernel_mul_mv_q4_K_f32( constant int64_t & ne02[[buffer(5)]], constant int64_t & ne10[[buffer(9)]], constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0[[buffer(15)]], - constant int64_t & ne1[[buffer(16)]], - constant uint & gqa[[buffer(17)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -2295,12 +2587,18 @@ kernel void kernel_mul_mv_q4_K_f32( const int nb = ne00/QK_K; const int r0 = tgpig.x; const int r1 = tgpig.y; - const int r2 = tgpig.z; + const int im = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; const int ib_row = first_row * nb; - const uint offset0 = r2/gqa*(nb*ne0); + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0; - device const float * y = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + float yl[8]; float yh[8]; float sumf[N_DST]={0.f}, all_sum; @@ -2356,7 +2654,7 @@ kernel void kernel_mul_mv_q4_K_f32( for (int row = 0; row < N_DST; ++row) { all_sum = simd_sum(sumf[row]); if (tiisg == 0) { - dst[r1*ne0+ r2*ne0*ne1 + first_row + row] = all_sum; + dst[r1*ne0+ im*ne0*ne1 + first_row + row] = all_sum; } } } @@ -2371,9 +2669,10 @@ kernel void kernel_mul_mv_q5_K_f32( constant int64_t & ne02[[buffer(5)]], constant int64_t & ne10[[buffer(9)]], constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0[[buffer(15)]], - constant int64_t & ne1[[buffer(16)]], - constant uint & gqa[[buffer(17)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -2382,12 +2681,17 @@ kernel void kernel_mul_mv_q5_K_f32( const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; - const int r2 = tgpig.z; + const int im = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2; - const uint offset0 = r2/gqa*(nb*ne0); + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + device const block_q5_K * x = (device const block_q5_K *) src0 + first_row*nb + offset0; - device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; + device const float * yy = (device const float *) src1 + r1*ne10 + im*ne00*ne1; float sumf[2]={0.f}; @@ -2403,15 +2707,15 @@ kernel void kernel_mul_mv_q5_K_f32( const int tid = tiisg/4; const int ix = tiisg%4; - const int im = tid/4; + const int iq = tid/4; const int ir = tid%4; const int n = 8; const int l0 = n*ir; - const int q_offset = 32*im + l0; - const int y_offset = 64*im + l0; + const int q_offset = 32*iq + l0; + const int y_offset = 64*iq + l0; - const uint8_t hm1 = 1u << (2*im); + const uint8_t hm1 = 1u << (2*iq); const uint8_t hm2 = hm1 << 1; const uint8_t hm3 = hm1 << 4; const uint8_t hm4 = hm2 << 4; @@ -2426,7 +2730,7 @@ kernel void kernel_mul_mv_q5_K_f32( device const uint8_t * q1 = x[i].qs + q_offset; device const uint8_t * qh = x[i].qh + l0; device const half * dh = &x[i].d; - device const uint16_t * a = (device const uint16_t *)x[i].scales + im; + device const uint16_t * a = (device const uint16_t *)x[i].scales + iq; device const float * y2 = y1 + 128; float4 sumy = {0.f, 0.f, 0.f, 0.f}; @@ -2482,7 +2786,7 @@ kernel void kernel_mul_mv_q5_K_f32( const int il = 4 * (tiisg/8); // 0, 4, 8, 12 const int ix = tiisg%8; - const int im = il/8; // 0, 0, 1, 1 + const int iq = il/8; // 0, 0, 1, 1 const int in = il%8; // 0, 4, 0, 4 device const float * y = yy + ix*QK_K + il; @@ -2507,7 +2811,7 @@ kernel void kernel_mul_mv_q5_K_f32( float2 acc = {0.f, 0.f}; for (int l = 0; l < 4; ++l) { - const uint8_t hl = h[l] >> im; + const uint8_t hl = h[l] >> iq; acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16)) + yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16)); acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256)) @@ -2529,7 +2833,7 @@ kernel void kernel_mul_mv_q5_K_f32( for (int row = 0; row < 2; ++row) { const float tot = simd_sum(sumf[row]); if (tiisg == 0) { - dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = tot; + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot; } } @@ -2544,9 +2848,10 @@ kernel void kernel_mul_mv_q6_K_f32( constant int64_t & ne02[[buffer(5)]], constant int64_t & ne10[[buffer(9)]], constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0[[buffer(15)]], - constant int64_t & ne1[[buffer(16)]], - constant uint & gqa[[buffer(17)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -2560,12 +2865,17 @@ kernel void kernel_mul_mv_q6_K_f32( const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; - const int r2 = tgpig.z; + const int im = tgpig.z; const int row = 2 * r0 + sgitg; - const uint offset0 = r2/gqa*(nb*ne0); + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + device const block_q6_K * x = (device const block_q6_K *) src0 + row * nb + offset0; - device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; + device const float * yy = (device const float *) src1 + r1*ne10 + im*ne00*ne1; float sumf = 0; @@ -2631,7 +2941,7 @@ kernel void kernel_mul_mv_q6_K_f32( const float tot = simd_sum(sumf); if (tiisg == 0) { - dst[r1*ne0 + r2*ne0*ne1 + row] = tot; + dst[r1*ne0 + im*ne0*ne1 + row] = tot; } } @@ -2941,24 +3251,25 @@ kernel void kernel_get_rows( // each block_q contains 16*nl weights template -kernel void kernel_mul_mm(device const uchar * src0, - device const uchar * src1, - device float * dst, - constant int64_t & ne00, - constant int64_t & ne02, - constant int64_t & nb01, - constant int64_t & nb02, - constant int64_t & ne12, - constant int64_t & nb10, - constant int64_t & nb11, - constant int64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & gqa, - threadgroup uchar * shared_memory [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { +void kernel_mul_mm_impl(device const uchar * src0, + device const uchar * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne02, + constant int64_t & nb01, + constant int64_t & nb02, + constant int64_t & ne12, + constant int64_t & nb10, + constant int64_t & nb11, + constant int64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup uchar * shared_memory [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { threadgroup half * sa = (threadgroup half *)(shared_memory); threadgroup float * sb = (threadgroup float *)(shared_memory + 4096); @@ -2984,7 +3295,10 @@ kernel void kernel_mul_mm(device const uchar * src0, short il = (tiitg % THREAD_PER_ROW); - uint offset0 = im/gqa*nb02; + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + uint offset0 = (i12/r2)*nb02 + (i13/r3)*(nb02*ne02); ushort offset1 = il/nl; device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1; @@ -3068,14 +3382,116 @@ kernel void kernel_mul_mm(device const uchar * src0, } } +template +kernel void kernel_mul_mm(device const uchar * src0, + device const uchar * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne02, + constant int64_t & nb01, + constant int64_t & nb02, + constant int64_t & ne12, + constant int64_t & nb10, + constant int64_t & nb11, + constant int64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup uchar * shared_memory [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + kernel_mul_mm_impl( + src0, + src1, + dst, + ne00, + ne02, + nb01, + nb02, + ne12, + nb10, + nb11, + nb12, + ne0, + ne1, + r2, + r3, + shared_memory, + tgpig, + tiitg, + sgitg); +} + +template +kernel void kernel_mul_mm_id( + device const int32_t * ids, + device const uchar * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne02, + constant int64_t & nb01, + constant int64_t & nb02, + constant int64_t & ne12, + constant int64_t & nb10, + constant int64_t & nb11, + constant int64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const uchar * src00, + device const uchar * src01, + device const uchar * src02, + device const uchar * src03, + device const uchar * src04, + device const uchar * src05, + device const uchar * src06, + device const uchar * src07, + threadgroup uchar * shared_memory [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const uchar * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + kernel_mul_mm_impl( + src0[ids[idx]], + src1, + dst, + ne00, + ne02, + nb01, + nb02, + ne12, + nb10, + nb11, + nb12, + ne0, + ne1, + r2, + r3, + shared_memory, + tgpig, + tiitg, + sgitg); +} + #if QK_K == 256 #define QK_NL 16 #else #define QK_NL 4 #endif -typedef void (get_rows_t)(device const void *, device const int *, device float *, constant int64_t &, \ - constant uint64_t &, constant uint64_t &, uint, uint, uint); +typedef void (get_rows_t)( + device const void * src0, + device const int * src1, + device float * dst, + constant int64_t & ne00, + constant uint64_t & nb01, + constant uint64_t & nb1, + uint, uint, uint); template [[host_name("kernel_get_rows_f32")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows; @@ -3104,8 +3520,10 @@ typedef void (mat_mm_t)( constant int64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant uint & gqa, - threadgroup uchar *, uint3, uint, uint); + constant uint & r2, + constant uint & r3, + threadgroup uchar *, + uint3, uint, uint); template [[host_name("kernel_mul_mm_f32_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm; @@ -3119,3 +3537,44 @@ template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm; + +typedef void (mat_mm_id_t)( + device const int32_t * ids, + device const uchar * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne02, + constant int64_t & nb01, + constant int64_t & nb02, + constant int64_t & ne12, + constant int64_t & nb10, + constant int64_t & nb11, + constant int64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const uchar * src00, + device const uchar * src01, + device const uchar * src02, + device const uchar * src03, + device const uchar * src04, + device const uchar * src05, + device const uchar * src06, + device const uchar * src07, + threadgroup uchar *, + uint3, uint, uint); + +template [[host_name("kernel_mul_mm_id_f32_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_f16_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q4_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q4_1_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q5_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q5_1_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q8_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q2_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q3_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q4_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id;