From b39809668a75ba5e44e7356a5450c21e6d4f4f83 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 8 Sep 2023 17:58:31 +0300 Subject: [PATCH] sync : ggml (HBM + Metal + style) (#1264) --- ggml-metal.m | 2 +- ggml-metal.metal | 49 +++++++++++++++++------------------------------- ggml.c | 30 +++++++++++++++++++++++------ 3 files changed, 42 insertions(+), 39 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 521ca18..7e2355c 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1141,7 +1141,7 @@ void ggml_metal_graph_compute( [encoder setBytes:&freq_base length:sizeof(float) atIndex:21]; [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22]; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)]; } break; case GGML_OP_DUP: case GGML_OP_CPY: diff --git a/ggml-metal.metal b/ggml-metal.metal index 119fcbe..5070561 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -220,14 +220,10 @@ kernel void kernel_norm( } threadgroup_barrier(mem_flags::mem_threadgroup); } - //// broadcast - //if (tpitg == 0) { - // sum[0] /= ne00; - //} - //threadgroup_barrier(mem_flags::mem_threadgroup); - const float mean = sum[0]; + const float mean = sum[0] / ne00; // recenter and VARIANCE + threadgroup_barrier(mem_flags::mem_threadgroup); device float * y = dst + tgpig*ne00; sum[tpitg] = 0.0f; for (int i00 = tpitg; i00 < ne00; i00 += ntg) { @@ -235,12 +231,6 @@ kernel void kernel_norm( sum[tpitg] += y[i00] * y[i00]; } - //// VARIANCE - //// parallel sum - //sum[tpitg] = 0.0f; - //for (int i00 = tpitg; i00 < ne00; i00 += ntg) { - // sum[tpitg] += y[i00] * y[i00]; - //} // reduce threadgroup_barrier(mem_flags::mem_threadgroup); for (uint i = ntg/2; i > 0; i /= 2) { @@ -249,12 +239,7 @@ kernel void kernel_norm( } threadgroup_barrier(mem_flags::mem_threadgroup); } - //// broadcast - //if (tpitg == 0) { - // sum[0] /= ne00; - //} - //threadgroup_barrier(mem_flags::mem_threadgroup); - const float variance = sum[0]; + const float variance = sum[0] / ne00; const float scale = 1.0f/sqrt(variance + eps); for (int i00 = tpitg; i00 < ne00; i00 += ntg) { @@ -262,7 +247,6 @@ kernel void kernel_norm( } } - kernel void kernel_rms_norm( device const void * src0, device float * dst, @@ -630,7 +614,6 @@ kernel void kernel_mul_mat_f16_f32( } } } - } kernel void kernel_alibi_f32( @@ -699,25 +682,27 @@ kernel void kernel_rope( constant int & mode, constant float & freq_base, constant float & freq_scale, - uint3 tpig[[thread_position_in_grid]]) { - const int64_t i3 = tpig[2]; - const int64_t i2 = tpig[1]; - const int64_t i1 = tpig[0]; + uint tiitg[[thread_index_in_threadgroup]], + uint3 tptg[[threads_per_threadgroup]], + uint3 tgpig[[threadgroup_position_in_grid]]) { + const int64_t i3 = tgpig[2]; + const int64_t i2 = tgpig[1]; + const int64_t i1 = tgpig[0]; const bool is_neox = mode & 2; - const float theta_scale = pow(freq_base, -2.0f/n_dims); const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); - float theta = freq_scale * (float)p; + const float theta_0 = freq_scale * (float)p; + const float inv_ndims = -1.f/n_dims; if (!is_neox) { - for (int64_t i0 = 0; i0 < ne0; i0 += 2) { + for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) { + + const float theta = theta_0 * pow(freq_base, inv_ndims*i0); const float cos_theta = cos(theta); const float sin_theta = sin(theta); - theta *= theta_scale; - device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -729,12 +714,12 @@ kernel void kernel_rope( } } else { for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { - for (int64_t ic = 0; ic < n_dims; ic += 2) { + for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) { + + const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib); const float cos_theta = cos(theta); const float sin_theta = sin(theta); - theta *= theta_scale; - const int64_t i0 = ib*n_dims + ic/2; device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); diff --git a/ggml.c b/ggml.c index 0fe07b2..3f72379 100644 --- a/ggml.c +++ b/ggml.c @@ -106,6 +106,9 @@ typedef void * thread_ret_t; #include #include +#endif +#ifdef GGML_USE_CPU_HBM +#include #endif // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 @@ -195,8 +198,14 @@ typedef void * thread_ret_t; #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) #else inline static void * ggml_aligned_malloc(size_t size) { + if (size == 0) { + GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n"); + return NULL; + } void * aligned_memory = NULL; -#ifdef GGML_USE_METAL +#ifdef GGML_USE_CPU_HBM + int result = hbw_posix_memalign(&aligned_memory, 16, size); +#elif GGML_USE_METAL int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size); #else int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size); @@ -218,8 +227,12 @@ inline static void * ggml_aligned_malloc(size_t size) { return aligned_memory; } #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size) +#ifdef GGML_USE_CPU_HBM +#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr) +#else #define GGML_ALIGNED_FREE(ptr) free(ptr) #endif +#endif #define UNUSED GGML_UNUSED #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0) @@ -4571,6 +4584,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { return NULL; } + // allow to call ggml_init with 0 size + if (params.mem_size == 0) { + params.mem_size = GGML_MEM_ALIGN; + } + const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN); *ctx = (struct ggml_context) { @@ -4773,7 +4791,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( size_t obj_alloc_size = 0; - if (view_src == NULL && ctx->no_alloc == false) { + if (view_src == NULL && !ctx->no_alloc) { if (ctx->scratch.data != NULL) { // allocate tensor data in the scratch buffer if (ctx->scratch.offs + data_size > ctx->scratch.size) { @@ -5474,7 +5492,7 @@ static struct ggml_tensor * ggml_mul_impl( } if (inplace) { - GGML_ASSERT(is_node == false); + GGML_ASSERT(!is_node); } struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); @@ -5517,7 +5535,7 @@ static struct ggml_tensor * ggml_div_impl( } if (inplace) { - GGML_ASSERT(is_node == false); + GGML_ASSERT(!is_node); } struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); @@ -19961,7 +19979,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p struct ggml_tensor * data = NULL; - if (params.no_alloc == false) { + if (!params.no_alloc) { data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size); ok = ok && data != NULL; @@ -20002,7 +20020,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p } // point the data member to the appropriate location in the binary blob using the tensor infos - if (params.no_alloc == false) { + if (!params.no_alloc) { //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data }