diff --git a/CMakeLists.txt b/CMakeLists.txt index ba008bcc6..1eae2d670 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -301,7 +301,7 @@ if (LLAMA_METAL) set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h) add_compile_definitions(GGML_USE_METAL) - add_compile_definitions(GGML_METAL_NDEBUG) + #add_compile_definitions(GGML_METAL_NDEBUG) # get full path to the file #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") diff --git a/Makefile b/Makefile index e60821dd5..a64374e7d 100644 --- a/Makefile +++ b/Makefile @@ -305,7 +305,7 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h endif # LLAMA_HIPBLAS ifdef LLAMA_METAL - CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG + CFLAGS += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG CXXFLAGS += -DGGML_USE_METAL LDFLAGS += -framework Foundation -framework Metal -framework MetalKit OBJS += ggml-metal.o diff --git a/ggml-metal.m b/ggml-metal.m index ad2ee8cf5..e929c4b07 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -11,6 +11,7 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) +// TODO: temporary - reuse llama.cpp logging #ifdef GGML_METAL_NDEBUG #define metal_printf(...) #else @@ -113,7 +114,7 @@ static NSString * const msl_library_source = @"see metal.metal"; @end struct ggml_metal_context * ggml_metal_init(int n_cb) { - fprintf(stderr, "%s: allocating\n", __func__); + metal_printf("%s: allocating\n", __func__); struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); @@ -132,7 +133,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error]; if (error) { - fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); + metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } } @@ -146,11 +147,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"]; NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; - fprintf(stderr, "%s: loading '%s'\n", __func__, [path UTF8String]); + metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]); NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; if (error) { - fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); + metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } @@ -162,7 +163,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error]; #endif if (error) { - fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); + metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } } @@ -174,11 +175,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { #define GGML_METAL_ADD_KERNEL(name) \ ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ - fprintf(stderr, "%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ + metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \ (int) ctx->pipeline_##name.threadExecutionWidth); \ if (error) { \ - fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ + metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ return NULL; \ } @@ -230,19 +231,19 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { #undef GGML_METAL_ADD_KERNEL } - fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); - fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); + metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); if (ctx->device.maxTransferRate != 0) { - fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); + metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); } else { - fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__); + metal_printf("%s: maxTransferRate = built-in GPU\n", __func__); } return ctx; } void ggml_metal_free(struct ggml_metal_context * ctx) { - fprintf(stderr, "%s: deallocating\n", __func__); + metal_printf("%s: deallocating\n", __func__); #define GGML_METAL_DEL_KERNEL(name) \ [ctx->function_##name release]; \ [ctx->pipeline_##name release]; @@ -311,7 +312,7 @@ void * ggml_metal_host_malloc(size_t n) { void * data = NULL; const int result = posix_memalign((void **) &data, getpagesize(), n); if (result != 0) { - fprintf(stderr, "%s: error: posix_memalign failed\n", __func__); + metal_printf("%s: error: posix_memalign failed\n", __func__); return NULL; } @@ -339,7 +340,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) { // Metal buffer based on the host memory pointer // static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) { - //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); + //metal_printf("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); const int64_t tsize = ggml_nbytes(t); @@ -350,13 +351,13 @@ static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) { *offs = (size_t) ioffs; - //fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); + //metal_printf("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); return ctx->buffers[i].metal; } } - fprintf(stderr, "%s: error: buffer is nil\n", __func__); + metal_printf("%s: error: buffer is nil\n", __func__); return nil; } @@ -368,7 +369,7 @@ bool ggml_metal_add_buffer( size_t size, size_t max_size) { if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) { - fprintf(stderr, "%s: too many buffers\n", __func__); + metal_printf("%s: too many buffers\n", __func__); return false; } @@ -378,7 +379,7 @@ bool ggml_metal_add_buffer( const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data; if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) { - fprintf(stderr, "%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); + metal_printf("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); return false; } } @@ -399,11 +400,11 @@ bool ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); + metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); return false; } - fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); + metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); ++ctx->n_buffers; } else { @@ -423,27 +424,27 @@ bool ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); + metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); return false; } - fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); + metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); if (i + size_step < size) { - fprintf(stderr, "\n"); + metal_printf("\n"); } ++ctx->n_buffers; } } - fprintf(stderr, ", (%8.2f / %8.2f)", + metal_printf(", (%8.2f / %8.2f)", ctx->device.currentAllocatedSize / 1024.0 / 1024.0, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) { - fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n"); + metal_printf(", warning: current allocated size is greater than the recommended max working set size\n"); } else { - fprintf(stderr, "\n"); + metal_printf("\n"); } } @@ -453,8 +454,6 @@ bool ggml_metal_add_buffer( void ggml_metal_set_tensor( struct ggml_metal_context * ctx, struct ggml_tensor * t) { - metal_printf("%s: set input for tensor '%s'\n", __func__, t->name); - size_t offs; id id_dst = ggml_metal_get_buffer(ctx, t, &offs); @@ -464,8 +463,6 @@ void ggml_metal_set_tensor( void ggml_metal_get_tensor( struct ggml_metal_context * ctx, struct ggml_tensor * t) { - metal_printf("%s: extract results for tensor '%s'\n", __func__, t->name); - size_t offs; id id_src = ggml_metal_get_buffer(ctx, t, &offs); @@ -560,15 +557,13 @@ void ggml_metal_graph_find_concurrency( } if (ctx->concur_list_len > GGML_MAX_CONCUR) { - fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__); + metal_printf("%s: too many elements for metal ctx->concur_list!\n", __func__); } } void ggml_metal_graph_compute( struct ggml_metal_context * ctx, struct ggml_cgraph * gf) { - metal_printf("%s: evaluating graph\n", __func__); - @autoreleasepool { // if there is ctx->concur_list, dispatch concurrently @@ -616,7 +611,7 @@ void ggml_metal_graph_compute( continue; } - metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); + //metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); struct ggml_tensor * src0 = gf->nodes[i]->src[0]; struct ggml_tensor * src1 = gf->nodes[i]->src[1]; @@ -764,7 +759,7 @@ void ggml_metal_graph_compute( } break; default: { - fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); GGML_ASSERT(false); } } break; @@ -923,7 +918,7 @@ void ggml_metal_graph_compute( } break; default: { - fprintf(stderr, "Asserting on type %d\n",(int)src0t); + metal_printf("Asserting on type %d\n",(int)src0t); GGML_ASSERT(false && "not implemented"); } }; @@ -1161,7 +1156,7 @@ void ggml_metal_graph_compute( } break; default: { - fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); GGML_ASSERT(false); } } @@ -1186,7 +1181,7 @@ void ggml_metal_graph_compute( MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status]; if (status != MTLCommandBufferStatusCompleted) { - fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status); + metal_printf("%s: command buffer %d failed with status %lu\n", __func__, i, status); GGML_ASSERT(false); } }