diff --git a/ggml-alloc.c b/ggml-alloc.c index 4121f3dba..8de28cf9d 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -67,6 +67,8 @@ struct ggml_allocr { struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE]; size_t max_size; bool measure; + int parse_seq[GGML_MAX_NODES]; + bool has_parse_seq; #ifdef GGML_ALLOCATOR_DEBUG struct ggml_tensor * allocated_tensors[1024]; @@ -229,6 +231,17 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t alloc->n_free_blocks++; } +void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) { + int pos = 0; + for (int i = 0; i < n; i++) { + if (list[i] != -1) { + alloc->parse_seq[pos] = list[i]; + pos++; + } + } + alloc->has_parse_seq = true; +} + void ggml_allocr_reset(struct ggml_allocr * alloc) { alloc->n_free_blocks = 1; size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment); @@ -248,6 +261,8 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) /*.hash_table = */ {{0}}, /*.max_size = */ 0, /*.measure = */ false, + /*.parse_seq = */ {0}, + /*.has_parse_seq = */ false, #ifdef GGML_ALLOCATOR_DEBUG /*.allocated_tensors = */ = {0}, #endif @@ -275,6 +290,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) { /*.hash_table = */ {{0}}, /*.max_size = */ 0, /*.measure = */ true, + /*.parse_seq = */ {0}, + /*.has_parse_seq = */ false, #ifdef GGML_ALLOCATOR_DEBUG /*.allocated_tensors = */ = {0}, #endif @@ -473,7 +490,13 @@ static size_t ggml_allocator_alloc_graph_tensors_n( allocate_node(alloc, input); } } - for (int i = 0; i < gf->n_nodes; i++) { + for (int ind = 0; ind < gf->n_nodes; ind++) { + int i; + if (alloc->has_parse_seq) { + i = alloc->parse_seq[ind]; + } else { + i = ind; + } struct ggml_tensor * node = gf->nodes[i]; // allocate parents (leafs) diff --git a/ggml-alloc.h b/ggml-alloc.h index a5ec8f87a..14a4350ac 100644 --- a/ggml-alloc.h +++ b/ggml-alloc.h @@ -10,6 +10,10 @@ extern "C" { GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment); GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment); +// tell the allocator to parse nodes following the order described in the list +// you should call this if your graph are optimized to execute out-of-order +GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n); + GGML_API void ggml_allocr_free(struct ggml_allocr * alloc); GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc); GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc); diff --git a/ggml-metal.h b/ggml-metal.h index 16f1a0caa..bf3f9a6a8 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -63,10 +63,13 @@ void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * // try to find operations that can be run concurrently in the graph // you should run it again if the topology of your graph changes -void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); +void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem); -// if the graph has been optimized for concurrently dispatch -bool ggml_metal_if_optimized(struct ggml_metal_context * ctx); +// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized +int ggml_metal_if_optimized(struct ggml_metal_context * ctx); + +// output the concur_list for ggml_alloc +int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx); // same as ggml_graph_compute but uses Metal // creates gf->n_threads command buffers in parallel diff --git a/ggml-metal.m b/ggml-metal.m index e13cb4b3c..32c6e4869 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -236,11 +236,12 @@ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) { ctx->n_cb = n_cb; } -bool ggml_metal_if_optimized(struct ggml_metal_context * ctx) { - if (ctx->concur_list_len) { - return true; - } - return false; +int ggml_metal_if_optimized(struct ggml_metal_context * ctx) { + return ctx->concur_list_len; +} + +int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) { + return ctx->concur_list; } // finds the Metal buffer that contains the tensor data on the GPU device @@ -383,7 +384,7 @@ void ggml_metal_get_tensor( void ggml_metal_graph_find_concurrency( struct ggml_metal_context * ctx, - struct ggml_cgraph * gf) { + struct ggml_cgraph * gf, bool check_mem) { int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time int nodes_unused[GGML_MAX_CONCUR]; @@ -430,7 +431,7 @@ void ggml_metal_graph_find_concurrency( } } } - if (exe_flag) { + if (exe_flag && check_mem) { // check if nodes[i]'s data will be overwritten by a node before nodes[i]. // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3] int64_t data_start = (int64_t) gf->nodes[i]->data; diff --git a/llama.cpp b/llama.cpp index a161f1566..345243990 100644 --- a/llama.cpp +++ b/llama.cpp @@ -63,7 +63,7 @@ static void llama_log_callback_default(llama_log_level level, const char * text, #define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__) -#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL) +#if !defined(GGML_USE_CUBLAS) #include "ggml-alloc.h" #define LLAMA_USE_ALLOCATOR #else @@ -1846,10 +1846,6 @@ static bool llama_eval_internal( #ifdef GGML_USE_METAL if (lctx.ctx_metal) { - // TODO: disabled until #2413 is resolved - //if (!ggml_metal_if_optimized(lctx.ctx_metal)) { - // ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf); - //} ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); ggml_metal_graph_compute(lctx.ctx_metal, gf); ggml_metal_get_tensor (lctx.ctx_metal, res); @@ -3287,7 +3283,18 @@ struct llama_context * llama_new_context_with_model( int n_past = hparams.n_ctx - n_tokens; llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past); - +#ifdef GGML_USE_METAL + if (params.n_gpu_layers > 0) { + ctx->ctx_metal = ggml_metal_init(1); + if (!ctx->ctx_metal) { + LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__); + llama_free(ctx); + return NULL; + } + ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false); + ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal)); + } +#endif // measure memory requirements for the graph size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment; @@ -3305,6 +3312,11 @@ struct llama_context * llama_new_context_with_model( ctx->buf_alloc.resize(alloc_size); ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment); +#ifdef GGML_USE_METAL + if (ctx->ctx_metal) { + ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal)); + } +#endif } #else ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead()); @@ -3319,13 +3331,6 @@ struct llama_context * llama_new_context_with_model( #ifdef GGML_USE_METAL if (params.n_gpu_layers > 0) { // this allocates all Metal resources and memory buffers - ctx->ctx_metal = ggml_metal_init(1); - - if (!ctx->ctx_metal) { - LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__); - llama_free(ctx); - return NULL; - } void * data_ptr = NULL; size_t data_size = 0; @@ -3354,8 +3359,7 @@ struct llama_context * llama_new_context_with_model( LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0)); LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0)); + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.addr, ctx->buf_alloc.size, 0)); #undef LLAMA_METAL_CHECK_BUF } #endif