From 3cd8dde0d1357b7f11bdd25c45d5bf5e97e284a0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 24 Mar 2023 06:22:28 +0200 Subject: [PATCH] Revert "Fix memory allocation issues and seg faults" This reverts commit 4870e455b3653f7d7769fa5772b2c90ffad088df. Will provide the correct fix later --- llama.cpp | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/llama.cpp b/llama.cpp index cf796cce3..d55219256 100644 --- a/llama.cpp +++ b/llama.cpp @@ -102,9 +102,6 @@ struct llama_context { // decode output (2-dimensional array: [n_tokens][n_vocab]) std::vector logits; bool logits_all = false; - - // work buffer for transformer evaluation - std::vector buf_eval; }; struct llama_context_params llama_context_default_params() { @@ -630,19 +627,27 @@ static bool llama_eval_internal( const int n_rot = hparams.n_embd/hparams.n_head; auto & mem_per_token = lctx.mem_per_token; - auto & buf_eval = lctx.buf_eval; - if (mem_per_token*(n_past + N + 16) > buf_eval.size()) { - const size_t buf_size_new = 1.618*buf_eval.size(); + // TODO: fix this hardcoded size + static size_t buf_size = 512u*1024*1024; + static void * buf = malloc(buf_size); - //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_eval.size(), buf_size_new); + if (mem_per_token > 0 && mem_per_token*N > buf_size) { + const size_t buf_size_new = 1.3*(mem_per_token*N); // add 30% to account for ggml object overhead + //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); - buf_eval.resize(buf_size_new); + // reallocate + buf_size = buf_size_new; + buf = realloc(buf, buf_size); + if (buf == nullptr) { + fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); + return false; + } } struct ggml_init_params params = { - /*.mem_size =*/ buf_eval.size(), - /*.mem_buffer =*/ buf_eval.data(), + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf, }; struct ggml_context * ctx0 = ggml_init(params); @@ -827,11 +832,10 @@ static bool llama_eval_internal( memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); } - if (N == 1) { - mem_per_token = ggml_used_mem(ctx0)/(n_past + N); + if (mem_per_token == 0) { + mem_per_token = ggml_used_mem(ctx0)/N; } - - //fprintf(stderr, "\nused_mem = %zu, %zu MB\n", ggml_used_mem(ctx0), ggml_used_mem(ctx0)/1024/1024); + //fprintf(stderr, "used_mem = %zu\n", ggml_used_mem(ctx0)); ggml_free(ctx0); @@ -1412,8 +1416,6 @@ struct llama_context * llama_init_from_file( return nullptr; } - ctx->buf_eval.resize(512u*1024u*1024u); - return ctx; }