diff --git a/ggml.c b/ggml.c index 0e4b1466c..800390a88 100644 --- a/ggml.c +++ b/ggml.c @@ -1,5 +1,5 @@ -// Defines CLOCK_MONOTONIC on Linux -#define _POSIX_C_SOURCE 199309L +// Defines CLOCK_MONOTONIC and asprintf on Linux +#define _GNU_SOURCE #include "ggml.h" @@ -10,6 +10,7 @@ #endif #include +#include #include #include #include @@ -31,7 +32,6 @@ #else // ref: https://github.com/ggerganov/whisper.cpp/issues/168 #include -#include #endif typedef volatile LONG atomic_int; @@ -83,6 +83,17 @@ typedef void* thread_ret_t; #define static_assert(cond, msg) _Static_assert(cond, msg) #endif +#define GGML_MLOCK_SUPPORT 0 + +#ifdef __has_include + #if __has_include() + #undef GGML_MLOCK_SUPPORT + #define GGML_MLOCK_SUPPORT 1 + #include + #endif +#endif + + /*#define GGML_PERF*/ #define GGML_DEBUG 0 #define GGML_GELU_FP16 @@ -2344,6 +2355,7 @@ struct ggml_context { size_t mem_size; void * mem_buffer; bool mem_buffer_owned; + bool mem_buffer_mlocked; int n_objects; @@ -2619,16 +2631,19 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { } *ctx = (struct ggml_context) { - /*.mem_size =*/ params.mem_size, - /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size), - /*.mem_buffer_owned =*/ params.mem_buffer ? false : true, - /*.n_objects =*/ 0, - /*.objects_begin =*/ NULL, - /*.objects_end =*/ NULL, - /*.scratch =*/ { 0, 0, NULL, }, - /*.scratch_save =*/ { 0, 0, NULL, }, + /*.mem_size =*/ params.mem_size, + /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size), + /*.mem_buffer_owned =*/ params.mem_buffer ? false : true, + /*.mem_buffer_mlocked =*/ false, + /*.n_objects =*/ 0, + /*.objects_begin =*/ NULL, + /*.objects_end =*/ NULL, + /*.scratch =*/ { 0, 0, NULL, }, + /*.scratch_save =*/ { 0, 0, NULL, }, }; + GGML_ASSERT(ctx->mem_buffer != NULL); // check for allocation failure + ggml_assert_aligned(ctx->mem_buffer); GGML_PRINT_DEBUG("%s: context initialized\n", __func__); @@ -2651,6 +2666,14 @@ void ggml_free(struct ggml_context * ctx) { GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n", __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size); +#if GGML_MLOCK_SUPPORT + if (ctx->mem_buffer_mlocked) { + if (munlock(ctx->mem_buffer, ctx->mem_size)) { + fprintf(stderr, "%s: failed to munlock buffer: %s\n", __func__, strerror(errno)); + } + } +#endif + if (ctx->mem_buffer_owned) { free(ctx->mem_buffer); } @@ -2679,6 +2702,37 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) return result; } +bool ggml_mlock_supported(void) { + return GGML_MLOCK_SUPPORT; +} + +#if GGML_MLOCK_SUPPORT +#ifdef __APPLE__ + #define MLOCK_SUGGESTION "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or\n" \ + "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l)." +#else + #define MLOCK_SUGGESTION "Try increasing RLIMIT_MLOCK (ulimit -l)." +#endif +bool ggml_mlock(struct ggml_context * ctx, char ** err_p) { + if (ctx->mem_buffer_mlocked) { + return true; + } + if (mlock(ctx->mem_buffer, ctx->mem_size)) { + int ret = asprintf(err_p, "failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION, + ctx->mem_size, strerror(errno)); + GGML_ASSERT(ret >= 0); + return false; + } + ctx->mem_buffer_mlocked = true; + return true; +} +#else // GGML_MLOCK_SUPPORT +bool ggml_mlock(struct ggml_context * ctx, char ** err_p) { + *err_p = strdup("can't mlock because it's not supported on this system"); + return false; +} +#endif // GGML_MLOCK_SUPPORT + //////////////////////////////////////////////////////////////////////////////// struct ggml_tensor * ggml_new_tensor_impl( diff --git a/ggml.h b/ggml.h index c7e6814a8..ddb97318b 100644 --- a/ggml.h +++ b/ggml.h @@ -343,6 +343,9 @@ size_t ggml_used_mem(const struct ggml_context * ctx); size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); +bool ggml_mlock_supported(void); +bool ggml_mlock(struct ggml_context * ctx, char ** err_p); + struct ggml_tensor * ggml_new_tensor( struct ggml_context * ctx, enum ggml_type type, diff --git a/llama.cpp b/llama.cpp index d8c771529..5d56cc90e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -115,6 +115,7 @@ struct llama_context_params llama_context_default_params() { /*.f16_kv =*/ false, /*.logits_all =*/ false, /*.vocab_only =*/ false, + /*.use_mlock =*/ false, /*.embedding =*/ false, }; @@ -1428,11 +1429,22 @@ struct llama_context * llama_init_from_file( ggml_type type_memory = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory, params.vocab_only)) { + if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory, + params.vocab_only)) { fprintf(stderr, "%s: failed to load model\n", __func__); delete ctx; return nullptr; } + + if (params.use_mlock) { + char *err; + if (!ggml_mlock(ctx->model.ctx, &err)) { + fprintf(stderr, "%s\n", err); + free(err); + delete ctx; + return nullptr; + } + } // reserve memory for context buffers { diff --git a/llama.h b/llama.h index 209b4dbe8..9943d96ba 100644 --- a/llama.h +++ b/llama.h @@ -53,6 +53,7 @@ extern "C" { bool f16_kv; // use fp16 for KV cache bool logits_all; // the llama_eval() call computes all logits, not just the last one bool vocab_only; // only load the vocabulary, no weights + bool use_mlock; // force system to keep model in RAM bool embedding; // embedding mode only }; diff --git a/main.cpp b/main.cpp index 46a80ff87..39dfc575b 100644 --- a/main.cpp +++ b/main.cpp @@ -199,6 +199,7 @@ int main(int argc, char ** argv) { lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; lparams.logits_all = params.perplexity; + lparams.use_mlock = params.use_mlock; lparams.embedding = params.embedding; ctx = llama_init_from_file(params.model.c_str(), lparams); diff --git a/utils.cpp b/utils.cpp index 0df89af0b..10673fb82 100644 --- a/utils.cpp +++ b/utils.cpp @@ -1,3 +1,5 @@ +#include "ggml.h" + #include "utils.h" #include @@ -127,6 +129,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.instruct = true; } else if (arg == "--color") { params.use_color = true; + } else if (arg == "--mlock") { + params.use_mlock = true; } else if (arg == "-r" || arg == "--reverse-prompt") { if (++i >= argc) { invalid_param = true; @@ -194,6 +198,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n"); fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); + if (ggml_mlock_supported()) { + fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); + } fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); fprintf(stderr, "\n"); diff --git a/utils.h b/utils.h index 8120c123b..cf914990c 100644 --- a/utils.h +++ b/utils.h @@ -46,6 +46,7 @@ struct gpt_params { bool instruct = false; // instruction mode (used for Alpaca models) bool ignore_eos = false; // do not stop generating after eos bool perplexity = false; // compute perplexity over the prompt + bool use_mlock = false; // use mlock to keep model in memory }; bool gpt_params_parse(int argc, char ** argv, gpt_params & params);