diff --git a/llama.cpp b/llama.cpp index 4a7d01b32..e597f5048 100644 --- a/llama.cpp +++ b/llama.cpp @@ -925,21 +925,21 @@ static bool kv_cache_init( struct llama_context_params llama_context_default_params() { struct llama_context_params result = { + /*.seed =*/ -1, /*.n_ctx =*/ 512, /*.n_batch =*/ 512, /*.gpu_layers =*/ 0, /*.main_gpu =*/ 0, /*.tensor_split =*/ {0}, + /*.progress_callback =*/ nullptr, + /*.progress_callback_user_data =*/ nullptr, /*.low_vram =*/ false, - /*.seed =*/ -1, /*.f16_kv =*/ true, /*.logits_all =*/ false, /*.vocab_only =*/ false, /*.use_mmap =*/ true, /*.use_mlock =*/ false, /*.embedding =*/ false, - /*.progress_callback =*/ nullptr, - /*.progress_callback_user_data =*/ nullptr, }; return result; diff --git a/llama.h b/llama.h index 1241ba6c0..0de530d45 100644 --- a/llama.h +++ b/llama.h @@ -71,28 +71,27 @@ extern "C" { typedef void (*llama_progress_callback)(float progress, void *ctx); - struct llama_context_params { + struct llama_context_params { + int seed; // RNG seed, -1 for random int n_ctx; // text context int n_batch; // prompt processing batch size int n_gpu_layers; // number of layers to store in VRAM int main_gpu; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs - bool low_vram; // if true, reduce VRAM usage at the cost of performance - int seed; // RNG seed, -1 for random + // called with a progress value between 0 and 1, pass NULL to disable + llama_progress_callback progress_callback; + // context pointer passed to the progress callback + void * progress_callback_user_data; + // Keep the booleans together to avoid misalignment during copy-by-value. + bool low_vram; // if true, reduce VRAM usage at the cost of performance bool f16_kv; // use fp16 for KV cache bool logits_all; // the llama_eval() call computes all logits, not just the last one bool vocab_only; // only load the vocabulary, no weights bool use_mmap; // use mmap if possible bool use_mlock; // force system to keep model in RAM bool embedding; // embedding mode only - - // called with a progress value between 0 and 1, pass NULL to disable - llama_progress_callback progress_callback; - // context pointer passed to the progress callback - void * progress_callback_user_data; }; - // model file types enum llama_ftype { LLAMA_FTYPE_ALL_F32 = 0,