diff --git a/common/common.cpp b/common/common.cpp index 6e5d5b4d5..afc9b8a55 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -374,6 +374,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { #else fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); +#endif + } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { + if (++i >= argc) { + invalid_param = true; + break; + } +#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD + params.n_gpu_layers_draft = std::stoi(argv[i]); +#else + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); #endif } else if (arg == "--main-gpu" || arg == "-mg") { if (++i >= argc) { @@ -664,6 +675,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD printf(" -ngl N, --n-gpu-layers N\n"); printf(" number of layers to store in VRAM\n"); + printf(" -ngld N, --n-gpu-layers-draft N\n"); + printf(" number of layers to store in VRAM for the draft model\n"); printf(" -ts SPLIT --tensor-split SPLIT\n"); printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); diff --git a/common/common.h b/common/common.h index 012bf5e13..238635ae3 100644 --- a/common/common.h +++ b/common/common.h @@ -38,6 +38,7 @@ struct gpt_params { int32_t n_draft = 16; // number of tokens to draft during speculative decoding int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) + int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 822d7b529..2cd153f9a 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -42,6 +42,7 @@ int main(int argc, char ** argv) { // load the draft model params.model = params.model_draft; + params.n_gpu_layers = params.n_gpu_layers_draft; std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params); // tokenize the prompt