diff --git a/common/common.cpp b/common/common.cpp index 18289755c..bf1ed8a66 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1281,7 +1281,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.n_batch = params.n_batch; cparams.n_threads = params.n_threads; cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; - cparams.mul_mat_q = params.mul_mat_q; cparams.seed = params.seed; cparams.logits_all = params.logits_all; cparams.embedding = params.embedding; @@ -1725,7 +1724,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs); fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); - fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false"); fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false"); fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type); fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride); diff --git a/common/common.h b/common/common.h index 25003df26..ab62bdb82 100644 --- a/common/common.h +++ b/common/common.h @@ -115,7 +115,6 @@ struct gpt_params { bool kl_divergence = false; // compute KL-divergence - bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index b4b8a38e1..19aff18ae 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -32,16 +32,15 @@ int main(int argc, char ** argv) { gpt_params params; if (argc == 1 || argv[1][0] == '-') { - printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] \n" , argv[0]); + printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] \n" , argv[0]); printf(" , and PL are comma-separated lists of numbers without spaces\n\n"); - printf(" example: %s ggml-model-f16.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]); + printf(" example: %s ggml-model-f16.gguf 2048 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]); return 1 ; } int n_kv_max = 2048; int is_pp_shared = 0; int n_gpu_layers = 0; - int mmq = 0; std::vector n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, }; std::vector n_tg = { 128, 256, }; @@ -65,19 +64,15 @@ int main(int argc, char ** argv) { } if (argc >= 6) { - mmq = std::atoi(argv[5]); + n_pp = parse_list(argv[5]); } if (argc >= 7) { - n_pp = parse_list(argv[6]); + n_tg = parse_list(argv[6]); } if (argc >= 8) { - n_tg = parse_list(argv[7]); - } - - if (argc >= 9) { - n_pl = parse_list(argv[8]); + n_pl = parse_list(argv[7]); } // init LLM @@ -106,7 +101,6 @@ int main(int argc, char ** argv) { ctx_params.seed = 1234; ctx_params.n_ctx = n_kv_max; ctx_params.n_batch = 512; - ctx_params.mul_mat_q = mmq; ctx_params.n_threads = params.n_threads; ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; @@ -159,7 +153,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch); + LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); LOG_TEE("\n"); LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md index 374e40a7d..10f37b441 100644 --- a/examples/llama-bench/README.md +++ b/examples/llama-bench/README.md @@ -35,7 +35,6 @@ options: -mg, --main-gpu (default: 0) -nkvo, --no-kv-offload <0|1> (default: 0) -mmp, --mmap <0|1> (default: 1) - -mmq, --mul-mat-q <0|1> (default: 1) -ts, --tensor_split (default: 0) -r, --repetitions (default: 5) -o, --output (default: md) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 8fec3d43d..c2155b2ac 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -176,7 +176,6 @@ struct cmd_params { std::vector split_mode; std::vector main_gpu; std::vector no_kv_offload; - std::vector mul_mat_q; std::vector> tensor_split; std::vector use_mmap; int reps; @@ -196,7 +195,6 @@ static const cmd_params cmd_params_defaults = { /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, /* main_gpu */ {0}, /* no_kv_offload */ {false}, - /* mul_mat_q */ {true}, /* tensor_split */ {std::vector(llama_max_devices(), 0.0f)}, /* use_mmap */ {true}, /* reps */ 5, @@ -221,7 +219,6 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); - printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); printf(" -ts, --tensor_split (default: 0)\n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); printf(" -o, --output (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); @@ -383,13 +380,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = split(argv[i], split_delim); params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end()); - } else if (arg == "-mmq" || arg == "--mul-mat-q") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = split(argv[i], split_delim); - params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end()); } else if (arg == "-mmp" || arg == "--mmap") { if (++i >= argc) { invalid_param = true; @@ -466,7 +456,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } - if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; } if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } @@ -486,7 +475,6 @@ struct cmd_params_instance { llama_split_mode split_mode; int main_gpu; bool no_kv_offload; - bool mul_mat_q; std::vector tensor_split; bool use_mmap; @@ -518,7 +506,6 @@ struct cmd_params_instance { cparams.n_batch = n_batch; cparams.type_k = type_k; cparams.type_v = type_v; - cparams.mul_mat_q = mul_mat_q; cparams.offload_kqv = !no_kv_offload; return cparams; @@ -538,7 +525,6 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & nb : params.n_batch) for (const auto & tk : params.type_k) for (const auto & tv : params.type_v) - for (const auto & mmq : params.mul_mat_q) for (const auto & nkvo : params.no_kv_offload) for (const auto & nt : params.n_threads) { for (const auto & n_prompt : params.n_prompt) { @@ -557,7 +543,6 @@ static std::vector get_cmd_params_instances(const cmd_param /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, - /* .mul_mat_q = */ mmq, /* .tensor_split = */ ts, /* .use_mmap = */ mmp, }; @@ -580,7 +565,6 @@ static std::vector get_cmd_params_instances(const cmd_param /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, - /* .mul_mat_q = */ mmq, /* .tensor_split = */ ts, /* .use_mmap = */ mmp, }; @@ -616,7 +600,6 @@ struct test { llama_split_mode split_mode; int main_gpu; bool no_kv_offload; - bool mul_mat_q; std::vector tensor_split; bool use_mmap; int n_prompt; @@ -639,7 +622,6 @@ struct test { split_mode = inst.split_mode; main_gpu = inst.main_gpu; no_kv_offload = inst.no_kv_offload; - mul_mat_q = inst.mul_mat_q; tensor_split = inst.tensor_split; use_mmap = inst.use_mmap; n_prompt = inst.n_prompt; @@ -713,7 +695,7 @@ struct test { "n_batch", "n_threads", "type_k", "type_v", "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", - "mul_mat_q", "tensor_split", "use_mmap", + "tensor_split", "use_mmap", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts" @@ -733,7 +715,7 @@ struct test { } if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" || field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" || - field == "mul_mat_q" || field == "use_mmap") { + field == "use_mmap") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -767,7 +749,7 @@ struct test { std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_gpu_layers), split_mode_str(split_mode), std::to_string(main_gpu), std::to_string(no_kv_offload), - std::to_string(mul_mat_q), tensor_split_str, std::to_string(use_mmap), + tensor_split_str, std::to_string(use_mmap), std::to_string(n_prompt), std::to_string(n_gen), test_time, std::to_string(avg_ns()), std::to_string(stdev_ns()), std::to_string(avg_ts()), std::to_string(stdev_ts()) @@ -931,9 +913,6 @@ struct markdown_printer : public printer { if (field == "n_threads") { return "threads"; } - if (field == "mul_mat_q") { - return "mmq"; - } if (field == "no_kv_offload") { return "nkvo"; } @@ -974,9 +953,6 @@ struct markdown_printer : public printer { if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) { fields.emplace_back("split_mode"); } - if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) { - fields.emplace_back("mul_mat_q"); - } if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) { fields.emplace_back("no_kv_offload"); } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index eea987966..2b2f4a0f4 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2390,14 +2390,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } #else LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {}); -#endif // GGML_USE_CUBLAS - } - else if (arg == "--no-mul-mat-q" || arg == "-nommq") - { -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL) - params.mul_mat_q = false; -#else - LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {}); #endif // GGML_USE_CUBLAS } else if (arg == "--main-gpu" || arg == "-mg") diff --git a/llama.cpp b/llama.cpp index a35f07aa4..073fd3b70 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1645,7 +1645,6 @@ struct llama_cparams { float yarn_beta_slow; float defrag_thold; - bool mul_mat_q; bool offload_kqv; bool do_pooling; @@ -11633,7 +11632,6 @@ struct llama_context_params llama_context_default_params() { /*.cb_eval_user_data =*/ nullptr, /*.type_k =*/ GGML_TYPE_F16, /*.type_v =*/ GGML_TYPE_F16, - /*.mul_mat_q =*/ true, /*.logits_all =*/ false, /*.embedding =*/ false, /*.offload_kqv =*/ true, @@ -11785,7 +11783,6 @@ struct llama_context * llama_new_context_with_model( cparams.yarn_beta_fast = params.yarn_beta_fast; cparams.yarn_beta_slow = params.yarn_beta_slow; cparams.defrag_thold = params.defrag_thold; - cparams.mul_mat_q = params.mul_mat_q; cparams.offload_kqv = params.offload_kqv; cparams.do_pooling = params.do_pooling; diff --git a/llama.h b/llama.h index 4d0ebe37d..ed51f478a 100644 --- a/llama.h +++ b/llama.h @@ -255,7 +255,6 @@ extern "C" { enum ggml_type type_v; // data type for V cache // Keep the booleans together to avoid misalignment during copy-by-value. - bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true) bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) bool embedding; // embedding mode only bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py index 70737f976..39c3e52e5 100755 --- a/scripts/compare-llama-bench.py +++ b/scripts/compare-llama-bench.py @@ -31,7 +31,7 @@ PRETTY_NAMES = { "model_size": "Model Size [GiB]", "model_n_params": "Num. of Parameters", "n_batch": "Batch size", "n_threads": "Threads", "type_k": "K type", "type_v": "V type", "n_gpu_layers": "GPU layers", "main_gpu": "Main GPU", "no_kv_offload": "NKVO", - "mul_mat_q": "MMQ", "tensor_split": "Tensor split" + "tensor_split": "Tensor split" } DEFAULT_SHOW = ["model_type"] # Always show these properties by default.