diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 7a2811584..d0fe6d90d 100755 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -441,6 +441,8 @@ struct test { static const std::string gpu_info; std::string model_filename; std::string model_type; + uint64_t model_size; + uint64_t model_n_params; int n_batch; int n_threads; bool f32_kv; @@ -457,8 +459,10 @@ struct test { test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) { model_filename = inst.model; char buf[128]; - llama_model_type(lmodel, buf, sizeof(buf)); + llama_model_desc(lmodel, buf, sizeof(buf)); model_type = buf; + model_size = llama_model_size(lmodel); + model_n_params = llama_model_n_params(lmodel); n_batch = inst.n_batch; n_threads = inst.n_threads; f32_kv = inst.f32_kv; @@ -524,7 +528,7 @@ struct test { "build_commit", "build_number", "cuda", "opencl", "metal", "gpu_blas", "blas", "cpu_info", "gpu_info", - "model_filename", "model_type", + "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_threads", "f16_kv", "n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split", "n_prompt", "n_gen", "test_time", @@ -538,6 +542,7 @@ struct test { static field_type get_field_type(const std::string & field) { if (field == "build_number" || field == "n_batch" || field == "n_threads" || + field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" || field == "stddev_ns") { @@ -573,7 +578,7 @@ struct test { build_commit, std::to_string(build_number), std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas), cpu_info, gpu_info, - model_filename, model_type, + model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv), std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str, std::to_string(n_prompt), std::to_string(n_gen), test_time, @@ -709,8 +714,15 @@ struct markdown_printer : public printer { return -30; } if (field == "t/s") { - return 15; + return 16; } + if (field == "size" || field == "params") { + return 10; + } + if (field == "n_gpu_layers") { + return 3; + } + int width = std::max((int)field.length(), 10); if (test::get_field_type(field) == test::STRING) { @@ -719,9 +731,28 @@ struct markdown_printer : public printer { return width; } + static std::string get_field_display_name(const std::string & field) { + if (field == "n_gpu_layers") { + return "ngl"; + } + if (field == "n_threads") { + return "threads"; + } + if (field == "mul_mat_q") { + return "mmq"; + } + if (field == "tensor_split") { + return "ts"; + } + return field; + } + void print_header(const cmd_params & params) override { // select fields to print - fields = { "model", "backend" }; + fields.push_back("model"); + fields.push_back("size"); + fields.push_back("params"); + fields.push_back("backend"); bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS"; if (!is_cpu_backend) { fields.push_back("n_gpu_layers"); @@ -752,7 +783,7 @@ struct markdown_printer : public printer { fprintf(fout, "|"); for (const auto & field : fields) { - fprintf(fout, " %*s |", get_field_width(field), field.c_str()); + fprintf(fout, " %*s |", get_field_width(field), get_field_display_name(field).c_str()); } fprintf(fout, "\n"); fprintf(fout, "|"); @@ -769,12 +800,26 @@ struct markdown_printer : public printer { fprintf(fout, "|"); for (const auto & field : fields) { std::string value; + char buf[128]; if (field == "model") { value = t.model_type; + } else if (field == "size") { + if (t.model_size < 1024*1024*1024) { + snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0); + } else { + snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0); + } + value = buf; + } else if (field == "params") { + if (t.model_n_params < 1000*1000*1000) { + snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6); + } else { + snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9); + } + value = buf; } else if (field == "backend") { value = test::get_backend(); } else if (field == "test") { - char buf[128]; if (t.n_prompt > 0 && t.n_gen == 0) { snprintf(buf, sizeof(buf), "pp %d", t.n_prompt); } else if (t.n_gen > 0 && t.n_prompt == 0) { @@ -785,7 +830,6 @@ struct markdown_printer : public printer { } value = buf; } else if (field == "t/s") { - char buf[128]; snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts()); value = buf; } else if (vmap.find(field) != vmap.end()) { diff --git a/llama.cpp b/llama.cpp index d12b6d1cb..4529ac822 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5297,13 +5297,29 @@ int llama_model_n_embd(const struct llama_model * model) { return model->hparams.n_embd; } -int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) { +int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) { return snprintf(buf, buf_size, "%s %s %s", model->name.c_str(), llama_model_type_name(model->type), llama_model_ftype_name(model->ftype).c_str()); } +uint64_t llama_model_size(const struct llama_model * model) { + uint64_t size = 0; + for (const auto & it : model->tensors_by_name) { + size += ggml_nbytes(it.second); + } + return size; +} + +uint64_t llama_model_n_params(const struct llama_model * model) { + uint64_t nparams = 0; + for (const auto & it : model->tensors_by_name) { + nparams += ggml_nelements(it.second); + } + return nparams; +} + int llama_model_quantize( const char * fname_inp, const char * fname_out, diff --git a/llama.h b/llama.h index 2bcf94e0f..d47468172 100644 --- a/llama.h +++ b/llama.h @@ -254,7 +254,11 @@ extern "C" { LLAMA_API int llama_model_n_embd (const struct llama_model * model); // Get a string describing the model type - LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size); + LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size); + // Returns the total size of all the tensors in the model in bytes + LLAMA_API uint64_t llama_model_size(const struct llama_model * model); + // Returns the total number of parameters in the model + LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model); // Returns 0 on success LLAMA_API int llama_model_quantize(