From 1290fc64572f434f2f36721d2e2b0913cec0178a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 18 Jan 2023 20:31:46 +0200 Subject: [PATCH] bench : add memcpy and ggml_mul_mat benchmarks --- Makefile | 4 +- examples/bench/bench.cpp | 164 +++++++++++++++++++++++++++++++++++++-- extra/bench-all.sh | 13 +++- ggml.c | 15 +++- 4 files changed, 182 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index b7edea8..56c3793 100644 --- a/Makefile +++ b/Makefile @@ -133,8 +133,8 @@ ifdef WHISPER_OPENBLAS LDFLAGS += -lopenblas endif ifdef WHISPER_GPROF - CFLAGS += -pg - CXXFLAGS += -pg + CFLAGS += -pg + CXXFLAGS += -pg endif ifneq ($(filter aarch64%,$(UNAME_M)),) endif diff --git a/examples/bench/bench.cpp b/examples/bench/bench.cpp index 2fd2423..5f99774 100644 --- a/examples/bench/bench.cpp +++ b/examples/bench/bench.cpp @@ -1,12 +1,16 @@ +#include "ggml.h" #include "whisper.h" #include +#include #include #include +#include // command-line parameters struct whisper_params { int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); + int32_t what = 0; // what to benchmark: 0 - whisper ecoder, 1 - memcpy, 2 - ggml_mul_mat std::string model = "models/ggml-base.en.bin"; }; @@ -23,6 +27,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { } else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); } else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; } + else if (arg == "-w" || arg == "--what") { params.what = atoi(argv[++i]); } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); whisper_print_usage(argc, argv, params); @@ -41,16 +46,14 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -h, --help [default] show this help message and exit\n"); fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads); fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str()); + fprintf(stderr, " -w N, --what N [%-7d] what to benchmark:\n", params.what); + fprintf(stderr, " %-7s 0 - whisper encoder\n", ""); + fprintf(stderr, " %-7s 1 - memcpy\n", ""); + fprintf(stderr, " %-7s 2 - ggml_mul_mat\n", ""); fprintf(stderr, "\n"); } -int main(int argc, char ** argv) { - whisper_params params; - - if (whisper_params_parse(argc, argv, params) == false) { - return 1; - } - +int bench_whisper_encoder(const whisper_params & params) { // whisper init struct whisper_context * ctx = whisper_init_from_file(params.model.c_str()); @@ -92,3 +95,150 @@ int main(int argc, char ** argv) { return 0; } + +int bench_memcpy(const whisper_params & params) { + size_t n = 50; + size_t arr = params.what > 0 ? 1024 : params.what; // trick to avoid compiler optimizations + + // 1 GB array + const size_t size = arr*1024llu*1024llu; + + char * src = (char *) malloc(size); + char * dst = (char *) malloc(size); + + for (size_t i = 0; i < size; i++) src[i] = i; + + memcpy(dst, src, size); // heat-up + + double tsum = 0.0; + + for (size_t i = 0; i < n; i++) { + const int64_t t0 = ggml_time_us(); + + memcpy(dst, src, size); + + const int64_t t1 = ggml_time_us(); + + tsum += (t1 - t0)*1e-6; + + src[0] = rand(); + } + + fprintf(stderr, "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu)); + + // needed to prevent the compile from optimizing the memcpy away + { + double sum = 0.0; + + for (size_t i = 0; i < size; i++) sum += dst[i]; + + fprintf(stderr, "sum: %s\n", sum == -536870910.00 ? "ok" : "error"); + } + + free(src); + free(dst); + + return 0; +} + +int bench_ggml_mul_mat(const whisper_params & params) { + const int n_max = 128; + + const std::vector sizes = { + 64, 128, 256, 512, 1024, 2048, 4096, + }; + + const size_t N_max = sizes.back(); + + // a: N*N*sizeof(float) + // b: N*N*sizeof(float) + // c: N*N*sizeof(float) + // when F16 is used, there is an extra work buffer of size N*N*sizeof(float) + std::vector buf(4llu*N_max*N_max*sizeof(float) + 4*256); + + for (size_t i = 0; i < buf.size(); i++) buf[i] = i; + + for (int j = 0; j < (int) sizes.size(); j++) { + int n_fp16 = 0; + int n_fp32 = 0; + + // GFLOPS/s + double s_fp16 = 0.0; + double s_fp32 = 0.0; + + const size_t N = sizes[j]; + + for (int k = 0; k < 2; ++k) { + const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32; + + double & s = k == 0 ? s_fp16 : s_fp32; + int & n = k == 0 ? n_fp16 : n_fp32; + + struct ggml_init_params gparams = { + /*.mem_size =*/ buf.size(), + /*.mem_buffer =*/ buf.data(), + }; + + struct ggml_context * ctx0 = ggml_init(gparams); + + struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype, N, N); + struct ggml_tensor * b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N); + + struct ggml_tensor * c = ggml_mul_mat(ctx0, a, b); + + struct ggml_cgraph gf = ggml_build_forward(c); + + gf.n_threads = params.n_threads; + + double tsum = 0.0; + + // heat-up + ggml_graph_compute(ctx0, &gf); + + for (int i = 0; i < n_max; ++i) { + const int64_t t0 = ggml_time_us(); + + ggml_graph_compute(ctx0, &gf); + + const int64_t t1 = ggml_time_us(); + + tsum += (t1 - t0)*1e-6; + n++; + + if (tsum > 1.0 && n >= 3) { + break; + } + } + + ggml_free(ctx0); + + s = ((2.0*N*N*N*n)/tsum)*1e-9; + } + + fprintf(stderr, "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n", + N, N, s_fp16, n_fp16, s_fp32, n_fp32); + } + + return 0; +} + +int main(int argc, char ** argv) { + whisper_params params; + + if (whisper_params_parse(argc, argv, params) == false) { + return 1; + } + + ggml_time_init(); + + int ret = -1; + + switch (params.what) { + case 0: ret = bench_whisper_encoder(params); break; + case 1: ret = bench_memcpy(params); break; + case 2: ret = bench_ggml_mul_mat(params); break; + default: fprintf(stderr, "error: unknown benchmark: %d\n", params.what); break; + } + + return ret; +} diff --git a/extra/bench-all.sh b/extra/bench-all.sh index fbdc4c2..bfb3764 100755 --- a/extra/bench-all.sh +++ b/extra/bench-all.sh @@ -12,6 +12,18 @@ fi models=( "tiny" "base" "small" "medium" "large" ) +printf "\n" +printf "Running memcpy benchmark with 1 thread\n" +printf "\n" + +./bench -w 1 -t 1 2>&1 + +printf "\n" +printf "Running ggml_mul_mat benchmark with " $n_threads " threads\n" +printf "\n" + +./bench -w 2 -t $n_threads 2>&1 + printf "\n" printf "Running benchmark for all models\n" printf "This can take a while!\n" @@ -56,4 +68,3 @@ for model in "${models[@]}"; do printf "| | | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n" done - diff --git a/ggml.c b/ggml.c index c59ee64..16f0f85 100644 --- a/ggml.c +++ b/ggml.c @@ -4373,7 +4373,9 @@ static void ggml_compute_forward_mul_mat_f32( if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { GGML_ASSERT(nb10 == sizeof(float)); - if (params->ith != 0) return; + if (params->ith != 0) { + return; + } if (params->type == GGML_TASK_INIT) { return; @@ -4616,7 +4618,9 @@ static void ggml_compute_forward_mul_mat_f16_f32( if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { GGML_ASSERT(nb10 == sizeof(float)); - if (params->ith != 0) return; + if (params->ith != 0) { + return; + } if (params->type == GGML_TASK_INIT) { return; @@ -7054,7 +7058,7 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg #ifdef __APPLE__ //#include - +// //typedef os_unfair_lock ggml_lock_t; // //#define ggml_lock_init(x) UNUSED(x) @@ -7161,6 +7165,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { if (state->params.ith < state->params.nth) { ggml_compute_forward(&state->params, state->node); } + state->node = NULL; } else { break; @@ -7205,6 +7210,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) .node = NULL, .shared = &state_shared, }; + int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); assert(rc == 0); UNUSED(rc); @@ -7273,7 +7279,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) node->src1->type == GGML_TYPE_F32) { #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - node->n_tasks = 1; + node->n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning cur = sizeof(float)*(node->src0->ne[0]*node->src0->ne[1]); } else { cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1);