diff --git a/ggml.c b/ggml.c index b124f14cc..62f0f18ef 100644 --- a/ggml.c +++ b/ggml.c @@ -9704,10 +9704,10 @@ static void ggml_compute_forward_group_norm( #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // helper function to determine if it is better to use BLAS or not // for large matrices, BLAS is faster -static bool ggml_compute_forward_mul_mat_use_blas( - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + //const int64_t ne00 = src0->ne[0]; //const int64_t ne01 = src0->ne[1]; @@ -9787,7 +9787,7 @@ static void ggml_compute_forward_mul_mat( #endif #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { + if (ggml_compute_forward_mul_mat_use_blas(dst)) { if (params->ith != 0) { return; } @@ -16301,24 +16301,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { //n_tasks = MIN(n_threads, MAX(1, nr0/128)); //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks); - -#if defined(GGML_USE_CUBLAS) - if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) { - n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - } -#elif defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) { - n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - } -#endif -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) { - n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - } -#endif } break; case GGML_OP_MUL_MAT_ID: { @@ -16491,6 +16473,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { state->shared->node_n += 1; return (thread_ret_t) GGML_EXIT_ABORTED; } + if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { // all other threads are finished and spinning // do finalize and init here so we don't have synchronize again @@ -16556,14 +16539,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } else { // wait for other threads to finish const int last = node_n; + + const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT; + while (true) { // TODO: this sched_yield can have significant impact on the performance - either positive or negative // depending on the workload and the operating system. // since it is not clear what is the best approach, it should potentially become user-configurable // ref: https://github.com/ggerganov/ggml/issues/291 -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - sched_yield(); -#endif + // UPD: adding the do_yield flag seems to resolve the issue universally + if (do_yield) { + sched_yield(); + } node_n = atomic_load(&state->shared->node_n); if (node_n != last) break; @@ -16642,7 +16629,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { } else #endif #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) { + if (ggml_compute_forward_mul_mat_use_blas(node)) { if (node->src[0]->type != GGML_TYPE_F32) { // here we need memory just for single 2D matrix from src0 cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);