diff --git a/ggml-cuda.cu b/ggml-cuda.cu index dc14f2f5d..9f873035a 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -6893,6 +6893,8 @@ static void ggml_cuda_op_mul_mat( int64_t row_low[GGML_CUDA_MAX_DEVICES]; int64_t row_high[GGML_CUDA_MAX_DEVICES]; + int used_devices = 0; + for (int64_t id = 0; id < g_device_count; ++id) { // by default, use all rows row_low[id] = 0; @@ -6920,6 +6922,8 @@ static void ggml_cuda_op_mul_mat( continue; } + used_devices++; + const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device; const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device; @@ -6958,12 +6962,12 @@ static void ggml_cuda_op_mul_mat( // if multiple devices are used they need to wait for the main device // here an event is recorded that signals that the main device has finished calculating the input data - if (split && g_device_count > 1) { + if (split && used_devices > 1) { CUDA_CHECK(ggml_cuda_set_device(g_main_device)); CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0])); } - const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11; + const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11; for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) { const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0; const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride; @@ -7079,6 +7083,9 @@ static void ggml_cuda_op_mul_mat( } for (int64_t id = 0; id < g_device_count; ++id) { + if ((!split && id != g_main_device) || row_low[id] == row_high[id]) { + continue; + } CUDA_CHECK(ggml_cuda_set_device(id)); // free buffers again when done @@ -7103,6 +7110,9 @@ static void ggml_cuda_op_mul_mat( CUDA_CHECK(ggml_cuda_set_device(g_main_device)); for (int64_t id = 0; id < g_device_count; ++id) { + if (row_low[id] == row_high[id]) { + continue; + } for (int64_t is = 0; is < is_max; ++is) { CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0)); } @@ -7400,7 +7410,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const bool all_on_device = - (src0->backend == GGML_BACKEND_GPU) && + (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) && (src1->backend == GGML_BACKEND_GPU) && ( dst->backend == GGML_BACKEND_GPU);