From 664a238adfd581dd1b2e6ed94592ba956ae117dc Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Mon, 15 Oct 2018 20:08:29 +0200 Subject: [PATCH] Fixed a bug in the XaxpyFaster kernel for specific parameters --- CHANGELOG | 1 + src/kernels/level1/xaxpy.opencl | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index bc856357..18c9051d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -7,6 +7,7 @@ Development (next version) - Fixed an issue with conjugate transpose not being executed in certain cases for a.o. XOMATCOPY - Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel - Fixed an issue with the preprocessor and the new GEMMK == 1 kernel +- Fixed an issue for certain parameters for AXPY's 'XaxpyFaster' kernel - Various minor fixes and enhancements - Added non-BLAS routines: * SCONVGEMM/DCONVGEMM/HCONVGEMM (convolution as im2col followed by batched GEMM) diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl index 74e49930..2829237e 100644 --- a/src/kernels/level1/xaxpy.opencl +++ b/src/kernels/level1/xaxpy.opencl @@ -43,10 +43,11 @@ void XaxpyFaster(const int n, const real_arg arg_alpha, __global realV* ygm) { const real alpha = GetRealArg(arg_alpha); - if (get_global_id(0) < n / (VW)) { + const int num_worker_threads = n / (VW * WPT); + if (get_global_id(0) < num_worker_threads) { #pragma unroll for (int _w = 0; _w < WPT; _w += 1) { - const int id = _w*get_global_size(0) + get_global_id(0); + const int id = _w*num_worker_threads + get_global_id(0); realV xvalue = xgm[id]; realV yvalue = ygm[id]; ygm[id] = MultiplyAddVector(yvalue, alpha, xvalue);