Merge pull request #325 from CNugteren/CLBlast-321-axpy-faster-kernel-bug

Fixed a bug in the XaxpyFaster kernel for specific parameters
2018-10-16 21:06:57 +02:00 · 2018-10-16 21:06:57 +02:00 · e33542acdd
parent 634b2bc75c 664a238adf
commit e33542acdd
2 changed files with 4 additions and 2 deletions
--- a/1
+++ b/1
@ -7,6 +7,7 @@ Development (next version)
 - Fixed an issue with conjugate transpose not being executed in certain cases for a.o. XOMATCOPY
 - Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel
 - Fixed an issue with the preprocessor and the new GEMMK == 1 kernel
+- Fixed an issue for certain parameters for AXPY's 'XaxpyFaster' kernel
 - Various minor fixes and enhancements
 - Added non-BLAS routines:
  * SCONVGEMM/DCONVGEMM/HCONVGEMM (convolution as im2col followed by batched GEMM)
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@ -43,10 +43,11 @@ void XaxpyFaster(const int n, const real_arg arg_alpha,
                 __global realV* ygm) {
  const real alpha = GetRealArg(arg_alpha);

-  if (get_global_id(0) < n / (VW)) {
+  const int num_worker_threads = n / (VW * WPT);
+  if (get_global_id(0) < num_worker_threads) {
    #pragma unroll
    for (int _w = 0; _w < WPT; _w += 1) {
-      const int id = _w*get_global_size(0) + get_global_id(0);
+      const int id = _w*num_worker_threads + get_global_id(0);
      realV xvalue = xgm[id];
      realV yvalue = ygm[id];
      ygm[id] = MultiplyAddVector(yvalue, alpha, xvalue);