Merge pull request #325 from CNugteren/CLBlast-321-axpy-faster-kernel-bug
Fixed a bug in the XaxpyFaster kernel for specific parameterspull/329/head
commit
e33542acdd
|
@ -7,6 +7,7 @@ Development (next version)
|
|||
- Fixed an issue with conjugate transpose not being executed in certain cases for a.o. XOMATCOPY
|
||||
- Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel
|
||||
- Fixed an issue with the preprocessor and the new GEMMK == 1 kernel
|
||||
- Fixed an issue for certain parameters for AXPY's 'XaxpyFaster' kernel
|
||||
- Various minor fixes and enhancements
|
||||
- Added non-BLAS routines:
|
||||
* SCONVGEMM/DCONVGEMM/HCONVGEMM (convolution as im2col followed by batched GEMM)
|
||||
|
|
|
@ -43,10 +43,11 @@ void XaxpyFaster(const int n, const real_arg arg_alpha,
|
|||
__global realV* ygm) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
if (get_global_id(0) < n / (VW)) {
|
||||
const int num_worker_threads = n / (VW * WPT);
|
||||
if (get_global_id(0) < num_worker_threads) {
|
||||
#pragma unroll
|
||||
for (int _w = 0; _w < WPT; _w += 1) {
|
||||
const int id = _w*get_global_size(0) + get_global_id(0);
|
||||
const int id = _w*num_worker_threads + get_global_id(0);
|
||||
realV xvalue = xgm[id];
|
||||
realV yvalue = ygm[id];
|
||||
ygm[id] = MultiplyAddVector(yvalue, alpha, xvalue);
|
||||
|
|
Loading…
Reference in New Issue