From 57f1aa76857cf0566e05b43b9b2a98a3a6139c8b Mon Sep 17 00:00:00 2001 From: "D. Van Assche" Date: Thu, 18 Aug 2016 17:33:13 +0200 Subject: [PATCH] Adapt opencl files for 1.1 OpenCL In OpenCL 1.1 __kernel has to be before __attribute__, at least with Vivante compiler. --- src/kernels/level1/xamax.opencl | 8 ++++---- src/kernels/level1/xasum.opencl | 8 ++++---- src/kernels/level1/xaxpy.opencl | 8 ++++---- src/kernels/level1/xcopy.opencl | 8 ++++---- src/kernels/level1/xdot.opencl | 8 ++++---- src/kernels/level1/xnrm2.opencl | 8 ++++---- src/kernels/level1/xscal.opencl | 8 ++++---- src/kernels/level1/xswap.opencl | 8 ++++---- src/kernels/level2/xgemv.opencl | 4 ++-- src/kernels/level2/xgemv_fast.opencl | 8 ++++---- src/kernels/level2/xger.opencl | 4 ++-- src/kernels/level2/xher.opencl | 4 ++-- src/kernels/level2/xher2.opencl | 4 ++-- src/kernels/level3/convert_hermitian.opencl | 8 ++++---- src/kernels/level3/convert_symmetric.opencl | 8 ++++---- src/kernels/level3/convert_triangular.opencl | 8 ++++---- src/kernels/level3/copy_fast.opencl | 4 ++-- src/kernels/level3/copy_pad.opencl | 8 ++++---- src/kernels/level3/transpose_fast.opencl | 4 ++-- src/kernels/level3/transpose_pad.opencl | 8 ++++---- src/kernels/level3/xgemm_part2.opencl | 12 ++++++------ 21 files changed, 74 insertions(+), 74 deletions(-) diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl index 48d0eb5c..763c1c0e 100644 --- a/src/kernels/level1/xamax.opencl +++ b/src/kernels/level1/xamax.opencl @@ -30,8 +30,8 @@ R"( // ================================================================================================= // The main reduction kernel, performing the loading and the majority of the operation -__attribute__((reqd_work_group_size(WGS1, 1, 1))) -__kernel void Xamax(const int n, +__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +void Xamax(const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, __global singlereal* maxgm, __global unsigned int* imaxgm) { __local singlereal maxlm[WGS1]; @@ -95,8 +95,8 @@ __kernel void Xamax(const int n, // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to // be launched with a single workgroup only. -__attribute__((reqd_work_group_size(WGS2, 1, 1))) -__kernel void XamaxEpilogue(const __global singlereal* restrict maxgm, +__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +void XamaxEpilogue(const __global singlereal* restrict maxgm, const __global unsigned int* restrict imaxgm, __global unsigned int* imax, const int imax_offset) { __local singlereal maxlm[WGS2]; diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl index 58d0f11b..1542a187 100644 --- a/src/kernels/level1/xasum.opencl +++ b/src/kernels/level1/xasum.opencl @@ -30,8 +30,8 @@ R"( // ================================================================================================= // The main reduction kernel, performing the loading and the majority of the operation -__attribute__((reqd_work_group_size(WGS1, 1, 1))) -__kernel void Xasum(const int n, +__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +void Xasum(const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* output) { __local real lm[WGS1]; @@ -74,8 +74,8 @@ __kernel void Xasum(const int n, // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to // be launched with a single workgroup only. -__attribute__((reqd_work_group_size(WGS2, 1, 1))) -__kernel void XasumEpilogue(const __global real* restrict input, +__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +void XasumEpilogue(const __global real* restrict input, __global real* asum, const int asum_offset) { __local real lm[WGS2]; const int lid = get_local_id(0); diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl index e0efadc1..73a4a535 100644 --- a/src/kernels/level1/xaxpy.opencl +++ b/src/kernels/level1/xaxpy.opencl @@ -22,8 +22,8 @@ R"( // ================================================================================================= // Full version of the kernel with offsets and strided accesses -__attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha, +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void Xaxpy(const int n, const __constant real* restrict arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc) { const real alpha = arg_alpha[0]; @@ -40,8 +40,8 @@ __kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha, // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. -__attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void XaxpyFast(const int n, const __constant real* restrict arg_alpha, +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void XaxpyFast(const int n, const __constant real* restrict arg_alpha, const __global realV* restrict xgm, __global realV* ygm) { const real alpha = arg_alpha[0]; diff --git a/src/kernels/level1/xcopy.opencl b/src/kernels/level1/xcopy.opencl index 97c27ccf..c96c33ac 100644 --- a/src/kernels/level1/xcopy.opencl +++ b/src/kernels/level1/xcopy.opencl @@ -22,8 +22,8 @@ R"( // ================================================================================================= // Full version of the kernel with offsets and strided accesses -__attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void Xcopy(const int n, +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void Xcopy(const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc) { @@ -38,8 +38,8 @@ __kernel void Xcopy(const int n, // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. -__attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void XcopyFast(const int n, +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void XcopyFast(const int n, const __global realV* restrict xgm, __global realV* ygm) { #pragma unroll diff --git a/src/kernels/level1/xdot.opencl b/src/kernels/level1/xdot.opencl index e13eb3c1..210aee94 100644 --- a/src/kernels/level1/xdot.opencl +++ b/src/kernels/level1/xdot.opencl @@ -30,8 +30,8 @@ R"( // ================================================================================================= // The main reduction kernel, performing the multiplication and the majority of the sum operation -__attribute__((reqd_work_group_size(WGS1, 1, 1))) -__kernel void Xdot(const int n, +__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +void Xdot(const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, const __global real* restrict ygm, const int y_offset, const int y_inc, __global real* output, const int do_conjugate) { @@ -73,8 +73,8 @@ __kernel void Xdot(const int n, // The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to // be launched with a single workgroup only. -__attribute__((reqd_work_group_size(WGS2, 1, 1))) -__kernel void XdotEpilogue(const __global real* restrict input, +__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +void XdotEpilogue(const __global real* restrict input, __global real* dot, const int dot_offset) { __local real lm[WGS2]; const int lid = get_local_id(0); diff --git a/src/kernels/level1/xnrm2.opencl b/src/kernels/level1/xnrm2.opencl index 9803687a..3633efea 100644 --- a/src/kernels/level1/xnrm2.opencl +++ b/src/kernels/level1/xnrm2.opencl @@ -30,8 +30,8 @@ R"( // ================================================================================================= // The main reduction kernel, performing the multiplication and the majority of the operation -__attribute__((reqd_work_group_size(WGS1, 1, 1))) -__kernel void Xnrm2(const int n, +__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +void Xnrm2(const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* output) { __local real lm[WGS1]; @@ -72,8 +72,8 @@ __kernel void Xnrm2(const int n, // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to // be launched with a single workgroup only. -__attribute__((reqd_work_group_size(WGS2, 1, 1))) -__kernel void Xnrm2Epilogue(const __global real* restrict input, +__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +void Xnrm2Epilogue(const __global real* restrict input, __global real* nrm2, const int nrm2_offset) { __local real lm[WGS2]; const int lid = get_local_id(0); diff --git a/src/kernels/level1/xscal.opencl b/src/kernels/level1/xscal.opencl index 59936776..61ff5bb6 100644 --- a/src/kernels/level1/xscal.opencl +++ b/src/kernels/level1/xscal.opencl @@ -22,8 +22,8 @@ R"( // ================================================================================================= // Full version of the kernel with offsets and strided accesses -__attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void Xscal(const int n, const real alpha, +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void Xscal(const int n, const real alpha, __global real* xgm, const int x_offset, const int x_inc) { // Loops over the work that needs to be done (allows for an arbitrary number of threads) @@ -40,8 +40,8 @@ __kernel void Xscal(const int n, const real alpha, // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. -__attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void XscalFast(const int n, const real alpha, +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void XscalFast(const int n, const real alpha, __global realV* xgm) { #pragma unroll for (int w=0; w 'a_ld' is a multiple of VW2 // --> 'a_rotated' is 0 // --> 'do_conjugate' is 0 -__attribute__((reqd_work_group_size(WGS2, 1, 1))) -__kernel void XgemvFast(const int m, const int n, +__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +void XgemvFast(const int m, const int n, const __constant real* restrict arg_alpha, const __constant real* restrict arg_beta, const int a_rotated, @@ -196,8 +196,8 @@ __kernel void XgemvFast(const int m, const int n, // --> 'a_ld' is a multiple of VW3 // --> 'a_rotated' is 1 // --> 'do_conjugate' is 0 -__attribute__((reqd_work_group_size(WGS3, 1, 1))) -__kernel void XgemvFastRot(const int m, const int n, +__kernel __attribute__((reqd_work_group_size(WGS3, 1, 1))) +void XgemvFastRot(const int m, const int n, const __constant real* restrict arg_alpha, const __constant real* restrict arg_beta, const int a_rotated, diff --git a/src/kernels/level2/xger.opencl b/src/kernels/level2/xger.opencl index 63817afb..21744799 100644 --- a/src/kernels/level2/xger.opencl +++ b/src/kernels/level2/xger.opencl @@ -18,8 +18,8 @@ R"( // ================================================================================================= // Regular version of the rank-1 matrix update kernel (GER, GERU, GERC) -__attribute__((reqd_work_group_size(WGS1, WGS2, 1))) -__kernel void Xger(const int max1, const int max2, +__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) +void Xger(const int max1, const int max2, const __constant real* restrict arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, const __global real* ygm, const int y_offset, const int y_inc, diff --git a/src/kernels/level2/xher.opencl b/src/kernels/level2/xher.opencl index fc635f2e..4b304a9f 100644 --- a/src/kernels/level2/xher.opencl +++ b/src/kernels/level2/xher.opencl @@ -18,8 +18,8 @@ R"( // ================================================================================================= // Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR) -__attribute__((reqd_work_group_size(WGS1, WGS2, 1))) -__kernel void Xher(const int n, +__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) +void Xher(const int n, const __constant real* restrict arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* restrict agm, const int a_offset, const int a_ld, diff --git a/src/kernels/level2/xher2.opencl b/src/kernels/level2/xher2.opencl index a66f255f..8d05f020 100644 --- a/src/kernels/level2/xher2.opencl +++ b/src/kernels/level2/xher2.opencl @@ -18,8 +18,8 @@ R"( // ================================================================================================= // Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2) -__attribute__((reqd_work_group_size(WGS1, WGS2, 1))) -__kernel void Xher2(const int n, +__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) +void Xher2(const int n, const __constant real* restrict arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, const __global real* restrict ygm, const int y_offset, const int y_inc, diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl index 53cc161a..272905eb 100644 --- a/src/kernels/level3/convert_hermitian.opencl +++ b/src/kernels/level3/convert_hermitian.opencl @@ -20,8 +20,8 @@ R"( // Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void HermLowerToSquared(const int src_dim, +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void HermLowerToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_dim, @@ -59,8 +59,8 @@ __kernel void HermLowerToSquared(const int src_dim, } // Same as above, but now the matrix' data is stored in the upper-triangle -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void HermUpperToSquared(const int src_dim, +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void HermUpperToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_dim, diff --git a/src/kernels/level3/convert_symmetric.opencl b/src/kernels/level3/convert_symmetric.opencl index c6ce93ca..ea6f7dbd 100644 --- a/src/kernels/level3/convert_symmetric.opencl +++ b/src/kernels/level3/convert_symmetric.opencl @@ -20,8 +20,8 @@ R"( // Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void SymmLowerToSquared(const int src_dim, +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void SymmLowerToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_dim, @@ -53,8 +53,8 @@ __kernel void SymmLowerToSquared(const int src_dim, } // Same as above, but now the matrix' data is stored in the upper-triangle -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void SymmUpperToSquared(const int src_dim, +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void SymmUpperToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_dim, diff --git a/src/kernels/level3/convert_triangular.opencl b/src/kernels/level3/convert_triangular.opencl index fdd2461a..858228bb 100644 --- a/src/kernels/level3/convert_triangular.opencl +++ b/src/kernels/level3/convert_triangular.opencl @@ -20,8 +20,8 @@ R"( // Kernel to populate a squared triangular matrix, given that the triangle which holds the data is // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void TriaLowerToSquared(const int src_dim, +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void TriaLowerToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_dim, @@ -55,8 +55,8 @@ __kernel void TriaLowerToSquared(const int src_dim, } // Same as above, but now the matrix' data is stored in the upper-triangle -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void TriaUpperToSquared(const int src_dim, +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void TriaUpperToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_dim, diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl index 09e54e6d..54f9d987 100644 --- a/src/kernels/level3/copy_fast.opencl +++ b/src/kernels/level3/copy_fast.opencl @@ -35,8 +35,8 @@ R"( // Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of // COPY_VW. Also requires both matrices to be of the same dimensions and without offset. -__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) -__kernel void CopyMatrixFast(const int ld, +__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) +void CopyMatrixFast(const int ld, __global const realC* restrict src, __global realC* dest, const __constant real* restrict arg_alpha) { diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl index d276cc60..92279ecf 100644 --- a/src/kernels/level3/copy_pad.opencl +++ b/src/kernels/level3/copy_pad.opencl @@ -24,8 +24,8 @@ R"( // Copies a matrix from source to destination. The output is padded with zero values in case the // destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld // value and offset can be different. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void CopyPadMatrix(const int src_one, const int src_two, +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void CopyPadMatrix(const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_one, const int dest_two, @@ -65,8 +65,8 @@ __kernel void CopyPadMatrix(const int src_one, const int src_two, // Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but // writes only the actual data back to the destination matrix. Again, the ld value and offset can // be different. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void CopyMatrix(const int src_one, const int src_two, +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void CopyMatrix(const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_one, const int dest_two, diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl index d5c46a30..a2007408 100644 --- a/src/kernels/level3/transpose_fast.opencl +++ b/src/kernels/level3/transpose_fast.opencl @@ -36,8 +36,8 @@ R"( // Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without // offset. A more general version is available in 'padtranspose.opencl'. -__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1))) -__kernel void TransposeMatrixFast(const int ld, +__kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1))) +void TransposeMatrixFast(const int ld, __global const realT* restrict src, __global realT* dest, const __constant real* restrict arg_alpha) { diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl index 2de0c7bd..63cc6e9a 100644 --- a/src/kernels/level3/transpose_pad.opencl +++ b/src/kernels/level3/transpose_pad.opencl @@ -24,8 +24,8 @@ R"( // Transposes a matrix from source to destination. The output is padded with zero values in case the // destination matrix dimensions are larger than the transposed source matrix dimensions. -__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) -__kernel void TransposePadMatrix(const int src_one, const int src_two, +__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +void TransposePadMatrix(const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_one, const int dest_two, @@ -88,8 +88,8 @@ __kernel void TransposePadMatrix(const int src_one, const int src_two, // Transposes a matrix, while considering possible padding in the source matrix. Data is read from a // padded source matrix, but only the actual data is written back to the transposed destination // matrix. This kernel optionally checks for upper/lower triangular matrices. -__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) -__kernel void TransposeMatrix(const int src_one, const int src_two, +__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +void TransposeMatrix(const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_one, const int dest_two, diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl index 42c1127c..60e38c06 100644 --- a/src/kernels/level3/xgemm_part2.opencl +++ b/src/kernels/level3/xgemm_part2.opencl @@ -268,8 +268,8 @@ inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, #if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K) // Main entry point of the kernel. This is the upper-triangular version. -__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -__kernel void XgemmUpper(const int kSizeN, const int kSizeK, +__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +void XgemmUpper(const int kSizeN, const int kSizeK, const __constant real* restrict arg_alpha, const __constant real* restrict arg_beta, const __global realM* restrict agm, @@ -308,8 +308,8 @@ __kernel void XgemmUpper(const int kSizeN, const int kSizeK, } // Main entry point of the kernel. This is the lower-triangular version. -__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -__kernel void XgemmLower(const int kSizeN, const int kSizeK, +__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +void XgemmLower(const int kSizeN, const int kSizeK, const __constant real* restrict arg_alpha, const __constant real* restrict arg_beta, const __global realM* restrict agm, @@ -352,8 +352,8 @@ __kernel void XgemmLower(const int kSizeN, const int kSizeK, #else // Main entry point of the kernel. This is the regular full version. -__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, +__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, const __constant real* restrict arg_alpha, const __constant real* restrict arg_beta, const __global realM* restrict agm,