implemented changes to boost Adreno performance according to https://jira-dc.qualcomm.com/jira/browse/OSR-8731

2023-01-03 10:56:04 -08:00 · 2023-01-03 10:56:04 -08:00 · 4f394608a2
parent 03cffa83c5
commit 4f394608a2
36 changed files with 416 additions and 77 deletions
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@ -365,6 +365,14 @@ class Device {
    return false;
  }

+  // Returns the Qualcomm Adreno GPU version (i.e. a650, a730, a740, etc.)
+  std::string AdrenoVersion() const {
+    if (IsQualcomm()) {
+      return GetInfoString(CL_DEVICE_OPENCL_C_VERSION);
+    }
+    else { return std::string{""}; }
+  }
+
  // Retrieves the above extra information (if present)
  std::string GetExtraInfo() const {
    if (HasExtension("cl_amd_device_attribute_query")) { return AMDBoardName(); }
--- a/src/database/kernels/copy/copy_32.hpp
+++ b/src/database/kernels/copy/copy_32.hpp
@ -135,6 +135,7 @@ const DatabaseEntry CopySingle = {
          { Name{"GeForce GTX 670                                   "}, Params{ 16, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 680                                   "}, Params{ 32, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 760 Ti OEM                            "}, Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Quadro K600                                       "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.5", {
--- a/src/kernel_preprocessor.cpp
+++ b/src/kernel_preprocessor.cpp
@ -371,6 +371,25 @@ std::vector<std::string> PreprocessDefinesAndComments(const std::string& source,
        defines_string.emplace(name, value);
      }

+      // Detect #undef macros
+      // When USE_SUBGROUP_SHUFFLING is set, but kernel parameters do not satisfy the conditions
+      // for subgroup shuffle, USE_SUBGROUP_SHUFFLING needs to be unset in preprocessing
+      // to avoid GEMM kernel errors. See src/kernels/level3/xgemm_part1.opencl line 142.
+      // In this preprocessor, macros are not redefined because of behavior defined by std::map::emplace
+      const auto undef_pos = line.find("#undef ");
+      if (undef_pos != std::string::npos) {
+        const auto undef = line.substr(undef_pos + 7); // length of "#undef "
+        // checks if definition is found in defines_int and/or defines_string, then removes the definition
+        auto int_undef = defines_int.find(undef);
+        if (int_undef != defines_int.end()){
+          defines_int.erase(int_undef);
+        }
+        auto string_undef = defines_string.find(undef);
+        if (string_undef != defines_string.end()){
+          defines_string.erase(string_undef);
+        }
+      }
+
      // Detect #ifndef blocks
      const auto ifndef_pos = line.find("#ifndef ");
      if (ifndef_pos != std::string::npos) {
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@ -132,6 +132,12 @@ R"(
  #define USE_CL_MAD 0
 #endif

+// By default the workgroup size requirement is enabled. For Qualcomm devices the workgroup size 
+// requirement results in worse performance and is disabled (src/utilities/compile.cpp)
+#ifndef RELAX_WORKGROUP_SIZE
+  #define RELAX_WORKGROUP_SIZE 0
+#endif
+
 // Sets a variable to zero
 #if PRECISION == 3232 || PRECISION == 6464
  #define SetToZero(a) a.x = ZERO; a.y = ZERO
--- a/src/kernels/level1/xamax.opencl
+++ b/src/kernels/level1/xamax.opencl
@ -30,7 +30,11 @@ R"(
 // =================================================================================================

 // The main reduction kernel, performing the loading and the majority of the operation
-__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#endif
 void Xamax(const int n,
           const __global real* restrict xgm, const int x_offset, const int x_inc,
           __global singlereal* maxgm, __global unsigned int* imaxgm) {
@ -96,7 +100,11 @@ void Xamax(const int n,

 // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
 // be launched with a single workgroup only.
-__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#endif
 void XamaxEpilogue(const __global singlereal* restrict maxgm,
                   const __global unsigned int* restrict imaxgm,
                   __global unsigned int* imax, const int imax_offset) {
--- a/src/kernels/level1/xasum.opencl
+++ b/src/kernels/level1/xasum.opencl
@ -30,7 +30,11 @@ R"(
 // =================================================================================================

 // The main reduction kernel, performing the loading and the majority of the operation
-__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#endif
 void Xasum(const int n,
           const __global real* restrict xgm, const int x_offset, const int x_inc,
           __global real* output) {
@ -73,7 +77,11 @@ void Xasum(const int n,

 // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
 // be launched with a single workgroup only.
-__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#endif
 void XasumEpilogue(const __global real* restrict input,
                   __global real* asum, const int asum_offset) {
  __local real lm[WGS2];
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@ -22,7 +22,11 @@ R"(
 // =================================================================================================

 // Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void Xaxpy(const int n, const real_arg arg_alpha,
           const __global real* restrict xgm, const int x_offset, const int x_inc,
           __global real* ygm, const int y_offset, const int y_inc) {
@ -37,7 +41,11 @@ void Xaxpy(const int n, const real_arg arg_alpha,

 // Faster version of the kernel without offsets and strided accesses but with if-statement. Also
 // assumes that 'n' is dividable by 'VW' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XaxpyFaster(const int n, const real_arg arg_alpha,
                 const __global realV* restrict xgm,
                 __global realV* ygm) {
@ -57,7 +65,11 @@ void XaxpyFaster(const int n, const real_arg arg_alpha,

 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XaxpyFastest(const int n, const real_arg arg_alpha,
                  const __global realV* restrict xgm,
                  __global realV* ygm) {
@ -75,7 +87,11 @@ void XaxpyFastest(const int n, const real_arg arg_alpha,
 // =================================================================================================

 // Full version of the kernel with offsets and strided accesses: batched version
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XaxpyBatched(const int n, const __constant real_arg* arg_alphas,
                  const __global real* restrict xgm, const __constant int* x_offsets, const int x_inc,
                  __global real* ygm, const __constant int* y_offsets, const int y_inc) {
--- a/src/kernels/level1/xcopy.opencl
+++ b/src/kernels/level1/xcopy.opencl
@ -22,7 +22,11 @@ R"(
 // =================================================================================================

 // Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void Xcopy(const int n,
           const __global real* restrict xgm, const int x_offset, const int x_inc,
           __global real* ygm, const int y_offset, const int y_inc) {
@ -37,7 +41,11 @@ void Xcopy(const int n,

 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XcopyFast(const int n,
               const __global realV* restrict xgm,
               __global realV* ygm) {
--- a/src/kernels/level1/xdot.opencl
+++ b/src/kernels/level1/xdot.opencl
@ -30,7 +30,11 @@ R"(
 // =================================================================================================

 // The main reduction kernel, performing the multiplication and the majority of the sum operation
-__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#endif
 void Xdot(const int n,
          const __global real* restrict xgm, const int x_offset, const int x_inc,
          const __global real* restrict ygm, const int y_offset, const int y_inc,
@ -72,7 +76,11 @@ void Xdot(const int n,

 // The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to
 // be launched with a single workgroup only.
-__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#endif
 void XdotEpilogue(const __global real* restrict input,
                  __global real* dot, const int dot_offset) {
  __local real lm[WGS2];
--- a/src/kernels/level1/xhad.opencl
+++ b/src/kernels/level1/xhad.opencl
@ -66,7 +66,11 @@ INLINE_FUNC realV MultiplyVectorVector(realV cvec, const realV aval, const realV
 // =================================================================================================

 // Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta,
          const __global real* restrict xgm, const int x_offset, const int x_inc,
          const __global real* restrict ygm, const int y_offset, const int y_inc,
@ -90,7 +94,11 @@ void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta,

 // Faster version of the kernel without offsets and strided accesses but with if-statement. Also
 // assumes that 'n' is dividable by 'VW' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta,
                const __global realV* restrict xgm, const __global realV* restrict ygm,
                __global realV* zgm) {
@ -117,7 +125,11 @@ void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta,

 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XhadFastest(const int n, const real_arg arg_alpha, const real_arg arg_beta,
                 const __global realV* restrict xgm, const __global realV* restrict ygm,
                 __global realV* zgm) {
--- a/src/kernels/level1/xnrm2.opencl
+++ b/src/kernels/level1/xnrm2.opencl
@ -30,7 +30,11 @@ R"(
 // =================================================================================================

 // The main reduction kernel, performing the multiplication and the majority of the operation
-__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#endif
 void Xnrm2(const int n,
           const __global real* restrict xgm, const int x_offset, const int x_inc,
           __global real* output) {
@ -71,7 +75,11 @@ void Xnrm2(const int n,

 // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
 // be launched with a single workgroup only.
-__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#endif
 void Xnrm2Epilogue(const __global real* restrict input,
                   __global real* nrm2, const int nrm2_offset) {
  __local real lm[WGS2];
--- a/src/kernels/level1/xscal.opencl
+++ b/src/kernels/level1/xscal.opencl
@ -22,7 +22,11 @@ R"(
 // =================================================================================================

 // Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void Xscal(const int n, const real_arg arg_alpha,
           __global real* xgm, const int x_offset, const int x_inc) {
  const real alpha = GetRealArg(arg_alpha);
@ -40,7 +44,11 @@ void Xscal(const int n, const real_arg arg_alpha,

 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XscalFast(const int n, const real_arg arg_alpha,
               __global realV* xgm) {
  const real alpha = GetRealArg(arg_alpha);
--- a/src/kernels/level1/xswap.opencl
+++ b/src/kernels/level1/xswap.opencl
@ -22,7 +22,11 @@ R"(
 // =================================================================================================

 // Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void Xswap(const int n,
           __global real* xgm, const int x_offset, const int x_inc,
           __global real* ygm, const int y_offset, const int y_inc) {
@ -39,7 +43,11 @@ void Xswap(const int n,

 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XswapFast(const int n,
               __global realV* xgm,
               __global realV* ygm) {
--- a/src/kernels/level2/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@ -210,7 +210,11 @@ INLINE_FUNC real LoadMatrixA(const __global real* restrict agm, const int x, con
 // =================================================================================================

 // Full version of the kernel
-__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#endif
 void Xgemv(const int m, const int n,
                    const real_arg arg_alpha,
                    const real_arg arg_beta,
--- a/src/kernels/level2/xgemv_fast.opencl
+++ b/src/kernels/level2/xgemv_fast.opencl
@ -88,7 +88,11 @@ INLINE_FUNC realVF LoadMatrixAVF(const __global realVF* restrict agm, const int
 // --> 'a_ld' is a multiple of VW2
 // --> 'a_rotated' is 0
 // --> 'do_conjugate' is 0
-__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#endif
 void XgemvFast(const int m, const int n,
               const real_arg arg_alpha,
               const real_arg arg_beta,
@ -191,7 +195,11 @@ void XgemvFast(const int m, const int n,
 // --> 'a_ld' is a multiple of VW3
 // --> 'a_rotated' is 1
 // --> 'do_conjugate' is 0
-__kernel __attribute__((reqd_work_group_size(WGS3, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS3, 1, 1)))
+#endif
 void XgemvFastRot(const int m, const int n,
                  const real_arg arg_alpha,
                  const real_arg arg_beta,
--- a/src/kernels/level2/xger.opencl
+++ b/src/kernels/level2/xger.opencl
@ -18,7 +18,11 @@ R"(
 // =================================================================================================

 // Regular version of the rank-1 matrix update kernel (GER, GERU, GERC)
-__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+#endif
 void Xger(const int max1, const int max2,
          const real_arg arg_alpha,
          const __global real* restrict xgm, const int x_offset, const int x_inc,
--- a/src/kernels/level2/xher.opencl
+++ b/src/kernels/level2/xher.opencl
@ -18,7 +18,11 @@ R"(
 // =================================================================================================

 // Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR)
-__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+#endif
 void Xher(const int n,
          const real_arg arg_alpha,
          const __global real* restrict xgm, const int x_offset, const int x_inc,
--- a/src/kernels/level2/xher2.opencl
+++ b/src/kernels/level2/xher2.opencl
@ -18,7 +18,11 @@ R"(
 // =================================================================================================

 // Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2)
-__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+#endif
 void Xher2(const int n,
           const real_arg arg_alpha,
           const __global real* restrict xgm, const int x_offset, const int x_inc,
--- a/src/kernels/level2/xtrsv.opencl
+++ b/src/kernels/level2/xtrsv.opencl
@ -39,7 +39,11 @@ void FillVector(const int n, const int inc, const int offset,

 // =================================================================================================

-__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
+#endif
 void trsv_forward(int n,
                  const __global real *A, const int a_offset, int a_ld,
                  __global real *b, const int b_offset, int b_inc,
@ -87,7 +91,11 @@ void trsv_forward(int n,
  }
 }

-__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
+#endif
 void trsv_backward(int n,
                   const __global real *A, const int a_offset, int a_ld,
                   __global real *b, const int b_offset, int b_inc,
--- a/src/kernels/level3/convert_hermitian.opencl
+++ b/src/kernels/level3/convert_hermitian.opencl
@ -21,7 +21,11 @@ R"(

 // Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void HermLowerToSquared(const int src_dim,
                        const int src_ld, const int src_offset,
                        __global const real* restrict src,
@ -60,7 +64,11 @@ void HermLowerToSquared(const int src_dim,
 }

 // Same as above, but now the matrix' data is stored in the upper-triangle
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void HermUpperToSquared(const int src_dim,
                        const int src_ld, const int src_offset,
                        __global const real* restrict src,
--- a/src/kernels/level3/convert_symmetric.opencl
+++ b/src/kernels/level3/convert_symmetric.opencl
@ -20,7 +20,11 @@ R"(

 // Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void SymmLowerToSquared(const int src_dim,
                        const int src_ld, const int src_offset,
                        __global const real* restrict src,
@ -53,7 +57,11 @@ void SymmLowerToSquared(const int src_dim,
 }

 // Same as above, but now the matrix' data is stored in the upper-triangle
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void SymmUpperToSquared(const int src_dim,
                        const int src_ld, const int src_offset,
                        __global const real* restrict src,
--- a/src/kernels/level3/convert_triangular.opencl
+++ b/src/kernels/level3/convert_triangular.opencl
@ -20,7 +20,11 @@ R"(

 // Kernel to populate a squared triangular matrix, given that the triangle which holds the data is
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void TriaLowerToSquared(const int src_dim,
                        const int src_ld, const int src_offset,
                        __global const real* restrict src,
@ -55,7 +59,11 @@ void TriaLowerToSquared(const int src_dim,
 }

 // Same as above, but now the matrix' data is stored in the upper-triangle
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void TriaUpperToSquared(const int src_dim,
                        const int src_ld, const int src_offset,
                        __global const real* restrict src,
--- a/src/kernels/level3/copy_fast.opencl
+++ b/src/kernels/level3/copy_fast.opencl
@ -35,7 +35,11 @@ R"(

 // Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of
 // COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
-__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#endif
 void CopyMatrixFast(const int ld,
                    __global const realC* restrict src,
                    __global realC* dest,
--- a/src/kernels/level3/copy_pad.opencl
+++ b/src/kernels/level3/copy_pad.opencl
@ -59,7 +59,11 @@ INLINE_FUNC void _CopyPadMatrix(const int src_one, const int src_two,
 }

 // Interface to the above function
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void CopyPadMatrix(const int src_one, const int src_two,
                   const int src_ld, const int src_offset,
                   __global const real* restrict src,
@ -118,7 +122,11 @@ INLINE_FUNC void _CopyMatrix(const int src_one, const int src_two,
 }

 // Interface to the above function
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void CopyMatrix(const int src_one, const int src_two,
                const int src_ld, const int src_offset,
                __global const real* restrict src,
@ -138,7 +146,11 @@ void CopyMatrix(const int src_one, const int src_two,
 #if defined(ROUTINE_GEMMBATCHED)

 // Batched version of the above
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void CopyPadMatrixBatched(const int src_one, const int src_two,
                          const int src_ld, const __constant int* src_offsets,
                          __global const real* restrict src,
@ -156,7 +168,11 @@ void CopyPadMatrixBatched(const int src_one, const int src_two,
 }

 // Batched version of the above
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void CopyMatrixBatched(const int src_one, const int src_two,
                       const int src_ld, const __constant int* src_offsets,
                       __global const real* restrict src,
@ -177,7 +193,11 @@ void CopyMatrixBatched(const int src_one, const int src_two,
 #if defined(ROUTINE_GEMMSTRIDEDBATCHED)

 // Strided-batched version of the above
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void CopyPadMatrixStridedBatched(const int src_one, const int src_two,
                                 const int src_ld, const int src_offset,
                                 const int src_stride, __global const real* restrict src,
@ -195,7 +215,11 @@ void CopyPadMatrixStridedBatched(const int src_one, const int src_two,
 }

 // Strided-batched version of the above
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void CopyMatrixStridedBatched(const int src_one, const int src_two,
                              const int src_ld, const int src_offset,
                              const int src_stride, __global const real* restrict src,
--- a/src/kernels/level3/invert_diagonal_blocks_part1.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
@ -82,7 +82,11 @@ R"(
 // =================================================================================================

 // Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix
-__kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1)))
+#endif
 void InvertDiagonalBlock(const int n, __global const real* restrict src, const int src_offset, const int src_ld,
                         __global real* restrict dest, const int outer_block_size,
                         const int unit_diagonal, const int is_upper)
--- a/src/kernels/level3/transpose_fast.opencl
+++ b/src/kernels/level3/transpose_fast.opencl
@ -36,7 +36,11 @@ R"(

 // Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without
 // offset. A more general version is available in 'padtranspose.opencl'.
-__kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
+#endif
 void TransposeMatrixFast(const int ld,
                         __global const realT* restrict src,
                         __global realT* dest,
--- a/src/kernels/level3/transpose_pad.opencl
+++ b/src/kernels/level3/transpose_pad.opencl
@ -84,7 +84,11 @@ INLINE_FUNC void _TransposePadMatrix(LOCAL_PTR real* tile,
 }

 // Interface to the above function
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
 void TransposePadMatrix(const int src_one, const int src_two,
                        const int src_ld, const int src_offset,
                        __global const real* restrict src,
@ -172,7 +176,11 @@ INLINE_FUNC void _TransposeMatrix(LOCAL_PTR real* tile,
 }

 // Interface to the above function
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
 void TransposeMatrix(const int src_one, const int src_two,
                     const int src_ld, const int src_offset,
                     __global const real* restrict src,
@ -193,7 +201,11 @@ void TransposeMatrix(const int src_one, const int src_two,
 #if defined(ROUTINE_GEMMBATCHED)

 // Batched version of the above
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
 void TransposePadMatrixBatched(const int src_one, const int src_two,
                               const int src_ld, const __constant int* src_offsets,
                               __global const real* restrict src,
@ -212,7 +224,11 @@ void TransposePadMatrixBatched(const int src_one, const int src_two,
 }

 // Batched version of the above
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
 void TransposeMatrixBatched(const int src_one, const int src_two,
                            const int src_ld, const __constant int* src_offsets,
                            __global const real* restrict src,
@ -234,7 +250,11 @@ void TransposeMatrixBatched(const int src_one, const int src_two,
 #if defined(ROUTINE_GEMMSTRIDEDBATCHED)

 // Strided-batched version of the above
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
 void TransposePadMatrixStridedBatched(const int src_one, const int src_two,
                                      const int src_ld, const int src_offset,
                                      const int src_stride, __global const real* restrict src,
@ -253,7 +273,11 @@ void TransposePadMatrixStridedBatched(const int src_one, const int src_two,
 }

 // Strided-batched version of the above
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
 void TransposeMatrixStridedBatched(const int src_one, const int src_two,
                                   const int src_ld, const int src_offset,
                                   const int src_stride, __global const real* restrict src,
--- a/src/kernels/level3/xgemm_batched.opencl
+++ b/src/kernels/level3/xgemm_batched.opencl
@ -19,7 +19,11 @@ R"(
 // =================================================================================================
 #if defined(ROUTINE_GEMMBATCHED)

-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#endif
 void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK,
                  const __constant real_arg* arg_alphas,
                  const __constant real_arg* arg_betas,
@ -62,7 +66,11 @@ void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK,
 // =================================================================================================
 #if defined(ROUTINE_GEMMSTRIDEDBATCHED)

-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#endif
 void XgemmStridedBatched(const int kSizeM, const int kSizeN, const int kSizeK,
                         const real_arg arg_alpha, const real_arg arg_beta,
                         const __global realM* restrict agm, const int a_one, const int a_two,
--- a/src/kernels/level3/xgemm_direct_batched.opencl
+++ b/src/kernels/level3/xgemm_direct_batched.opencl
@ -20,7 +20,11 @@ R"(
 #if defined(ROUTINE_GEMMBATCHED)

 // Direct version of the batched GEMM kernel with [A, B] = [non-transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
                          const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
                          const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
@ -41,7 +45,11 @@ void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
 }

 // Direct version of the batched GEMM kernel with [A, B] = [non-transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
                          const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
                          const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
@ -62,7 +70,11 @@ void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
 }

 // Direct version of the batched GEMM kernel with [A, B] = [transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
                          const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
                          const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
@ -83,7 +95,11 @@ void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
 }

 // Direct version of the batched GEMM kernel with [A, B] = [transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
                          const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
                          const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
@ -108,7 +124,11 @@ void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
 #if defined(ROUTINE_GEMMSTRIDEDBATCHED)

 // Direct version of the strided-batched GEMM kernel with [A, B] = [non-transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectStridedBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
                                 const real_arg arg_alpha, const real_arg arg_beta,
                                 const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride,
@ -127,7 +147,11 @@ void XgemmDirectStridedBatchedNN(const int kSizeM, const int kSizeN, const int k
 }

 // Direct version of the strided-batched GEMM kernel with [A, B] = [non-transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectStridedBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
                                 const real_arg arg_alpha, const real_arg arg_beta,
                                 const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride,
@ -146,7 +170,11 @@ void XgemmDirectStridedBatchedNT(const int kSizeM, const int kSizeN, const int k
 }

 // Direct version of the strided-batched GEMM kernel with [A, B] = [transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectStridedBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
                                 const real_arg arg_alpha, const real_arg arg_beta,
                                 const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride,
@ -165,7 +193,11 @@ void XgemmDirectStridedBatchedTN(const int kSizeM, const int kSizeN, const int k
 }

 // Direct version of the strided-batched GEMM kernel with [A, B] = [transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectStridedBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
                                 const real_arg arg_alpha, const real_arg arg_beta,
                                 const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride,
--- a/src/kernels/level3/xgemm_direct_part3.opencl
+++ b/src/kernels/level3/xgemm_direct_part3.opencl
@ -218,7 +218,11 @@ INLINE_FUNC void XgemmDirect(const int kSizeM, const int kSizeN, const int kSize
 // =================================================================================================

 // Direct version of the GEMM kernel with [A, B] = [non-transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK,
                            const real_arg arg_alpha, const real_arg arg_beta,
                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
@ -233,7 +237,11 @@ void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK,
 }

 // Direct version of the GEMM kernel with [A, B] = [non-transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK,
                            const real_arg arg_alpha, const real_arg arg_beta,
                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
@ -248,7 +256,11 @@ void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK,
 }

 // Direct version of the GEMM kernel with [A, B] = [transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK,
                            const real_arg arg_alpha, const real_arg arg_beta,
                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
@ -263,7 +275,11 @@ void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK,
 }

 // Direct version of the GEMM kernel with [A, B] = [transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectTT(const int kSizeM, const int kSizeN, const int kSizeK,
                            const real_arg arg_alpha, const real_arg arg_beta,
                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
--- a/src/kernels/level3/xgemm_part4.opencl
+++ b/src/kernels/level3/xgemm_part4.opencl
@ -19,7 +19,11 @@ R"(
 #if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)

 // Main entry point of the kernel. This is the upper-triangular version.
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#endif
 void XgemmUpper(const int kSizeN, const int kSizeK,
                const real_arg arg_alpha,
                const real_arg arg_beta,
@ -55,7 +59,11 @@ void XgemmUpper(const int kSizeN, const int kSizeK,
 }

 // Main entry point of the kernel. This is the lower-triangular version.
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#endif
 void XgemmLower(const int kSizeN, const int kSizeK,
                const real_arg arg_alpha,
                const real_arg arg_beta,
@ -95,7 +103,11 @@ void XgemmLower(const int kSizeN, const int kSizeK,
 #else

 // Main entry point of the kernel. This is the regular full version.
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#endif
 void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
           const real_arg arg_alpha,
           const real_arg arg_beta,
--- a/src/kernels/levelx/col2im.opencl
+++ b/src/kernels/levelx/col2im.opencl
@ -92,7 +92,11 @@ INLINE_FUNC void Xcol2im(const int input_h, const int input_w, const int channel
 // =================================================================================================

 // Kernel flip version of the Xcol2im kernel (for convolution)
-__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#endif
 void Xcol2imKernelFlip(const int input_h, const int input_w, const int channels,
                       const int output_h, const int output_w,
                       const int kernel_h, const int kernel_w,
@ -113,7 +117,11 @@ void Xcol2imKernelFlip(const int input_h, const int input_w, const int channels,
 }

 // Normal version of the Xcol2im kernel (for cross-correlation)
-__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#endif
 void Xcol2imKernelNormal(const int input_h, const int input_w, const int channels,
                         const int output_h, const int output_w,
                         const int kernel_h, const int kernel_w,
--- a/src/kernels/levelx/im2col.opencl
+++ b/src/kernels/levelx/im2col.opencl
@ -74,7 +74,11 @@ INLINE_FUNC void Xim2col(const int input_h, const int input_w, const int channel
 // =================================================================================================

 // Kernel flip version of the Xim2col kernel (for convolution)
-__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#endif
 void Xim2colKernelFlip(const int input_h, const int input_w, const int channels,
                       const int output_h, const int output_w,
                       const int kernel_h, const int kernel_w,
@ -91,7 +95,11 @@ void Xim2colKernelFlip(const int input_h, const int input_w, const int channels,
 }

 // Normal version of the Xim2col kernel (for cross-correlation)
-__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#endif
 void Xim2colKernelNormal(const int input_h, const int input_w, const int channels,
                         const int output_h, const int output_w,
                         const int kernel_h, const int kernel_w,
--- a/src/kernels/levelx/xconvgemm_part2.opencl
+++ b/src/kernels/levelx/xconvgemm_part2.opencl
@ -23,7 +23,11 @@ R"(

 // ConvGEMM kernel
 #if defined(CONVGEMM_WITH_IM2COL)
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void Xconvgemm(const int num_patches, const int num_kernels, const int patch_size,
               const __global realND* restrict kernelgm, const int kernel_offset,
               __global real* resultgm, const int result_offset, const int result_stride,
@ -285,7 +289,11 @@ INLINE_FUNC void Xconvgemm(const int num_patches, const int num_kernels, const i
 }

 #if !defined(CONVGEMM_WITH_IM2COL)
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XconvgemmFlip(const int num_patches, const int num_kernels, const int patch_size,
                   const __global realND* restrict kernelgm, const int kernel_offset,
                   __global real* resultgm, const int result_offset, const int result_stride,
@ -306,7 +314,11 @@ void XconvgemmFlip(const int num_patches, const int num_kernels, const int patch
            output_h, output_w, alm, blm, kernel_flip);
 }

-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XconvgemmNormal(const int num_patches, const int num_kernels, const int patch_size,
                     const __global realND* restrict kernelgm, const int kernel_offset,
                     __global real* resultgm, const int result_offset, const int result_stride,
--- a/src/utilities/compile.cpp
+++ b/src/utilities/compile.cpp
@ -37,13 +37,13 @@ std::shared_ptr<Program> CompileFromSource(

  // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
  // which it is known to work with all OpenCL platforms.
-  if (device.IsNVIDIA() || device.IsARM()) {
+  if (device.IsNVIDIA() || device.IsARM() || device.IsQualcomm()) {
    header_string += "#define USE_INLINE_KEYWORD 1\n";
  }

  // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
  // performance, but might result in a reduced accuracy.
-  if (device.IsAMD() && device.IsGPU()) {
+  if ((device.IsAMD() && device.IsGPU()) || device.IsQualcomm()) {
    header_string += "#define USE_CL_MAD 1\n";
  }

@ -54,7 +54,7 @@ std::shared_ptr<Program> CompileFromSource(

  // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
  // performance through better cache behaviour
-  if (device.IsARM() && device.IsGPU()) {
+  if ((device.IsARM() && device.IsGPU()) || device.IsQualcomm()) {
    header_string += "#define GLOBAL_MEM_FENCE 1\n";
  }

@ -77,6 +77,12 @@ std::shared_ptr<Program> CompileFromSource(
      header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 1\n";
    }
  }
+
+  // For Qualcomm devices, specifying the OpenCL kernel attribute reqd_work_group_size reduces performance.
+  // This option compiles without the workgroup size requirement and does not affect correctness.
+  if (device.IsQualcomm()) {
+    header_string += "#define RELAX_WORKGROUP_SIZE 1\n";
+  }
  
  // Optionally adds a translation header from OpenCL kernels to CUDA kernels
  #ifdef CUDA_API
--- a/src/utilities/utilities.cpp
+++ b/src/utilities/utilities.cpp
@ -463,6 +463,9 @@ std::string GetDeviceArchitecture(const Device& device) {
    else if (device.HasExtension(kKhronosAttributesAMD)) {
      device_architecture = device.Name(); // Name is architecture for AMD APP and AMD ROCm
    }
+    else if (device.IsQualcomm()) { // queries the Adreno GPU architecture version
+      device_architecture = device.AdrenoVersion();
+    }
    // Note: no else - 'device_architecture' might be the empty string
  #endif