Merge branch 'master' into convgemm_multi_kernel

2018-09-16 20:01:18 +02:00 · 2018-09-16 20:01:18 +02:00 · 83ba3d4b7b
parent b7d8339012 c163868e18
commit 83ba3d4b7b
60 changed files with 618 additions and 431 deletions
--- a/4
+++ b/4
@ -1,7 +1,11 @@

 Development (next version)
 - Added support for shuffle instructions for NVIDIA GPUs (thanks to 'tyler-utah')
+- Added an option to compile the Netlib API with static OpenCL device and context (-DNETLIB_PERSISTENT_OPENCL=ON)
+- The tuners now check beforehand on invalid local thread sizes and skip those completely
+- Fixed an issue with conjugate transpose not being executed in certain cases for a.o. XOMATCOPY
 - Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel
+- Fixed an issue with the preprocessor and the new GEMMK == 1 kernel
 - Various minor fixes and enhancements
 - Added non-BLAS routines:
  * SCONVGEMM/DCONVGEMM/HCONVGEMM (convolution as im2col followed by batched GEMM)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -32,9 +32,25 @@ option(SAMPLES "Enable compilation of the examples" OFF)
 option(TUNERS "Enable compilation of the tuners" ON)
 option(CLIENTS "Enable compilation of the clients to test and compare performance" OFF)
 option(TESTS "Enable compilation of the correctness tests" OFF)
-option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF)
 option(CUBLAS "Enables performance comparison against cuBLAS on NVIDIA GPUs" OFF)

+# The optional Netlib API for CLBlast
+option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF)
+option(NETLIB_PERSISTENT_OPENCL "Makes OpenCL device and context in the CBLAS Netlib API static" OFF)
+if(NETLIB)
+  message("-- Building the Netlib API of CLBlast")
+  if(NETLIB_PERSISTENT_OPENCL)
+    message("   ^^ while using static variables for OpenCL device and context")
+    add_definitions(-DNETLIB_PERSISTENT_OPENCL)
+  endif()
+endif()
+
+# Workarounds for bugs
+option(AMD_SI_EMPTY_KERNEL_WORKAROUND "Enables workaround for bug in AMD Southern Island GPUs" OFF)
+if(AMD_SI_EMPTY_KERNEL_WORKAROUND)
+  add_definitions(-DAMD_SI_EMPTY_KERNEL_WORKAROUND)
+endif()
+
 # Select between an OpenCL API (default) or a CUDA API (beta)
 option(OPENCL "Build CLBlast with an OpenCL API (default)" ON)
 option(CUDA "Build CLBlast with a CUDA API (beta)" OFF)
--- a/README.md
+++ b/README.md
@ -101,6 +101,10 @@ Other known issues:

 * The AMD run-time compiler has a bug causing it to get stuck in an infinite loop. This is reported to happen occasionally when tuning the CLBlast GEMM routine.

+* AMD Southern Island GPUs might cause wrong results with the amdgpu-pro drivers. Do configure CMake with `AMD_SI_EMPTY_KERNEL_WORKAROUND` to resolve the issue, [see issue #301](https://github.com/CNugteren/CLBlast/issues/301).
+
+* Tests might fail on an Intel IvyBridge GPU with the latest Beignet. Please downgrade Beignet to 1.2.1, [see issue #231](https://github.com/CNugteren/CLBlast/issues/231).
+

 Contributing
 -------------
--- a/doc/api.md
+++ b/doc/api.md
@ -3512,7 +3512,7 @@ Arguments to FillCache:
 RetrieveParameters: Retrieves current tuning parameters (auxiliary function)
 -------------

-This function retrieves current tuning parameters for a specific device-precision-kernel combination. This can be used for debugging or inspection.
+This function retrieves current tuning parameters for a specific device-precision-kernel combination. This can be used for debugging or inspection. See [tuning.md](tuning.md) for more details on which kernel names and parameters are valid.

 C++ API:
 ```
@ -3535,7 +3535,7 @@ Arguments to RetrieveParameters (C++ version):
 OverrideParameters: Override tuning parameters (auxiliary function)
 -------------

-This function overrides tuning parameters for a specific device-precision-kernel combination. The next time the target routine is called it will be re-compiled and use the new parameters. All further times (until `OverrideParameters` is called again) it will load the kernel from the cache and thus continue to use the new parameters. Note that the first time after calling `OverrideParameters` a performance drop can be observable due to the re-compilation of the kernel.
+This function overrides tuning parameters for a specific device-precision-kernel combination. The next time the target routine is called it will be re-compiled and use the new parameters. All further times (until `OverrideParameters` is called again) it will load the kernel from the cache and thus continue to use the new parameters. Note that the first time after calling `OverrideParameters` a performance drop can be observable due to the re-compilation of the kernel. See [tuning.md](tuning.md) for more details on which kernel names and parameters are valid.

 C++ API:
 ```
--- a/doc/bindings.md
+++ b/doc/bindings.md
@ -30,3 +30,9 @@ Nim: nim-CLBlast (3rd party)
 -------------

 A 3rd party CLBlast wrapper for the nim language is available [here](https://github.com/numforge/nim-clblast).
+
+
+Julia: CLBlast.jl (3rd party)
+-------------
+
+A 3rd party CLBlast wrapper for [Julia](https://julialang.org/) is available [here](https://github.com/JuliaGPU/CLBlast.jl).
--- a/doc/tuning.md
+++ b/doc/tuning.md
@ -195,6 +195,26 @@ To inspect current behaviour, you can also retrieve the parameters for a specifi
                                             const Precision precision,
                                             std::unordered_map<std::string,size_t> &parameters)

+These two functions require/retrieve the parameters as given in [src/database/kernels](../src/database/kernels), i.e.:
+
+| Kernel name         | Parameters            |
+| --------------------|-----------------------|
+| Xaxpy               |  VW, WGS, WPT         |
+| Xdot                |  WGS1, WGS2           |
+| Xgemv               |  WGS1, WPT1, UNROLL1  |
+| XgemvFast           |  VW2, WGS2, WPT2      |
+| XgemvFastRot        |  VW3, WGS3, WPT3      |
+| Xger                |  WGS1, WGS2, WPT      |
+| Xtrsv               |  TRSV_BLOCK_SIZE      |
+| Xgemm               |  GEMMK, KREG, KWG, KWI, MDIMA, MDIMC, MWG, NDIMB, NDIMC, NWG, SA, SB, STRM, STRN, VWM, VWN |
+| XgemmDirect         |  KWID, MDIMAD, MDIMCD, NDIMBD, NDIMCD, PADA, PADB, VWMD, VWND, WGD |
+| Copy                |  COPY_DIMX, COPY_DIMY, COPY_VW, COPY_WPT |
+| Pad                 |  PAD_DIMX, PAD_DIMY, PAD_WPTX, PAD_WPTY |
+| Transpose           |  TRA_DIM, TRA_PAD, TRA_SHUFFLE, TRA_WPT |
+| Padtranspose        |  PADTRA_PAD, PADTRA_TILE, PADTRA_WPT |
+| Invert              |  INTERNAL_BLOCK_SIZE  |
+| TrsvRoutine         |  TRSV_BLOCK_SIZE      |
+

 Tuning OpenCL compiler options
 -------------
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@ -49,7 +49,7 @@ FILES = [
    "/src/clblast_cuda.cpp",
    "/src/pyclblast/src/pyclblast.pyx"
 ]
-HEADER_LINES = [123, 21, 127, 24, 29, 45, 29, 65, 32, 95, 21, 290]
+HEADER_LINES = [123, 21, 127, 24, 29, 45, 29, 65, 40, 95, 21, 290]
 FOOTER_LINES = [98, 57, 112, 275, 6, 6, 6, 9, 2, 41, 56, 37]
 HEADER_LINES_DOC = 0
 FOOTER_LINES_DOC = 232
--- a/scripts/generator/generator/cpp.py
+++ b/scripts/generator/generator/cpp.py
@ -145,8 +145,8 @@ def clblast_netlib_c_cc(routine):
            result += routine.routine_header_netlib(flavour, 9, "") + " {" + NL

            # Initialize OpenCL
-            result += "  auto device = get_device();" + NL
-            result += "  auto context = clblast::Context(device);" + NL
+            result += "  OPTIONAL_STATIC auto device = get_device();" + NL
+            result += "  OPTIONAL_STATIC auto context = clblast::Context(device);" + NL
            result += "  auto queue = clblast::Queue(context, device);" + NL

            # Set alpha and beta
--- a/src/clblast_netlib_c.cpp
+++ b/src/clblast_netlib_c.cpp
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@ -447,8 +447,14 @@ class Program {

  // Source-based constructor with memory management
  explicit Program(const Context &context, const std::string &source) {
-    const char *source_ptr = &source[0];
-    const auto length = source.length();
+    #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND
+      const std::string source_null_kernel = source + "\n__kernel void null_kernel() {}\n";
+      const char *source_ptr = &source_null_kernel[0];
+      const auto length = source_null_kernel.length();
+    #else
+      const char *source_ptr = &source[0];
+      const auto length = source.length();
+    #endif
    auto status = CL_SUCCESS;
    program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status);
    CLCudaAPIError::Check(status, "clCreateProgramWithSource");
@ -723,9 +729,10 @@ class Buffer {
  }

  // Copies the contents of this buffer into another device buffer
-  void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
+  void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination,
+                   EventPointer event = nullptr) const {
    CheckError(clEnqueueCopyBuffer(queue(), *buffer_, destination(), 0, 0, size*sizeof(T), 0,
-                                   nullptr, nullptr));
+                                   nullptr, event));
  }
  void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
    CopyToAsync(queue, size, destination);
@ -764,10 +771,21 @@ class Kernel {
      kernel_(new cl_kernel, [](cl_kernel* k) {
        if (*k) { CheckErrorDtor(clReleaseKernel(*k)); }
        delete k;
-      }) {
+      })
+    #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND
+      , null_kernel_(new cl_kernel, [](cl_kernel* k) {
+        if (*k) { CheckErrorDtor(clReleaseKernel(*k)); }
+        delete k;
+      })
+    #endif
+  {
    auto status = CL_SUCCESS;
    *kernel_ = clCreateKernel(program->operator()(), name.c_str(), &status);
    CLCudaAPIError::Check(status, "clCreateKernel");
+    #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND
+      *null_kernel_ = clCreateKernel(program->operator()(), "null_kernel", &status);
+      CLCudaAPIError::Check(status, "clCreateKernel");
+    #endif
  }

  // Sets a kernel argument at the indicated position
@ -831,12 +849,21 @@ class Kernel {
                                      static_cast<cl_uint>(waitForEventsPlain.size()),
                                      !waitForEventsPlain.empty() ? waitForEventsPlain.data() : nullptr,
                                      event));
+    #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND
+      const std::vector<size_t> nullRange = {1};
+      CheckError(clEnqueueNDRangeKernel(queue(), *null_kernel_, static_cast<cl_uint>(nullRange.size()),
+                                        nullptr, nullRange.data(), nullptr,
+                                        0, nullptr, nullptr));
+    #endif
  }

  // Accessor to the private data-member
  const cl_kernel& operator()() const { return *kernel_; }
 private:
  std::shared_ptr<cl_kernel> kernel_;
+  #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND
+    std::shared_ptr<cl_kernel> null_kernel_;
+  #endif

  // Internal implementation for the recursive SetArguments function.
  template <typename T>
--- a/src/database/kernels/copy/copy_3232.hpp
+++ b/src/database/kernels/copy/copy_3232.hpp
@ -88,6 +88,7 @@ const DatabaseEntry CopyComplexSingle = {
          { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 16, 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics 620                          "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Iris                                              "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Iris Pro                                          "}, Params{ 32, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
--- a/src/database/kernels/gemm_routine/gemm_routine_32.hpp
+++ b/src/database/kernels/gemm_routine/gemm_routine_32.hpp
@ -33,6 +33,7 @@ const DatabaseEntry GemmRoutineSingle = {
      kDeviceTypeGPU, "Intel", {
        { "default", {
          { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
@ -62,7 +63,7 @@ const DatabaseEntry GemmRoutineSingle = {
    { // Default
      kDeviceTypeAll, "default", {
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 704, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
--- a/src/database/kernels/gemm_routine/gemm_routine_3232.hpp
+++ b/src/database/kernels/gemm_routine/gemm_routine_3232.hpp
@ -24,6 +24,7 @@ const DatabaseEntry GemmRoutineComplexSingle = {
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "default", {
+          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
@ -49,7 +50,7 @@ const DatabaseEntry GemmRoutineComplexSingle = {
    { // Default
      kDeviceTypeAll, "default", {
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
--- a/src/database/kernels/invert/invert_32.hpp
+++ b/src/database/kernels/invert/invert_32.hpp
@ -24,6 +24,7 @@ const DatabaseEntry InvertSingle = {
      kDeviceTypeGPU, "Intel", {
        { "default", {
          { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
--- a/src/database/kernels/invert/invert_3232.hpp
+++ b/src/database/kernels/invert/invert_3232.hpp
@ -23,6 +23,7 @@ const DatabaseEntry InvertComplexSingle = {
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "default", {
+          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
--- a/src/database/kernels/pad/pad_3232.hpp
+++ b/src/database/kernels/pad/pad_3232.hpp
@ -88,6 +88,7 @@ const DatabaseEntry PadComplexSingle = {
          { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics 620                          "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 32, 16, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Iris                                              "}, Params{ 32, 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Iris Pro                                          "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
--- a/src/database/kernels/padtranspose/padtranspose_3232.hpp
+++ b/src/database/kernels/padtranspose/padtranspose_3232.hpp
@ -88,10 +88,11 @@ const DatabaseEntry PadtransposeComplexSingle = {
          { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics 620                          "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Iris                                              "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Iris Pro                                          "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
--- a/src/database/kernels/transpose/transpose_3232.hpp
+++ b/src/database/kernels/transpose/transpose_3232.hpp
@ -88,6 +88,7 @@ const DatabaseEntry TransposeComplexSingle = {
          { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics 620                          "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Iris                                              "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Iris Pro                                          "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
--- a/src/database/kernels/trsv_routine/trsv_routine_32.hpp
+++ b/src/database/kernels/trsv_routine/trsv_routine_32.hpp
@ -24,6 +24,7 @@ const DatabaseEntry TrsvRoutineSingle = {
      kDeviceTypeGPU, "Intel", {
        { "default", {
          { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
--- a/src/database/kernels/trsv_routine/trsv_routine_3232.hpp
+++ b/src/database/kernels/trsv_routine/trsv_routine_3232.hpp
@ -23,6 +23,7 @@ const DatabaseEntry TrsvRoutineComplexSingle = {
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "default", {
+          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
--- a/src/database/kernels/xaxpy/xaxpy_3232.hpp
+++ b/src/database/kernels/xaxpy/xaxpy_3232.hpp
@ -88,6 +88,7 @@ const DatabaseEntry XaxpyComplexSingle = {
          { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics 620                          "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Iris                                              "}, Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Iris Pro                                          "}, Params{ 1, 256, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
--- a/src/database/kernels/xdot/xdot_32.hpp
+++ b/src/database/kernels/xdot/xdot_32.hpp
@ -84,7 +84,7 @@ const DatabaseEntry XdotSingle = {
          { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 512, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics 620                          "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Iris Pro                                          "}, Params{ 512, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
--- a/src/database/kernels/xdot/xdot_3232.hpp
+++ b/src/database/kernels/xdot/xdot_3232.hpp
@ -82,6 +82,7 @@ const DatabaseEntry XdotComplexSingle = {
          { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics 620                          "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 32, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Iris Pro                                          "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
--- a/src/database/kernels/xgemm/xgemm_32.hpp
+++ b/src/database/kernels/xgemm/xgemm_32.hpp
@ -90,7 +90,7 @@ const DatabaseEntry XgemmSingle = {
          { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 1, 4, 1, 1, 8, 8, 64, 8, 8, 64, 0, 0, 0, 0, 4, 4 } },
          { Name{"Intel(R) HD Graphics 620                          "}, Params{ 0, 1, 32, 2, 32, 8, 64, 16, 16, 128, 0, 0, 0, 1, 1, 2 } },
          { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 0, 1, 16, 2, 16, 8, 32, 8, 16, 128, 1, 1, 1, 1, 2, 4 } },
-          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 16, 128, 1, 1, 0, 1, 1, 4 } },
+          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } },
          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 1, 2, 1, 1, 4, 4, 32, 8, 8, 64, 0, 0, 0, 0, 2, 2 } },
          { Name{"Iris                                              "}, Params{ 0, 1, 16, 8, 16, 8, 128, 32, 16, 64, 1, 1, 1, 1, 4, 1 } },
          { Name{"Iris Pro                                          "}, Params{ 0, 1, 16, 2, 16, 8, 64, 32, 32, 128, 1, 1, 1, 0, 4, 4 } },
--- a/src/database/kernels/xgemm/xgemm_3232.hpp
+++ b/src/database/kernels/xgemm/xgemm_3232.hpp
@ -88,10 +88,11 @@ const DatabaseEntry XgemmComplexSingle = {
          { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 0, 1, 16, 8, 8, 8, 32, 16, 16, 64, 1, 0, 0, 0, 4, 4 } },
          { Name{"Intel(R) HD Graphics 620                          "}, Params{ 0, 1, 16, 2, 16, 8, 32, 8, 8, 32, 0, 0, 1, 0, 1, 1 } },
          { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 0, 1, 32, 8, 16, 16, 64, 16, 16, 64, 1, 1, 1, 1, 2, 1 } },
+          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 1, 4, 1, 1, 32, 32, 128, 16, 16, 128, 0, 0, 0, 0, 4, 1 } },
          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 1, 2, 1, 1, 4, 4, 16, 8, 8, 64, 0, 0, 0, 0, 2, 2 } },
          { Name{"Iris                                              "}, Params{ 0, 1, 32, 8, 32, 16, 64, 8, 16, 64, 1, 0, 1, 0, 1, 1 } },
          { Name{"Iris Pro                                          "}, Params{ 0, 1, 16, 2, 8, 8, 32, 32, 8, 32, 1, 1, 1, 1, 1, 1 } },
-          { kDeviceNameDefault                                        , Params{ 0, 1, 32, 2, 16, 16, 64, 32, 32, 128, 1, 1, 1, 0, 2, 2 } },
+          { kDeviceNameDefault                                        , Params{ 0, 1, 16, 2, 16, 8, 32, 8, 8, 32, 0, 0, 1, 0, 1, 1 } },
        } },
      }
    },
--- a/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp
+++ b/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp
@ -69,7 +69,7 @@ const DatabaseEntry XgemmDirectSingle = {
        { "default", {
          { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics 620                          "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } },
-          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 8, 16, 8, 16, 16, 1, 0, 2, 2, 32, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 8, 16, 32, 16, 8, 1, 0, 1, 1, 64, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } },
          { Name{"Iris Pro                                          "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 4, 32, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } },
--- a/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp
+++ b/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp
@ -63,9 +63,10 @@ const DatabaseEntry XgemmDirectComplexSingle = {
      kDeviceTypeGPU, "Intel", {
        { "default", {
          { Name{"Intel(R) HD Graphics 620                          "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } },
          { Name{"Iris Pro                                          "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
--- a/src/database/kernels/xgemv/xgemv_3232.hpp
+++ b/src/database/kernels/xgemv/xgemv_3232.hpp
@ -86,6 +86,7 @@ const DatabaseEntry XgemvComplexSingle = {
          { Name{"Intel(R) HD Graphics 530                          "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Iris                                              "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Iris Pro                                          "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
--- a/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp
+++ b/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp
@ -92,7 +92,7 @@ const DatabaseEntry XgemvFastSingle = {
          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Iris                                              "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Iris Pro                                          "}, Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
--- a/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp
+++ b/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp
@ -86,6 +86,7 @@ const DatabaseEntry XgemvFastComplexSingle = {
          { Name{"Intel(R) HD Graphics 530                          "}, Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Iris                                              "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Iris Pro                                          "}, Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
--- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp
+++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp
@ -70,7 +70,7 @@ const DatabaseEntry XgemvFastRotSingle = {
          { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 2, 128, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 2, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Iris Pro                                          "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
--- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp
+++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp
@ -68,6 +68,7 @@ const DatabaseEntry XgemvFastRotComplexSingle = {
        { "default", {
          { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 2, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 4, 128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Iris Pro                                          "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
--- a/src/database/kernels/xger/xger_3232.hpp
+++ b/src/database/kernels/xger/xger_3232.hpp
@ -87,6 +87,7 @@ const DatabaseEntry XgerComplexSingle = {
          { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics 620                          "}, Params{ 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 512, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) HD Graphics IvyBridge M GT2              "}, Params{ 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Iris Pro                                          "}, Params{ 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
--- a/src/kernel_preprocessor.cpp
+++ b/src/kernel_preprocessor.cpp
@ -557,6 +557,8 @@ std::string PreprocessKernelSource(const std::string& kernel_source) {
  lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers);
  lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, false);
  lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, false);
+  lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, false);
+  lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, false);
  lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, true);

  // Gather the results
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@ -43,8 +43,6 @@
 // literal). Comment-out this line for syntax-highlighting when developing.
 R"(

-// =================================================================================================
-
 // Parameters set by the tuner or by the database. Here they are given a basic default value in case
 // this kernel file is used outside of the CLBlast library.
 #ifndef GEMMK
@ -397,9 +395,7 @@ INLINE_FUNC realN LocalToPrivateB(LOCAL_PTR realN* blm, const int _ni, const int
 }
 #endif

-// =================================================================================================
-
-// End of the C++11 raw string literal
 )"
+// End of the C++11 raw string literal

 // =================================================================================================
--- a/src/kernels/level3/xgemm_part2.opencl
+++ b/src/kernels/level3/xgemm_part2.opencl
@ -15,8 +15,6 @@
 // literal). Comment-out this line for syntax-highlighting when developing.
 R"(

-// =================================================================================================
-
 // The vectorised multiply-add function
 INLINE_FUNC realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
  #if USE_VECTOR_MAD == 1
@ -171,9 +169,7 @@ INLINE_FUNC void StoreResults(__global realM* cgm, realM c_value, const int _mi,
  cgm[index] = result;
 }

-// =================================================================================================
-
-// End of the C++11 raw string literal
 )"
+// End of the C++11 raw string literal

 // =================================================================================================
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@ -15,14 +15,12 @@
 // literal). Comment-out this line for syntax-highlighting when developing.
 R"(

-// =================================================================================================
-
 // A common interface for subgroup functions

 #if USE_SUBGROUP_SHUFFLING == 1

 INLINE_FUNC int clblast_get_sub_group_local_id() {
-  
+
  // Intel extension 
  #if SUBGROUP_SHUFFLING_INTEL == 1
  return get_sub_group_local_id();
@ -36,7 +34,7 @@ INLINE_FUNC int clblast_get_sub_group_local_id() {
 }

 INLINE_FUNC realN clblast_sub_group_shuffle(realN reg, int src) {
-  
+
  // Intel extension 
  #if SUBGROUP_SHUFFLING_INTEL == 1
  return intel_sub_group_shuffle(reg, src);
@ -238,48 +236,47 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
            for (int _mi = 0; _mi < MWI/VWM; _mi += 1) {
              #pragma unroll
              for (int _ki = 0; _ki < KREG/VWN; _ki += 1) {
-                const int index =  _ni * (MWI/VWM) + _mi;
                #if USE_SUBGROUP_SHUFFLING == 1
                  const realN aval = clblast_sub_group_shuffle(apm[_ki], _ni);
                #else
                  const realN aval = apm[_ni * (KREG/VWN) + _ki];
                #endif
                #if VWN == 1
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval);
                #elif VWN == 2
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.x);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.y);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.x);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.y);
                #elif VWN == 4
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.x);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.y);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 2) * (MWI/VWM) + _mi], aval.z);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 3) * (MWI/VWM) + _mi], aval.w);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.x);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.y);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 2) * (MWI/VWM) + _mi], aval.z);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 3) * (MWI/VWM) + _mi], aval.w);
                #elif VWN == 8
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.s0);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.s1);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 2) * (MWI/VWM) + _mi], aval.s2);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 3) * (MWI/VWM) + _mi], aval.s3);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 4) * (MWI/VWM) + _mi], aval.s4);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 5) * (MWI/VWM) + _mi], aval.s5);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 6) * (MWI/VWM) + _mi], aval.s6);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 7) * (MWI/VWM) + _mi], aval.s7);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.s0);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.s1);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 2) * (MWI/VWM) + _mi], aval.s2);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 3) * (MWI/VWM) + _mi], aval.s3);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 4) * (MWI/VWM) + _mi], aval.s4);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 5) * (MWI/VWM) + _mi], aval.s5);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 6) * (MWI/VWM) + _mi], aval.s6);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 7) * (MWI/VWM) + _mi], aval.s7);
                #elif VWN == 16
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0 ) * (MWI/VWM) + _mi], aval.s0);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 1 ) * (MWI/VWM) + _mi], aval.s1);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 2 ) * (MWI/VWM) + _mi], aval.s2);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 3 ) * (MWI/VWM) + _mi], aval.s3);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 4 ) * (MWI/VWM) + _mi], aval.s4);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 5 ) * (MWI/VWM) + _mi], aval.s5);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 6 ) * (MWI/VWM) + _mi], aval.s6);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 7 ) * (MWI/VWM) + _mi], aval.s7);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 8 ) * (MWI/VWM) + _mi], aval.s8);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 9 ) * (MWI/VWM) + _mi], aval.s9);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 10) * (MWI/VWM) + _mi], aval.sA);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 11) * (MWI/VWM) + _mi], aval.sB);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 12) * (MWI/VWM) + _mi], aval.sC);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 13) * (MWI/VWM) + _mi], aval.sD);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 14) * (MWI/VWM) + _mi], aval.sE);
-                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 15) * (MWI/VWM) + _mi], aval.sF);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0 ) * (MWI/VWM) + _mi], aval.s0);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 1 ) * (MWI/VWM) + _mi], aval.s1);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 2 ) * (MWI/VWM) + _mi], aval.s2);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 3 ) * (MWI/VWM) + _mi], aval.s3);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 4 ) * (MWI/VWM) + _mi], aval.s4);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 5 ) * (MWI/VWM) + _mi], aval.s5);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 6 ) * (MWI/VWM) + _mi], aval.s6);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 7 ) * (MWI/VWM) + _mi], aval.s7);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 8 ) * (MWI/VWM) + _mi], aval.s8);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 9 ) * (MWI/VWM) + _mi], aval.s9);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 10) * (MWI/VWM) + _mi], aval.sA);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 11) * (MWI/VWM) + _mi], aval.sB);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 12) * (MWI/VWM) + _mi], aval.sC);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 13) * (MWI/VWM) + _mi], aval.sD);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 14) * (MWI/VWM) + _mi], aval.sE);
+                  cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 15) * (MWI/VWM) + _mi], aval.sF);
                #endif
              }
            }
@ -311,9 +308,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
  }
 }

-// =================================================================================================
-
-// End of the C++11 raw string literal
 )"
+// End of the C++11 raw string literal

 // =================================================================================================
--- a/src/kernels/level3/xgemm_part4.opencl
+++ b/src/kernels/level3/xgemm_part4.opencl
@ -15,7 +15,6 @@
 // literal). Comment-out this line for syntax-highlighting when developing.
 R"(

-// =================================================================================================
 // The upper-triangular and lower-triangular kernels are only used in special cases
 #if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)

@ -132,9 +131,8 @@ void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
 }

 #endif
-// =================================================================================================

-// End of the C++11 raw string literal
 )"
+// End of the C++11 raw string literal

 // =================================================================================================
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@ -76,6 +76,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,

  // Determines the right kernel
  auto kernel_name = std::string{};
+  auto pad_kernel = false;
  if (do_transpose) {
    if (use_fast_kernel &&
        IsMultiple(src_ld, db["TRA_WPT"]) &&
@ -85,7 +86,8 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
    }
    else {
      use_fast_kernel = false;
-      kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix";
+      pad_kernel = (do_pad || do_conjugate);
+      kernel_name = (pad_kernel) ? "TransposePadMatrix" : "TransposeMatrix";
    }
  }
  else {
@ -97,7 +99,8 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
    }
    else {
      use_fast_kernel = false;
-      kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix";
+      pad_kernel = do_pad;
+      kernel_name = (pad_kernel) ? "CopyPadMatrix" : "CopyMatrix";
    }
  }

@ -123,7 +126,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
    kernel.SetArgument(8, static_cast<int>(dest_offset));
    kernel.SetArgument(9, dest());
    kernel.SetArgument(10, GetRealArg(alpha));
-    if (do_pad) {
+    if (pad_kernel) {
      kernel.SetArgument(11, static_cast<int>(do_conjugate));
    }
    else {
--- a/src/routines/level2/xtrsv.cpp
+++ b/src/routines/level2/xtrsv.cpp
@ -33,7 +33,8 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
                            const size_t n,
                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
                            const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                            EventPointer event) {

  if (n > db_["TRSV_BLOCK_SIZE"]) { throw BLASError(StatusCode::kUnexpectedError); };

@ -69,9 +70,7 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
  // Launches the kernel
  const auto local = std::vector<size_t>{db_["TRSV_BLOCK_SIZE"]};
  const auto global = std::vector<size_t>{Ceil(n, db_["TRSV_BLOCK_SIZE"])};
-  auto event = Event();
-  RunKernel(kernel, queue_, device_, global, local, event.pointer());
-  event.WaitForCompletion();
+  RunKernel(kernel, queue_, device_, global, local, event);
 }

 // =================================================================================================
@ -146,14 +145,16 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
    }

    // Runs the triangular substitution for the block size
+    auto sub_event = Event();
    Substitution(layout, triangle, a_transpose, diagonal, block_size,
                 a_buffer, a_offset + col + col*a_ld, a_ld,
                 b_buffer, b_offset + col*b_inc, b_inc,
-                 x_buffer, x_offset + col*x_inc, x_inc);
+                 x_buffer, x_offset + col*x_inc, x_inc, sub_event.pointer());
+    sub_event.WaitForCompletion();
  }

  // Retrieves the results
-  x_buffer.CopyTo(queue_, x_size, b_buffer);
+  x_buffer.CopyToAsync(queue_, x_size, b_buffer, event_);
 }

 // =================================================================================================
--- a/src/routines/level2/xtrsv.hpp
+++ b/src/routines/level2/xtrsv.hpp
@ -32,6 +32,7 @@ class Xtrsv: public Xgemv<T> {
  using Xgemv<T>::device_;
  using Xgemv<T>::db_;
  using Xgemv<T>::program_;
+  using Xgemv<T>::event_;
  using Xgemv<T>::DoGemv;

  // Constructor
@ -50,7 +51,8 @@ class Xtrsv: public Xgemv<T> {
                    const size_t n,
                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
                    const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
-                    const Buffer<T> &x_buffer, const size_t offset_x, const size_t x_inc);
+                    const Buffer<T> &x_buffer, const size_t offset_x, const size_t x_inc,
+                    EventPointer event);
 };

 // =================================================================================================
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@ -40,6 +40,7 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
    , // separated in multiple parts to prevent C1091 in MSVC 2013
    #include "../../kernels/level3/xgemm_part1.opencl"
    #include "../../kernels/level3/xgemm_part2.opencl"
+    , // separated in multiple parts to prevent C1091 in MSVC 2013
    #include "../../kernels/level3/xgemm_part3.opencl"
    #include "../../kernels/level3/xgemm_part4.opencl"
    }) {
--- a/src/routines/level3/xherk.cpp
+++ b/src/routines/level3/xherk.cpp
@ -32,6 +32,7 @@ Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name):
    , // separated in multiple parts to prevent C1091 in MSVC 2013
    #include "../../kernels/level3/xgemm_part1.opencl"
    #include "../../kernels/level3/xgemm_part2.opencl"
+    , // separated in multiple parts to prevent C1091 in MSVC 2013
    #include "../../kernels/level3/xgemm_part3.opencl"
    #include "../../kernels/level3/xgemm_part4.opencl"
    }) {
--- a/src/routines/level3/xsyrk.cpp
+++ b/src/routines/level3/xsyrk.cpp
@ -32,6 +32,7 @@ Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name):
    , // separated in multiple parts to prevent C1091 in MSVC 2013
    #include "../../kernels/level3/xgemm_part1.opencl"
    #include "../../kernels/level3/xgemm_part2.opencl"
+    , // separated in multiple parts to prevent C1091 in MSVC 2013
    #include "../../kernels/level3/xgemm_part3.opencl"
    #include "../../kernels/level3/xgemm_part4.opencl"
    }) {
--- a/src/routines/level3/xtrsm.cpp
+++ b/src/routines/level3/xtrsm.cpp
@ -246,7 +246,7 @@ void Xtrsm<T>::TrsmColMajor(const Side side, const Triangle triangle,
  }

  // Retrieves the results
-  x_buffer.CopyTo(queue_, b_size, b_buffer);
+  x_buffer.CopyToAsync(queue_, b_size, b_buffer, event_);
 }

 // =================================================================================================
--- a/src/routines/level3/xtrsm.hpp
+++ b/src/routines/level3/xtrsm.hpp
@ -31,6 +31,7 @@ class Xtrsm: public Xgemm<T> {
  using Xgemm<T>::device_;
  using Xgemm<T>::db_;
  using Xgemm<T>::program_;
+  using Xgemm<T>::event_;
  using Xgemm<T>::DoGemm;

  // Constructor
--- a/src/routines/levelx/xgemmbatched.cpp
+++ b/src/routines/levelx/xgemmbatched.cpp
@ -38,6 +38,7 @@ XgemmBatched<T>::XgemmBatched(Queue &queue, EventPointer event, const std::strin
    , // separated in multiple parts to prevent C1091 in MSVC 2013
    #include "../../kernels/level3/xgemm_part1.opencl"
    #include "../../kernels/level3/xgemm_part2.opencl"
+    , // separated in multiple parts to prevent C1091 in MSVC 2013
    #include "../../kernels/level3/xgemm_part3.opencl"
    #include "../../kernels/level3/xgemm_part4.opencl"
    , // separated in multiple parts to prevent C1091 in MSVC 2013
--- a/src/routines/levelx/xgemmstridedbatched.cpp
+++ b/src/routines/levelx/xgemmstridedbatched.cpp
@ -37,6 +37,7 @@ XgemmStridedBatched<T>::XgemmStridedBatched(Queue &queue, EventPointer event, co
            , // separated in multiple parts to prevent C1091 in MSVC 2013
            #include "../../kernels/level3/xgemm_part1.opencl"
            #include "../../kernels/level3/xgemm_part2.opencl"
+            , // separated in multiple parts to prevent C1091 in MSVC 2013
            #include "../../kernels/level3/xgemm_part3.opencl"
            #include "../../kernels/level3/xgemm_part4.opencl"
            , // separated in multiple parts to prevent C1091 in MSVC 2013
--- a/src/tuning/configurations.cpp
+++ b/src/tuning/configurations.cpp
@ -23,28 +23,42 @@ namespace clblast {
 // Finds all configurations. It also applies the user-defined constraints within.
 std::vector<Configuration> SetConfigurations(const Device& device,
                                             const std::vector<Parameter> parameters,
+                                             const std::vector<size_t>& local_size_base,
+                                             const TransformVector& mul_local_config,
+                                             const TransformVector& div_local_config,
                                             const Constraints& constraints,
                                             const LocalMemSizeInfo& local_mem_size_info) {
  const auto local_mem_max = device.LocalMemSize();
+  const auto max_work_item_sizes = device.MaxWorkItemSizes();
+  const auto max_work_group_size = device.MaxWorkGroupSize();
  auto config = Configuration();
  auto configurations = std::vector<Configuration>();
-  PopulateConfigurations(parameters, 0, config, configurations,
-                         local_mem_max, constraints, local_mem_size_info);
+  PopulateConfigurations(parameters, local_size_base, mul_local_config, div_local_config,
+                         0, config, configurations,
+                         local_mem_max, constraints, local_mem_size_info,
+                         max_work_item_sizes, max_work_group_size);
  return configurations;
 }

 // Iterates recursively over all permutations of the user-defined parameters
 void PopulateConfigurations(const std::vector<Parameter> &parameters,
+                            const std::vector<size_t> local_size_base,
+                            const TransformVector& mul_local_config,
+                            const TransformVector& div_local_config,
                            const size_t index, const Configuration &config,
                            std::vector<Configuration> &configuration,
                            const size_t local_mem_max,
                            const Constraints& constraints,
-                            const LocalMemSizeInfo& local_mem_size_info) {
+                            const LocalMemSizeInfo& local_mem_size_info,
+                            const std::vector<size_t>& max_work_item_sizes,
+                            const size_t max_work_group_size) {

  // End of the chain: all parameters are considered, store the resulting configuration if it is a
  // valid one according to the constraints
  if (index == parameters.size()) {
-    if (ValidConfiguration(config, local_mem_max, constraints, local_mem_size_info)) {
+    if (ValidConfiguration(config, local_mem_max, constraints, local_mem_size_info,
+                           local_size_base, mul_local_config, div_local_config,
+                           max_work_item_sizes, max_work_group_size)) {
      configuration.push_back(config);
    }
    return;
@ -55,8 +69,10 @@ void PopulateConfigurations(const std::vector<Parameter> &parameters,
  for (auto &value: parameter.second) {
    auto config_copy = config;
    config_copy[parameter.first] = value;
-    PopulateConfigurations(parameters, index+1, config_copy, configuration,
-                           local_mem_max, constraints, local_mem_size_info);
+    PopulateConfigurations(parameters, local_size_base, mul_local_config, div_local_config,
+                           index+1, config_copy, configuration,
+                           local_mem_max, constraints, local_mem_size_info,
+                           max_work_item_sizes, max_work_group_size);
  }
 }

@ -64,7 +80,12 @@ void PopulateConfigurations(const std::vector<Parameter> &parameters,
 bool ValidConfiguration(const Configuration &config,
                        const size_t local_mem_max,
                        const Constraints& constraints,
-                        const LocalMemSizeInfo& local_mem_size_info) {
+                        const LocalMemSizeInfo& local_mem_size_info,
+                        const std::vector<size_t> local_size_base,
+                        const TransformVector& mul_local_config,
+                        const TransformVector& div_local_config,
+                        const std::vector<size_t>& max_work_item_sizes,
+                        const size_t max_work_group_size) {

  // Iterates over all constraints
  for (auto &constraint: constraints) {
@ -92,6 +113,20 @@ bool ValidConfiguration(const Configuration &config,
    return false;
  }

+  // Checks the local thread size (both per dimension and in total)
+  const auto local = SetThreadConfiguration(config, local_size_base,
+                                            mul_local_config, div_local_config);
+  for (auto i=size_t{0}; i<local.size(); ++i) {
+    if (local[i] > max_work_item_sizes[i]) {
+      return false;
+    }
+  }
+  auto local_size = size_t{1};
+  for (auto &item: local) { local_size *= item; }
+  if (local_size > max_work_group_size) {
+    return false;
+  }
+
  // Everything was OK: this configuration is valid
  return true;
 }
--- a/src/tuning/configurations.hpp
+++ b/src/tuning/configurations.hpp
@ -50,6 +50,9 @@ struct LocalMemSizeInfo {
 // function to find all configurations. It also applies the user-defined constraints within.
 std::vector<Configuration> SetConfigurations(const Device& device,
                                             const std::vector<Parameter> parameters,
+                                             const std::vector<size_t>& local_size_base,
+                                             const TransformVector& mul_local_config,
+                                             const TransformVector& div_local_config,
                                             const Constraints& constraints,
                                             const LocalMemSizeInfo& local_mem_size_info);

@ -58,11 +61,16 @@ std::vector<Configuration> SetConfigurations(const Device& device,
 // At the end of each chain (when all parameters are considered), the function stores the result
 // into the configuration list.
 void PopulateConfigurations(const std::vector<Parameter> &parameters,
+                            const std::vector<size_t> local_size_base,
+                            const TransformVector& mul_local_config,
+                            const TransformVector& div_local_config,
                            const size_t index, const Configuration &config,
                            std::vector<Configuration> &configuration,
                            const size_t local_mem_max,
                            const Constraints& constraints,
-                            const LocalMemSizeInfo& local_mem_size_info);
+                            const LocalMemSizeInfo& local_mem_size_info,
+                            const std::vector<size_t>& max_work_item_sizes,
+                            const size_t max_work_group_size);

 // Loops over all user-defined constraints to check whether or not the configuration is valid.
 // Assumes initially all configurations are valid, then returns false if one of the constraints has
@ -71,7 +79,12 @@ void PopulateConfigurations(const std::vector<Parameter> &parameters,
 bool ValidConfiguration(const Configuration &config,
                        const size_t local_mem_max,
                        const Constraints& constraints,
-                        const LocalMemSizeInfo& local_mem_size_info);
+                        const LocalMemSizeInfo& local_mem_size_info,
+                        const std::vector<size_t> local_size_base,
+                        const TransformVector& mul_local_config,
+                        const TransformVector& div_local_config,
+                        const std::vector<size_t>& max_work_item_sizes,
+                        const size_t max_work_group_size);

 // Processes multipliers and dividers to obtain the final thread configuration
 std::vector<size_t> SetThreadConfiguration(const Configuration& config,
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@ -33,9 +33,13 @@ void StartVariation(int argc, char *argv[]) {

 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
+  printf("* (1/4) Tuning main GEMM kernel (GEMMK == 0) for fixed set of parameters\n\n");
  StartVariation<1>(argc, argv);
+  printf("* (2/4) Tuning main GEMM kernel (GEMMK == 0) for random parameters out of larger set\n\n");
  StartVariation<2>(argc, argv);
+  printf("* (3/4) Tuning secondary GEMM kernel (GEMMK == 1) for fixed set of parameters\n\n");
  StartVariation<11>(argc, argv);
+  printf("* (4/4) Tuning secondary GEMM kernel (GEMMK == 1) for random parameters out of larger set\n\n");
  StartVariation<12>(argc, argv);
  return 0;
 }
--- a/src/tuning/kernels/xgemm.hpp
+++ b/src/tuning/kernels/xgemm.hpp
@ -50,6 +50,8 @@ TunerSettings XgemmGetTunerSettings(const int V, const Arguments<T> &args) {
  settings.sources +=
 #include "../src/kernels/level3/xgemm_part1.opencl"
 #include "../src/kernels/level3/xgemm_part2.opencl"
+  ;
+  settings.sources +=
 #include "../src/kernels/level3/xgemm_part3.opencl"
 #include "../src/kernels/level3/xgemm_part4.opencl"
  ;
--- a/src/tuning/tuning.cpp
+++ b/src/tuning/tuning.cpp
@ -172,7 +172,8 @@ void Tuner(int argc, char* argv[], const int V,
  }

  // Sets the tunable parameters and their possible values
-  auto configurations = SetConfigurations(device, settings.parameters,
+  auto configurations = SetConfigurations(device, settings.parameters, settings.local_size,
+                                          settings.mul_local, settings.div_local,
                                          SetConstraints(V), ComputeLocalMemSize(V));
  printf("* Found %s%zu configuration(s)%s\n",
         kPrintMessage.c_str(), configurations.size(), kPrintEnd.c_str());
--- a/src/tuning/tuning_api.cpp
+++ b/src/tuning/tuning_api.cpp
@ -264,7 +264,8 @@ StatusCode TunerAPI(Queue &queue, const Arguments<T> &args, const int V,
  }

  // Sets the tunable parameters and their possible values
-  auto configurations = SetConfigurations(device, settings.parameters,
+  auto configurations = SetConfigurations(device, settings.parameters, settings.local_size,
+                                          settings.mul_local, settings.div_local,
                                          SetConstraints(V), ComputeLocalMemSize(V));

  // Select the search method (full search or a random fraction)
--- a/src/utilities/compile.cpp
+++ b/src/utilities/compile.cpp
@ -59,7 +59,8 @@ std::shared_ptr<Program> CompileFromSource(
  }

  // For Intel GPUs with subgroup support, use subgroup shuffling.
-  if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups)) {
+  if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups) &&
+      (precision == Precision::kSingle || precision == Precision::kHalf)) {
    header_string += "#define USE_SUBGROUP_SHUFFLING 1\n";
    header_string += "#define SUBGROUP_SHUFFLING_INTEL 1\n";
  }
--- a/test/correctness/misc/preprocessor.cpp
+++ b/test/correctness/misc/preprocessor.cpp
@ -221,7 +221,7 @@ size_t RunPreprocessor(int argc, char *argv[], const bool silent, const Precisio
  ;
  if (TestKernel(device, context, "TransposePadMatrix", transpose_pad_sources, precision)) { passed++; } else { errors++; }

-  // GEMM (in-direct)
+  // GEMM (in-direct) GEMMK==0
  const auto gemm_sources =
    "#define KWI 2\n"
    "#define MWG 16\n"
@ -234,6 +234,18 @@ size_t RunPreprocessor(int argc, char *argv[], const bool silent, const Precisio
  ;
  if (TestKernel(device, context, "Xgemm", gemm_sources, precision)) { passed++; } else { errors++; }

+  // GEMM (in-direct) GEMMK==1
+  const auto gemm_sources_gemmk1 =
+    "#define MWG 16\n"
+    "#define NWG 16\n"
+    "#define GEMMK 1\n"
+    #include "../src/kernels/level3/xgemm_part1.opencl"
+    #include "../src/kernels/level3/xgemm_part2.opencl"
+    #include "../src/kernels/level3/xgemm_part3.opencl"
+    #include "../src/kernels/level3/xgemm_part4.opencl"
+  ;
+  if (TestKernel(device, context, "Xgemm", gemm_sources_gemmk1, precision)) { passed++; } else { errors++; }
+
  // GEMM (direct)
  const auto gemm_direct_sources =
    "#define KWID 2\n"
--- a/test/correctness/testblas.cpp
+++ b/test/correctness/testblas.cpp
@ -239,7 +239,7 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
    }

    // Tests the error count (should be zero)
-    TestErrorCount(errors, get_id1_(args)*get_id2_(args), args);
+    TestErrorCount(errors, get_id1_(args)*get_id2_(args) + kCanarySize, args);
  }
  TestEnd();
 }
--- a/test/routines/levelx/xomatcopy.hpp
+++ b/test/routines/levelx/xomatcopy.hpp
@ -45,7 +45,9 @@ StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host)
      const auto b_two = (b_rotated) ? id1 : id2;
      const auto a_index = a_two * args.a_ld + a_one + args.a_offset;
      const auto b_index = b_two * args.b_ld + b_one + args.b_offset;
-      buffers_host.b_mat[b_index] = args.alpha * buffers_host.a_mat[a_index];
+      auto a_value = buffers_host.a_mat[a_index];
+      if (args.a_transpose == Transpose::kConjugate) { a_value = ComplexConjugate(a_value); }
+      buffers_host.b_mat[b_index] = args.alpha * a_value;
    }
  }
  return StatusCode::kSuccess;
--- a/test/test_utilities.cpp
+++ b/test/test_utilities.cpp
@ -31,6 +31,16 @@ template <> bool IsCloseToZero(const double2 value) { return IsCloseToZero(value

 // =================================================================================================

+// Performs a complex conjugate if complex
+template <typename T> T ComplexConjugate(const T value) { return value; }
+template half ComplexConjugate(const half);
+template float ComplexConjugate(const float);
+template double ComplexConjugate(const double);
+template <> float2 ComplexConjugate(const float2 value) { return float2{value.real(), -value.imag()}; }
+template <> double2 ComplexConjugate(const double2 value) { return double2{value.real(), -value.imag()}; }
+
+// =================================================================================================
+
 template <typename T, typename U>
 void DeviceToHost(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host,
                  Queue &queue, const std::vector<std::string> &names) {
--- a/test/test_utilities.hpp
+++ b/test/test_utilities.hpp
@ -70,6 +70,10 @@ struct BuffersHost {

 // =================================================================================================

+template <typename T> T ComplexConjugate(const T value);
+
+// =================================================================================================
+
 // Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast
 // data-types such as the Layout and Transpose data-types.
 template <typename T>