Merge pull request #15 from CNugteren/development

Update to version 0.3.0
2024-07-04 21:36:57 +02:00 · 2015-07-24 08:30:41 +02:00 · 2015-07-24 08:30:41 +02:00 · db6846b791
parent 18251df848 efbdcd2d90
commit db6846b791
127 changed files with 6575 additions and 2664 deletions
--- a/12
+++ b/12
@ -1,4 +1,16 @@
 Version 0.3.0
 - Re-organized test/client infrastructure to avoid code duplication
 - Added an optional bypass for pre/post-processing kernels in level-3 routines
 - Significantly improved performance of level-3 routines on AMD GPUs
 - Added level-3 routines:
  * CHEMM/ZHEMM
  * SSYRK/DSYRK/CSYRK/ZSYRK
  * CHERK/ZHERK
  * SSYR2K/DSYR2K/CSYR2K/ZSYR2K
  * CHER2K/ZHER2K
  * STRMM/DTRMM/CTRMM/ZTRMM
 Version 0.2.0
 - Added support for complex conjugate transpose
 - Several host-code performance improvements
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -13,7 +13,7 @@
 cmake_minimum_required(VERSION 2.8.10)
 project("clblast" CXX)
 set(clblast_VERSION_MAJOR 0)
-set(clblast_VERSION_MINOR 2)
+set(clblast_VERSION_MINOR 3)
 set(clblast_VERSION_PATCH 0)
 # Options and their default values
@ -95,17 +95,23 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
 # Sets the supported routines and the used kernels. New routines and kernels should be added here.
 set(KERNELS copy pad transpose padtranspose xaxpy xgemv xgemm)
 set(SAMPLE_PROGRAMS sgemm)
-set(ROUTINES_XY xaxpy)
+set(LEVEL1_ROUTINES xaxpy)
-set(ROUTINES_AXY xgemv)
+set(LEVEL2_ROUTINES xgemv)
-set(ROUTINES_ABC xgemm xsymm)
+set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
-set(ROUTINES ${ROUTINES_XY} ${ROUTINES_AXY} ${ROUTINES_ABC})
+set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES})
 # ==================================================================================================
 # Gathers all source-files
 set(SOURCES src/clblast.cc src/database.cc src/routine.cc src/utilities.cc)
-foreach(ROUTINE ${ROUTINES})
+foreach(ROUTINE ${LEVEL1_ROUTINES})
-  set(SOURCES ${SOURCES} src/routines/${ROUTINE}.cc)
+  set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cc)
 endforeach()
 foreach(ROUTINE ${LEVEL2_ROUTINES})
  set(SOURCES ${SOURCES} src/routines/level2/${ROUTINE}.cc)
 endforeach()
 foreach(ROUTINE ${LEVEL3_ROUTINES})
  set(SOURCES ${SOURCES} src/routines/level3/${ROUTINE}.cc)
 endforeach()
 # Creates and links the library
@ -168,33 +174,23 @@ if(TESTS)
  include_directories(${clblast_SOURCE_DIR}/test ${clBLAS_SOURCE_DIR})
  # Creates the common correctness-tests objects (requires CMake 2.8.8)
-  add_library(test_correctness_common OBJECT test/correctness/tester.cc)
+  add_library(test_correctness_common OBJECT
-  add_library(test_correctness_xy OBJECT test/correctness/testxy.cc)
+              test/correctness/tester.cc test/correctness/testblas.cc)
  add_library(test_correctness_axy OBJECT test/correctness/testaxy.cc)
  add_library(test_correctness_abc OBJECT test/correctness/testabc.cc)
  # Compiles the correctness-tests
-  foreach(ROUTINE ${ROUTINES_XY})
+  foreach(ROUTINE ${LEVEL1_ROUTINES})
-    add_executable(test_${ROUTINE}
+    add_executable(test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
-                   $<TARGET_OBJECTS:test_correctness_common>
+                   test/correctness/routines/level1/${ROUTINE}.cc)
                   $<TARGET_OBJECTS:test_correctness_xy>
                   test/correctness/routines/${ROUTINE}.cc)
    target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
    install(TARGETS test_${ROUTINE} DESTINATION bin)
  endforeach()
-  foreach(ROUTINE ${ROUTINES_AXY})
+  foreach(ROUTINE ${LEVEL2_ROUTINES})
-    add_executable(test_${ROUTINE}
+    add_executable(test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
-                   $<TARGET_OBJECTS:test_correctness_common>
+                   test/correctness/routines/level2/${ROUTINE}.cc)
                   $<TARGET_OBJECTS:test_correctness_axy>
                   test/correctness/routines/${ROUTINE}.cc)
    target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
    install(TARGETS test_${ROUTINE} DESTINATION bin)
  endforeach()
-  foreach(ROUTINE ${ROUTINES_ABC})
+  foreach(ROUTINE ${LEVEL3_ROUTINES})
-    add_executable(test_${ROUTINE}
+    add_executable(test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
-                   $<TARGET_OBJECTS:test_correctness_common>
+                   test/correctness/routines/level3/${ROUTINE}.cc)
-                   $<TARGET_OBJECTS:test_correctness_abc>
+  endforeach()
-                   test/correctness/routines/${ROUTINE}.cc)
+  foreach(ROUTINE ${ROUTINES})
    target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
    install(TARGETS test_${ROUTINE} DESTINATION bin)
  endforeach()
@ -203,10 +199,19 @@ if(TESTS)
  add_library(test_performance_common OBJECT test/performance/client.cc)
  # Compiles the performance-tests
-  set(TEST_PERF_COMM )
+  foreach(ROUTINE ${LEVEL1_ROUTINES})
  foreach(ROUTINE ${ROUTINES})
    add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
-                   test/performance/routines/${ROUTINE}.cc)
+                   test/performance/routines/level1/${ROUTINE}.cc)
  endforeach()
  foreach(ROUTINE ${LEVEL2_ROUTINES})
    add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
                   test/performance/routines/level2/${ROUTINE}.cc)
  endforeach()
  foreach(ROUTINE ${LEVEL3_ROUTINES})
    add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
                   test/performance/routines/level3/${ROUTINE}.cc)
  endforeach()
  foreach(ROUTINE ${ROUTINES})
    target_link_libraries(client_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
    install(TARGETS client_${ROUTINE} DESTINATION bin)
  endforeach()
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@ CLBlast: The tuned OpenCL BLAS library
 CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices.
-__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version supports only a minimal amount of routines (including `gemm` and `gemv`): others will be added in due time. It also lacks extensive tuning and testing on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
+__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version doesn't support all routines yet: others will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
 Why CLBlast and not clBLAS or cuBLAS?
@ -109,13 +109,13 @@ Performance remarks
 The CLBlast library provides pre-tuned parameter-values for a number of OpenCL devices. If your device is not among these, then out-of-the-box performance might be poor. Even if the device is included performance might be poor in some cases: __the preview version is not thoroughly tested for performance yet__. See above under `Using the tuners` to find out how to tune for your device.
-The folder `doc/performance` contains some PDF files with performance results on tested devices. Performance is compared against a tuned version of the clBLAS library. The graphs of the level-3 routines (Xgemm and Xsymm) show the strong points of CLBlast:
+The folder `doc/performance` contains some PDF files with performance results on tested devices. Performance is compared against a tuned version of the clBLAS library. The graphs of the level-3 routines (Xgemm, Xsymm, Xsyrk) show the strong points of CLBlast:
 * The library reaches a high peak performance for large matrix sizes, in some cases a factor 2 more than clBLAS.
 * The performance for non-power of 2 values (e.g. 1000) is roughly equal to power of 2 cases (e.g. 1024). This is not the case for clBLAS, which sometimes shows a drop of a factor 2.
 * The performance is also constant for different layouts and transpose options. Again, this is not the case for clBLAS.
-The graphs also show the current weak point of CLBlast: its performance for smaller matrix sizes is not too good. Furthermore, although the GEMM kernels perform well on AMD GPUs, the supporting copy and transpose kernel do not.
+The graphs also show the current weak points of CLBlast: for small sizes the benefit is minimal or non-existent, and for some specific configurations clBLAS is still faster.
 These graphs can be generated automatically on your own device. First, compile CLBlast with the tests enabled. Then, make sure your installation of the reference clBLAS is performance-tuned by running the `tune` executable. Finally, run one of the graph-scripts found in `test/performance/graphs` using R. For example, to generate the Xgemm PDF on device 1 of platform 0:
@ -124,7 +124,7 @@ These graphs can be generated automatically on your own device. First, compile C
 Supported routines
 -------------
-CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with `x` in the following tables:
+CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with '✔' in the following tables:
 | Level-1  | S | D | C | Z | Notes   |
 | ---------|---|---|---|---|---------|
@ -135,7 +135,7 @@ CLBlast is in active development and currently does not support the full set of
 | xSWAP    |   |   |   |   |         |
 | xSCAL    |   |   |   |   | +CS +ZD |
 | xCOPY    |   |   |   |   |         |
-| xAXPY    |`x`|`x`|`x`|`x`|         |
+| xAXPY    | ✔ | ✔ | ✔ | ✔ |         |
 | xDOT     |   |   | - | - | +DS     |
 | xDOTU    | - | - |   |   |         |
 | xDOTC    | - | - |   |   |         |
@ -147,7 +147,7 @@ CLBlast is in active development and currently does not support the full set of
 | Level-2  | S | D | C | Z | Notes   |
 | ---------|---|---|---|---|---------|
-| xGEMV    |`x`|`x`|`x`|`x`|         |
+| xGEMV    | ✔ | ✔ | ✔ | ✔ |         |
 | xGBMV    |   |   |   |   |         |
 | xHEMV    | - | - |   |   |         |
 | xHBMV    | - | - |   |   |         |
@ -175,14 +175,14 @@ CLBlast is in active development and currently does not support the full set of
 | Level-3  | S | D | C | Z | Notes   |
 | ---------|---|---|---|---|---------|
-| xGEMM    |`x`|`x`|`x`|`x`|         |
+| xGEMM    | ✔ | ✔ | ✔ | ✔ |         |
-| xSYMM    |`x`|`x`|`x`|`x`|         |
+| xSYMM    | ✔ | ✔ | ✔ | ✔ |         |
-| xHEMM    | - | - |   |   |         |
+| xHEMM    | - | - | ✔ | ✔ |         |
-| xSYRK    |   |   |   |   |         |
+| xSYRK    | ✔ | ✔ | ✔ | ✔ |         |
-| xHERK    | - | - |   |   |         |
+| xHERK    | - | - | ✔ | ✔ |         |
-| xSYR2K   |   |   |   |   |         |
+| xSYR2K   | ✔ | ✔ | ✔ | ✔ |         |
-| xHER2K   | - | - |   |   |         |
+| xHER2K   | - | - | ✔ | ✔ |         |
-| xTRMM    |   |   |   |   |         |
+| xTRMM    | ✔ | ✔ | ✔ | ✔ |         |
 | xTRSM    |   |   |   |   |         |
@ -214,8 +214,6 @@ To-do list before release of version 1.0
 - Improve host performance:
  * Allow initialization to pre-compile kernels and store to disk
 - Improve device performance:
  * Enable 'mad()' for AMD devices
  * Improve the performance of the copy and transpose kernels
  * Tune for a wider range of devices
  * Allow users to define custom tuned parameters
 - Improve the tuning
--- a/doc/performance/GeForce_GTX480/SAXPY.pdf
+++ b/doc/performance/GeForce_GTX480/SAXPY.pdf
--- a/doc/performance/GeForce_GTX480/SGEMM.pdf
+++ b/doc/performance/GeForce_GTX480/SGEMM.pdf
--- a/doc/performance/GeForce_GTX480/SGEMV.pdf
+++ b/doc/performance/GeForce_GTX480/SGEMV.pdf
--- a/doc/performance/GeForce_GTX480/SSYMM.pdf
+++ b/doc/performance/GeForce_GTX480/SSYMM.pdf
--- a/doc/performance/Iris/SAXPY.pdf
+++ b/doc/performance/Iris/SAXPY.pdf
--- a/doc/performance/Iris/SGEMM.pdf
+++ b/doc/performance/Iris/SGEMM.pdf
--- a/doc/performance/Iris/SGEMV.pdf
+++ b/doc/performance/Iris/SGEMV.pdf
--- a/doc/performance/Iris/SSYMM.pdf
+++ b/doc/performance/Iris/SSYMM.pdf
--- a/doc/performance/Iris/SSYRK.pdf
+++ b/doc/performance/Iris/SSYRK.pdf
--- a/doc/performance/Radeon_HD7950/SAXPY.pdf
+++ b/doc/performance/Radeon_HD7950/SAXPY.pdf
--- a/doc/performance/Radeon_HD7950/SGEMM.pdf
+++ b/doc/performance/Radeon_HD7950/SGEMM.pdf
--- a/doc/performance/Radeon_HD7950/SGEMV.pdf
+++ b/doc/performance/Radeon_HD7950/SGEMV.pdf
--- a/doc/performance/Radeon_HD7950/SSYMM.pdf
+++ b/doc/performance/Radeon_HD7950/SSYMM.pdf
--- a/doc/performance/Radeon_HD7950/SSYRK.pdf
+++ b/doc/performance/Radeon_HD7950/SSYRK.pdf
--- a/doc/performance/Tesla_K40m/SAXPY.pdf
+++ b/doc/performance/Tesla_K40m/SAXPY.pdf
--- a/doc/performance/Tesla_K40m/SGEMM.pdf
+++ b/doc/performance/Tesla_K40m/SGEMM.pdf
--- a/doc/performance/Tesla_K40m/SGEMV.pdf
+++ b/doc/performance/Tesla_K40m/SGEMV.pdf
--- a/doc/performance/Tesla_K40m/SSYMM.pdf
+++ b/doc/performance/Tesla_K40m/SSYMM.pdf
--- a/doc/performance/Tesla_K40m/SSYRK.pdf
+++ b/doc/performance/Tesla_K40m/SSYRK.pdf
--- a/include/clblast.h
+++ b/include/clblast.h
@ -75,6 +75,7 @@ enum class Layout { kRowMajor, kColMajor };
 enum class Transpose { kNo, kYes, kConjugate };
 enum class Side { kLeft, kRight };
 enum class Triangle { kUpper, kLower };
 enum class Diagonal { kUnit, kNonUnit };
 // Precision scoped enum (values in bits)
 enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
@ -95,7 +96,7 @@ StatusCode Axpy(const size_t n, const T alpha,
 // Templated-precision generalized matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV
 template <typename T>
-StatusCode Gemv(const Layout layout, const Transpose transpose_a,
+StatusCode Gemv(const Layout layout, const Transpose a_transpose,
                const size_t m, const size_t n,
                const T alpha,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
@ -107,9 +108,9 @@ StatusCode Gemv(const Layout layout, const Transpose transpose_a,
 // =================================================================================================
 // BLAS level-3 (matrix-matrix) routines
-// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM
+// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM
 template <typename T>
-StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b,
+StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
                const size_t m, const size_t n, const size_t k,
                const T alpha,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
@ -118,7 +119,7 @@ StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpos
                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                cl_command_queue* queue, cl_event* event);
-// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM
+// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM
 template <typename T>
 StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
                const size_t m, const size_t n,
@ -129,6 +130,81 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                cl_command_queue* queue, cl_event* event);
 // Templated-precision hermitian matrix-matrix multiplication: CHEMM/ZHEMM
 template <typename T>
 StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
                const size_t m, const size_t n,
                const T alpha,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                const T beta,
                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                cl_command_queue* queue, cl_event* event);
 // Templated-precision rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK
 template <typename T>
 StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
                const size_t n, const size_t k,
                const T alpha,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                const T beta,
                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                cl_command_queue* queue, cl_event* event);
 // Templated-precision rank-K update of a hermitian matrix: CHERK/ZHERK
 template <typename T>
 StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
                const size_t n, const size_t k,
                const T alpha,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                const T beta,
                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                cl_command_queue* queue, cl_event* event);
 // Templated-precision rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K
 template <typename T>
 StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
                 const size_t n, const size_t k,
                 const T alpha,
                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                 const T beta,
                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                 cl_command_queue* queue, cl_event* event);
 // Templated-precision rank-2K update of a hermitian matrix: CHER2K/ZHER2K
 template <typename T, typename U>
 StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
                 const size_t n, const size_t k,
                 const T alpha,
                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                 const U beta,
                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                 cl_command_queue* queue, cl_event* event);
 // Templated-precision triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM
 template <typename T>
 StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle,
                const Transpose a_transpose, const Diagonal diagonal,
                const size_t m, const size_t n,
                const T alpha,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                cl_command_queue* queue, cl_event* event);
 // Templated-precision matrix equation solver: STRSM/DTRSM/CTRSM/ZTRSM
 /*
 template <typename T>
 StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle,
                const Transpose a_transpose, const Diagonal diagonal,
                const size_t m, const size_t n,
                const T alpha,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                cl_command_queue* queue, cl_event* event);
 */
 // =================================================================================================
 } // namespace clblast
--- a/include/internal/database/copy.h
+++ b/include/internal/database/copy.h
@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::CopySingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",4}, {"COPY_VW",2} } },
      }
    },
@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::CopyDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",4} } },
      }
    },
@ -81,7 +81,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
      }
    },
@ -110,7 +110,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",4}, {"COPY_VW",2} } },
      }
    },
--- a/include/internal/database/pad.h
+++ b/include/internal/database/pad.h
@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::PadSingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
      }
    },
@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::PadDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
      }
    },
@ -81,7 +81,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
@ -110,7 +110,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
      }
    },
--- a/include/internal/database/padtranspose.h
+++ b/include/internal/database/padtranspose.h
@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::PadTraSingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"PADTRA_TILE",16}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
      }
    },
@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::PadTraDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"PADTRA_TILE",8}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
      }
    },
@ -81,7 +81,7 @@ const Database::DatabaseEntry Database::PadTraComplexSingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
      }
    },
@ -110,7 +110,7 @@ const Database::DatabaseEntry Database::PadTraComplexDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"PADTRA_TILE",8}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
      }
    },
--- a/include/internal/database/transpose.h
+++ b/include/internal/database/transpose.h
@ -18,24 +18,24 @@ const Database::DatabaseEntry Database::TraSingle = {
  "Transpose", Precision::kSingle, {
    { // NVIDIA GPUs
      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
-        { "GeForce GTX 480",  { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+        { "GeForce GTX 480",  { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
-        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
-        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
-        { "Tahiti",           { {"TRA_DIM",8}, {"TRA_WPT",8}, {"TRA_PAD",0} } },
+        { "Tahiti",           { {"TRA_DIM",16}, {"TRA_WPT",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
      }
    },
    { // Intel GPUs
      CL_DEVICE_TYPE_GPU, "Intel", {
-        { "Iris",             { {"TRA_DIM",8}, {"TRA_WPT",4}, {"TRA_PAD",0} } },
+        { "Iris",             { {"TRA_DIM",8}, {"TRA_WPT",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
      }
    },
    { // Default
      CL_DEVICE_TYPE_ALL, kDefault, {
-        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
      }
    },
  }
@ -47,14 +47,14 @@ const Database::DatabaseEntry Database::TraDouble = {
  "Transpose", Precision::kDouble, {
    { // NVIDIA GPUs
      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
-        { "GeForce GTX 480",  { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+        { "GeForce GTX 480",  { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
-        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
-        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
-        { "Tahiti",           { {"TRA_DIM",8}, {"TRA_WPT",8}, {"TRA_PAD",0} } },
+        { "Tahiti",           { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
      }
    },
    { // Intel GPUs
@ -63,7 +63,7 @@ const Database::DatabaseEntry Database::TraDouble = {
    },
    { // Default
      CL_DEVICE_TYPE_ALL, kDefault, {
-        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
      }
    },
  }
@ -75,24 +75,24 @@ const Database::DatabaseEntry Database::TraComplexSingle = {
  "Transpose", Precision::kComplexSingle, {
    { // NVIDIA GPUs
      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
-        { "GeForce GTX 480",  { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+        { "GeForce GTX 480",  { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
-        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
-        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
-        { "Tahiti",           { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+        { "Tahiti",           { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1} } },
      }
    },
    { // Intel GPUs
      CL_DEVICE_TYPE_GPU, "Intel", {
-        { "Iris",             { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+        { "Iris",             { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
      }
    },
    { // Default
      CL_DEVICE_TYPE_ALL, kDefault, {
-        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
      }
    },
  }
@ -104,14 +104,14 @@ const Database::DatabaseEntry Database::TraComplexDouble = {
  "Transpose", Precision::kComplexDouble, {
    { // NVIDIA GPUs
      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
-        { "GeForce GTX 480",  { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+        { "GeForce GTX 480",  { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
-        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
-        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
-        { "Tahiti",           { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+        { "Tahiti",           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
      }
    },
    { // Intel GPUs
@ -120,7 +120,7 @@ const Database::DatabaseEntry Database::TraComplexDouble = {
    },
    { // Default
      CL_DEVICE_TYPE_ALL, kDefault, {
-        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
      }
    },
  }
--- a/include/internal/database/xaxpy.h
+++ b/include/internal/database/xaxpy.h
@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"WGS",64}, {"WPT",1}, {"VW",2} } },
      }
    },
@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"WGS",256}, {"WPT",1}, {"VW",1} } },
      }
    },
@ -80,7 +80,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"WGS",64}, {"WPT",1}, {"VW",1} } },
      }
    },
@ -109,7 +109,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"WGS",64}, {"WPT",1}, {"VW",1} } },
      }
    },
--- a/include/internal/database/xgemm.h
+++ b/include/internal/database/xgemm.h
@ -25,8 +25,8 @@ const Database::DatabaseEntry Database::XgemmSingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
-        { "Tahiti",           { {"MWG",128}, {"NWG",128}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",8}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+        { "Tahiti",           { {"MWG",128}, {"NWG",128}, {"KWG",32}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",32}, {"NDIMB",8}, {"KWI",2}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
      }
    },
    { // Intel GPUs
@ -55,7 +55,7 @@ const Database::DatabaseEntry Database::XgemmDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
      }
    },
@ -84,13 +84,13 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"MWG",16}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } },
      }
    },
    { // Intel GPUs
      CL_DEVICE_TYPE_GPU, "Intel", {
-        { "Iris",             { {"MWG",64}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",8}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } },
+        { "Iris",             { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
      }
    },
    { // Default
@ -114,7 +114,7 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"MWG",128}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
      }
    },
--- a/include/internal/database/xgemv.h
+++ b/include/internal/database/xgemv.h
@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
      }
    },
@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::XgemvDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
      }
    },
@ -80,7 +80,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
      }
    },
@ -109,7 +109,7 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
      }
    },
--- a/include/internal/routine.h
+++ b/include/internal/routine.h
@ -34,20 +34,14 @@ class Routine {
    Program program;
    std::string device_name;
    Precision precision;
-    std::vector<std::string> routines;
+    std::string routine_name_;
    // Finds out whether the properties match
-    bool MatchInCache(const std::string &ref_name, const Precision &ref_precision,
+    bool MatchInCache(const std::string &ref_device, const Precision &ref_precision,
-                      const std::vector<std::string> &ref_routines) {
+                      const std::string &ref_routine) {
-      auto ref_size = ref_routines.size();
+      return (device_name == ref_device &&
-      if (device_name == ref_name && precision == ref_precision && routines.size() == ref_size) {
+              precision == ref_precision &&
-        auto found_match = true;
+              routine_name_ == ref_routine);
        for (auto i=size_t{0}; i<ref_size; ++i) {
          if (routines[i] != ref_routines[i]) { found_match = false; }
        }
        return found_match;
      }
      return false;
    }
  };
@ -58,11 +52,11 @@ class Routine {
  static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
  // Base class constructor
-  explicit Routine(CommandQueue &queue, Event &event,
+  explicit Routine(CommandQueue &queue, Event &event, const std::string &name,
                   const std::vector<std::string> &routines, const Precision precision);
  // Set-up phase of the kernel
-  StatusCode SetUp(const std::string &routine_source);
+  StatusCode SetUp();
 protected:
@ -84,15 +78,18 @@ class Routine {
  StatusCode TestVectorY(const size_t n, const Buffer &buffer, const size_t offset,
                         const size_t inc, const size_t data_size);
-  // Copies/transposes a matrix and padds/unpads it
+  // Copies/transposes a matrix and padds/unpads it with zeroes. This method is also able to write
  // to symmetric and triangular matrices through optional arguments.
  StatusCode PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
                                    const size_t src_ld, const size_t src_offset,
                                    const Buffer &src,
                                    const size_t dest_one, const size_t dest_two,
                                    const size_t dest_ld, const size_t dest_offset,
                                    const Buffer &dest,
                                    const Program &program, const bool do_pad,
                                    const bool do_transpose, const bool do_conjugate,
-                                    const bool pad, const Program &program);
+                                    const bool upper = false, const bool lower = false,
                                    const bool diagonal_imag_zero = false);
  // Queries the cache and retrieve either a matching program or a boolean whether a match exists.
  // The first assumes that the program is available in the cache and will throw an exception
@ -104,6 +101,10 @@ class Routine {
  // a derived class.
  const Precision precision_;
  // The routine's name and its kernel-source in string form
  const std::string routine_name_;
  std::string source_string_;
  // The OpenCL objects, accessible only from derived classes
  CommandQueue queue_;
  Event event_;
@ -118,7 +119,6 @@ class Routine {
  // Connection to the database for all the device-specific parameters
  const Database db_;
  const std::vector<std::string> routines_;
 };
 // =================================================================================================
--- a/include/internal/routines/level1/xaxpy.h
+++ b/include/internal/routines/level1/xaxpy.h
--- a/include/internal/routines/level2/xgemv.h
+++ b/include/internal/routines/level2/xgemv.h
--- a/include/internal/routines/level3/xgemm.h
+++ b/include/internal/routines/level3/xgemm.h
--- a/include/internal/routines/level3/xhemm.h
+++ b/include/internal/routines/level3/xhemm.h
@ -0,0 +1,58 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xhemm routine. It is based on the generalized matrix multiplication
 // routine (Xgemm). The implementation is very similar to the Xsymm routine.
 //
 // =================================================================================================
 #ifndef CLBLAST_ROUTINES_XHEMM_H_
 #define CLBLAST_ROUTINES_XHEMM_H_
 #include "internal/routines/level3/xgemm.h"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T>
 class Xhemm: public Xgemm<T> {
 public:
  // Uses several variables from the Routine class
  using Routine::db_;
  using Routine::context_;
  // Uses several helper functions from the Routine class
  using Routine::RunKernel;
  using Routine::ErrorIn;
  using Routine::TestMatrixA;
  using Routine::GetProgramFromCache;
  // Uses the regular Xgemm routine
  using Xgemm<T>::DoGemm;
  // Constructor
  Xhemm(CommandQueue &queue, Event &event);
  // Templated-precision implementation of the routine
  StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle,
                    const size_t m, const size_t n,
                    const T alpha,
                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
                    const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
                    const T beta,
                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
 };
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_ROUTINES_XHEMM_H_
 #endif
--- a/include/internal/routines/level3/xher2k.h
+++ b/include/internal/routines/level3/xher2k.h
@ -0,0 +1,48 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xher2k routine. The precision is implemented using the template argument
 // 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the
 // Xsyr2k routine.
 //
 // =================================================================================================
 #ifndef CLBLAST_ROUTINES_XHER2K_H_
 #define CLBLAST_ROUTINES_XHER2K_H_
 #include "internal/routine.h"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T, typename U>
 class Xher2k: public Routine {
 public:
  Xher2k(CommandQueue &queue, Event &event);
  // Templated-precision implementation of the routine
  StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
                     const size_t n, const size_t k,
                     const T alpha,
                     const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
                     const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
                     const U beta,
                     const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
 private:
  // Static variable to get the precision
  const static Precision precision_;
 };
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_ROUTINES_XHER2K_H_
 #endif
--- a/include/internal/routines/level3/xherk.h
+++ b/include/internal/routines/level3/xherk.h
@ -0,0 +1,47 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xherk routine. The precision is implemented using the template argument
 // 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the
 // Xsyrk routine.
 //
 // =================================================================================================
 #ifndef CLBLAST_ROUTINES_XHERK_H_
 #define CLBLAST_ROUTINES_XHERK_H_
 #include "internal/routine.h"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T, typename U>
 class Xherk: public Routine {
 public:
  Xherk(CommandQueue &queue, Event &event);
  // Templated-precision implementation of the routine
  StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
                    const size_t n, const size_t k,
                    const U alpha,
                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
                    const U beta,
                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
 private:
  // Static variable to get the precision
  const static Precision precision_;
 };
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_ROUTINES_XHERK_H_
 #endif
--- a/include/internal/routines/level3/xsymm.h
+++ b/include/internal/routines/level3/xsymm.h
@ -17,7 +17,7 @@
 #ifndef CLBLAST_ROUTINES_XSYMM_H_
 #define CLBLAST_ROUTINES_XSYMM_H_
-#include "internal/routines/xgemm.h"
+#include "internal/routines/level3/xgemm.h"
 namespace clblast {
 // =================================================================================================
--- a/include/internal/routines/level3/xsyr2k.h
+++ b/include/internal/routines/level3/xsyr2k.h
@ -0,0 +1,48 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xsyr2k routine. The precision is implemented using a template argument.
 // The implementation is very similar to Xsyrk (see header for details), except for the fact that
 // the main XgemmUpper/XgemmLower kernel is called twice: C = AB^T + C and C = BA^T + C.
 //
 // =================================================================================================
 #ifndef CLBLAST_ROUTINES_XSYR2K_H_
 #define CLBLAST_ROUTINES_XSYR2K_H_
 #include "internal/routine.h"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T>
 class Xsyr2k: public Routine {
 public:
  Xsyr2k(CommandQueue &queue, Event &event);
  // Templated-precision implementation of the routine
  StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
                     const size_t n, const size_t k,
                     const T alpha,
                     const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
                     const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
                     const T beta,
                     const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
 private:
  // Static variable to get the precision
  const static Precision precision_;
 };
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_ROUTINES_XSYR2K_H_
 #endif
--- a/include/internal/routines/level3/xsyrk.h
+++ b/include/internal/routines/level3/xsyrk.h
@ -0,0 +1,49 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xsyrk routine. The precision is implemented using a template argument.
 // The implementation is based on the regular Xgemm routine and kernel, but with two main changes:
 // 1) The final unpad(transpose) kernel updates only the upper/lower triangular part.
 // 2) The main Xgemm kernel masks workgroups not contributing to usefull data. This is only for
 //    performance reasons, as the actual masking is done later (see the first point).
 //
 // =================================================================================================
 #ifndef CLBLAST_ROUTINES_XSYRK_H_
 #define CLBLAST_ROUTINES_XSYRK_H_
 #include "internal/routine.h"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T>
 class Xsyrk: public Routine {
 public:
  Xsyrk(CommandQueue &queue, Event &event);
  // Templated-precision implementation of the routine
  StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
                    const size_t n, const size_t k,
                    const T alpha,
                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
                    const T beta,
                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
 private:
  // Static variable to get the precision
  const static Precision precision_;
 };
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_ROUTINES_XSYRK_H_
 #endif
--- a/include/internal/routines/level3/xtrmm.h
+++ b/include/internal/routines/level3/xtrmm.h
@ -0,0 +1,58 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xtrmm routine. The implementation is based on first transforming the
 // upper/lower unit/non-unit triangular matrix into a regular matrix and then calling the GEMM
 // routine. Therefore, this class inherits from the Xgemm class.
 //
 // =================================================================================================
 #ifndef CLBLAST_ROUTINES_XTRMM_H_
 #define CLBLAST_ROUTINES_XTRMM_H_
 #include "internal/routines/level3/xgemm.h"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T>
 class Xtrmm: public Xgemm<T> {
 public:
  // Uses several variables from the Routine class
  using Routine::db_;
  using Routine::context_;
  // Uses several helper functions from the Routine class
  using Routine::RunKernel;
  using Routine::ErrorIn;
  using Routine::TestMatrixA;
  using Routine::GetProgramFromCache;
  // Uses the regular Xgemm routine
  using Xgemm<T>::DoGemm;
  // Constructor
  Xtrmm(CommandQueue &queue, Event &event);
  // Templated-precision implementation of the routine
  StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle,
                    const Transpose a_transpose, const Diagonal diagonal,
                    const size_t m, const size_t n,
                    const T alpha,
                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
                    const Buffer &b_buffer, const size_t b_offset, const size_t b_ld);
 };
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_ROUTINES_XTRMM_H_
 #endif
--- a/include/internal/utilities.h
+++ b/include/internal/utilities.h
@ -46,6 +46,7 @@ constexpr auto kArgATransp = "transA";
 constexpr auto kArgBTransp = "transB";
 constexpr auto kArgSide = "side";
 constexpr auto kArgTriangle = "triangle";
 constexpr auto kArgDiagonal = "diagonal";
 constexpr auto kArgXInc = "incx";
 constexpr auto kArgYInc = "incy";
 constexpr auto kArgXOffset = "offx";
@ -93,6 +94,7 @@ struct Arguments {
  Transpose b_transpose = Transpose::kNo;
  Side side = Side::kLeft;
  Triangle triangle = Triangle::kUpper;
  Diagonal diagonal = Diagonal::kUnit;
  size_t x_inc = 1;
  size_t y_inc = 1;
  size_t x_offset = 0;
@ -105,6 +107,11 @@ struct Arguments {
  size_t c_offset = 0;
  T alpha = T{1.0};
  T beta = T{1.0};
  size_t x_size = 1;
  size_t y_size = 1;
  size_t a_size = 1;
  size_t b_size = 1;
  size_t c_size = 1;
  // Tuner-specific arguments
  double fraction = 1.0;
  // Client-specific arguments
@ -123,6 +130,15 @@ struct Arguments {
  bool no_abbrv = false;
 };
 // Structure containing all possible buffers for test clients
 struct Buffers {
  Buffer x_vec;
  Buffer y_vec;
  Buffer a_mat;
  Buffer b_mat;
  Buffer c_mat;
 };
 // =================================================================================================
 // Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast
--- a/src/clblast.cc
+++ b/src/clblast.cc
@ -18,14 +18,20 @@
 #include "clblast.h"
 // BLAS level-1 includes
-#include "internal/routines/xaxpy.h"
+#include "internal/routines/level1/xaxpy.h"
 // BLAS level-2 includes
-#include "internal/routines/xgemv.h"
+#include "internal/routines/level2/xgemv.h"
 // BLAS level-3 includes
-#include "internal/routines/xgemm.h"
+#include "internal/routines/level3/xgemm.h"
-#include "internal/routines/xsymm.h"
+#include "internal/routines/level3/xsymm.h"
 #include "internal/routines/level3/xhemm.h"
 #include "internal/routines/level3/xsyrk.h"
 #include "internal/routines/level3/xherk.h"
 #include "internal/routines/level3/xsyr2k.h"
 #include "internal/routines/level3/xher2k.h"
 #include "internal/routines/level3/xtrmm.h"
 namespace clblast {
 // =================================================================================================
@ -41,10 +47,8 @@ StatusCode Axpy(const size_t n, const T alpha,
  auto event_cpp = Event(*event);
  auto routine = Xaxpy<T>(queue_cpp, event_cpp);
-  // Loads the kernel source-code as an include (C++11 raw string literal)
+  // Compiles the routine's device kernels
-  std::string kernel_source =
+  auto status = routine.SetUp();
  #include "kernels/xaxpy.opencl"
  auto status = routine.SetUp(kernel_source);
  if (status != StatusCode::kSuccess) { return status; }
  // Runs the routine
@ -74,7 +78,7 @@ template StatusCode Axpy<double2>(const size_t, const double2,
 // GEMV
 template <typename T>
-StatusCode Gemv(const Layout layout, const Transpose transpose_a,
+StatusCode Gemv(const Layout layout, const Transpose a_transpose,
                const size_t m, const size_t n, const T alpha,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta,
@ -85,14 +89,12 @@ StatusCode Gemv(const Layout layout, const Transpose transpose_a,
  auto event_cpp = Event(*event);
  auto routine = Xgemv<T>(queue_cpp, event_cpp);
-  // Loads the kernel source-code as an include (C++11 raw string literal)
+  // Compiles the routine's device kernels
-  std::string kernel_source =
+  auto status = routine.SetUp();
  #include "kernels/xgemv.opencl"
  auto status = routine.SetUp(kernel_source);
  if (status != StatusCode::kSuccess) { return status; }
  // Runs the routine
-  return routine.DoGemv(layout, transpose_a, m, n, alpha,
+  return routine.DoGemv(layout, a_transpose, m, n, alpha,
                        Buffer(a_buffer), a_offset, a_ld,
                        Buffer(x_buffer), x_offset, x_inc, beta,
                        Buffer(y_buffer), y_offset, y_inc);
@ -127,7 +129,7 @@ template StatusCode Gemv<double2>(const Layout, const Transpose,
 // GEMM
 template <typename T>
-StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b,
+StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
                const size_t m, const size_t n, const size_t k, const T alpha,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
@ -137,23 +139,12 @@ StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpos
  auto event_cpp = Event(*event);
  auto routine = Xgemm<T>(queue_cpp, event_cpp);
-  // Loads the kernel source-code as an include (C++11 raw string literal)
+  // Compiles the routine's device kernels
-  std::string common_source1 =
+  auto status = routine.SetUp();
  #include "kernels/copy.opencl"
  std::string common_source2 =
  #include "kernels/pad.opencl"
  std::string common_source3 =
  #include "kernels/transpose.opencl"
  std::string common_source4 =
  #include "kernels/padtranspose.opencl"
  std::string kernel_source =
  #include "kernels/xgemm.opencl"
  auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
                              kernel_source);
  if (status != StatusCode::kSuccess) { return status; }
  // Runs the routine
-  return routine.DoGemm(layout, transpose_a, transpose_b, m, n, k, alpha,
+  return routine.DoGemm(layout, a_transpose, b_transpose, m, n, k, alpha,
                        Buffer(a_buffer), a_offset, a_ld,
                        Buffer(b_buffer), b_offset, b_ld, beta,
                        Buffer(c_buffer), c_offset, c_ld);
@ -197,19 +188,8 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
  auto event_cpp = Event(*event);
  auto routine = Xsymm<T>(queue_cpp, event_cpp);
-  // Loads the kernel source-code as an include (C++11 raw string literal)
+  // Compiles the routine's device kernels
-  std::string common_source1 =
+  auto status = routine.SetUp();
  #include "kernels/copy.opencl"
  std::string common_source2 =
  #include "kernels/pad.opencl"
  std::string common_source3 =
  #include "kernels/transpose.opencl"
  std::string common_source4 =
  #include "kernels/padtranspose.opencl"
  std::string kernel_source =
  #include "kernels/xgemm.opencl"
  auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
                            kernel_source);
  if (status != StatusCode::kSuccess) { return status; }
  // Runs the routine
@ -244,4 +224,302 @@ template StatusCode Symm<double2>(const Layout, const Side, const Triangle,
                                  cl_command_queue*, cl_event*);
 // =================================================================================================
 // HEMM
 template <typename T>
 StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
                const size_t m, const size_t n, const T alpha,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                cl_command_queue* queue, cl_event* event) {
  auto queue_cpp = CommandQueue(*queue);
  auto event_cpp = Event(*event);
  auto routine = Xhemm<T>(queue_cpp, event_cpp);
  // Compiles the routine's device kernels
  auto status = routine.SetUp();
  if (status != StatusCode::kSuccess) { return status; }
  // Runs the routine
  return routine.DoHemm(layout, side, triangle, m, n, alpha,
                        Buffer(a_buffer), a_offset, a_ld,
                        Buffer(b_buffer), b_offset, b_ld, beta,
                        Buffer(c_buffer), c_offset, c_ld);
 }
 template StatusCode Hemm<float2>(const Layout, const Side, const Triangle,
                                 const size_t, const size_t, const float2,
                                 const cl_mem, const size_t, const size_t,
                                 const cl_mem, const size_t, const size_t, const float2,
                                 cl_mem, const size_t, const size_t,
                                 cl_command_queue*, cl_event*);
 template StatusCode Hemm<double2>(const Layout, const Side, const Triangle,
                                  const size_t, const size_t, const double2,
                                  const cl_mem, const size_t, const size_t,
                                  const cl_mem, const size_t, const size_t, const double2,
                                  cl_mem, const size_t, const size_t,
                                  cl_command_queue*, cl_event*);
 // =================================================================================================
 // SYRK
 template <typename T>
 StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
                const size_t n, const size_t k, const T alpha,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta,
                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                cl_command_queue* queue, cl_event* event) {
  auto queue_cpp = CommandQueue(*queue);
  auto event_cpp = Event(*event);
  auto routine = Xsyrk<T>(queue_cpp, event_cpp);
  // Compiles the routine's device kernels
  auto status = routine.SetUp();
  if (status != StatusCode::kSuccess) { return status; }
  // Runs the routine
  return routine.DoSyrk(layout, triangle, a_transpose, n, k, alpha,
                        Buffer(a_buffer), a_offset, a_ld, beta,
                        Buffer(c_buffer), c_offset, c_ld);
 }
 template StatusCode Syrk<float>(const Layout, const Triangle, const Transpose,
                                const size_t, const size_t, const float,
                                const cl_mem, const size_t, const size_t, const float,
                                cl_mem, const size_t, const size_t,
                                cl_command_queue*, cl_event*);
 template StatusCode Syrk<double>(const Layout, const Triangle, const Transpose,
                                 const size_t, const size_t, const double,
                                 const cl_mem, const size_t, const size_t, const double,
                                 cl_mem, const size_t, const size_t,
                                 cl_command_queue*, cl_event*);
 template StatusCode Syrk<float2>(const Layout, const Triangle, const Transpose,
                                 const size_t, const size_t, const float2,
                                 const cl_mem, const size_t, const size_t, const float2,
                                 cl_mem, const size_t, const size_t,
                                 cl_command_queue*, cl_event*);
 template StatusCode Syrk<double2>(const Layout, const Triangle, const Transpose,
                                  const size_t, const size_t, const double2,
                                  const cl_mem, const size_t, const size_t, const double2,
                                  cl_mem, const size_t, const size_t,
                                  cl_command_queue*, cl_event*);
 // =================================================================================================
 // HERK
 template <typename T>
 StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
                const size_t n, const size_t k, const T alpha,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta,
                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                cl_command_queue* queue, cl_event* event) {
  auto queue_cpp = CommandQueue(*queue);
  auto event_cpp = Event(*event);
  auto routine = Xherk<std::complex<T>,T>(queue_cpp, event_cpp);
  // Compiles the routine's device kernels
  auto status = routine.SetUp();
  if (status != StatusCode::kSuccess) { return status; }
  // Runs the routine
  return routine.DoHerk(layout, triangle, a_transpose, n, k, alpha,
                        Buffer(a_buffer), a_offset, a_ld, beta,
                        Buffer(c_buffer), c_offset, c_ld);
 }
 template StatusCode Herk<float>(const Layout, const Triangle, const Transpose,
                                const size_t, const size_t, const float,
                                const cl_mem, const size_t, const size_t, const float,
                                cl_mem, const size_t, const size_t,
                                cl_command_queue*, cl_event*);
 template StatusCode Herk<double>(const Layout, const Triangle, const Transpose,
                                 const size_t, const size_t, const double,
                                 const cl_mem, const size_t, const size_t, const double,
                                 cl_mem, const size_t, const size_t,
                                 cl_command_queue*, cl_event*);
 // =================================================================================================
 // SYR2K
 template <typename T>
 StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
                 const size_t n, const size_t k, const T alpha,
                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                 cl_command_queue* queue, cl_event* event) {
  auto queue_cpp = CommandQueue(*queue);
  auto event_cpp = Event(*event);
  auto routine = Xsyr2k<T>(queue_cpp, event_cpp);
  // Compiles the routine's device kernels
  auto status = routine.SetUp();
  if (status != StatusCode::kSuccess) { return status; }
  // Runs the routine
  return routine.DoSyr2k(layout, triangle, ab_transpose, n, k, alpha,
                         Buffer(a_buffer), a_offset, a_ld,
                         Buffer(b_buffer), b_offset, b_ld, beta,
                         Buffer(c_buffer), c_offset, c_ld);
 }
 template StatusCode Syr2k<float>(const Layout, const Triangle, const Transpose,
                                 const size_t, const size_t, const float,
                                 const cl_mem, const size_t, const size_t,
                                 const cl_mem, const size_t, const size_t, const float,
                                 cl_mem, const size_t, const size_t,
                                 cl_command_queue*, cl_event*);
 template StatusCode Syr2k<double>(const Layout, const Triangle, const Transpose,
                                  const size_t, const size_t, const double,
                                  const cl_mem, const size_t, const size_t,
                                  const cl_mem, const size_t, const size_t, const double,
                                  cl_mem, const size_t, const size_t,
                                  cl_command_queue*, cl_event*);
 template StatusCode Syr2k<float2>(const Layout, const Triangle, const Transpose,
                                  const size_t, const size_t, const float2,
                                  const cl_mem, const size_t, const size_t,
                                  const cl_mem, const size_t, const size_t, const float2,
                                  cl_mem, const size_t, const size_t,
                                  cl_command_queue*, cl_event*);
 template StatusCode Syr2k<double2>(const Layout, const Triangle, const Transpose,
                                   const size_t, const size_t, const double2,
                                   const cl_mem, const size_t, const size_t,
                                   const cl_mem, const size_t, const size_t, const double2,
                                   cl_mem, const size_t, const size_t,
                                   cl_command_queue*, cl_event*);
 // =================================================================================================
 // SYR2K
 template <typename T, typename U>
 StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
                 const size_t n, const size_t k, const T alpha,
                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const U beta,
                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                 cl_command_queue* queue, cl_event* event) {
  auto queue_cpp = CommandQueue(*queue);
  auto event_cpp = Event(*event);
  auto routine = Xher2k<T,U>(queue_cpp, event_cpp);
  // Compiles the routine's device kernels
  auto status = routine.SetUp();
  if (status != StatusCode::kSuccess) { return status; }
  // Runs the routine
  return routine.DoHer2k(layout, triangle, ab_transpose, n, k, alpha,
                         Buffer(a_buffer), a_offset, a_ld,
                         Buffer(b_buffer), b_offset, b_ld, beta,
                         Buffer(c_buffer), c_offset, c_ld);
 }
 template StatusCode Her2k<float2,float>(const Layout, const Triangle, const Transpose,
                                        const size_t, const size_t, const float2,
                                        const cl_mem, const size_t, const size_t,
                                        const cl_mem, const size_t, const size_t, const float,
                                        cl_mem, const size_t, const size_t,
                                        cl_command_queue*, cl_event*);
 template StatusCode Her2k<double2,double>(const Layout, const Triangle, const Transpose,
                                          const size_t, const size_t, const double2,
                                          const cl_mem, const size_t, const size_t,
                                          const cl_mem, const size_t, const size_t, const double,
                                          cl_mem, const size_t, const size_t,
                                          cl_command_queue*, cl_event*);
 // =================================================================================================
 // TRMM
 template <typename T>
 StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle,
                const Transpose a_transpose, const Diagonal diagonal,
                const size_t m, const size_t n,
                const T alpha,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                cl_command_queue* queue, cl_event* event) {
  auto queue_cpp = CommandQueue(*queue);
  auto event_cpp = Event(*event);
  auto routine = Xtrmm<T>(queue_cpp, event_cpp);
  // Compiles the routine's device kernels
  auto status = routine.SetUp();
  if (status != StatusCode::kSuccess) { return status; }
  // Runs the routine
  return routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha,
                        Buffer(a_buffer), a_offset, a_ld,
                        Buffer(b_buffer), b_offset, b_ld);
 }
 template StatusCode Trmm<float>(const Layout, const Side, const Triangle,
                                const Transpose, const Diagonal,
                                const size_t, const size_t, const float,
                                const cl_mem, const size_t, const size_t,
                                cl_mem, const size_t, const size_t,
                                cl_command_queue*, cl_event*);
 template StatusCode Trmm<double>(const Layout, const Side, const Triangle,
                                 const Transpose, const Diagonal,
                                 const size_t, const size_t, const double,
                                 const cl_mem, const size_t, const size_t,
                                 cl_mem, const size_t, const size_t,
                                 cl_command_queue*, cl_event*);
 template StatusCode Trmm<float2>(const Layout, const Side, const Triangle,
                                 const Transpose, const Diagonal,
                                 const size_t, const size_t, const float2,
                                 const cl_mem, const size_t, const size_t,
                                 cl_mem, const size_t, const size_t,
                                 cl_command_queue*, cl_event*);
 template StatusCode Trmm<double2>(const Layout, const Side, const Triangle,
                                  const Transpose, const Diagonal,
                                  const size_t, const size_t, const double2,
                                  const cl_mem, const size_t, const size_t,
                                  cl_mem, const size_t, const size_t,
                                  cl_command_queue*, cl_event*);
 // =================================================================================================
 // TRSM
 /*
 template <typename T>
 StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle,
                const Transpose a_transpose, const Diagonal diagonal,
                const size_t m, const size_t n,
                const T alpha,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                cl_command_queue* queue, cl_event* event) {
  auto queue_cpp = CommandQueue(*queue);
  auto event_cpp = Event(*event);
  auto routine = Xtrsm<T>(queue_cpp, event_cpp);
  // Compiles the routine's device kernels
  auto status = routine.SetUp();
  if (status != StatusCode::kSuccess) { return status; }
  // Runs the routine
  return routine.DoTrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha,
                        Buffer(a_buffer), a_offset, a_ld,
                        Buffer(b_buffer), b_offset, b_ld);
 }
 template StatusCode Trsm<float>(const Layout, const Side, const Triangle,
                                const Transpose, const Diagonal,
                                const size_t, const size_t, const float,
                                const cl_mem, const size_t, const size_t,
                                cl_mem, const size_t, const size_t,
                                cl_command_queue*, cl_event*);
 template StatusCode Trsm<double>(const Layout, const Side, const Triangle,
                                 const Transpose, const Diagonal,
                                 const size_t, const size_t, const double,
                                 const cl_mem, const size_t, const size_t,
                                 cl_mem, const size_t, const size_t,
                                 cl_command_queue*, cl_event*);
 template StatusCode Trsm<float2>(const Layout, const Side, const Triangle,
                                 const Transpose, const Diagonal,
                                 const size_t, const size_t, const float2,
                                 const cl_mem, const size_t, const size_t,
                                 cl_mem, const size_t, const size_t,
                                 cl_command_queue*, cl_event*);
 template StatusCode Trsm<double2>(const Layout, const Side, const Triangle,
                                  const Transpose, const Diagonal,
                                  const size_t, const size_t, const double2,
                                  const cl_mem, const size_t, const size_t,
                                  cl_mem, const size_t, const size_t,
                                  cl_command_queue*, cl_event*);
 */
 // =================================================================================================
 } // namespace clblast
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@ -39,6 +39,7 @@ R"(
  typedef float8 real8;
  typedef float16 real16;
  #define ZERO 0.0f
  #define ONE 1.0f
 // Double-precision 
 #elif PRECISION == 64
@ -48,6 +49,7 @@ R"(
  typedef double8 real8;
  typedef double16 real16;
  #define ZERO 0.0
  #define ONE 1.0
 // Complex single-precision
 #elif PRECISION == 3232
@ -61,6 +63,7 @@ R"(
                           real s8; real s9; real sA; real sB;
                           real sC; real sD; real sE; real sF;} real16;
  #define ZERO 0.0f
  #define ONE 1.0f
 // Complex Double-precision
 #elif PRECISION == 6464
@ -74,12 +77,16 @@ R"(
                            real s8; real s9; real sA; real sB;
                            real sC; real sD; real sE; real sF;} real16;
  #define ZERO 0.0
  #define ONE 1.0
 #endif
 // =================================================================================================
-// Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction
+// Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific
-#define USE_CL_MAD 0
+// devices, this is enabled (see src/routine.cc).
 #ifndef USE_CL_MAD
  #define USE_CL_MAD 0
 #endif
 // Sets a variable to zero
 #if PRECISION == 3232 || PRECISION == 6464
@ -88,6 +95,20 @@ R"(
  #define SetToZero(a) a = ZERO
 #endif
 // Sets a variable to zero (only the imaginary part)
 #if PRECISION == 3232 || PRECISION == 6464
  #define ImagToZero(a) a.y = ZERO
 #else
  #define ImagToZero(a) 
 #endif
 // Sets a variable to one
 #if PRECISION == 3232 || PRECISION == 6464
  #define SetToOne(a) a.x = ONE; a.y = ZERO
 #else
  #define SetToOne(a) a = ONE
 #endif
 // Multiply two complex variables (used in the define below)
 #if PRECISION == 3232 || PRECISION == 6464
  #define MulReal(a, b) a.x*b.x - a.y*b.y
@ -122,6 +143,6 @@ R"(
 // =================================================================================================
 // End of the C++11 raw string literal
-)";
+)"
 // =================================================================================================
--- a/src/kernels/copy.opencl
+++ b/src/kernels/copy.opencl
@ -68,6 +68,6 @@ __kernel void CopyMatrix(const int ld,
 // =================================================================================================
 // End of the C++11 raw string literal
-)";
+)"
 // =================================================================================================
--- a/src/kernels/pad.opencl
+++ b/src/kernels/pad.opencl
@ -86,7 +86,9 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
                          __global const real* restrict src,
                          const int dest_one, const int dest_two,
                          const int dest_ld, const int dest_offset,
-                          __global real* dest) {
+                          __global real* dest,
                          const int upper, const int lower,
                          const int diagonal_imag_zero) {
  // Loops over the work per thread in both dimensions
  #pragma unroll
@ -95,11 +97,20 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
    #pragma unroll
    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_two && id_one < dest_one) {
+
      // Masking in case of triangular matrices: updates only the upper or lower part
      bool condition = true;
      if (upper == 1) { condition = (id_two >= id_one); }
      else if (lower == 1) { condition = (id_two <= id_one); }
      if (condition) {
        // Copies the value into the destination matrix. This is always within bounds of the source
        // matrix, as we know that the destination matrix is smaller than the source.
-        dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset];
+        if (id_two < dest_two && id_one < dest_one) {
          real value = src[id_two*src_ld + id_one + src_offset];
          if (diagonal_imag_zero == 1 && id_one == id_two) { ImagToZero(value); }
          dest[id_two*dest_ld + id_one + dest_offset] = value;
        }
      }
    }
  }
@ -127,15 +138,15 @@ __kernel void SymmLowerToSquared(const int src_dim,
      if (id_two < dest_dim && id_one < dest_dim) {
        // Loads data from the lower-symmetric matrix
-        real value;
+        real result;
-        SetToZero(value);
+        SetToZero(result);
        if (id_two < src_dim && id_one < src_dim) {
-          if (id_two <= id_one) { value = src[id_two*src_ld + id_one + src_offset]; }
+          if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; }
-          else                  { value = src[id_one*src_ld + id_two + src_offset]; }
+          else                  { result = src[id_one*src_ld + id_two + src_offset]; }
        }
-        // Stores the value in the destination matrix
+        // Stores the result in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = value;
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
      }
    }
  }
@ -160,15 +171,171 @@ __kernel void SymmUpperToSquared(const int src_dim,
      if (id_two < dest_dim && id_one < dest_dim) {
        // Loads data from the upper-symmetric matrix
-        real value;
+        real result;
-        SetToZero(value);
+        SetToZero(result);
        if (id_two < src_dim && id_one < src_dim) {
-          if (id_one <= id_two) { value = src[id_two*src_ld + id_one + src_offset]; }
+          if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; }
-          else                  { value = src[id_one*src_ld + id_two + src_offset]; }
+          else                  { result = src[id_one*src_ld + id_two + src_offset]; }
        }
-        // Stores the value in the destination matrix
+        // Stores the result in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = value;
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
      }
    }
  }
 }
 // =================================================================================================
 #if PRECISION == 3232 || PRECISION == 6464
 // Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
 __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
 __kernel void HermLowerToSquared(const int src_dim,
                                 const int src_ld, const int src_offset,
                                 __global const real* restrict src,
                                 const int dest_dim,
                                 const int dest_ld, const int dest_offset,
                                 __global real* dest) {
  // Loops over the work per thread in both dimensions
  #pragma unroll
  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
    #pragma unroll
    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
      if (id_two < dest_dim && id_one < dest_dim) {
        // Loads data from the lower-hermitian matrix
        real result;
        SetToZero(result);
        if (id_two < src_dim && id_one < src_dim) {
          if (id_two <= id_one) {
            result = src[id_two*src_ld + id_one + src_offset];
            if (id_one == id_two) { result.y = ZERO; }
          }
          else {
            result = src[id_one*src_ld + id_two + src_offset];
            COMPLEX_CONJUGATE(result);
          }
        }
        // Stores the result in the destination matrix
        dest[id_two*dest_ld + id_one + dest_offset] = result;
      }
    }
  }
 }
 // Same as above, but now the matrix' data is stored in the upper-triangle
 __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
 __kernel void HermUpperToSquared(const int src_dim,
                                 const int src_ld, const int src_offset,
                                 __global const real* restrict src,
                                 const int dest_dim,
                                 const int dest_ld, const int dest_offset,
                                 __global real* dest) {
  // Loops over the work per thread in both dimensions
  #pragma unroll
  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
    #pragma unroll
    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
      if (id_two < dest_dim && id_one < dest_dim) {
        // Loads data from the upper-hermitian matrix
        real result;
        SetToZero(result);
        if (id_two < src_dim && id_one < src_dim) {
          if (id_one <= id_two) {
            result = src[id_two*src_ld + id_one + src_offset];
            if (id_one == id_two) { result.y = ZERO; }
          }
          else {
            result = src[id_one*src_ld + id_two + src_offset];
            COMPLEX_CONJUGATE(result);
          }
        }
        // Stores the result in the destination matrix
        dest[id_two*dest_ld + id_one + dest_offset] = result;
      }
    }
  }
 }
 #endif
 // =================================================================================================
 // Kernel to populate a squared triangular matrix, given that the triangle which holds the data is
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
 __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
 __kernel void TrmmLowerToSquared(const int src_dim,
                                 const int src_ld, const int src_offset,
                                 __global const real* restrict src,
                                 const int dest_dim,
                                 const int dest_ld, const int dest_offset,
                                 __global real* dest,
                                 const int unit_diagonal) {
  // Loops over the work per thread in both dimensions
  #pragma unroll
  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
    #pragma unroll
    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
      if (id_two < dest_dim && id_one < dest_dim) {
        // Loads data from the lower-triangular matrix
        real result;
        SetToZero(result);
        if (id_two < src_dim && id_one < src_dim) {
          if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; }
          if (id_two == id_one && unit_diagonal) { SetToOne(result); }
          // Else: result is zero
        }
        // Stores the result in the destination matrix
        dest[id_two*dest_ld + id_one + dest_offset] = result;
      }
    }
  }
 }
 // Same as above, but now the matrix' data is stored in the upper-triangle
 __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
 __kernel void TrmmUpperToSquared(const int src_dim,
                                 const int src_ld, const int src_offset,
                                 __global const real* restrict src,
                                 const int dest_dim,
                                 const int dest_ld, const int dest_offset,
                                 __global real* dest,
                                 const int unit_diagonal) {
  // Loops over the work per thread in both dimensions
  #pragma unroll
  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
    #pragma unroll
    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
      if (id_two < dest_dim && id_one < dest_dim) {
        // Loads data from the upper-triangular matrix
        real result;
        SetToZero(result);
        if (id_two < src_dim && id_one < src_dim) {
          if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; }
          if (id_one == id_two && unit_diagonal) { SetToOne(result); }
          // Else: result is zero
        }
        // Stores the result in the destination matrix
        dest[id_two*dest_ld + id_one + dest_offset] = result;
      }
    }
  }
@ -177,6 +344,6 @@ __kernel void SymmUpperToSquared(const int src_dim,
 // =================================================================================================
 // End of the C++11 raw string literal
-)";
+)"
 // =================================================================================================
--- a/src/kernels/padtranspose.opencl
+++ b/src/kernels/padtranspose.opencl
@ -100,7 +100,9 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
                                   __global const real* restrict src,
                                   const int dest_one, const int dest_two,
                                   const int dest_ld, const int dest_offset,
-                                   __global real* dest) {
+                                   __global real* dest,
                                   const int upper, const int lower,
                                   const int diagonal_imag_zero) {
  // Local memory to store a tile of the matrix (for coalescing)
  __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
@ -137,10 +139,18 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
      const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
      const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
-      // Stores the transposed value in the destination matrix
+      // Masking in case of triangular matrices: updates only the upper or lower part
-      if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
+      bool condition = true;
-        real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
+      if (upper == 1) { condition = (id_dest_one >= id_dest_two); }
-        dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+      else if (lower == 1) { condition = (id_dest_one <= id_dest_two); }
      if (condition) {
        // Stores the transposed value in the destination matrix
        if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
          real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
          if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); }
          dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
        }
      }
    }
  }
@ -149,6 +159,6 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
 // =================================================================================================
 // End of the C++11 raw string literal
-)";
+)"
 // =================================================================================================
--- a/src/kernels/transpose.opencl
+++ b/src/kernels/transpose.opencl
@ -20,13 +20,16 @@ R"(
 // Parameters set by the tuner or by the database. Here they are given a basic default value in case
 // this kernel file is used outside of the CLBlast library.
 #ifndef TRA_DIM
-  #define TRA_DIM 8    // Number of local threads in the two dimensions (x,y)
+  #define TRA_DIM 8       // Number of local threads in the two dimensions (x,y)
 #endif
 #ifndef TRA_WPT
-  #define TRA_WPT 1    // Work per thread in one dimension and vector-width in the other
+  #define TRA_WPT 1       // Work per thread in one dimension and vector-width in the other
 #endif
 #ifndef TRA_PAD
-  #define TRA_PAD 0    // Padding of the local memory to avoid bank-conflicts
+  #define TRA_PAD 0       // Padding of the local memory to avoid bank-conflicts
 #endif
 #ifndef TRA_SHUFFLE
  #define TRA_SHUFFLE 0   // Shuffling of the global indices to avoid global memory bank-conflicts
 #endif
 // =================================================================================================
@ -53,116 +56,94 @@ __kernel void TransposeMatrix(const int ld,
                              __global const realT* restrict src,
                              __global realT* dest) {
-  // Local memory to store a tile of the matrix (for coalescing)
+  // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different
-  __local real tile[TRA_WPT*TRA_DIM][TRA_WPT*TRA_DIM + TRA_PAD];
+  // way over workgroups, breaking memory-bank dependencies.
  const int gid0 = get_group_id(0);
  #if TRA_SHUFFLE == 1
    const int gid1 = (get_group_id(0) + get_group_id(1)) % get_num_groups(0);
  #else
    const int gid1 = get_group_id(1);
  #endif
-  // Loop over the work per thread
+  // Local memory to store a tile of the matrix (for coalescing)
  __local realT tile[TRA_WPT*TRA_DIM][TRA_DIM + TRA_PAD];
  // Loops over the work per thread
  #pragma unroll
  for (int w_one=0; w_one<TRA_WPT; ++w_one) {
    // Computes the identifiers for the source matrix. Note that the local and global dimensions
    // do not correspond to each other!
-    const int id_one = get_group_id(1) * TRA_DIM + get_local_id(0);
+    const int id_one = gid1 * TRA_DIM + get_local_id(0);
-    const int id_two = (get_group_id(0) * TRA_DIM + get_local_id(1))*TRA_WPT + w_one;
+    const int id_two = (gid0 * TRA_DIM + get_local_id(1))*TRA_WPT + w_one;
    // Loads data into the local memory
    realT value = src[id_two*(ld/TRA_WPT) + id_one];
-    #if TRA_WPT == 1
+    tile[get_local_id(0)*TRA_WPT + w_one][get_local_id(1)] = value;
      tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value;
    #elif TRA_WPT == 2
      tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.x;
      tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.y;
    #elif TRA_WPT == 4
      tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.x;
      tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.y;
      tile[get_local_id(1)*TRA_WPT + 2][get_local_id(0)*TRA_WPT + w_one] = value.z;
      tile[get_local_id(1)*TRA_WPT + 3][get_local_id(0)*TRA_WPT + w_one] = value.w;
    #elif TRA_WPT == 8
      tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.s0;
      tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.s1;
      tile[get_local_id(1)*TRA_WPT + 2][get_local_id(0)*TRA_WPT + w_one] = value.s2;
      tile[get_local_id(1)*TRA_WPT + 3][get_local_id(0)*TRA_WPT + w_one] = value.s3;
      tile[get_local_id(1)*TRA_WPT + 4][get_local_id(0)*TRA_WPT + w_one] = value.s4;
      tile[get_local_id(1)*TRA_WPT + 5][get_local_id(0)*TRA_WPT + w_one] = value.s5;
      tile[get_local_id(1)*TRA_WPT + 6][get_local_id(0)*TRA_WPT + w_one] = value.s6;
      tile[get_local_id(1)*TRA_WPT + 7][get_local_id(0)*TRA_WPT + w_one] = value.s7;
    #elif TRA_WPT == 16
      tile[get_local_id(1)*TRA_WPT +  0][get_local_id(0)*TRA_WPT + w_one] = value.s0;
      tile[get_local_id(1)*TRA_WPT +  1][get_local_id(0)*TRA_WPT + w_one] = value.s1;
      tile[get_local_id(1)*TRA_WPT +  2][get_local_id(0)*TRA_WPT + w_one] = value.s2;
      tile[get_local_id(1)*TRA_WPT +  3][get_local_id(0)*TRA_WPT + w_one] = value.s3;
      tile[get_local_id(1)*TRA_WPT +  4][get_local_id(0)*TRA_WPT + w_one] = value.s4;
      tile[get_local_id(1)*TRA_WPT +  5][get_local_id(0)*TRA_WPT + w_one] = value.s5;
      tile[get_local_id(1)*TRA_WPT +  6][get_local_id(0)*TRA_WPT + w_one] = value.s6;
      tile[get_local_id(1)*TRA_WPT +  7][get_local_id(0)*TRA_WPT + w_one] = value.s7;
      tile[get_local_id(1)*TRA_WPT +  8][get_local_id(0)*TRA_WPT + w_one] = value.s8;
      tile[get_local_id(1)*TRA_WPT +  9][get_local_id(0)*TRA_WPT + w_one] = value.s9;
      tile[get_local_id(1)*TRA_WPT + 10][get_local_id(0)*TRA_WPT + w_one] = value.sA;
      tile[get_local_id(1)*TRA_WPT + 11][get_local_id(0)*TRA_WPT + w_one] = value.sB;
      tile[get_local_id(1)*TRA_WPT + 12][get_local_id(0)*TRA_WPT + w_one] = value.sC;
      tile[get_local_id(1)*TRA_WPT + 13][get_local_id(0)*TRA_WPT + w_one] = value.sD;
      tile[get_local_id(1)*TRA_WPT + 14][get_local_id(0)*TRA_WPT + w_one] = value.sE;
      tile[get_local_id(1)*TRA_WPT + 15][get_local_id(0)*TRA_WPT + w_one] = value.sF;
    #endif
  }
  // Synchronizes all threads in a workgroup
  barrier(CLK_LOCAL_MEM_FENCE);
-  // Loop over the work per thread
+  // Loads transposed data from the local memory
  realT v[TRA_WPT];
  #pragma unroll
  for (int w_one=0; w_one<TRA_WPT; ++w_one) {
    v[w_one] = tile[get_local_id(1)*TRA_WPT + w_one][get_local_id(0)];
  }
  // Performs the register-level transpose of the vectorized data
  realT results[TRA_WPT];
  #if TRA_WPT == 1
    results[0] = v[0];
  #elif TRA_WPT == 2
    results[0] = (realT) (v[0].x, v[1].x);
    results[1] = (realT) (v[0].y, v[1].y);
  #elif TRA_WPT == 4
    results[0] = (realT) (v[0].x, v[1].x, v[2].x, v[3].x);
    results[1] = (realT) (v[0].y, v[1].y, v[2].y, v[3].y);
    results[2] = (realT) (v[0].z, v[1].z, v[2].z, v[3].z);
    results[3] = (realT) (v[0].w, v[1].w, v[2].w, v[3].w);
  #elif TRA_WPT == 8
    results[0] = (realT) (v[0].s0, v[1].s0, v[2].s0, v[3].s0, v[4].s0, v[5].s0, v[6].s0, v[7].s0);
    results[1] = (realT) (v[0].s1, v[1].s1, v[2].s1, v[3].s1, v[4].s1, v[5].s1, v[6].s1, v[7].s1);
    results[2] = (realT) (v[0].s2, v[1].s2, v[2].s2, v[3].s2, v[4].s2, v[5].s2, v[6].s2, v[7].s2);
    results[3] = (realT) (v[0].s3, v[1].s3, v[2].s3, v[3].s3, v[4].s3, v[5].s3, v[6].s3, v[7].s3);
    results[4] = (realT) (v[0].s4, v[1].s4, v[2].s4, v[3].s4, v[4].s4, v[5].s4, v[6].s4, v[7].s4);
    results[5] = (realT) (v[0].s5, v[1].s5, v[2].s5, v[3].s5, v[4].s5, v[5].s5, v[6].s5, v[7].s5);
    results[6] = (realT) (v[0].s6, v[1].s6, v[2].s6, v[3].s6, v[4].s6, v[5].s6, v[6].s6, v[7].s6);
    results[7] = (realT) (v[0].s7, v[1].s7, v[2].s7, v[3].s7, v[4].s7, v[5].s7, v[6].s7, v[7].s7);
  #elif TRA_WPT == 16
    results[ 0] = (realT) (v[0].s0, v[1].s0, v[2].s0, v[3].s0, v[4].s0, v[5].s0, v[6].s0, v[7].s0, v[8].s0, v[9].s0, v[10].s0, v[11].s0, v[12].s0, v[13].s0, v[14].s0, v[15].s0);
    results[ 1] = (realT) (v[0].s1, v[1].s1, v[2].s1, v[3].s1, v[4].s1, v[5].s1, v[6].s1, v[7].s1, v[8].s1, v[9].s1, v[10].s1, v[11].s1, v[12].s1, v[13].s1, v[14].s1, v[15].s1);
    results[ 2] = (realT) (v[0].s2, v[1].s2, v[2].s2, v[3].s2, v[4].s2, v[5].s2, v[6].s2, v[7].s2, v[8].s2, v[9].s2, v[10].s2, v[11].s2, v[12].s2, v[13].s2, v[14].s2, v[15].s2);
    results[ 3] = (realT) (v[0].s3, v[1].s3, v[2].s3, v[3].s3, v[4].s3, v[5].s3, v[6].s3, v[7].s3, v[8].s3, v[9].s3, v[10].s3, v[11].s3, v[12].s3, v[13].s3, v[14].s3, v[15].s3);
    results[ 4] = (realT) (v[0].s4, v[1].s4, v[2].s4, v[3].s4, v[4].s4, v[5].s4, v[6].s4, v[7].s4, v[8].s4, v[9].s4, v[10].s4, v[11].s4, v[12].s4, v[13].s4, v[14].s4, v[15].s4);
    results[ 5] = (realT) (v[0].s5, v[1].s5, v[2].s5, v[3].s5, v[4].s5, v[5].s5, v[6].s5, v[7].s5, v[8].s5, v[9].s5, v[10].s5, v[11].s5, v[12].s5, v[13].s5, v[14].s5, v[15].s5);
    results[ 6] = (realT) (v[0].s6, v[1].s6, v[2].s6, v[3].s6, v[4].s6, v[5].s6, v[6].s6, v[7].s6, v[8].s6, v[9].s6, v[10].s6, v[11].s6, v[12].s6, v[13].s6, v[14].s6, v[15].s6);
    results[ 7] = (realT) (v[0].s7, v[1].s7, v[2].s7, v[3].s7, v[4].s7, v[5].s7, v[6].s7, v[7].s7, v[8].s7, v[9].s7, v[10].s7, v[11].s7, v[12].s7, v[13].s7, v[14].s7, v[15].s7);
    results[ 8] = (realT) (v[0].s8, v[1].s8, v[2].s8, v[3].s8, v[4].s8, v[5].s8, v[6].s8, v[7].s8, v[8].s8, v[9].s8, v[10].s8, v[11].s8, v[12].s8, v[13].s8, v[14].s8, v[15].s8);
    results[ 9] = (realT) (v[0].s9, v[1].s9, v[2].s9, v[3].s9, v[4].s9, v[5].s9, v[6].s9, v[7].s9, v[8].s9, v[9].s9, v[10].s9, v[11].s9, v[12].s9, v[13].s9, v[14].s9, v[15].s9);
    results[10] = (realT) (v[0].sA, v[1].sA, v[2].sA, v[3].sA, v[4].sA, v[5].sA, v[6].sA, v[7].sA, v[8].sA, v[9].sA, v[10].sA, v[11].sA, v[12].sA, v[13].sA, v[14].sA, v[15].sA);
    results[11] = (realT) (v[0].sB, v[1].sB, v[2].sB, v[3].sB, v[4].sB, v[5].sB, v[6].sB, v[7].sB, v[8].sB, v[9].sB, v[10].sB, v[11].sB, v[12].sB, v[13].sB, v[14].sB, v[15].sB);
    results[12] = (realT) (v[0].sC, v[1].sC, v[2].sC, v[3].sC, v[4].sC, v[5].sC, v[6].sC, v[7].sC, v[8].sC, v[9].sC, v[10].sC, v[11].sC, v[12].sC, v[13].sC, v[14].sC, v[15].sC);
    results[13] = (realT) (v[0].sD, v[1].sD, v[2].sD, v[3].sD, v[4].sD, v[5].sD, v[6].sD, v[7].sD, v[8].sD, v[9].sD, v[10].sD, v[11].sD, v[12].sD, v[13].sD, v[14].sD, v[15].sD);
    results[14] = (realT) (v[0].sE, v[1].sE, v[2].sE, v[3].sE, v[4].sE, v[5].sE, v[6].sE, v[7].sE, v[8].sE, v[9].sE, v[10].sE, v[11].sE, v[12].sE, v[13].sE, v[14].sE, v[15].sE);
    results[15] = (realT) (v[0].sF, v[1].sF, v[2].sF, v[3].sF, v[4].sF, v[5].sF, v[6].sF, v[7].sF, v[8].sF, v[9].sF, v[10].sF, v[11].sF, v[12].sF, v[13].sF, v[14].sF, v[15].sF);
  #endif
  // Stores the results into the destination matrix
  #pragma unroll
  for (int w_two=0; w_two<TRA_WPT; ++w_two) {
-
+    const int id_one = gid0*TRA_DIM + get_local_id(0);
-    // Computes the identifiers for the destination matrix
+    const int id_two = (gid1*TRA_DIM + get_local_id(1))*TRA_WPT + w_two;
-    const int id_one = get_global_id(0);
+    dest[id_two*(ld/TRA_WPT) + id_one] = results[w_two];
    const int id_two = get_global_id(1)*TRA_WPT + w_two;
    // Stores the transposed value in the destination matrix
    realT value;
    #if TRA_WPT == 1
      value = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
    #elif TRA_WPT == 2
      value.x = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
      value.y = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
    #elif TRA_WPT == 4
      value.x = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
      value.y = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
      value.z = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 2];
      value.w = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 3];
    #elif TRA_WPT == 8
      value.s0 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
      value.s1 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
      value.s2 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 2];
      value.s3 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 3];
      value.s4 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 4];
      value.s5 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 5];
      value.s6 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 6];
      value.s7 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 7];
    #elif TRA_WPT == 16
      value.s0 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  0];
      value.s1 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  1];
      value.s2 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  2];
      value.s3 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  3];
      value.s4 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  4];
      value.s5 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  5];
      value.s6 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  6];
      value.s7 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  7];
      value.s8 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  8];
      value.s9 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  9];
      value.sA = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 10];
      value.sB = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 11];
      value.sC = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 12];
      value.sD = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 13];
      value.sE = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 14];
      value.sF = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 15];
    #endif
    dest[id_two*(ld/TRA_WPT) + id_one] = value;
  }
 }
 // =================================================================================================
 // End of the C++11 raw string literal
-)";
+)"
 // =================================================================================================
--- a/src/kernels/xaxpy.opencl
+++ b/src/kernels/xaxpy.opencl
@ -123,6 +123,6 @@ __kernel void XaxpyFast(const int n, const real alpha,
 // =================================================================================================
 // End of the C++11 raw string literal
-)";
+)"
 // =================================================================================================
--- a/src/kernels/xgemm.opencl
+++ b/src/kernels/xgemm.opencl
@ -127,6 +127,55 @@ R"(
 // =================================================================================================
 // Initializes the accumulation registers to zero
 inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
  #pragma unroll
  for (int mi=0; mi<MWI/VWM; ++mi) {
    #pragma unroll
    for (int ni=0; ni<NWI; ++ni) {
      #if VWM == 1
        SetToZero(cpm[ni][mi]);
      #elif VWM == 2
        SetToZero(cpm[ni][mi].x);
        SetToZero(cpm[ni][mi].y);
      #elif VWM == 4
        SetToZero(cpm[ni][mi].x);
        SetToZero(cpm[ni][mi].y);
        SetToZero(cpm[ni][mi].z);
        SetToZero(cpm[ni][mi].w);
      #elif VWM == 8
        SetToZero(cpm[ni][mi].s0);
        SetToZero(cpm[ni][mi].s1);
        SetToZero(cpm[ni][mi].s2);
        SetToZero(cpm[ni][mi].s3);
        SetToZero(cpm[ni][mi].s4);
        SetToZero(cpm[ni][mi].s5);
        SetToZero(cpm[ni][mi].s6);
        SetToZero(cpm[ni][mi].s7);
      #elif VWM == 16
        SetToZero(cpm[ni][mi].s0);
        SetToZero(cpm[ni][mi].s1);
        SetToZero(cpm[ni][mi].s2);
        SetToZero(cpm[ni][mi].s3);
        SetToZero(cpm[ni][mi].s4);
        SetToZero(cpm[ni][mi].s5);
        SetToZero(cpm[ni][mi].s6);
        SetToZero(cpm[ni][mi].s7);
        SetToZero(cpm[ni][mi].s8);
        SetToZero(cpm[ni][mi].s9);
        SetToZero(cpm[ni][mi].sA);
        SetToZero(cpm[ni][mi].sB);
        SetToZero(cpm[ni][mi].sC);
        SetToZero(cpm[ni][mi].sD);
        SetToZero(cpm[ni][mi].sE);
        SetToZero(cpm[ni][mi].sF);
      #endif
    }
  }
 }
 // =================================================================================================
 // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
 // caching the A input matrix.
 #if SA == 1
@ -272,71 +321,6 @@ inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg
 // =================================================================================================
 // Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
 // with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
 inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
                         const real alpha, const real beta) {
  #pragma unroll
  for (int ni=0; ni<NWI; ++ni) {
    #pragma unroll
    for (int mi=0; mi<MWI/VWM; ++mi) {
      #if STRM == 0
        int mg = mi + get_local_id(0)*(MWI/VWM);
      #elif STRM == 1
        int mg = get_local_id(0) + mi*MDIMC;
      #endif
      #if STRN == 0
        int ng = ni + get_local_id(1)*NWI;
      #elif STRN == 1
        int ng = ni%VWN + get_local_id(1)*VWN + (ni/VWN)*VWN*NDIMC;
      #endif
      int idm = mg + get_group_id(0)*(MWG/VWM);
      int idn = ng + get_group_id(1)*NWG;
      int index = idn*(kSizeM/VWM) + idm;
      realM cval = cgm[index];
      #if VWM == 1
        AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cval);
      #elif VWM == 2
        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
      #elif VWM == 4
        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
        AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cval.z);
        AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cval.w);
      #elif VWM == 8
        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
      #elif VWM == 16
        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
        AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cval.s8);
        AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cval.s9);
        AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cval.sA);
        AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cval.sB);
        AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cval.sC);
        AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cval.sD);
        AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cval.sE);
        AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cval.sF);
      #endif
    }
  }
 }
 // =================================================================================================
 // The vectorised multiply-add function
 inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
  #if USE_VECTOR_MAD == 1
@ -432,77 +416,97 @@ inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], real
 // =================================================================================================
-// Main entry of the kernel. This function contains the basic skeleton, the functionality is
+// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
-// provided by the inlined functions above
+// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
-__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
+                         const real alpha, const real beta) {
-                    const real alpha, const real beta,
+  #pragma unroll
-                    const __global realM* restrict agm,
+  for (int ni=0; ni<NWI; ++ni) {
-                    const __global realN* restrict bgm,
+    #pragma unroll
-                    __global realM* cgm) {
+    for (int mi=0; mi<MWI/VWM; ++mi) {
      #if STRM == 0
        int mg = mi + get_local_id(0)*(MWI/VWM);
      #elif STRM == 1
        int mg = get_local_id(0) + mi*MDIMC;
      #endif
      #if STRN == 0
        int ng = ni + get_local_id(1)*NWI;
      #elif STRN == 1
        int ng = ni%VWN + get_local_id(1)*VWN + (ni/VWN)*VWN*NDIMC;
      #endif
      int idm = mg + get_group_id(0)*(MWG/VWM);
      int idn = ng + get_group_id(1)*NWG;
-  // Combined thread identifier
+      // The final multiplication with alpha and the addition with beta*C
      int index = idn*(kSizeM/VWM) + idm;
      realM cval = cgm[index];
      #if VWM == 1
        AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cval);
      #elif VWM == 2
        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
      #elif VWM == 4
        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
        AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cval.z);
        AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cval.w);
      #elif VWM == 8
        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
      #elif VWM == 16
        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
        AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cval.s8);
        AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cval.s9);
        AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cval.sA);
        AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cval.sB);
        AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cval.sC);
        AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cval.sD);
        AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cval.sE);
        AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cval.sF);
      #endif
    }
  }
 }
 // =================================================================================================
 // Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
 inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
                      const __global realM* restrict agm, const __global realN* restrict bgm,
                      __global realM* cgm, realM cpm[NWI][MWI/VWM]
                      #if SA == 1 && SB == 1
                        , __local realM* alm, __local realN* blm
                      #elif SA == 1
                        , __local realM* alm
                      #elif SB == 1
                        , __local realN* blm
                      #endif
                      ) {
  // Allocates workitem-private memory (registers)
  realM apm[MWI/VWM];
  realN bpm[NWI/VWN];
  // Combined thread identifier (volatile to disable caching)
  #if SA == 1 || SB == 1
    volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
  #endif
  // Allocates workgroup-private memory (local memory)
  #if SA == 1
    __local realM alm[KWG * MWG/VWM];
  #endif
  #if SB == 1
    __local realN blm[KWG * NWG/VWN];
  #endif
  // Allocates workitem-private memory (registers)
  realM apm[MWI/VWM];
  realN bpm[NWI/VWN];
  realM cpm[NWI][MWI/VWM];
  // Initializes the accumulation registers
-  #pragma unroll
+  InitAccRegisters(cpm);
  for (int mi=0; mi<MWI/VWM; ++mi) {
    #pragma unroll
    for (int ni=0; ni<NWI; ++ni) {
      #if VWM == 1
        SetToZero(cpm[ni][mi]);
      #elif VWM == 2
        SetToZero(cpm[ni][mi].x);
        SetToZero(cpm[ni][mi].y);
      #elif VWM == 4
        SetToZero(cpm[ni][mi].x);
        SetToZero(cpm[ni][mi].y);
        SetToZero(cpm[ni][mi].z);
        SetToZero(cpm[ni][mi].w);
      #elif VWM == 8
        SetToZero(cpm[ni][mi].s0);
        SetToZero(cpm[ni][mi].s1);
        SetToZero(cpm[ni][mi].s2);
        SetToZero(cpm[ni][mi].s3);
        SetToZero(cpm[ni][mi].s4);
        SetToZero(cpm[ni][mi].s5);
        SetToZero(cpm[ni][mi].s6);
        SetToZero(cpm[ni][mi].s7);
      #elif VWM == 16
        SetToZero(cpm[ni][mi].s0);
        SetToZero(cpm[ni][mi].s1);
        SetToZero(cpm[ni][mi].s2);
        SetToZero(cpm[ni][mi].s3);
        SetToZero(cpm[ni][mi].s4);
        SetToZero(cpm[ni][mi].s5);
        SetToZero(cpm[ni][mi].s6);
        SetToZero(cpm[ni][mi].s7);
        SetToZero(cpm[ni][mi].s8);
        SetToZero(cpm[ni][mi].s9);
        SetToZero(cpm[ni][mi].sA);
        SetToZero(cpm[ni][mi].sB);
        SetToZero(cpm[ni][mi].sC);
        SetToZero(cpm[ni][mi].sD);
        SetToZero(cpm[ni][mi].sE);
        SetToZero(cpm[ni][mi].sF);
      #endif
    }
  }
  // Loops over all workgroup tiles
  for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
@ -515,8 +519,6 @@ __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
    #if SB == 1
      GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
    #endif
    // Synchronizes all threads in a workgroup
    #if SA == 1 || SB == 1
      barrier(CLK_LOCAL_MEM_FENCE);
    #endif
@ -552,20 +554,130 @@ __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
        MultiplyAccumulate(cpm, apm, bpm);
      }
    }
    // Synchronizes all threads in a workgroup
    #if SA == 1 || SB == 1
      barrier(CLK_LOCAL_MEM_FENCE);
    #endif
  }
  // Stores an MWG * NWG tile of results and perform the multiplication with alpha and beta
  StoreResults(cgm, cpm, kSizeM, alpha, beta);
 }
 // =================================================================================================
 // The upper-triangular and lower-triangular kernels are only used in special cases
 #if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
-// End of the C++11 raw string literal
+// Main entry point of the kernel. This is the upper-triangular version.
-)";
+__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
 __kernel void XgemmUpper(const int kSizeN, const int kSizeK,
                         const real alpha, const real beta,
                         const __global realM* restrict agm,
                         const __global realN* restrict bgm,
                         __global realM* cgm) {
  // Skip these threads if they do not contain threads contributing to the upper-triangle
  if (get_group_id(1)*NWG < get_group_id(0)*MWG) {
    return;
  }
  // Allocates workgroup-private memory (local memory)
  #if SA == 1
    __local realM alm[KWG * MWG/VWM];
  #endif
  #if SB == 1
    __local realN blm[KWG * NWG/VWN];
  #endif
  // Computes the matrix-multiplication and stores the result in register memory
  realM cpm[NWI][MWI/VWM];
  #if SA == 1 && SB == 1
    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
  #elif SA == 1
    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
  #elif SB == 1
    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
  #else
    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
  #endif
  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
  StoreResults(cgm, cpm, kSizeN, alpha, beta);
 }
 // Main entry point of the kernel. This is the lower-triangular version.
 __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
 __kernel void XgemmLower(const int kSizeN, const int kSizeK,
                         const real alpha, const real beta,
                         const __global realM* restrict agm,
                         const __global realN* restrict bgm,
                         __global realM* cgm) {
  // Skip these threads if they do not contain threads contributing to the lower-triangle
  if (get_group_id(1)*NWG > get_group_id(0)*MWG) {
    return;
  }
  // Allocates workgroup-private memory (local memory)
  #if SA == 1
    __local realM alm[KWG * MWG/VWM];
  #endif
  #if SB == 1
    __local realN blm[KWG * NWG/VWN];
  #endif
  // Computes the matrix-multiplication and stores the result in register memory
  realM cpm[NWI][MWI/VWM];
  #if SA == 1 && SB == 1
    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
  #elif SA == 1
    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
  #elif SB == 1
    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
  #else
    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
  #endif
  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
  StoreResults(cgm, cpm, kSizeN, alpha, beta);
 }
 // =================================================================================================
 // If not using a triangular version, include the regular kernel
 #else
 // Main entry point of the kernel. This is the regular full version.
 __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
 __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
                    const real alpha, const real beta,
                    const __global realM* restrict agm,
                    const __global realN* restrict bgm,
                    __global realM* cgm) {
  // Allocates workgroup-private memory (local memory)
  #if SA == 1
    __local realM alm[KWG * MWG/VWM];
  #endif
  #if SB == 1
    __local realN blm[KWG * NWG/VWN];
  #endif
  // Computes the matrix-multiplication and stores the result in register memory
  realM cpm[NWI][MWI/VWM];
  #if SA == 1 && SB == 1
    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
  #elif SA == 1
    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
  #elif SB == 1
    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
  #else
    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
  #endif
  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
  StoreResults(cgm, cpm, kSizeM, alpha, beta);
 }
 #endif
 // =================================================================================================
 // End of the C++11 raw string literal
 )"
 // =================================================================================================
--- a/src/kernels/xgemv.opencl
+++ b/src/kernels/xgemv.opencl
@ -368,6 +368,6 @@ __kernel void XgemvFastRot(const int m, const int n, const real alpha, const rea
 // =================================================================================================
 // End of the C++11 raw string literal
-)";
+)"
 // =================================================================================================
--- a/src/routine.cc
+++ b/src/routine.cc
@ -22,9 +22,10 @@ namespace clblast {
 std::vector<Routine::ProgramCache> Routine::program_cache_;
 // Constructor: not much here, because no status codes can be returned
-Routine::Routine(CommandQueue &queue, Event &event,
+Routine::Routine(CommandQueue &queue, Event &event, const std::string &name,
                 const std::vector<std::string> &routines, const Precision precision):
    precision_(precision),
    routine_name_(name),
    queue_(queue),
    event_(event),
    context_(queue_.GetContext()),
@ -33,14 +34,13 @@ Routine::Routine(CommandQueue &queue, Event &event,
    max_work_item_dimensions_(device_.MaxWorkItemDimensions()),
    max_work_item_sizes_(device_.MaxWorkItemSizes()),
    max_work_group_size_(device_.MaxWorkGroupSize()),
-    db_(queue_, routines, precision_),
+    db_(queue_, routines, precision_) {
    routines_(routines) {
 }
 // =================================================================================================
 // Separate set-up function to allow for status codes to be returned
-StatusCode Routine::SetUp(const std::string &routine_source) {
+StatusCode Routine::SetUp() {
  // Queries the cache to see whether or not the compiled kernel is already there. If not, it will
  // be built and added to the cache.
@ -63,12 +63,24 @@ StatusCode Routine::SetUp(const std::string &routine_source) {
    // Loads the common header (typedefs and defines and such)
    std::string common_header =
-    #include "kernels/common.opencl"
+      #include "kernels/common.opencl"
    ;
    // Collects the parameters for this device in the form of defines, and adds the precision
    auto defines = db_.GetDefines();
    defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
-    auto source_string = defines + common_header + routine_source;
+
    // Adds the name of the routine as a define
    defines += "#define ROUTINE_"+routine_name_+"\n";
    // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
    // performance, but might result in a reduced accuracy.
    if (device_.Vendor() == "AMD") {
      defines += "#define USE_CL_MAD 1\n";
    }
    // Combines everything together into a single source string
    auto source_string = defines + common_header + source_string_;
    // Compiles the kernel
    try {
@ -85,7 +97,7 @@ StatusCode Routine::SetUp(const std::string &routine_source) {
      if (status == CL_INVALID_BINARY) { return StatusCode::kInvalidBinary; }
      // Store the compiled program in the cache
-      program_cache_.push_back({program, device_name_, precision_, routines_});
+      program_cache_.push_back({program, device_name_, precision_, routine_name_});
    } catch (...) { return StatusCode::kBuildProgramFailure; }
  }
@ -202,19 +214,22 @@ StatusCode Routine::TestVectorY(const size_t n, const Buffer &buffer, const size
 // =================================================================================================
-// Copies a matrix and pads it with zeros
+// Copies or transposes a matrix and pads/unpads it with zeros
 StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
                                           const size_t src_ld, const size_t src_offset,
                                           const Buffer &src,
                                           const size_t dest_one, const size_t dest_two,
                                           const size_t dest_ld, const size_t dest_offset,
                                           const Buffer &dest,
                                           const Program &program, const bool do_pad,
                                           const bool do_transpose, const bool do_conjugate,
-                                           const bool pad, const Program &program) {
+                                           const bool upper, const bool lower,
                                           const bool diagonal_imag_zero) {
  // Determines whether or not the fast-version could potentially be used
  auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
-                         (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld);
+                         (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
                         (upper == false) && (lower == false) && (diagonal_imag_zero == false);
  // Determines the right kernel
  auto kernel_name = std::string{};
@ -227,7 +242,7 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
    }
    else {
      use_fast_kernel = false;
-      kernel_name = (pad) ? "PadTransposeMatrix" : "UnPadTransposeMatrix";
+      kernel_name = (do_pad) ? "PadTransposeMatrix" : "UnPadTransposeMatrix";
    }
  }
  else {
@ -239,7 +254,7 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
    }
    else {
      use_fast_kernel = false;
-      kernel_name = (pad) ? "PadMatrix" : "UnPadMatrix";
+      kernel_name = (do_pad) ? "PadMatrix" : "UnPadMatrix";
    }
  }
@ -264,9 +279,14 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
      kernel.SetArgument(7, static_cast<int>(dest_ld));
      kernel.SetArgument(8, static_cast<int>(dest_offset));
      kernel.SetArgument(9, dest());
-      if (pad) {
+      if (do_pad) {
        kernel.SetArgument(10, static_cast<int>(do_conjugate));
      }
      else {
        kernel.SetArgument(10, static_cast<int>(upper));
        kernel.SetArgument(11, static_cast<int>(lower));
        kernel.SetArgument(12, static_cast<int>(diagonal_imag_zero));
      }
    }
    // Launches the kernel and returns the error code. Uses global and local thread sizes based on
@ -310,7 +330,7 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
 // otherwise.
 const Program& Routine::GetProgramFromCache() const {
  for (auto &cached_program: program_cache_) {
-    if (cached_program.MatchInCache(device_name_, precision_, routines_)) {
+    if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) {
      return cached_program.program;
    }
  }
@ -320,7 +340,7 @@ const Program& Routine::GetProgramFromCache() const {
 // Queries the cache to see whether or not the compiled kernel is already there
 bool Routine::ProgramIsInCache() const {
  for (auto &cached_program: program_cache_) {
-    if (cached_program.MatchInCache(device_name_, precision_, routines_)) { return true; }
+    if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) { return true; }
  }
  return false;
 }
--- a/src/routines/level1/xaxpy.cc
+++ b/src/routines/level1/xaxpy.cc
@ -11,7 +11,7 @@
 //
 // =================================================================================================
-#include "internal/routines/xaxpy.h"
+#include "internal/routines/level1/xaxpy.h"
 #include <string>
 #include <vector>
@ -30,7 +30,10 @@ template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDoub
 // Constructor: forwards to base class constructor
 template <typename T>
 Xaxpy<T>::Xaxpy(CommandQueue &queue, Event &event):
-    Routine(queue, event, {"Xaxpy"}, precision_) {
+    Routine(queue, event, "AXPY", {"Xaxpy"}, precision_) {
  source_string_ =
    #include "../../kernels/xaxpy.opencl"
  ;
 }
 // =================================================================================================
--- a/src/routines/level2/xgemv.cc
+++ b/src/routines/level2/xgemv.cc
@ -11,7 +11,7 @@
 //
 // =================================================================================================
-#include "internal/routines/xgemv.h"
+#include "internal/routines/level2/xgemv.h"
 #include <string>
 #include <vector>
@ -30,7 +30,10 @@ template <> const Precision Xgemv<double2>::precision_ = Precision::kComplexDoub
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgemv<T>::Xgemv(CommandQueue &queue, Event &event):
-    Routine(queue, event, {"Xgemv"}, precision_) {
+    Routine(queue, event, "GEMV", {"Xgemv"}, precision_) {
  source_string_ =
    #include "../../kernels/xgemv.opencl"
  ;
 }
 // =================================================================================================
--- a/src/routines/level3/xgemm.cc
+++ b/src/routines/level3/xgemm.cc
@ -11,7 +11,7 @@
 //
 // =================================================================================================
-#include "internal/routines/xgemm.h"
+#include "internal/routines/level3/xgemm.h"
 #include <string>
 #include <vector>
@ -30,7 +30,14 @@ template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDoub
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgemm<T>::Xgemm(CommandQueue &queue, Event &event):
-    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
+    Routine(queue, event, "GEMM", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
  source_string_ =
    #include "../../kernels/copy.opencl"
    #include "../../kernels/pad.opencl"
    #include "../../kernels/transpose.opencl"
    #include "../../kernels/padtranspose.opencl"
    #include "../../kernels/xgemm.opencl"
  ;
 }
 // =================================================================================================
@ -95,31 +102,48 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
  auto n_ceiled = Ceil(n, db_["NWG"]);
  auto k_ceiled = Ceil(k, db_["KWG"]);
-  // Allocates space on the device for padded and/or transposed input and output matrices.
+  // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
  try {
    auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T));
    auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T));
    // Loads the program from the database
    auto& program = GetProgramFromCache();
-    // Runs the pre-processing kernels. This transposes the matrices, but also pads zeros to fill
+    // Determines whether or not temporary matrices are needed
-    // them up until they reach a certain multiple of size (kernel parameter dependent).
+    auto a_no_temp = a_one == m_ceiled && a_two == k_ceiled && a_ld == m_ceiled && a_offset == 0 &&
-    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+                     a_do_transpose == false && a_conjugate == false;
-                                    m_ceiled, k_ceiled, m_ceiled, 0, temp_a,
+    auto b_no_temp = b_one == n_ceiled && b_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
-                                    a_do_transpose, a_conjugate, true, program);
+                     b_do_transpose == false && b_conjugate == false;
-    if (ErrorIn(status)) { return status; }
+    auto c_no_temp = c_one == m_ceiled && c_two == n_ceiled && c_ld == m_ceiled && c_offset == 0 &&
-    status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
+                     c_do_transpose == false;
                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
                                    b_do_transpose, b_conjugate, true, program);
    if (ErrorIn(status)) { return status; }
-    // Only necessary for matrix C if it used both as input and output
+    // Creates the temporary matrices
-    if (beta != static_cast<T>(0)) {
+    auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T));
    auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
    auto c_temp = (c_no_temp) ? c_buffer : Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T));
    // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
    // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
    // case nothing has to be done, these kernels can be skipped.
    if (!a_no_temp) {
      status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
                                      m_ceiled, k_ceiled, m_ceiled, 0, a_temp,
                                      program, true, a_do_transpose, a_conjugate);
      if (ErrorIn(status)) { return status; }
    }
    // As above, but now for matrix B
    if (!b_no_temp) {
      status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
                                      n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
                                      program, true, b_do_transpose, b_conjugate);
      if (ErrorIn(status)) { return status; }
    }
    // As above, but now for matrix C. This is only necessary if C is used both as input and output.
    if (!c_no_temp && beta != static_cast<T>(0)) {
      status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer,
-                                      m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
+                                      m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
-                                      c_do_transpose, false, true, program);
+                                      program, true, c_do_transpose, false);
      if (ErrorIn(status)) { return status; }
    }
@ -133,9 +157,9 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
      kernel.SetArgument(2, static_cast<int>(k_ceiled));
      kernel.SetArgument(3, alpha);
      kernel.SetArgument(4, beta);
-      kernel.SetArgument(5, temp_a());
+      kernel.SetArgument(5, a_temp());
-      kernel.SetArgument(6, temp_b());
+      kernel.SetArgument(6, b_temp());
-      kernel.SetArgument(7, temp_c());
+      kernel.SetArgument(7, c_temp());
      // Computes the global and local thread sizes
      auto global = std::vector<size_t>{
@ -148,11 +172,13 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
      status = RunKernel(kernel, global, local);
      if (ErrorIn(status)) { return status; }
-      // Runs the post-processing kernel
+      // Runs the post-processing kernel if needed
-      status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
+      if (!c_no_temp) {
-                                      c_one, c_two, c_ld, c_offset, c_buffer,
+        status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
-                                      c_do_transpose, false, false, program);
+                                        c_one, c_two, c_ld, c_offset, c_buffer,
-      if (ErrorIn(status)) { return status; }
+                                        program, false, c_do_transpose, false);
        if (ErrorIn(status)) { return status; }
      }
      // Successfully finished the computation
      return StatusCode::kSuccess;
--- a/src/routines/level3/xhemm.cc
+++ b/src/routines/level3/xhemm.cc
@ -0,0 +1,130 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xhemm class (see the header for information about the class).
 //
 // =================================================================================================
 #include "internal/routines/level3/xhemm.h"
 #include <string>
 #include <vector>
 namespace clblast {
 // =================================================================================================
 // Constructor: forwards to base class constructor
 template <typename T>
 Xhemm<T>::Xhemm(CommandQueue &queue, Event &event):
    Xgemm<T>(queue, event) {
 }
 // =================================================================================================
 // The main routine
 template <typename T>
 StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle triangle,
                            const size_t m, const size_t n,
                            const T alpha,
                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
                            const T beta,
                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
  // Makes sure all dimensions are larger than zero
  if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
  // Computes the k dimension. This is based on whether or not the hermitian matrix is A (on the
  // left) or B (on the right) in the Xgemm routine.
  auto k = (side == Side::kLeft) ? m : n;
  // Checks for validity of the squared A matrix
  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
  if (ErrorIn(status)) { return status; }
  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
  // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix
  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
  auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared";
  // Temporary buffer for a copy of the hermitian matrix
  try {
    auto temp_herm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
    // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm
    // routine afterwards
    try {
      auto& program = GetProgramFromCache();
      auto kernel = Kernel(program, kernel_name);
      // Sets the arguments for the hermitian-to-squared kernel
      kernel.SetArgument(0, static_cast<int>(k));
      kernel.SetArgument(1, static_cast<int>(a_ld));
      kernel.SetArgument(2, static_cast<int>(a_offset));
      kernel.SetArgument(3, a_buffer());
      kernel.SetArgument(4, static_cast<int>(k));
      kernel.SetArgument(5, static_cast<int>(k));
      kernel.SetArgument(6, static_cast<int>(0));
      kernel.SetArgument(7, temp_herm());
      // Uses the common padding kernel's thread configuration. This is allowed, since the
      // hermitian-to-squared kernel uses the same parameters.
      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
      status = RunKernel(kernel, global, local);
      if (ErrorIn(status)) { return status; }
      // Runs the regular Xgemm code with either "C := AB+C" or ...
      if (side == Side::kLeft) {
        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
                        m, n, k,
                        alpha,
                        temp_herm, 0, k,
                        b_buffer, b_offset, b_ld,
                        beta,
                        c_buffer, c_offset, c_ld);
      }
      // ... with "C := BA+C". Note that A and B are now reversed.
      else {
        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
                        m, n, k,
                        alpha,
                        b_buffer, b_offset, b_ld,
                        temp_herm, 0, k,
                        beta,
                        c_buffer, c_offset, c_ld);
        // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
        switch(status) {
          case StatusCode::kInvalidMatrixA:      status = StatusCode::kInvalidMatrixB; break;
          case StatusCode::kInvalidMatrixB:      status = StatusCode::kInvalidMatrixA; break;
          case StatusCode::kInvalidLeadDimA:     status = StatusCode::kInvalidLeadDimB; break;
          case StatusCode::kInvalidLeadDimB:     status = StatusCode::kInvalidLeadDimA; break;
          case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
          case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
        }
      }
      // Return the status of the Xgemm routine
      return status;
    } catch (...) { return StatusCode::kInvalidKernel; }
  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
 }
 // =================================================================================================
 // Compiles the templated class
 template class Xhemm<float2>;
 template class Xhemm<double2>;
 // =================================================================================================
 } // namespace clblast
--- a/src/routines/level3/xher2k.cc
+++ b/src/routines/level3/xher2k.cc
@ -0,0 +1,207 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xher2k class (see the header for information about the class).
 //
 // =================================================================================================
 #include "internal/routines/level3/xher2k.h"
 #include <string>
 #include <vector>
 namespace clblast {
 // =================================================================================================
 // Specific implementations to get the memory-type based on a template argument
 template <> const Precision Xher2k<float2,float>::precision_ = Precision::kComplexSingle;
 template <> const Precision Xher2k<double2,double>::precision_ = Precision::kComplexDouble;
 // =================================================================================================
 // Constructor: forwards to base class constructor
 template <typename T, typename U>
 Xher2k<T,U>::Xher2k(CommandQueue &queue, Event &event):
    Routine(queue, event, "HER2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
  source_string_ =
    #include "../../kernels/copy.opencl"
    #include "../../kernels/pad.opencl"
    #include "../../kernels/transpose.opencl"
    #include "../../kernels/padtranspose.opencl"
    #include "../../kernels/xgemm.opencl"
  ;
 }
 // =================================================================================================
 // The main routine
 template <typename T, typename U>
 StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
                                const size_t n, const size_t k,
                                const T alpha,
                                const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
                                const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
                                const U beta,
                                const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
  // Makes sure all dimensions are larger than zero
  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
  // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
  // to matrix A (argument: conjugate transpose)
  auto ab_conjugate = (ab_transpose != Transpose::kNo);
  // Computes whether or not the matrices are transposed in memory. This is based on their layout
  // (row or column-major) and whether or not they are requested to be pre-transposed.
  auto ab_rotated = (layout == Layout::kColMajor && ab_conjugate) ||
                    (layout == Layout::kRowMajor && !ab_conjugate);
  auto c_rotated = (layout == Layout::kRowMajor);
  // Computes the first and second dimensions of the A and B matrices taking the layout into account
  auto ab_one = (ab_rotated) ? k : n;
  auto ab_two = (ab_rotated) ? n : k;
  // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
  // space. Also tests that the leading dimensions of:
  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
  //    matrix C cannot be less than N
  auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T));
  if (ErrorIn(status)) { return status; }
  status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T));
  if (ErrorIn(status)) { return status; }
  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
  if (ErrorIn(status)) { return status; }
  // Calculates the ceiled versions of n and k
  auto n_ceiled = Ceil(n, db_["NWG"]);
  auto k_ceiled = Ceil(k, db_["KWG"]);
  // Decides which kernel to run: the upper-triangular or lower-triangular version
  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
  // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
  try {
    // Loads the program from the database
    auto& program = GetProgramFromCache();
    // Determines whether or not temporary matrices are needed
    auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
                      ab_rotated == false && ab_conjugate == false;
    auto a2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
                      ab_rotated == false && ab_conjugate == true;
    auto b1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
                      ab_rotated == false && ab_conjugate == false;
    auto b2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
                      ab_rotated == false && ab_conjugate == true;
    // Creates the temporary matrices
    auto a1_temp = (a1_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
    auto a2_temp = (a2_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
    auto b1_temp = (b1_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
    auto b2_temp = (b2_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
    auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
    // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
    // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
    // case nothing has to be done, these kernels can be skipped.
    if (!a1_no_temp) {
      status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
                                      n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
                                      program, true, ab_rotated, ab_conjugate);
      if (ErrorIn(status)) { return status; }
    }
    if (!a2_no_temp) {
      status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
                                      n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
                                      program, true, ab_rotated, !ab_conjugate);
      if (ErrorIn(status)) { return status; }
    }
    if (!b1_no_temp) {
      status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
                                      n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
                                      program, true, ab_rotated, ab_conjugate);
      if (ErrorIn(status)) { return status; }
    }
    if (!b2_no_temp) {
      status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
                                      n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
                                      program, true, ab_rotated, !ab_conjugate);
      if (ErrorIn(status)) { return status; }
    }
    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
    // modify the other triangle.
    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
                                    n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                    program, true, c_rotated, false);
    if (ErrorIn(status)) { return status; }
    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
    try {
      auto kernel = Kernel(program, kernel_name);
      // Sets the kernel arguments
      auto complex_beta = T{beta, static_cast<U>(0.0)};
      kernel.SetArgument(0, static_cast<int>(n_ceiled));
      kernel.SetArgument(1, static_cast<int>(k_ceiled));
      kernel.SetArgument(2, alpha);
      kernel.SetArgument(3, complex_beta);
      kernel.SetArgument(4, a1_temp());
      kernel.SetArgument(5, b2_temp());
      kernel.SetArgument(6, c_temp());
      // Computes the global and local thread sizes
      auto global = std::vector<size_t>{
        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
      };
      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
      // Launches the kernel
      status = RunKernel(kernel, global, local);
      if (ErrorIn(status)) { return status; }
      // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
      auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
      auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
      kernel.SetArgument(2, conjugate_alpha);
      kernel.SetArgument(3, complex_one);
      kernel.SetArgument(4, b1_temp());
      kernel.SetArgument(5, a2_temp());
      // Runs the kernel again
      status = RunKernel(kernel, global, local);
      if (ErrorIn(status)) { return status; }
      // Runs the post-processing kernel
      auto upper = (triangle == Triangle::kUpper);
      auto lower = (triangle == Triangle::kLower);
      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                      n, n, c_ld, c_offset, c_buffer,
                                      program, false, c_rotated, false, upper, lower, true);
      if (ErrorIn(status)) { return status; }
      // Successfully finished the computation
      return StatusCode::kSuccess;
    } catch (...) { return StatusCode::kInvalidKernel; }
  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
 }
 // =================================================================================================
 // Compiles the templated class
 template class Xher2k<float2,float>;
 template class Xher2k<double2,double>;
 // =================================================================================================
 } // namespace clblast
--- a/src/routines/level3/xherk.cc
+++ b/src/routines/level3/xherk.cc
@ -0,0 +1,175 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xherk class (see the header for information about the class).
 //
 // =================================================================================================
 #include "internal/routines/level3/xherk.h"
 #include <string>
 #include <vector>
 namespace clblast {
 // =================================================================================================
 // Specific implementations to get the memory-type based on a template argument
 template <> const Precision Xherk<float2,float>::precision_ = Precision::kComplexSingle;
 template <> const Precision Xherk<double2,double>::precision_ = Precision::kComplexDouble;
 // =================================================================================================
 // Constructor: forwards to base class constructor
 template <typename T, typename U>
 Xherk<T,U>::Xherk(CommandQueue &queue, Event &event):
    Routine(queue, event, "HERK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
  source_string_ =
    #include "../../kernels/copy.opencl"
    #include "../../kernels/pad.opencl"
    #include "../../kernels/transpose.opencl"
    #include "../../kernels/padtranspose.opencl"
    #include "../../kernels/xgemm.opencl"
  ;
 }
 // =================================================================================================
 // The main routine
 template <typename T, typename U>
 StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
                              const size_t n, const size_t k,
                              const U alpha,
                              const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
                              const U beta,
                              const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
  // Makes sure all dimensions are larger than zero
  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
  // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
  // to matrix A (argument: conjugate transpose)
  auto a_conjugate = (a_transpose != Transpose::kNo);
  auto b_conjugate = (a_transpose == Transpose::kNo);
  // Computes whether or not the matrices are transposed in memory. This is based on their layout
  // (row or column-major) and whether or not they are requested to be pre-transposed.
  auto a_rotated = (layout == Layout::kColMajor && a_conjugate) ||
                   (layout == Layout::kRowMajor && !a_conjugate);
  auto c_rotated = (layout == Layout::kRowMajor);
  // Computes the first and second dimensions of the A matrix taking the layout into account
  auto a_one = (a_rotated) ? k : n;
  auto a_two = (a_rotated) ? n : k;
  // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and
  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
  // space. Also tests that the leading dimensions of:
  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
  //    matrix C cannot be less than N
  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
  if (ErrorIn(status)) { return status; }
  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
  if (ErrorIn(status)) { return status; }
  // Calculates the ceiled versions of n and k
  auto n_ceiled = Ceil(n, db_["NWG"]);
  auto k_ceiled = Ceil(k, db_["KWG"]);
  // Decides which kernel to run: the upper-triangular or lower-triangular version
  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
  // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
  try {
    // Loads the program from the database
    auto& program = GetProgramFromCache();
    // Determines whether or not temporary matrices are needed
    auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
                     a_rotated == false && a_conjugate == false;
    auto b_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
                     a_rotated == false && b_conjugate == false;
    // Creates the temporary matrices
    auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
    auto b_temp = (b_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
    auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
    // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
    // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
    // case nothing has to be done, these kernels can be skipped. Two copies are created.
    if (!a_no_temp) {
      status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
                                      n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
                                      program, true, a_rotated, a_conjugate);
      if (ErrorIn(status)) { return status; }
    }
    if (!b_no_temp) {
      status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
                                      n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
                                      program, true, a_rotated, b_conjugate);
      if (ErrorIn(status)) { return status; }
    }
    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
    // modify the other triangle.
    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
                                    n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                    program, true, c_rotated, false);
    if (ErrorIn(status)) { return status; }
    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
    try {
      auto kernel = Kernel(program, kernel_name);
      // Sets the kernel arguments
      auto complex_alpha = T{alpha, static_cast<U>(0.0)};
      auto complex_beta = T{beta, static_cast<U>(0.0)};
      kernel.SetArgument(0, static_cast<int>(n_ceiled));
      kernel.SetArgument(1, static_cast<int>(k_ceiled));
      kernel.SetArgument(2, complex_alpha);
      kernel.SetArgument(3, complex_beta);
      kernel.SetArgument(4, a_temp());
      kernel.SetArgument(5, b_temp());
      kernel.SetArgument(6, c_temp());
      // Computes the global and local thread sizes
      auto global = std::vector<size_t>{
        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
      };
      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
      // Launches the kernel
      status = RunKernel(kernel, global, local);
      if (ErrorIn(status)) { return status; }
      // Runs the post-processing kernel
      auto upper = (triangle == Triangle::kUpper);
      auto lower = (triangle == Triangle::kLower);
      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                      n, n, c_ld, c_offset, c_buffer,
                                      program, false, c_rotated, false, upper, lower, true);
      if (ErrorIn(status)) { return status; }
      // Successfully finished the computation
      return StatusCode::kSuccess;
    } catch (...) { return StatusCode::kInvalidKernel; }
  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
 }
 // =================================================================================================
 // Compiles the templated class
 template class Xherk<float2,float>;
 template class Xherk<double2,double>;
 // =================================================================================================
 } // namespace clblast
--- a/src/routines/level3/xsymm.cc
+++ b/src/routines/level3/xsymm.cc
@ -11,7 +11,7 @@
 //
 // =================================================================================================
-#include "internal/routines/xsymm.h"
+#include "internal/routines/level3/xsymm.h"
 #include <string>
 #include <vector>
@ -42,14 +42,14 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
  // Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the
  // left) or B (on the right) in the Xgemm routine.
-  size_t k = (side == Side::kLeft) ? m : n;
+  auto k = (side == Side::kLeft) ? m : n;
  // Checks for validity of the squared A matrix
  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
  if (ErrorIn(status)) { return status; }
  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
-  // default) and on whether we are dealing with an upper or lower triangle of the symmetrix matrix
+  // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix
  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
  auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared";
@ -75,7 +75,7 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
      kernel.SetArgument(7, temp_symm());
      // Uses the common padding kernel's thread configuration. This is allowed, since the
-      // symmetry-to-squared kernel uses the same parameters.
+      // symmetric-to-squared kernel uses the same parameters.
      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
--- a/src/routines/level3/xsyr2k.cc
+++ b/src/routines/level3/xsyr2k.cc
@ -0,0 +1,186 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xsyr2k class (see the header for information about the class).
 //
 // =================================================================================================
 #include "internal/routines/level3/xsyr2k.h"
 #include <string>
 #include <vector>
 namespace clblast {
 // =================================================================================================
 // Specific implementations to get the memory-type based on a template argument
 template <> const Precision Xsyr2k<float>::precision_ = Precision::kSingle;
 template <> const Precision Xsyr2k<double>::precision_ = Precision::kDouble;
 template <> const Precision Xsyr2k<float2>::precision_ = Precision::kComplexSingle;
 template <> const Precision Xsyr2k<double2>::precision_ = Precision::kComplexDouble;
 // =================================================================================================
 // Constructor: forwards to base class constructor
 template <typename T>
 Xsyr2k<T>::Xsyr2k(CommandQueue &queue, Event &event):
    Routine(queue, event, "SYR2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
  source_string_ =
    #include "../../kernels/copy.opencl"
    #include "../../kernels/pad.opencl"
    #include "../../kernels/transpose.opencl"
    #include "../../kernels/padtranspose.opencl"
    #include "../../kernels/xgemm.opencl"
  ;
 }
 // =================================================================================================
 // The main routine
 template <typename T>
 StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
                              const size_t n, const size_t k,
                              const T alpha,
                              const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
                              const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
                              const T beta,
                              const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
  // Makes sure all dimensions are larger than zero
  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
  // Computes whether or not the matrices are transposed in memory. This is based on their layout
  // (row or column-major) and whether or not they are requested to be pre-transposed.
  auto ab_rotated = (layout == Layout::kColMajor && ab_transpose != Transpose::kNo) ||
                    (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo);
  auto c_rotated = (layout == Layout::kRowMajor);
  // Computes the first and second dimensions of the A and B matrices taking the layout into account
  auto ab_one = (ab_rotated) ? k : n;
  auto ab_two = (ab_rotated) ? n : k;
  // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
  // space. Also tests that the leading dimensions of:
  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
  //    matrix C cannot be less than N
  auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T));
  if (ErrorIn(status)) { return status; }
  status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T));
  if (ErrorIn(status)) { return status; }
  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
  if (ErrorIn(status)) { return status; }
  // Calculates the ceiled versions of n and k
  auto n_ceiled = Ceil(n, db_["NWG"]);
  auto k_ceiled = Ceil(k, db_["KWG"]);
  // Decides which kernel to run: the upper-triangular or lower-triangular version
  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
  // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
  try {
    // Loads the program from the database
    auto& program = GetProgramFromCache();
    // Determines whether or not temporary matrices are needed
    auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
                     ab_rotated == false;
    auto b_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
                     ab_rotated == false;
    // Creates the temporary matrices
    auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
    auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
    auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
    // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
    // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
    // case nothing has to be done, these kernels can be skipped.
    if (!a_no_temp) {
      status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
                                      n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
                                      program, true, ab_rotated, false);
      if (ErrorIn(status)) { return status; }
    }
    if (!b_no_temp) {
      status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
                                      n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
                                      program, true, ab_rotated, false);
      if (ErrorIn(status)) { return status; }
    }
    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
    // modify the other triangle.
    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
                                    n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                    program, true, c_rotated, false);
    if (ErrorIn(status)) { return status; }
    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
    try {
      auto kernel = Kernel(program, kernel_name);
      // Sets the kernel arguments
      kernel.SetArgument(0, static_cast<int>(n_ceiled));
      kernel.SetArgument(1, static_cast<int>(k_ceiled));
      kernel.SetArgument(2, alpha);
      kernel.SetArgument(3, beta);
      kernel.SetArgument(4, a_temp());
      kernel.SetArgument(5, b_temp());
      kernel.SetArgument(6, c_temp());
      // Computes the global and local thread sizes
      auto global = std::vector<size_t>{
        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
      };
      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
      // Launches the kernel
      status = RunKernel(kernel, global, local);
      if (ErrorIn(status)) { return status; }
      // Swaps the arguments for matrices A and B, and sets 'beta' to 1
      auto one = static_cast<T>(1);
      kernel.SetArgument(3, one);
      kernel.SetArgument(4, b_temp());
      kernel.SetArgument(5, a_temp());
      // Runs the kernel again
      status = RunKernel(kernel, global, local);
      if (ErrorIn(status)) { return status; }
      // Runs the post-processing kernel
      auto upper = (triangle == Triangle::kUpper);
      auto lower = (triangle == Triangle::kLower);
      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                      n, n, c_ld, c_offset, c_buffer,
                                      program, false, c_rotated, false, upper, lower, false);
      if (ErrorIn(status)) { return status; }
      // Successfully finished the computation
      return StatusCode::kSuccess;
    } catch (...) { return StatusCode::kInvalidKernel; }
  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
 }
 // =================================================================================================
 // Compiles the templated class
 template class Xsyr2k<float>;
 template class Xsyr2k<double>;
 template class Xsyr2k<float2>;
 template class Xsyr2k<double2>;
 // =================================================================================================
 } // namespace clblast
--- a/src/routines/level3/xsyrk.cc
+++ b/src/routines/level3/xsyrk.cc
@ -0,0 +1,163 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xsyrk class (see the header for information about the class).
 //
 // =================================================================================================
 #include "internal/routines/level3/xsyrk.h"
 #include <string>
 #include <vector>
 namespace clblast {
 // =================================================================================================
 // Specific implementations to get the memory-type based on a template argument
 template <> const Precision Xsyrk<float>::precision_ = Precision::kSingle;
 template <> const Precision Xsyrk<double>::precision_ = Precision::kDouble;
 template <> const Precision Xsyrk<float2>::precision_ = Precision::kComplexSingle;
 template <> const Precision Xsyrk<double2>::precision_ = Precision::kComplexDouble;
 // =================================================================================================
 // Constructor: forwards to base class constructor
 template <typename T>
 Xsyrk<T>::Xsyrk(CommandQueue &queue, Event &event):
    Routine(queue, event, "SYRK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
  source_string_ =
    #include "../../kernels/copy.opencl"
    #include "../../kernels/pad.opencl"
    #include "../../kernels/transpose.opencl"
    #include "../../kernels/padtranspose.opencl"
    #include "../../kernels/xgemm.opencl"
  ;
 }
 // =================================================================================================
 // The main routine
 template <typename T>
 StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
                            const size_t n, const size_t k,
                            const T alpha,
                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
                            const T beta,
                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
  // Makes sure all dimensions are larger than zero
  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
  // Computes whether or not the matrices are transposed in memory. This is based on their layout
  // (row or column-major) and whether or not they are requested to be pre-transposed.
  auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) ||
                   (layout == Layout::kRowMajor && a_transpose == Transpose::kNo);
  auto c_rotated = (layout == Layout::kRowMajor);
  // Computes the first and second dimensions of the A matrix taking the layout into account
  auto a_one = (a_rotated) ? k : n;
  auto a_two = (a_rotated) ? n : k;
  // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and
  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
  // space. Also tests that the leading dimensions of:
  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
  //    matrix C cannot be less than N
  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
  if (ErrorIn(status)) { return status; }
  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
  if (ErrorIn(status)) { return status; }
  // Calculates the ceiled versions of n and k
  auto n_ceiled = Ceil(n, db_["NWG"]);
  auto k_ceiled = Ceil(k, db_["KWG"]);
  // Decides which kernel to run: the upper-triangular or lower-triangular version
  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
  // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
  try {
    // Loads the program from the database
    auto& program = GetProgramFromCache();
    // Determines whether or not temporary matrices are needed
    auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
                     a_rotated == false;
    // Creates the temporary matrices
    auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
    auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
    // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
    // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
    // case nothing has to be done, these kernels can be skipped.
    if (!a_no_temp) {
      status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
                                      n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
                                      program, true, a_rotated, false);
      if (ErrorIn(status)) { return status; }
    }
    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
    // modify the other triangle.
    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
                                    n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                    program, true, c_rotated, false);
    if (ErrorIn(status)) { return status; }
    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
    try {
      auto kernel = Kernel(program, kernel_name);
      // Sets the kernel arguments
      kernel.SetArgument(0, static_cast<int>(n_ceiled));
      kernel.SetArgument(1, static_cast<int>(k_ceiled));
      kernel.SetArgument(2, alpha);
      kernel.SetArgument(3, beta);
      kernel.SetArgument(4, a_temp());
      kernel.SetArgument(5, a_temp());
      kernel.SetArgument(6, c_temp());
      // Computes the global and local thread sizes
      auto global = std::vector<size_t>{
        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
      };
      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
      // Launches the kernel
      status = RunKernel(kernel, global, local);
      if (ErrorIn(status)) { return status; }
      // Runs the post-processing kernel
      auto upper = (triangle == Triangle::kUpper);
      auto lower = (triangle == Triangle::kLower);
      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                      n, n, c_ld, c_offset, c_buffer,
                                      program, false, c_rotated, false, upper, lower, false);
      if (ErrorIn(status)) { return status; }
      // Successfully finished the computation
      return StatusCode::kSuccess;
    } catch (...) { return StatusCode::kInvalidKernel; }
  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
 }
 // =================================================================================================
 // Compiles the templated class
 template class Xsyrk<float>;
 template class Xsyrk<double>;
 template class Xsyrk<float2>;
 template class Xsyrk<double2>;
 // =================================================================================================
 } // namespace clblast
--- a/src/routines/level3/xtrmm.cc
+++ b/src/routines/level3/xtrmm.cc
@ -0,0 +1,135 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xtrmm class (see the header for information about the class).
 //
 // =================================================================================================
 #include "internal/routines/level3/xtrmm.h"
 #include <string>
 #include <vector>
 namespace clblast {
 // =================================================================================================
 // Constructor: forwards to base class constructor
 template <typename T>
 Xtrmm<T>::Xtrmm(CommandQueue &queue, Event &event):
    Xgemm<T>(queue, event) {
 }
 // =================================================================================================
 // The main routine
 template <typename T>
 StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle triangle,
                            const Transpose a_transpose, const Diagonal diagonal,
                            const size_t m, const size_t n,
                            const T alpha,
                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) {
  // Makes sure all dimensions are larger than zero
  if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; }
  // Computes the k dimension. This is based on whether or not matrix is A (on the left)
  // or B (on the right) in the Xgemm routine.
  auto k = (side == Side::kLeft) ? m : n;
  // Checks for validity of the triangular A matrix
  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
  if (ErrorIn(status)) { return status; }
  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
  // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
  auto kernel_name = (is_upper) ? "TrmmUpperToSquared" : "TrmmLowerToSquared";
  // Determines whether or not the triangular matrix is unit-diagonal
  auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false;
  // Temporary buffer for a copy of the triangular matrix
  try {
    auto temp_triangular = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
    // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm
    // routine afterwards
    try {
      auto& program = GetProgramFromCache();
      auto kernel = Kernel(program, kernel_name);
      // Sets the arguments for the triangular-to-squared kernel
      kernel.SetArgument(0, static_cast<int>(k));
      kernel.SetArgument(1, static_cast<int>(a_ld));
      kernel.SetArgument(2, static_cast<int>(a_offset));
      kernel.SetArgument(3, a_buffer());
      kernel.SetArgument(4, static_cast<int>(k));
      kernel.SetArgument(5, static_cast<int>(k));
      kernel.SetArgument(6, static_cast<int>(0));
      kernel.SetArgument(7, temp_triangular());
      kernel.SetArgument(8, static_cast<int>(unit_diagonal));
      // Uses the common padding kernel's thread configuration. This is allowed, since the
      // triangular-to-squared kernel uses the same parameters.
      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
      status = RunKernel(kernel, global, local);
      if (ErrorIn(status)) { return status; }
      // Runs the regular Xgemm code with either "B := alpha*A*B" or ...
      if (side == Side::kLeft) {
        status = DoGemm(layout, a_transpose, Transpose::kNo,
                        m, n, k,
                        alpha,
                        temp_triangular, 0, k,
                        b_buffer, b_offset, b_ld,
                        static_cast<T>(0.0),
                        b_buffer, b_offset, b_ld);
      }
      // ... with "B := alpha*B*A". Note that A and B are now reversed.
      else {
        status = DoGemm(layout, Transpose::kNo, a_transpose,
                        m, n, k,
                        alpha,
                        b_buffer, b_offset, b_ld,
                        temp_triangular, 0, k,
                        static_cast<T>(0.0),
                        b_buffer, b_offset, b_ld);
        // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
        switch(status) {
          case StatusCode::kInvalidMatrixA:      status = StatusCode::kInvalidMatrixB; break;
          case StatusCode::kInvalidMatrixB:      status = StatusCode::kInvalidMatrixA; break;
          case StatusCode::kInvalidLeadDimA:     status = StatusCode::kInvalidLeadDimB; break;
          case StatusCode::kInvalidLeadDimB:     status = StatusCode::kInvalidLeadDimA; break;
          case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
          case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
        }
      }
      // Return the status of the Xgemm routine
      return status;
    } catch (...) { return StatusCode::kInvalidKernel; }
  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
 }
 // =================================================================================================
 // Compiles the templated class
 template class Xtrmm<float>;
 template class Xtrmm<double>;
 template class Xtrmm<float2>;
 template class Xtrmm<double2>;
 // =================================================================================================
 } // namespace clblast
--- a/src/tuning/copy.cc
+++ b/src/tuning/copy.cc
@ -30,11 +30,10 @@ void CopyTune(const Arguments<T> &args,
  // This points to the CopyMatrix kernel as found in the CLBlast library. This is just one example
  // of a copy kernel. However, all copy-kernels use the same tuning parameters, so one has to be
  // chosen as a representative.
-  std::string common_source =
+  std::string sources =
-  #include "../src/kernels/common.opencl"
+    #include "../src/kernels/common.opencl"
-  std::string kernel_source =
+    #include "../src/kernels/copy.opencl"
-  #include "../src/kernels/copy.opencl"
+  ;
  auto sources = common_source + kernel_source;
  auto id = tuner.AddKernelFromString(sources, "CopyMatrix", {args.m, args.n}, {1, 1});
  tuner.SetReferenceFromString(sources, "CopyMatrix", {args.m, args.n}, {8, 8});
--- a/src/tuning/pad.cc
+++ b/src/tuning/pad.cc
@ -30,11 +30,10 @@ void PadTune(const Arguments<T> &args,
  // This points to the PadMatrix kernel as found in the CLBlast library. This is just one
  // example of a pad kernel. However, all pad-kernels use the same tuning parameters, so one has
  // to be chosen as a representative.
-  std::string common_source =
+  std::string sources =
-  #include "../src/kernels/common.opencl"
+    #include "../src/kernels/common.opencl"
-  std::string kernel_source =
+    #include "../src/kernels/pad.opencl"
-  #include "../src/kernels/pad.opencl"
+  ;
  auto sources = common_source + kernel_source;
  auto id = tuner.AddKernelFromString(sources, "PadMatrix", {args.m, args.n}, {1, 1});
  tuner.SetReferenceFromString(sources, "PadMatrix", {args.m, args.n}, {8, 8});
--- a/src/tuning/padtranspose.cc
+++ b/src/tuning/padtranspose.cc
@ -30,11 +30,10 @@ void PadTransposeTune(const Arguments<T> &args,
  // This points to the PadTransposeMatrix kernel as found in the CLBlast library. This is just one
  // example of a transpose kernel. However, all kernels use the same tuning parameters, so one has
  // to be chosen as a representative.
-  std::string common_source =
+  std::string sources =
-  #include "../src/kernels/common.opencl"
+    #include "../src/kernels/common.opencl"
-  std::string kernel_source =
+    #include "../src/kernels/padtranspose.opencl"
-  #include "../src/kernels/padtranspose.opencl"
+  ;
  auto sources = common_source + kernel_source;
  auto id = tuner.AddKernelFromString(sources, "PadTransposeMatrix", {args.m, args.n}, {1, 1});
  tuner.SetReferenceFromString(sources, "PadTransposeMatrix", {args.m, args.n}, {8, 8});
--- a/src/tuning/transpose.cc
+++ b/src/tuning/transpose.cc
@ -30,11 +30,10 @@ void TransposeTune(const Arguments<T> &args,
  // This points to the PadTransposeMatrix kernel as found in the CLBlast library. This is just one
  // example of a transpose kernel. However, all kernels use the same tuning parameters, so one has
  // to be chosen as a representative.
-  std::string common_source =
+  std::string sources =
-  #include "../src/kernels/common.opencl"
+    #include "../src/kernels/common.opencl"
-  std::string kernel_source =
+    #include "../src/kernels/transpose.opencl"
-  #include "../src/kernels/transpose.opencl"
+  ;
  auto sources = common_source + kernel_source;
  auto id = tuner.AddKernelFromString(sources, "TransposeMatrix", {args.m, args.n}, {1, 1});
  tuner.SetReferenceFromString(sources, "TransposeMatrix", {args.m, args.n}, {8, 8});
@ -42,6 +41,7 @@ void TransposeTune(const Arguments<T> &args,
  tuner.AddParameter(id, "TRA_DIM", {4, 8, 16, 32, 64});
  tuner.AddParameter(id, "TRA_WPT", {1, 2, 4, 8, 16});
  tuner.AddParameter(id, "TRA_PAD", {0, 1});
  tuner.AddParameter(id, "TRA_SHUFFLE", {0, 1});
  // Tests for a specific precision
  tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
--- a/src/tuning/xaxpy.cc
+++ b/src/tuning/xaxpy.cc
@ -34,11 +34,10 @@ void XaxpyTune(const Arguments<T> &args,
  }
  // This points to the XaxpyFast kernel as found in the CLBlast library
-  std::string common_source =
+  std::string sources =
-  #include "../src/kernels/common.opencl"
+    #include "../src/kernels/common.opencl"
-  std::string kernel_source =
+    #include "../src/kernels/xaxpy.opencl"
-  #include "../src/kernels/xaxpy.opencl"
+  ;
  auto sources = common_source + kernel_source;
  auto id = tuner.AddKernelFromString(sources, "XaxpyFast", {args.n}, {1});
  tuner.SetReferenceFromString(sources, "XaxpyFast", {args.n}, {64});
--- a/src/tuning/xgemm.cc
+++ b/src/tuning/xgemm.cc
@ -30,11 +30,10 @@ void XgemmTune(const Arguments<T> &args,
               cltune::Tuner &tuner) {
  // This points to the Xgemm kernel as found in the CLBlast library and its golden reference
-  std::string common_source =
+  std::string sources =
-  #include "../src/kernels/common.opencl"
+    #include "../src/kernels/common.opencl"
-  std::string kernel_source =
+    #include "../src/kernels/xgemm.opencl"
-  #include "../src/kernels/xgemm.opencl"
+  ;
  auto sources = common_source + kernel_source;
  auto id = tuner.AddKernelFromString(sources, "Xgemm", {args.m, args.n}, {1, 1});
  tuner.SetReferenceFromString(sources, "Xgemm", {args.m, args.n}, {8, 8});
--- a/src/tuning/xgemv.cc
+++ b/src/tuning/xgemv.cc
@ -36,11 +36,10 @@ void XgemvTune(const Arguments<T> &args, const size_t variation,
  auto a_rotated = (variation == 3) ? 1 : 0;
  // This points to the Xgemv kernel as found in the CLBlast library
-  std::string common_source =
+  std::string sources =
-  #include "../src/kernels/common.opencl"
+    #include "../src/kernels/common.opencl"
-  std::string kernel_source =
+    #include "../src/kernels/xgemv.opencl"
-  #include "../src/kernels/xgemv.opencl"
+  ;
  auto sources = common_source + kernel_source;
  auto id = tuner.AddKernelFromString(sources, kernel_name, {args.m}, {1});
  tuner.SetReferenceFromString(sources, "Xgemv", {args.m}, {64});
--- a/src/utilities.cc
+++ b/src/utilities.cc
@ -79,6 +79,13 @@ std::string ToString(Triangle value) {
  }
 }
 template <>
 std::string ToString(Diagonal value) {
  switch(value) {
    case Diagonal::kUnit: return ToString(static_cast<int>(value))+" (unit)";
    case Diagonal::kNonUnit: return ToString(static_cast<int>(value))+" (non-unit)";
  }
 }
 template <>
 std::string ToString(Precision value) {
  switch(value) {
    case Precision::kHalf: return ToString(static_cast<int>(value))+" (half)";
@ -143,6 +150,7 @@ template Layout GetArgument<Layout>(const int, char **, std::string&, const std:
 template Transpose GetArgument<Transpose>(const int, char **, std::string&, const std::string&, const Transpose);
 template Side GetArgument<Side>(const int, char **, std::string&, const std::string&, const Side);
 template Triangle GetArgument<Triangle>(const int, char **, std::string&, const std::string&, const Triangle);
 template Diagonal GetArgument<Diagonal>(const int, char **, std::string&, const std::string&, const Diagonal);
 template Precision GetArgument<Precision>(const int, char **, std::string&, const std::string&, const Precision);
 // =================================================================================================
--- a/test/correctness/routines/level1/xaxpy.cc
+++ b/test/correctness/routines/level1/xaxpy.cc
@ -0,0 +1,81 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the tests for the Xaxpy routine.
 //
 // =================================================================================================
 #include "correctness/testblas.h"
 #include "routines/level1/xaxpy.h"
 namespace clblast {
 // =================================================================================================
 // The correctness tester
 template <typename T>
 void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
  // Creates a tester
  TestBlas<T,T> tester{argc, argv, silent, name, TestXaxpy<T>::GetOptions(),
                       TestXaxpy<T>::RunRoutine, TestXaxpy<T>::RunReference,
                       TestXaxpy<T>::DownloadResult, TestXaxpy<T>::GetResultIndex,
                       TestXaxpy<T>::ResultID1, TestXaxpy<T>::ResultID2};
  // This variable holds the arguments relevant for this routine
  auto args = Arguments<T>{};
  // Creates the arguments vector for the regular tests
  auto regular_test_vector = std::vector<Arguments<T>>{};
  for (auto &n: tester.kVectorDims) { args.n = n;
    for (auto &x_inc: tester.kIncrements) { args.x_inc = x_inc;
      for (auto &x_offset: tester.kOffsets) { args.x_offset = x_offset;
        for (auto &y_inc: tester.kIncrements) { args.y_inc = y_inc;
          for (auto &y_offset: tester.kOffsets) { args.y_offset = y_offset;
            for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
              args.x_size = TestXaxpy<T>::GetSizeX(args);
              args.y_size = TestXaxpy<T>::GetSizeY(args);
              if (args.x_size<1 || args.y_size<1) { continue; }
              regular_test_vector.push_back(args);
            }
          }
        }
      }
    }
  }
  // Creates the arguments vector for the invalid-buffer tests
  auto invalid_test_vector = std::vector<Arguments<T>>{};
  args.n = tester.kBufferSize;
  args.x_inc = args.y_inc = 1;
  args.x_offset = args.y_offset = 0;
  for (auto &x_size: tester.kVecSizes) { args.x_size = x_size;
    for (auto &y_size: tester.kVecSizes) { args.y_size = y_size;
      invalid_test_vector.push_back(args);
    }
  }
  // Runs the tests
  const auto case_name = "default";
  tester.TestRegular(regular_test_vector, case_name);
  tester.TestInvalid(invalid_test_vector, case_name);
 }
 // =================================================================================================
 } // namespace clblast
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
  clblast::RunTest<float>(argc, argv, false, "SAXPY");
  clblast::RunTest<double>(argc, argv, true, "DAXPY");
  clblast::RunTest<clblast::float2>(argc, argv, true, "CAXPY");
  clblast::RunTest<clblast::double2>(argc, argv, true, "ZAXPY");
  return 0;
 }
 // =================================================================================================
--- a/test/correctness/routines/level2/xgemv.cc
+++ b/test/correctness/routines/level2/xgemv.cc
@ -0,0 +1,99 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the tests for the Xgemv routine.
 //
 // =================================================================================================
 #include "correctness/testblas.h"
 #include "routines/level2/xgemv.h"
 namespace clblast {
 // =================================================================================================
 // The correctness tester
 template <typename T>
 void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
  // Creates a tester
  TestBlas<T,T> tester{argc, argv, silent, name, TestXgemv<T>::GetOptions(),
                       TestXgemv<T>::RunRoutine, TestXgemv<T>::RunReference,
                       TestXgemv<T>::DownloadResult, TestXgemv<T>::GetResultIndex,
                       TestXgemv<T>::ResultID1, TestXgemv<T>::ResultID2};
  // This variable holds the arguments relevant for this routine
  auto args = Arguments<T>{};
  // Loops over the test-cases from a data-layout point of view
  for (auto &layout: tester.kLayouts) { args.layout = layout;
    for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose;
      // Creates the arguments vector for the regular tests
      auto regular_test_vector = std::vector<Arguments<T>>{};
      for (auto &m: tester.kMatrixVectorDims) { args.m = m;
        for (auto &n: tester.kMatrixVectorDims) { args.n = n;
          for (auto &a_ld: tester.kMatrixVectorDims) { args.a_ld = a_ld;
            for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
              for (auto &x_inc: tester.kIncrements) { args.x_inc = x_inc;
                for (auto &x_offset: tester.kOffsets) { args.x_offset = x_offset;
                  for (auto &y_inc: tester.kIncrements) { args.y_inc = y_inc;
                    for (auto &y_offset: tester.kOffsets) { args.y_offset = y_offset;
                      for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
                        for (auto &beta: tester.kBetaValues) { args.beta = beta;
                          args.a_size = TestXgemv<T>::GetSizeA(args);
                          args.x_size = TestXgemv<T>::GetSizeX(args);
                          args.y_size = TestXgemv<T>::GetSizeY(args);
                          if (args.a_size<1 || args.x_size<1 || args.y_size<1) { continue; }
                          regular_test_vector.push_back(args);
                        }
                      }
                    }
                  }
                }
              }
            }
          }
        }
      }
      // Creates the arguments vector for the invalid-buffer tests
      auto invalid_test_vector = std::vector<Arguments<T>>{};
      args.m = args.n = tester.kBufferSize;
      args.a_ld = tester.kBufferSize;
      args.x_inc = args.y_inc = 1;
      args.a_offset = args.x_offset = args.y_offset = 0;
      for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
        for (auto &x_size: tester.kVecSizes) { args.x_size = x_size;
          for (auto &y_size: tester.kVecSizes) { args.y_size = y_size;
            invalid_test_vector.push_back(args);
          }
        }
      }
      // Runs the tests
      const auto case_name = ToString(layout)+" "+ToString(a_transpose);
      tester.TestRegular(regular_test_vector, case_name);
      tester.TestInvalid(invalid_test_vector, case_name);
    }
  }
 }
 // =================================================================================================
 } // namespace clblast
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
  clblast::RunTest<float>(argc, argv, false, "SGEMV");
  clblast::RunTest<double>(argc, argv, true, "DGEMV");
  clblast::RunTest<clblast::float2>(argc, argv, true, "CGEMV");
  clblast::RunTest<clblast::double2>(argc, argv, true, "ZGEMV");
  return 0;
 }
 // =================================================================================================
--- a/test/correctness/routines/level3/xgemm.cc
+++ b/test/correctness/routines/level3/xgemm.cc
@ -0,0 +1,102 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the tests for the Xgemm routine.
 //
 // =================================================================================================
 #include "correctness/testblas.h"
 #include "routines/level3/xgemm.h"
 namespace clblast {
 // =================================================================================================
 // The correctness tester
 template <typename T>
 void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
  // Creates a tester
  TestBlas<T,T> tester{argc, argv, silent, name, TestXgemm<T>::GetOptions(),
                       TestXgemm<T>::RunRoutine, TestXgemm<T>::RunReference,
                       TestXgemm<T>::DownloadResult, TestXgemm<T>::GetResultIndex,
                       TestXgemm<T>::ResultID1, TestXgemm<T>::ResultID2};
  // This variable holds the arguments relevant for this routine
  auto args = Arguments<T>{};
  // Loops over the test-cases from a data-layout point of view
  for (auto &layout: tester.kLayouts) { args.layout = layout;
    for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose;
      for (auto &b_transpose: tester.kTransposes) { args.b_transpose = b_transpose;
        // Creates the arguments vector for the regular tests
        auto regular_test_vector = std::vector<Arguments<T>>{};
        for (auto &m: tester.kMatrixDims) { args.m = m;
          for (auto &n: tester.kMatrixDims) { args.n = n;
            for (auto &k: tester.kMatrixDims) { args.k = k;
              for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
                for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
                  for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
                    for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
                      for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
                        for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
                          for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
                            for (auto &beta: tester.kBetaValues) { args.beta = beta;
                              args.a_size = TestXgemm<T>::GetSizeA(args);
                              args.b_size = TestXgemm<T>::GetSizeB(args);
                              args.c_size = TestXgemm<T>::GetSizeC(args);
                              if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
                              regular_test_vector.push_back(args);
                            }
                          }
                        }
                      }
                    }
                  }
                }
              }
            }
          }
        }
        // Creates the arguments vector for the invalid-buffer tests
        auto invalid_test_vector = std::vector<Arguments<T>>{};
        args.m = args.n = args.k = tester.kBufferSize;
        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
        args.a_offset = args.b_offset = args.c_offset = 0;
        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
              invalid_test_vector.push_back(args);
            }
          }
        }
        // Runs the tests
        const auto case_name = ToString(layout)+" "+ToString(a_transpose)+" "+ToString(b_transpose);
        tester.TestRegular(regular_test_vector, case_name);
        tester.TestInvalid(invalid_test_vector, case_name);
      }
    }
  }
 }
 // =================================================================================================
 } // namespace clblast
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
  clblast::RunTest<float>(argc, argv, false, "SGEMM");
  clblast::RunTest<double>(argc, argv, true, "DGEMM");
  clblast::RunTest<clblast::float2>(argc, argv, true, "CGEMM");
  clblast::RunTest<clblast::double2>(argc, argv, true, "ZGEMM");
  return 0;
 }
 // =================================================================================================
--- a/test/correctness/routines/level3/xhemm.cc
+++ b/test/correctness/routines/level3/xhemm.cc
@ -0,0 +1,98 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the tests for the Xhemm routine.
 //
 // =================================================================================================
 #include "correctness/testblas.h"
 #include "routines/level3/xhemm.h"
 namespace clblast {
 // =================================================================================================
 // The correctness tester
 template <typename T>
 void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
  // Creates a tester
  TestBlas<T,T> tester{argc, argv, silent, name, TestXhemm<T>::GetOptions(),
                       TestXhemm<T>::RunRoutine, TestXhemm<T>::RunReference,
                       TestXhemm<T>::DownloadResult, TestXhemm<T>::GetResultIndex,
                       TestXhemm<T>::ResultID1, TestXhemm<T>::ResultID2};
  // This variable holds the arguments relevant for this routine
  auto args = Arguments<T>{};
  // Loops over the test-cases from a data-layout point of view
  for (auto &layout: tester.kLayouts) { args.layout = layout;
    for (auto &side: tester.kSides) { args.side = side;
      for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
        // Creates the arguments vector for the regular tests
        auto regular_test_vector = std::vector<Arguments<T>>{};
        for (auto &m: tester.kMatrixDims) { args.m = m;
          for (auto &n: tester.kMatrixDims) { args.n = n;
            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
                for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
                  for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
                    for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
                      for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
                          for (auto &beta: tester.kBetaValues) { args.beta = beta;
                            args.a_size = TestXhemm<T>::GetSizeA(args);
                            args.b_size = TestXhemm<T>::GetSizeB(args);
                            args.c_size = TestXhemm<T>::GetSizeC(args);
                            if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
                            regular_test_vector.push_back(args);
                          }
                        }
                      }
                    }
                  }
                }
              }
            }
          }
        }
        // Creates the arguments vector for the invalid-buffer tests
        auto invalid_test_vector = std::vector<Arguments<T>>{};
        args.m = args.n = tester.kBufferSize;
        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
        args.a_offset = args.b_offset = args.c_offset = 0;
        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
              invalid_test_vector.push_back(args);
            }
          }
        }
        // Runs the tests
        const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
        tester.TestRegular(regular_test_vector, case_name);
        tester.TestInvalid(invalid_test_vector, case_name);
      }
    }
  }
 }
 // =================================================================================================
 } // namespace clblast
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
  clblast::RunTest<clblast::float2>(argc, argv, true, "CHEMM");
  clblast::RunTest<clblast::double2>(argc, argv, true, "ZHEMM");
  return 0;
 }
 // =================================================================================================
--- a/test/correctness/routines/level3/xher2k.cc
+++ b/test/correctness/routines/level3/xher2k.cc
@ -0,0 +1,100 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the tests for the Xher2k routine.
 //
 // =================================================================================================
 #include "correctness/testblas.h"
 #include "routines/level3/xher2k.h"
 namespace clblast {
 // =================================================================================================
 // The correctness tester
 template <typename T, typename U>
 void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
  // Creates a tester
  TestBlas<T,U> tester{argc, argv, silent, name, TestXher2k<T,U>::GetOptions(),
                       TestXher2k<T,U>::RunRoutine, TestXher2k<T,U>::RunReference,
                       TestXher2k<T,U>::DownloadResult, TestXher2k<T,U>::GetResultIndex,
                       TestXher2k<T,U>::ResultID1, TestXher2k<T,U>::ResultID2};
  // This variable holds the arguments relevant for this routine
  auto args = Arguments<U>{};
  // Loops over the test-cases from a data-layout point of view
  for (auto &layout: tester.kLayouts) { args.layout = layout;
    for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
      for (auto &ab_transpose: {Transpose::kNo, Transpose::kConjugate}) { // Regular transpose not a
        args.a_transpose = ab_transpose;                                  // valid BLAS option
        args.b_transpose = ab_transpose;
        // Creates the arguments vector for the regular tests
        auto regular_test_vector = std::vector<Arguments<U>>{};
        for (auto &n: tester.kMatrixDims) { args.n = n;
          for (auto &k: tester.kMatrixDims) { args.k = k;
            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
                for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
                  for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
                    for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
                      for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
                          for (auto &beta: tester.kBetaValues) { args.beta = beta;
                            args.a_size = TestXher2k<T,U>::GetSizeA(args);
                            args.b_size = TestXher2k<T,U>::GetSizeB(args);
                            args.c_size = TestXher2k<T,U>::GetSizeC(args);
                            if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
                            regular_test_vector.push_back(args);
                          }
                        }
                      }
                    }
                  }
                }
              }
            }
          }
        }
        // Creates the arguments vector for the invalid-buffer tests
        auto invalid_test_vector = std::vector<Arguments<U>>{};
        args.n = args.k = tester.kBufferSize;
        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
        args.a_offset = args.b_offset = args.c_offset = 0;
        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
              invalid_test_vector.push_back(args);
            }
          }
        }
        // Runs the tests
        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose);
        tester.TestRegular(regular_test_vector, case_name);
        tester.TestInvalid(invalid_test_vector, case_name);
      }
    }
  }
 }
 // =================================================================================================
 } // namespace clblast
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
  clblast::RunTest<clblast::float2,float>(argc, argv, false, "CHER2K");
  clblast::RunTest<clblast::double2,double>(argc, argv, true, "ZHER2K");
  return 0;
 }
 // =================================================================================================
--- a/test/correctness/routines/level3/xherk.cc
+++ b/test/correctness/routines/level3/xherk.cc
@ -0,0 +1,92 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the tests for the Xherk routine.
 //
 // =================================================================================================
 #include "correctness/testblas.h"
 #include "routines/level3/xherk.h"
 namespace clblast {
 // =================================================================================================
 // The correctness tester
 template <typename T, typename U>
 void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
  // Creates a tester
  TestBlas<T,U> tester{argc, argv, silent, name, TestXherk<T,U>::GetOptions(),
                       TestXherk<T,U>::RunRoutine, TestXherk<T,U>::RunReference,
                       TestXherk<T,U>::DownloadResult, TestXherk<T,U>::GetResultIndex,
                       TestXherk<T,U>::ResultID1, TestXherk<T,U>::ResultID2};
  // This variable holds the arguments relevant for this routine
  auto args = Arguments<U>{};
  // Loops over the test-cases from a data-layout point of view
  for (auto &layout: tester.kLayouts) { args.layout = layout;
    for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
      for (auto &a_transpose: {Transpose::kNo, Transpose::kConjugate}) { // Regular transpose not a
        args.a_transpose = a_transpose;                                  // valid BLAS option
        // Creates the arguments vector for the regular tests
        auto regular_test_vector = std::vector<Arguments<U>>{};
        for (auto &n: tester.kMatrixDims) { args.n = n;
          for (auto &k: tester.kMatrixDims) { args.k = k;
            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
                for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
                  for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
                    for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
                      for (auto &beta: tester.kBetaValues) { args.beta = beta;
                        args.a_size = TestXherk<T,U>::GetSizeA(args);
                        args.c_size = TestXherk<T,U>::GetSizeC(args);
                        if (args.a_size<1 || args.c_size<1) { continue; }
                        regular_test_vector.push_back(args);
                      }
                    }
                  }
                }
              }
            }
          }
        }
        // Creates the arguments vector for the invalid-buffer tests
        auto invalid_test_vector = std::vector<Arguments<U>>{};
        args.n = args.k = tester.kBufferSize;
        args.a_ld = args.c_ld = tester.kBufferSize;
        args.a_offset = args.c_offset = 0;
        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
          for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
            invalid_test_vector.push_back(args);
          }
        }
        // Runs the tests
        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose);
        tester.TestRegular(regular_test_vector, case_name);
        tester.TestInvalid(invalid_test_vector, case_name);
      }
    }
  }
 }
 // =================================================================================================
 } // namespace clblast
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
  clblast::RunTest<clblast::float2,float>(argc, argv, false, "CHERK");
  clblast::RunTest<clblast::double2,double>(argc, argv, true, "ZHERK");
  return 0;
 }
 // =================================================================================================
--- a/test/correctness/routines/level3/xsymm.cc
+++ b/test/correctness/routines/level3/xsymm.cc
@ -0,0 +1,100 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the tests for the Xsymm routine.
 //
 // =================================================================================================
 #include "correctness/testblas.h"
 #include "routines/level3/xsymm.h"
 namespace clblast {
 // =================================================================================================
 // The correctness tester
 template <typename T>
 void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
  // Creates a tester
  TestBlas<T,T> tester{argc, argv, silent, name, TestXsymm<T>::GetOptions(),
                       TestXsymm<T>::RunRoutine, TestXsymm<T>::RunReference,
                       TestXsymm<T>::DownloadResult, TestXsymm<T>::GetResultIndex,
                       TestXsymm<T>::ResultID1, TestXsymm<T>::ResultID2};
  // This variable holds the arguments relevant for this routine
  auto args = Arguments<T>{};
  // Loops over the test-cases from a data-layout point of view
  for (auto &layout: tester.kLayouts) { args.layout = layout;
    for (auto &side: tester.kSides) { args.side = side;
      for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
        // Creates the arguments vector for the regular tests
        auto regular_test_vector = std::vector<Arguments<T>>{};
        for (auto &m: tester.kMatrixDims) { args.m = m;
          for (auto &n: tester.kMatrixDims) { args.n = n;
            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
                for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
                  for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
                    for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
                      for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
                          for (auto &beta: tester.kBetaValues) { args.beta = beta;
                            args.a_size = TestXsymm<T>::GetSizeA(args);
                            args.b_size = TestXsymm<T>::GetSizeB(args);
                            args.c_size = TestXsymm<T>::GetSizeC(args);
                            if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
                            regular_test_vector.push_back(args);
                          }
                        }
                      }
                    }
                  }
                }
              }
            }
          }
        }
        // Creates the arguments vector for the invalid-buffer tests
        auto invalid_test_vector = std::vector<Arguments<T>>{};
        args.m = args.n = tester.kBufferSize;
        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
        args.a_offset = args.b_offset = args.c_offset = 0;
        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
              invalid_test_vector.push_back(args);
            }
          }
        }
        // Runs the tests
        const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
        tester.TestRegular(regular_test_vector, case_name);
        tester.TestInvalid(invalid_test_vector, case_name);
      }
    }
  }
 }
 // =================================================================================================
 } // namespace clblast
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
  clblast::RunTest<float>(argc, argv, false, "SSYMM");
  clblast::RunTest<double>(argc, argv, true, "DSYMM");
  clblast::RunTest<clblast::float2>(argc, argv, true, "CSYMM");
  clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYMM");
  return 0;
 }
 // =================================================================================================
--- a/test/correctness/routines/level3/xsyr2k.cc
+++ b/test/correctness/routines/level3/xsyr2k.cc
@ -0,0 +1,102 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the tests for the Xsyr2k routine.
 //
 // =================================================================================================
 #include "correctness/testblas.h"
 #include "routines/level3/xsyr2k.h"
 namespace clblast {
 // =================================================================================================
 // The correctness tester
 template <typename T>
 void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
  // Creates a tester
  TestBlas<T,T> tester{argc, argv, silent, name, TestXsyr2k<T>::GetOptions(),
                       TestXsyr2k<T>::RunRoutine, TestXsyr2k<T>::RunReference,
                       TestXsyr2k<T>::DownloadResult, TestXsyr2k<T>::GetResultIndex,
                       TestXsyr2k<T>::ResultID1, TestXsyr2k<T>::ResultID2};
  // This variable holds the arguments relevant for this routine
  auto args = Arguments<T>{};
  // Loops over the test-cases from a data-layout point of view
  for (auto &layout: tester.kLayouts) { args.layout = layout;
    for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
      for (auto &ab_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it
        args.a_transpose = ab_transpose;                            // is not supported by clBLAS
        args.b_transpose = ab_transpose;
        // Creates the arguments vector for the regular tests
        auto regular_test_vector = std::vector<Arguments<T>>{};
        for (auto &n: tester.kMatrixDims) { args.n = n;
          for (auto &k: tester.kMatrixDims) { args.k = k;
            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
                for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
                  for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
                    for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
                      for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
                          for (auto &beta: tester.kBetaValues) { args.beta = beta;
                            args.a_size = TestXsyr2k<T>::GetSizeA(args);
                            args.b_size = TestXsyr2k<T>::GetSizeB(args);
                            args.c_size = TestXsyr2k<T>::GetSizeC(args);
                            if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
                            regular_test_vector.push_back(args);
                          }
                        }
                      }
                    }
                  }
                }
              }
            }
          }
        }
        // Creates the arguments vector for the invalid-buffer tests
        auto invalid_test_vector = std::vector<Arguments<T>>{};
        args.n = args.k = tester.kBufferSize;
        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
        args.a_offset = args.b_offset = args.c_offset = 0;
        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
              invalid_test_vector.push_back(args);
            }
          }
        }
        // Runs the tests
        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose);
        tester.TestRegular(regular_test_vector, case_name);
        tester.TestInvalid(invalid_test_vector, case_name);
      }
    }
  }
 }
 // =================================================================================================
 } // namespace clblast
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
  clblast::RunTest<float>(argc, argv, false, "SSYR2K");
  clblast::RunTest<double>(argc, argv, true, "DSYR2K");
  clblast::RunTest<clblast::float2>(argc, argv, true, "CSYR2K");
  clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYR2K");
  return 0;
 }
 // =================================================================================================
--- a/test/correctness/routines/level3/xsyrk.cc
+++ b/test/correctness/routines/level3/xsyrk.cc
@ -0,0 +1,94 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the tests for the Xsyrk routine.
 //
 // =================================================================================================
 #include "correctness/testblas.h"
 #include "routines/level3/xsyrk.h"
 namespace clblast {
 // =================================================================================================
 // The correctness tester
 template <typename T>
 void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
  // Creates a tester
  TestBlas<T,T> tester{argc, argv, silent, name, TestXsyrk<T>::GetOptions(),
                       TestXsyrk<T>::RunRoutine, TestXsyrk<T>::RunReference,
                       TestXsyrk<T>::DownloadResult, TestXsyrk<T>::GetResultIndex,
                       TestXsyrk<T>::ResultID1, TestXsyrk<T>::ResultID2};
  // This variable holds the arguments relevant for this routine
  auto args = Arguments<T>{};
  // Loops over the test-cases from a data-layout point of view
  for (auto &layout: tester.kLayouts) { args.layout = layout;
    for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
      for (auto &a_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it
        args.a_transpose = a_transpose;                            // is not supported by clBLAS
        // Creates the arguments vector for the regular tests
        auto regular_test_vector = std::vector<Arguments<T>>{};
        for (auto &n: tester.kMatrixDims) { args.n = n;
          for (auto &k: tester.kMatrixDims) { args.k = k;
            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
                for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
                  for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
                    for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
                      for (auto &beta: tester.kBetaValues) { args.beta = beta;
                        args.a_size = TestXsyrk<T>::GetSizeA(args);
                        args.c_size = TestXsyrk<T>::GetSizeC(args);
                        if (args.a_size<1 || args.c_size<1) { continue; }
                        regular_test_vector.push_back(args);
                      }
                    }
                  }
                }
              }
            }
          }
        }
        // Creates the arguments vector for the invalid-buffer tests
        auto invalid_test_vector = std::vector<Arguments<T>>{};
        args.n = args.k = tester.kBufferSize;
        args.a_ld = args.c_ld = tester.kBufferSize;
        args.a_offset = args.c_offset = 0;
        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
          for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
            invalid_test_vector.push_back(args);
          }
        }
        // Runs the tests
        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose);
        tester.TestRegular(regular_test_vector, case_name);
        tester.TestInvalid(invalid_test_vector, case_name);
      }
    }
  }
 }
 // =================================================================================================
 } // namespace clblast
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
  clblast::RunTest<float>(argc, argv, false, "SSYRK");
  clblast::RunTest<double>(argc, argv, true, "DSYRK");
  clblast::RunTest<clblast::float2>(argc, argv, true, "CSYRK");
  clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYRK");
  return 0;
 }
 // =================================================================================================
--- a/test/correctness/routines/level3/xtrmm.cc
+++ b/test/correctness/routines/level3/xtrmm.cc
@ -0,0 +1,96 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the tests for the Xtrmm routine.
 //
 // =================================================================================================
 #include "correctness/testblas.h"
 #include "routines/level3/xtrmm.h"
 namespace clblast {
 // =================================================================================================
 // The correctness tester
 template <typename T>
 void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
  // Creates a tester
  TestBlas<T,T> tester{argc, argv, silent, name, TestXtrmm<T>::GetOptions(),
                       TestXtrmm<T>::RunRoutine, TestXtrmm<T>::RunReference,
                       TestXtrmm<T>::DownloadResult, TestXtrmm<T>::GetResultIndex,
                       TestXtrmm<T>::ResultID1, TestXtrmm<T>::ResultID2};
  // This variable holds the arguments relevant for this routine
  auto args = Arguments<T>{};
  // Loops over the test-cases from a data-layout point of view
  for (auto &layout: tester.kLayouts) { args.layout = layout;
    for (auto &side: tester.kSides) { args.side = side;
      for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
        for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose;
          for (auto &diagonal: tester.kDiagonals) { args.diagonal = diagonal;
            // Creates the arguments vector for the regular tests
            auto regular_test_vector = std::vector<Arguments<T>>{};
            for (auto &m: tester.kMatrixDims) { args.m = m;
              for (auto &n: tester.kMatrixDims) { args.n = n;
                for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
                  for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
                    for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
                      for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
                          args.a_size = TestXtrmm<T>::GetSizeA(args);
                          args.b_size = TestXtrmm<T>::GetSizeB(args);
                          if (args.a_size<1 || args.b_size<1) { continue; }
                          regular_test_vector.push_back(args);
                        }
                      }
                    }
                  }
                }
              }
            }
            // Creates the arguments vector for the invalid-buffer tests
            auto invalid_test_vector = std::vector<Arguments<T>>{};
            args.m = args.n = tester.kBufferSize;
            args.a_ld = args.b_ld = tester.kBufferSize;
            args.a_offset = args.b_offset = 0;
            for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
              for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
                invalid_test_vector.push_back(args);
              }
            }
            // Runs the tests
            const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle)+" "+
                                   ToString(a_transpose)+" "+ToString(diagonal);
            tester.TestRegular(regular_test_vector, case_name);
            tester.TestInvalid(invalid_test_vector, case_name);
          }
        }
      }
    }
  }
 }
 // =================================================================================================
 } // namespace clblast
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
  clblast::RunTest<float>(argc, argv, false, "STRMM");
  clblast::RunTest<double>(argc, argv, true, "DTRMM");
  clblast::RunTest<clblast::float2>(argc, argv, true, "CTRMM");
  clblast::RunTest<clblast::double2>(argc, argv, true, "ZTRMM");
  return 0;
 }
 // =================================================================================================
--- a/test/correctness/routines/xaxpy.cc
+++ b/test/correctness/routines/xaxpy.cc
@ -1,75 +0,0 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under the MIT license. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the tests for the Xaxpy routine. It is based on the TestXY class.
 //
 // =================================================================================================
 #include "wrapper_clblas.h"
 #include "correctness/testxy.h"
 namespace clblast {
 // =================================================================================================
 // The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
 template <typename T>
 void XaxpyTest(int argc, char *argv[], const bool silent, const std::string &name) {
  // Creates the CLBlast lambda
  auto clblast_lambda = [](const Arguments<T> &args,
                           const Buffer &x_vec, const Buffer &y_vec,
                           CommandQueue &queue) -> StatusCode {
    auto queue_plain = queue();
    auto event = cl_event{};
    return Axpy(args.n, args.alpha,
                x_vec(), args.x_offset, args.x_inc,
                y_vec(), args.y_offset, args.y_inc,
                &queue_plain, &event);
  };
  // Creates the clBLAS lambda (for comparison)
  auto clblas_lambda = [](const Arguments<T> &args,
                          const Buffer &x_vec, const Buffer &y_vec,
                          CommandQueue &queue) -> StatusCode {
    auto queue_plain = queue();
    auto event = cl_event{};
    auto status = clblasXaxpy(args.n, args.alpha,
                              x_vec(), args.x_offset, args.x_inc,
                              y_vec(), args.y_offset, args.y_inc,
                              1, &queue_plain, 0, nullptr, &event);
    return static_cast<StatusCode>(status);
  };
  // Initializes the arguments relevant for this routine
  auto args = Arguments<T>{};
  const auto options = std::vector<std::string>{kArgN, kArgXInc, kArgYInc,
                                                kArgXOffset, kArgYOffset, kArgAlpha};
  // Creates a tester
  TestXY<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
  // Runs the tests
  const auto case_name = "default";
  tester.TestRegular(args, case_name);
  tester.TestInvalidBufferSizes(args, case_name);
 }
 // =================================================================================================
 } // namespace clblast
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
  clblast::XaxpyTest<float>(argc, argv, false, "SAXPY");
  clblast::XaxpyTest<double>(argc, argv, true, "DAXPY");
  clblast::XaxpyTest<clblast::float2>(argc, argv, true, "CAXPY");
  clblast::XaxpyTest<clblast::double2>(argc, argv, true, "ZAXPY");
  return 0;
 }
 // =================================================================================================
--- a/test/correctness/routines/xgemm.cc
+++ b/test/correctness/routines/xgemm.cc
@ -1,98 +0,0 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under the MIT license. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the tests for the Xgemm routine. It is based on the TestABC class.
 //
 // =================================================================================================
 #include "wrapper_clblas.h"
 #include "correctness/testabc.h"
 namespace clblast {
 // =================================================================================================
 // The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
 template <typename T>
 void XgemmTest(int argc, char *argv[], const bool silent, const std::string &name) {
  // Creates the CLBlast lambda
  auto clblast_lambda = [](const Arguments<T> &args,
                           const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
                           CommandQueue &queue) -> StatusCode {
    auto queue_plain = queue();
    auto event = cl_event{};
    return Gemm(args.layout, args.a_transpose, args.b_transpose,
                args.m, args.n, args.k,
                args.alpha,
                a_mat(), args.a_offset, args.a_ld,
                b_mat(), args.b_offset, args.b_ld,
                args.beta,
                c_mat(), args.c_offset, args.c_ld,
                &queue_plain, &event);
  };
  // Creates the clBLAS lambda (for comparison)
  auto clblas_lambda = [](const Arguments<T> &args,
                          const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
                          CommandQueue &queue) -> StatusCode {
    auto queue_plain = queue();
    auto event = cl_event{};
    auto status = clblasXgemm(static_cast<clblasOrder>(args.layout),
                              static_cast<clblasTranspose>(args.a_transpose),
                              static_cast<clblasTranspose>(args.b_transpose),
                              args.m, args.n, args.k,
                              args.alpha,
                              a_mat(), args.a_offset, args.a_ld,
                              b_mat(), args.b_offset, args.b_ld,
                              args.beta,
                              c_mat(), args.c_offset, args.c_ld,
                              1, &queue_plain, 0, nullptr, &event);
    return static_cast<StatusCode>(status);
  };
  // Initializes the arguments relevant for this routine
  auto args = Arguments<T>{};
  const auto options = std::vector<std::string>{kArgM, kArgN, kArgK, kArgLayout,
                                                kArgATransp, kArgBTransp,
                                                kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
                                                kArgAOffset, kArgBOffset, kArgCOffset};
  // Creates a tester
  TestABC<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
  // Loops over the test-cases from a data-layout point of view
  for (auto &layout: tester.kLayouts) {
    args.layout = layout;
    for (auto &a_transpose: tester.kTransposes) {
      args.a_transpose = a_transpose;
      for (auto &b_transpose: tester.kTransposes) {
        args.b_transpose = b_transpose;
        const auto case_name = ToString(layout)+" "+ToString(a_transpose)+" "+ToString(b_transpose);
        // Runs the tests
        tester.TestRegular(args, case_name);
        tester.TestInvalidBufferSizes(args, case_name);
      }
    }
  }
 }
 // =================================================================================================
 } // namespace clblast
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
  clblast::XgemmTest<float>(argc, argv, false, "SGEMM");
  clblast::XgemmTest<double>(argc, argv, true, "DGEMM");
  clblast::XgemmTest<clblast::float2>(argc, argv, true, "CGEMM");
  clblast::XgemmTest<clblast::double2>(argc, argv, true, "ZGEMM");
  return 0;
 }
 // =================================================================================================
--- a/test/correctness/routines/xgemv.cc
+++ b/test/correctness/routines/xgemv.cc
@ -1,88 +0,0 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under the MIT license. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the tests for the Xgemv routine. It is based on the TestAXY class.
 //
 // =================================================================================================
 #include "wrapper_clblas.h"
 #include "correctness/testaxy.h"
 namespace clblast {
 // =================================================================================================
 // The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
 template <typename T>
 void XgemvTest(int argc, char *argv[], const bool silent, const std::string &name) {
  // Creates the CLBlast lambda
  auto clblast_lambda = [](const Arguments<T> &args,
                           const Buffer &a_mat, const Buffer &x_vec, const Buffer &y_vec,
                           CommandQueue &queue) -> StatusCode {
    auto queue_plain = queue();
    auto event = cl_event{};
    return Gemv(args.layout, args.a_transpose, args.m, args.n, args.alpha,
                a_mat(), args.a_offset, args.a_ld,
                x_vec(), args.x_offset, args.x_inc, args.beta,
                y_vec(), args.y_offset, args.y_inc,
                &queue_plain, &event);
  };
  // Creates the clBLAS lambda (for comparison)
  auto clblas_lambda = [](const Arguments<T> &args,
                          const Buffer &a_mat, const Buffer &x_vec, const Buffer &y_vec,
                          CommandQueue &queue) -> StatusCode {
    auto queue_plain = queue();
    auto event = cl_event{};
    auto status = clblasXgemv(static_cast<clblasOrder>(args.layout),
                              static_cast<clblasTranspose>(args.a_transpose),
                              args.m, args.n, args.alpha,
                              a_mat(), args.a_offset, args.a_ld,
                              x_vec(), args.x_offset, args.x_inc, args.beta,
                              y_vec(), args.y_offset, args.y_inc,
                              1, &queue_plain, 0, nullptr, &event);
    return static_cast<StatusCode>(status);
  };
  // Initializes the arguments relevant for this routine
  auto args = Arguments<T>{};
  const auto options = std::vector<std::string>{kArgM, kArgN, kArgLayout, kArgATransp,
                                                kArgALeadDim, kArgXInc, kArgYInc,
                                                kArgAOffset, kArgXOffset, kArgYOffset};
  // Creates a tester
  TestAXY<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
  // Loops over the test-cases from a data-layout point of view
  for (auto &layout: tester.kLayouts) {
    args.layout = layout;
    for (auto &a_transpose: tester.kTransposes) {
      args.a_transpose = a_transpose;
      const auto case_name = ToString(layout)+" "+ToString(a_transpose);
      // Runs the tests
      tester.TestRegular(args, case_name);
      tester.TestInvalidBufferSizes(args, case_name);
    }
  }
 }
 // =================================================================================================
 } // namespace clblast
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
  clblast::XgemvTest<float>(argc, argv, false, "SGEMV");
  clblast::XgemvTest<double>(argc, argv, true, "DGEMV");
  clblast::XgemvTest<clblast::float2>(argc, argv, true, "CGEMV");
  clblast::XgemvTest<clblast::double2>(argc, argv, true, "ZGEMV");
  return 0;
 }
 // =================================================================================================
--- a/test/correctness/routines/xsymm.cc
+++ b/test/correctness/routines/xsymm.cc
@ -1,98 +0,0 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under the MIT license. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the tests for the Xsymm routine. It is based on the TestABC class.
 //
 // =================================================================================================
 #include "wrapper_clblas.h"
 #include "correctness/testabc.h"
 namespace clblast {
 // =================================================================================================
 // The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
 template <typename T>
 void XsymmTest(int argc, char *argv[], const bool silent, const std::string &name) {
  // Creates the CLBlast lambda
  auto clblast_lambda = [](const Arguments<T> &args,
                           const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
                           CommandQueue &queue) -> StatusCode {
    auto queue_plain = queue();
    auto event = cl_event{};
    return Symm(args.layout, args.side, args.triangle,
                args.m, args.n,
                args.alpha,
                a_mat(), args.a_offset, args.a_ld,
                b_mat(), args.b_offset, args.b_ld,
                args.beta,
                c_mat(), args.c_offset, args.c_ld,
                &queue_plain, &event);
  };
  // Creates the clBLAS lambda (for comparison)
  auto clblas_lambda = [](const Arguments<T> &args,
                          const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
                          CommandQueue &queue) -> StatusCode {
    auto queue_plain = queue();
    auto event = cl_event{};
    auto status = clblasXsymm(static_cast<clblasOrder>(args.layout),
                              static_cast<clblasSide>(args.side),
                              static_cast<clblasUplo>(args.triangle),
                              args.m, args.n,
                              args.alpha,
                              a_mat(), args.a_offset, args.a_ld,
                              b_mat(), args.b_offset, args.b_ld,
                              args.beta,
                              c_mat(), args.c_offset, args.c_ld,
                              1, &queue_plain, 0, nullptr, &event);
    return static_cast<StatusCode>(status);
  };
  // Initializes the arguments relevant for this routine
  auto args = Arguments<T>{};
  const auto options = std::vector<std::string>{kArgM, kArgN, kArgLayout,
                                                kArgSide, kArgTriangle,
                                                kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
                                                kArgAOffset, kArgBOffset, kArgCOffset};
  // Creates a tester
  TestABC<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
  // Loops over the test-cases from a data-layout point of view
  for (auto &layout: tester.kLayouts) {
    args.layout = layout;
    for (auto &side: {Side::kLeft, Side::kRight}) {
      args.side = side;
      for (auto &triangle: {Triangle::kUpper, Triangle::kLower}) {
        args.triangle = triangle;
        const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
        // Runs the tests
        tester.TestRegular(args, case_name);
        tester.TestInvalidBufferSizes(args, case_name);
      }
    }
  }
 }
 // =================================================================================================
 } // namespace clblast
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
  clblast::XsymmTest<float>(argc, argv, false, "SSYMM");
  clblast::XsymmTest<double>(argc, argv, true, "DSYMM");
  clblast::XsymmTest<clblast::float2>(argc, argv, true, "CSYMM");
  clblast::XsymmTest<clblast::double2>(argc, argv, true, "ZSYMM");
  return 0;
 }
 // =================================================================================================
--- a/test/correctness/testabc.cc
+++ b/test/correctness/testabc.cc
@ -1,217 +0,0 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under the MIT license. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the TestABC class (see the header for information about the class).
 //
 // =================================================================================================
 #include <algorithm>
 #include "correctness/testabc.h"
 namespace clblast {
 // =================================================================================================
 // Constructor, initializes the base class tester and input data
 template <typename T>
 TestABC<T>::TestABC(int argc, char *argv[], const bool silent,
                    const std::string &name, const std::vector<std::string> &options,
                    const Routine clblast_lambda, const Routine clblas_lambda):
    Tester<T>{argc, argv, silent, name, options},
    clblast_lambda_(clblast_lambda),
    clblas_lambda_(clblas_lambda) {
  // Computes the maximum sizes. This allows for a single set of input/output buffers.
  auto max_dim = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
  auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
  auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
  // Creates test input data
  a_source_.resize(max_dim*max_ld + max_offset);
  b_source_.resize(max_dim*max_ld + max_offset);
  c_source_.resize(max_dim*max_ld + max_offset);
  PopulateVector(a_source_);
  PopulateVector(b_source_);
  PopulateVector(c_source_);
 }
 // ===============================================================================================
 // Tests the routine for a wide variety of parameters
 template <typename T>
 void TestABC<T>::TestRegular(Arguments<T> &args, const std::string &name) {
  if (!PrecisionSupported()) { return; }
  TestStart("regular behaviour", name);
  // Computes whether or not the matrices are transposed. Note that we assume a default of
  // column-major and no-transpose. If one of them is different (but not both), then rotated
  // is considered true.
  auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
                   (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
  auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
                   (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
  auto c_rotated = (args.layout == Layout::kRowMajor);
  // Iterates over the matrix dimensions
  for (auto &m: kMatrixDims) {
    args.m = m;
    for (auto &n: kMatrixDims) {
      args.n = n;
      for (auto &k: kMatrixDims) {
        args.k = k;
        // Computes the second dimensions of the matrices taking the rotation into account
        auto a_two = (a_rotated) ? m : k;
        auto b_two = (b_rotated) ? k : n;
        auto c_two = (c_rotated) ? m : n;
        // Iterates over the leading-dimension values and the offsets
        for (auto &a_ld: kMatrixDims) {
          args.a_ld = a_ld;
          for (auto &a_offset: kOffsets) {
            args.a_offset = a_offset;
            for (auto &b_ld: kMatrixDims) {
              args.b_ld = b_ld;
              for (auto &b_offset: kOffsets) {
                args.b_offset = b_offset;
                for (auto &c_ld: kMatrixDims) {
                  args.c_ld = c_ld;
                  for (auto &c_offset: kOffsets) {
                    args.c_offset = c_offset;
                    // Computes the buffer sizes
                    auto a_size = a_two * a_ld + a_offset;
                    auto b_size = b_two * b_ld + b_offset;
                    auto c_size = c_two * c_ld + c_offset;
                    if (a_size < 1 || b_size < 1 || c_size < 1) { continue; }
                    // Creates the OpenCL buffers
                    auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T));
                    auto b_mat = Buffer(context_, CL_MEM_READ_WRITE, b_size*sizeof(T));
                    auto r_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T));
                    auto s_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T));
                    // Iterates over the values for alpha and beta
                    for (auto &alpha: kAlphaValues) {
                      args.alpha = alpha;
                      for (auto &beta: kBetaValues) {
                        args.beta = beta;
                        // Runs the reference clBLAS code
                        a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
                        b_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_);
                        r_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_);
                        auto status1 = clblas_lambda_(args, a_mat, b_mat, r_mat, queue_);
                        // Runs the CLBlast code
                        a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
                        b_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_);
                        s_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_);
                        auto status2 = clblast_lambda_(args, a_mat, b_mat, s_mat, queue_);
                        // Tests for equality of the two status codes
                        if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
                          TestErrorCodes(status1, status2, args);
                          continue;
                        }
                        // Downloads the results
                        std::vector<T> r_result(c_size, static_cast<T>(0));
                        std::vector<T> s_result(c_size, static_cast<T>(0));
                        r_mat.ReadBuffer(queue_, c_size*sizeof(T), r_result);
                        s_mat.ReadBuffer(queue_, c_size*sizeof(T), s_result);
                        // Checks for differences in the output
                        auto errors = size_t{0};
                        for (auto idm=size_t{0}; idm<m; ++idm) {
                          for (auto idn=size_t{0}; idn<n; ++idn) {
                            auto index = (args.layout == Layout::kRowMajor) ?
                                          idm*args.c_ld + idn + args.c_offset:
                                          idn*args.c_ld + idm + args.c_offset;
                            if (!TestSimilarity(r_result[index], s_result[index])) {
                              errors++;
                            }
                          }
                        }
                        // Tests the error count (should be zero)
                        TestErrorCount(errors, m*n, args);
                      }
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
  }
  TestEnd();
 }
 // =================================================================================================
 // Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
 // does not test for results (if any).
 template <typename T>
 void TestABC<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
  if (!PrecisionSupported()) { return; }
  TestStart("invalid buffer sizes", name);
  // Sets example test parameters
  args.m = kBufferSize;
  args.n = kBufferSize;
  args.k = kBufferSize;
  args.a_ld = kBufferSize;
  args.b_ld = kBufferSize;
  args.c_ld = kBufferSize;
  args.a_offset = 0;
  args.b_offset = 0;
  args.c_offset = 0;
  // Iterates over test buffer sizes
  const std::vector<size_t> kBufferSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
  for (auto &a_size: kBufferSizes) {
    for (auto &b_size: kBufferSizes) {
      for (auto &c_size: kBufferSizes) {
        // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
        // want to be able to create invalid buffers (no error checking here).
        auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr);
        auto a_mat = Buffer(a);
        auto b = clCreateBuffer(context_(), CL_MEM_READ_WRITE, b_size*sizeof(T), nullptr, nullptr);
        auto b_mat = Buffer(b);
        auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr);
        auto r_mat = Buffer(r);
        auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr);
        auto s_mat = Buffer(s);
        // Runs the two routines
        auto status1 = clblas_lambda_(args, a_mat, b_mat, r_mat, queue_);
        auto status2 = clblast_lambda_(args, a_mat, b_mat, s_mat, queue_);
        // Tests for equality of the two status codes
        TestErrorCodes(status1, status2, args);
      }
    }
  }
  TestEnd();
 }
 // =================================================================================================
 // Compiles the templated class
 template class TestABC<float>;
 template class TestABC<double>;
 template class TestABC<float2>;
 template class TestABC<double2>;
 // =================================================================================================
 } // namespace clblast
--- a/test/correctness/testabc.h
+++ b/test/correctness/testabc.h
@ -1,86 +0,0 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under the MIT license. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file tests any mat-mat-mat (A,B,C) routine. It contains two types of tests: one testing
 // all sorts of input combinations, and one deliberatly testing with invalid values.
 //
 // =================================================================================================
 #ifndef CLBLAST_TEST_CORRECTNESS_TESTABC_H_
 #define CLBLAST_TEST_CORRECTNESS_TESTABC_H_
 #include <vector>
 #include <string>
 #include "correctness/tester.h"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T>
 class TestABC: public Tester<T> {
 public:
  // Uses several variables from the Tester class
  using Tester<T>::context_;
  using Tester<T>::queue_;
  using Tester<T>::kLayouts;
  using Tester<T>::kTransposes;
  // Uses several helper functions from the Tester class
  using Tester<T>::TestStart;
  using Tester<T>::TestEnd;
  using Tester<T>::TestSimilarity;
  using Tester<T>::TestErrorCount;
  using Tester<T>::TestErrorCodes;
  using Tester<T>::GetExampleScalars;
  using Tester<T>::GetOffsets;
  using Tester<T>::PrecisionSupported;
  // Test settings for the regular test. Append to this list in case more tests are required.
  const std::vector<size_t> kMatrixDims = { 7, 64 };
  const std::vector<size_t> kOffsets = GetOffsets();
  const std::vector<T> kAlphaValues = GetExampleScalars();
  const std::vector<T> kBetaValues = GetExampleScalars();
  // Test settings for the invalid test
  const size_t kBufferSize = 64;
  // Shorthand for a BLAS routine
  using Routine = std::function<StatusCode(const Arguments<T>&,
                                           const Buffer&, const Buffer&, const Buffer&,
                                           CommandQueue&)>;
  // Constructor, initializes the base class tester and input data
  TestABC(int argc, char *argv[], const bool silent,
          const std::string &name, const std::vector<std::string> &options,
          const Routine clblast_lambda, const Routine clblas_lambda);
  // The test functions, taking no inputs
  void TestRegular(Arguments<T> &args, const std::string &name);
  void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
 private:
  // Source data to test with
  std::vector<T> a_source_;
  std::vector<T> b_source_;
  std::vector<T> c_source_;
  // The routines to test
  Routine clblast_lambda_;
  Routine clblas_lambda_;
 };
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_TEST_CORRECTNESS_TESTABC_H_
 #endif
--- a/test/correctness/testaxy.cc
+++ b/test/correctness/testaxy.cc
@ -1,213 +0,0 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under the MIT license. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the TestAXY class (see the header for information about the class).
 //
 // =================================================================================================
 #include <algorithm>
 #include "correctness/testaxy.h"
 namespace clblast {
 // =================================================================================================
 // Constructor, initializes the base class tester and input data
 template <typename T>
 TestAXY<T>::TestAXY(int argc, char *argv[], const bool silent,
                    const std::string &name, const std::vector<std::string> &options,
                    const Routine clblast_lambda, const Routine clblas_lambda):
    Tester<T>{argc, argv, silent, name, options},
    clblast_lambda_(clblast_lambda),
    clblas_lambda_(clblas_lambda) {
  // Computes the maximum sizes. This allows for a single set of input/output buffers.
  auto max_dim = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
  auto max_ld = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
  auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end());
  auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
  // Creates test input data
  a_source_.resize(max_dim*max_ld + max_offset);
  x_source_.resize(max_dim*max_inc + max_offset);
  y_source_.resize(max_dim*max_inc + max_offset);
  PopulateVector(a_source_);
  PopulateVector(x_source_);
  PopulateVector(y_source_);
 }
 // ===============================================================================================
 // Tests the routine for a wide variety of parameters
 template <typename T>
 void TestAXY<T>::TestRegular(Arguments<T> &args, const std::string &name) {
  if (!PrecisionSupported()) { return; }
  TestStart("regular behaviour", name);
  // Iterates over the dimension for the matrix and vectors
  for (auto &m: kMatrixVectorDims) {
    args.m = m;
    for (auto &n: kMatrixVectorDims) {
      args.n = n;
      // Computes the second dimension of the matrix taking the rotation into account
      auto a_two = (args.layout == Layout::kRowMajor) ? args.m : args.n;
      // Computes the vector sizes in case the matrix is transposed
      auto a_transposed = (args.a_transpose == Transpose::kYes);
      auto m_real = (a_transposed) ? n : m;
      auto n_real = (a_transposed) ? m : n;
      // Iterates over the leading-dimension values and the offsets of the matrix
      for (auto &a_ld: kMatrixVectorDims) {
        args.a_ld = a_ld;
        for (auto &a_offset: kOffsets) {
          args.a_offset = a_offset;
          // Iterates over the increment-values and the offsets of the vectors
          for (auto &x_inc: kIncrements) {
            args.x_inc = x_inc;
            for (auto &x_offset: kOffsets) {
              args.x_offset = x_offset;
              for (auto &y_inc: kIncrements) {
                args.y_inc = y_inc;
                for (auto &y_offset: kOffsets) {
                  args.y_offset = y_offset;
                  // Computes the buffer sizes
                  auto a_size = a_two * a_ld + a_offset;
                  auto x_size = n_real * x_inc + x_offset;
                  auto y_size = m_real * y_inc + y_offset;
                  if (a_size < 1 || x_size < 1 || y_size < 1) { continue; }
                  // Creates the OpenCL buffers
                  auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T));
                  auto x_vec = Buffer(context_, CL_MEM_READ_WRITE, x_size*sizeof(T));
                  auto r_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
                  auto s_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
                  // Iterates over the values for alpha and beta
                  for (auto &alpha: kAlphaValues) {
                    args.alpha = alpha;
                    for (auto &beta: kBetaValues) {
                      args.beta = beta;
                      // Runs the reference clBLAS code
                      a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
                      x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
                      r_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
                      auto status1 = clblas_lambda_(args, a_mat, x_vec, r_vec, queue_);
                      // Runs the CLBlast code
                      a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
                      x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
                      s_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
                      auto status2 = clblast_lambda_(args, a_mat, x_vec, s_vec, queue_);
                      // Tests for equality of the two status codes
                      if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
                        TestErrorCodes(status1, status2, args);
                        continue;
                      }
                      // Downloads the results
                      std::vector<T> r_result(y_size, static_cast<T>(0));
                      std::vector<T> s_result(y_size, static_cast<T>(0));
                      r_vec.ReadBuffer(queue_, y_size*sizeof(T), r_result);
                      s_vec.ReadBuffer(queue_, y_size*sizeof(T), s_result);
                      // Checks for differences in the output
                      auto errors = size_t{0};
                      for (auto idm=size_t{0}; idm<m_real; ++idm) {
                        auto index = idm*y_inc + y_offset;
                        if (!TestSimilarity(r_result[index], s_result[index])) {
                          errors++;
                        }
                      }
                      // Tests the error count (should be zero)
                      TestErrorCount(errors, m_real, args);
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
  }
  TestEnd();
 }
 // =================================================================================================
 // Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
 // does not test for results (if any).
 template <typename T>
 void TestAXY<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
  if (!PrecisionSupported()) { return; }
  TestStart("invalid buffer sizes", name);
  // Sets example test parameters
  args.m = kBufferSize;
  args.n = kBufferSize;
  args.a_ld = kBufferSize;
  args.a_offset = 0;
  args.x_offset = 0;
  args.y_offset = 0;
  // Iterates over test buffer sizes
  const std::vector<size_t> kMatrixSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
  const std::vector<size_t> kVectorSizes = {0, kBufferSize - 1, kBufferSize};
  for (auto &a_size: kMatrixSizes) {
    for (auto &x_size: kVectorSizes) {
      for (auto &y_size: kVectorSizes) {
        // Iterates over test increments
        for (auto &x_inc: kInvalidIncrements) {
          args.x_inc = x_inc;
          for (auto &y_inc: kInvalidIncrements) {
            args.y_inc = y_inc;
            // Creates the OpenCL buffers. Note: we are not using the C++ version since we
            // explicitly want to be able to create invalid buffers (no error checking here).
            auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr);
            auto a_mat = Buffer(a);
            auto x = clCreateBuffer(context_(), CL_MEM_READ_WRITE, x_size*sizeof(T), nullptr, nullptr);
            auto x_vec = Buffer(x);
            auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
            auto r_vec = Buffer(r);
            auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
            auto s_vec = Buffer(s);
            // Runs the two routines
            auto status1 = clblas_lambda_(args, a_mat, x_vec, r_vec, queue_);
            auto status2 = clblast_lambda_(args, a_mat, x_vec, s_vec, queue_);
            // Tests for equality of the two status codes
            TestErrorCodes(status1, status2, args);
          }
        }
      }
    }
  }
  TestEnd();
 }
 // =================================================================================================
 // Compiles the templated class
 template class TestAXY<float>;
 template class TestAXY<double>;
 template class TestAXY<float2>;
 template class TestAXY<double2>;
 // =================================================================================================
 } // namespace clblast
--- a/test/correctness/testaxy.h
+++ b/test/correctness/testaxy.h
@ -1,88 +0,0 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under the MIT license. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file tests any mat-vec-vec (A,X,Y) routine. It contains two types of tests: one testing
 // all sorts of input combinations, and one deliberatly testing with invalid values.
 //
 // =================================================================================================
 #ifndef CLBLAST_TEST_CORRECTNESS_TESTAXY_H_
 #define CLBLAST_TEST_CORRECTNESS_TESTAXY_H_
 #include <vector>
 #include <string>
 #include "correctness/tester.h"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T>
 class TestAXY: public Tester<T> {
 public:
  // Uses several variables from the Tester class
  using Tester<T>::context_;
  using Tester<T>::queue_;
  using Tester<T>::kLayouts;
  using Tester<T>::kTransposes;
  // Uses several helper functions from the Tester class
  using Tester<T>::TestStart;
  using Tester<T>::TestEnd;
  using Tester<T>::TestSimilarity;
  using Tester<T>::TestErrorCount;
  using Tester<T>::TestErrorCodes;
  using Tester<T>::GetExampleScalars;
  using Tester<T>::GetOffsets;
  using Tester<T>::PrecisionSupported;
  // Test settings for the regular test. Append to this list in case more tests are required.
  const std::vector<size_t> kMatrixVectorDims = { 61, 512 };
  const std::vector<size_t> kOffsets = GetOffsets();
  const std::vector<size_t> kIncrements = { 1, 2 };
  const std::vector<T> kAlphaValues = GetExampleScalars();
  const std::vector<T> kBetaValues = GetExampleScalars();
  // Test settings for the invalid test
  const std::vector<size_t> kInvalidIncrements = { 0, 1 };
  const size_t kBufferSize = 64;
  // Shorthand for a BLAS routine
  using Routine = std::function<StatusCode(const Arguments<T>&,
                                           const Buffer&, const Buffer&, const Buffer&,
                                           CommandQueue&)>;
  // Constructor, initializes the base class tester and input data
  TestAXY(int argc, char *argv[], const bool silent,
          const std::string &name, const std::vector<std::string> &options,
          const Routine clblast_lambda, const Routine clblas_lambda);
  // The test functions, taking no inputs
  void TestRegular(Arguments<T> &args, const std::string &name);
  void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
 private:
  // Source data to test with
  std::vector<T> a_source_;
  std::vector<T> x_source_;
  std::vector<T> y_source_;
  // The routines to test
  Routine clblast_lambda_;
  Routine clblas_lambda_;
 };
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_TEST_CORRECTNESS_TESTAXY_H_
 #endif
--- a/test/correctness/testblas.cc
+++ b/test/correctness/testblas.cc
@ -0,0 +1,189 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the TestBlas class (see the header for information about the class).
 //
 // =================================================================================================
 #include <algorithm>
 #include "correctness/testblas.h"
 namespace clblast {
 // =================================================================================================
 // The transpose-options to test with (data-type dependent)
 template <> const std::vector<Transpose> TestBlas<float,float>::kTransposes = {Transpose::kNo, Transpose::kYes};
 template <> const std::vector<Transpose> TestBlas<double,double>::kTransposes = {Transpose::kNo, Transpose::kYes};
 template <> const std::vector<Transpose> TestBlas<float2,float2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
 template <> const std::vector<Transpose> TestBlas<double2,double2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
 template <> const std::vector<Transpose> TestBlas<float2,float>::kTransposes = {Transpose::kNo, Transpose::kConjugate};
 template <> const std::vector<Transpose> TestBlas<double2,double>::kTransposes = {Transpose::kNo, Transpose::kConjugate};
 // =================================================================================================
 // Constructor, initializes the base class tester and input data
 template <typename T, typename U>
 TestBlas<T,U>::TestBlas(int argc, char *argv[], const bool silent,
                        const std::string &name, const std::vector<std::string> &options,
                        const Routine run_routine, const Routine run_reference,
                        const ResultGet get_result, const ResultIndex get_index,
                        const ResultIterator get_id1, const ResultIterator get_id2):
    Tester<T,U>{argc, argv, silent, name, options},
    run_routine_(run_routine),
    run_reference_(run_reference),
    get_result_(get_result),
    get_index_(get_index),
    get_id1_(get_id1),
    get_id2_(get_id2) {
  // Computes the maximum sizes. This allows for a single set of input/output buffers.
  auto max_vec = *std::max_element(kVectorDims.begin(), kVectorDims.end());
  auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end());
  auto max_mat = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
  auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
  auto max_matvec = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
  auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
  // Creates test input data
  x_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset);
  y_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset);
  a_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
  b_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
  c_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
  PopulateVector(x_source_);
  PopulateVector(y_source_);
  PopulateVector(a_source_);
  PopulateVector(b_source_);
  PopulateVector(c_source_);
 }
 // ===============================================================================================
 // Tests the routine for a wide variety of parameters
 template <typename T, typename U>
 void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const std::string &name) {
  if (!PrecisionSupported<T>(device_)) { return; }
  TestStart("regular behaviour", name);
  // Iterates over all the to-be-tested combinations of arguments
  for (auto &args: test_vector) {
    // Runs the reference clBLAS code
    auto x_vec1 = Buffer(context_, CL_MEM_READ_WRITE, args.x_size*sizeof(T));
    auto y_vec1 = Buffer(context_, CL_MEM_READ_WRITE, args.y_size*sizeof(T));
    auto a_mat1 = Buffer(context_, CL_MEM_READ_WRITE, args.a_size*sizeof(T));
    auto b_mat1 = Buffer(context_, CL_MEM_READ_WRITE, args.b_size*sizeof(T));
    auto c_mat1 = Buffer(context_, CL_MEM_READ_WRITE, args.c_size*sizeof(T));
    x_vec1.WriteBuffer(queue_, args.x_size*sizeof(T), x_source_);
    y_vec1.WriteBuffer(queue_, args.y_size*sizeof(T), y_source_);
    a_mat1.WriteBuffer(queue_, args.a_size*sizeof(T), a_source_);
    b_mat1.WriteBuffer(queue_, args.b_size*sizeof(T), b_source_);
    c_mat1.WriteBuffer(queue_, args.c_size*sizeof(T), c_source_);
    auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1};
    auto status1 = run_reference_(args, buffers1, queue_);
    // Runs the CLBlast code
    auto x_vec2 = Buffer(context_, CL_MEM_READ_WRITE, args.x_size*sizeof(T));
    auto y_vec2 = Buffer(context_, CL_MEM_READ_WRITE, args.y_size*sizeof(T));
    auto a_mat2 = Buffer(context_, CL_MEM_READ_WRITE, args.a_size*sizeof(T));
    auto b_mat2 = Buffer(context_, CL_MEM_READ_WRITE, args.b_size*sizeof(T));
    auto c_mat2 = Buffer(context_, CL_MEM_READ_WRITE, args.c_size*sizeof(T));
    x_vec2.WriteBuffer(queue_, args.x_size*sizeof(T), x_source_);
    y_vec2.WriteBuffer(queue_, args.y_size*sizeof(T), y_source_);
    a_mat2.WriteBuffer(queue_, args.a_size*sizeof(T), a_source_);
    b_mat2.WriteBuffer(queue_, args.b_size*sizeof(T), b_source_);
    c_mat2.WriteBuffer(queue_, args.c_size*sizeof(T), c_source_);
    auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2};
    auto status2 = run_routine_(args, buffers2, queue_);
    // Tests for equality of the two status codes
    if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
      TestErrorCodes(status1, status2, args);
      continue;
    }
    // Downloads the results
    auto result1 = get_result_(args, buffers1, queue_);
    auto result2 = get_result_(args, buffers2, queue_);
    // Checks for differences in the output
    auto errors = size_t{0};
    for (auto id1=size_t{0}; id1<get_id1_(args); ++id1) {
      for (auto id2=size_t{0}; id2<get_id2_(args); ++id2) {
        auto index = get_index_(args, id1, id2);
        if (!TestSimilarity(result1[index], result2[index])) {
          errors++;
        }
      }
    }
    // Tests the error count (should be zero)
    TestErrorCount(errors, get_id1_(args)*get_id2_(args), args);
  }
  TestEnd();
 }
 // =================================================================================================
 // Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
 // does not test for results (if any).
 template <typename T, typename U>
 void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const std::string &name) {
  if (!PrecisionSupported<T>(device_)) { return; }
  TestStart("invalid buffer sizes", name);
  // Iterates over all the to-be-tested combinations of arguments
  for (auto &args: test_vector) {
    // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
    // want to be able to create invalid buffers (no error checking here).
    auto x1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
    auto y1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
    auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
    auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
    auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
    auto x_vec1 = Buffer(x1);
    auto y_vec1 = Buffer(y1);
    auto a_mat1 = Buffer(a1);
    auto b_mat1 = Buffer(b1);
    auto c_mat1 = Buffer(c1);
    auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
    auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
    auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
    auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
    auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
    auto x_vec2 = Buffer(x2);
    auto y_vec2 = Buffer(y2);
    auto a_mat2 = Buffer(a2);
    auto b_mat2 = Buffer(b2);
    auto c_mat2 = Buffer(c2);
    // Runs the two routines
    auto status1 = run_reference_(args, Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1}, queue_);
    auto status2 = run_routine_(args, Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2}, queue_);
    // Tests for equality of the two status codes
    TestErrorCodes(status1, status2, args);
  }
  TestEnd();
 }
 // =================================================================================================
 // Compiles the templated class
 template class TestBlas<float, float>;
 template class TestBlas<double, double>;
 template class TestBlas<float2, float2>;
 template class TestBlas<double2, double2>;
 template class TestBlas<float2, float>;
 template class TestBlas<double2, double>;
 // =================================================================================================
 } // namespace clblast
--- a/test/correctness/testblas.h
+++ b/test/correctness/testblas.h
@ -0,0 +1,106 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file tests any CLBlast routine. It contains two types of tests: one testing all sorts of
 // input combinations, and one deliberatly testing with invalid values.
 // Typename T: the data-type of the routine's memory buffers (==precision)
 // Typename U: the data-type of the alpha and beta arguments
 //
 // =================================================================================================
 #ifndef CLBLAST_TEST_CORRECTNESS_TESTBLAS_H_
 #define CLBLAST_TEST_CORRECTNESS_TESTBLAS_H_
 #include <vector>
 #include <string>
 #include "correctness/tester.h"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T, typename U>
 class TestBlas: public Tester<T,U> {
 public:
  // Uses several variables from the Tester class
  using Tester<T,U>::context_;
  using Tester<T,U>::queue_;
  using Tester<T,U>::full_test_;
  using Tester<T,U>::device_;
  // Uses several helper functions from the Tester class
  using Tester<T,U>::TestStart;
  using Tester<T,U>::TestEnd;
  using Tester<T,U>::TestErrorCount;
  using Tester<T,U>::TestErrorCodes;
  using Tester<T,U>::GetOffsets;
  // Test settings for the regular test. Append to these lists in case more tests are required.
  const std::vector<size_t> kVectorDims = { 7, 93, 4096 };
  const std::vector<size_t> kIncrements = { 1, 2, 7 };
  const std::vector<size_t> kMatrixDims = { 7, 64 };
  const std::vector<size_t> kMatrixVectorDims = { 61, 512 };
  const std::vector<size_t> kOffsets = GetOffsets();
  const std::vector<U> kAlphaValues = GetExampleScalars<U>(full_test_);
  const std::vector<U> kBetaValues = GetExampleScalars<U>(full_test_);
  // Test settings for the invalid tests
  const std::vector<size_t> kInvalidIncrements = { 0, 1 };
  const size_t kBufferSize = 64;
  const std::vector<size_t> kMatSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
  const std::vector<size_t> kVecSizes = {0, kBufferSize - 1, kBufferSize};
  // The layout/transpose/triangle options to test with
  const std::vector<Layout> kLayouts = {Layout::kRowMajor, Layout::kColMajor};
  const std::vector<Triangle> kTriangles = {Triangle::kUpper, Triangle::kLower};
  const std::vector<Side> kSides = {Side::kLeft, Side::kRight};
  const std::vector<Diagonal> kDiagonals = {Diagonal::kUnit, Diagonal::kNonUnit};
  static const std::vector<Transpose> kTransposes; // Data-type dependent, see .cc-file
  // Shorthand for the routine-specific functions passed to the tester
  using Routine = std::function<StatusCode(const Arguments<U>&, const Buffers&, CommandQueue&)>;
  using ResultGet = std::function<std::vector<T>(const Arguments<U>&, Buffers&, CommandQueue&)>;
  using ResultIndex = std::function<size_t(const Arguments<U>&, const size_t, const size_t)>;
  using ResultIterator = std::function<size_t(const Arguments<U>&)>;
  // Constructor, initializes the base class tester and input data
  TestBlas(int argc, char *argv[], const bool silent,
           const std::string &name, const std::vector<std::string> &options,
           const Routine run_routine, const Routine run_reference, const ResultGet get_result,
           const ResultIndex get_index, const ResultIterator get_id1, const ResultIterator get_id2);
  // The test functions, taking no inputs
  void TestRegular(std::vector<Arguments<U>> &test_vector, const std::string &name);
  void TestInvalid(std::vector<Arguments<U>> &test_vector, const std::string &name);
 private:
  // Source data to test with
  std::vector<T> x_source_;
  std::vector<T> y_source_;
  std::vector<T> a_source_;
  std::vector<T> b_source_;
  std::vector<T> c_source_;
  // The routine-specific functions passed to the tester
  Routine run_routine_;
  Routine run_reference_;
  ResultGet get_result_;
  ResultIndex get_index_;
  ResultIterator get_id1_;
  ResultIterator get_id2_;
 };
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_TEST_CORRECTNESS_TESTBLAS_H_
 #endif
--- a/test/correctness/tester.cc
+++ b/test/correctness/tester.cc
@ -21,21 +21,11 @@
 namespace clblast {
 // =================================================================================================
 // The layouts and transpose-options to test with (data-type dependent)
 template <typename T>
 const std::vector<Layout> Tester<T>::kLayouts = {Layout::kRowMajor, Layout::kColMajor};
 template <> const std::vector<Transpose> Tester<float>::kTransposes = {Transpose::kNo, Transpose::kYes};
 template <> const std::vector<Transpose> Tester<double>::kTransposes = {Transpose::kNo, Transpose::kYes};
 template <> const std::vector<Transpose> Tester<float2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
 template <> const std::vector<Transpose> Tester<double2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
 // =================================================================================================
 // General constructor for all CLBlast testers. It prints out the test header to stdout and sets-up
 // the clBLAS library for reference.
-template <typename T>
+template <typename T, typename U>
-Tester<T>::Tester(int argc, char *argv[], const bool silent,
+Tester<T,U>::Tester(int argc, char *argv[], const bool silent,
-                  const std::string &name, const std::vector<std::string> &options):
+                    const std::string &name, const std::vector<std::string> &options):
    help_("Options given/available:\n"),
    platform_(Platform(GetArgument(argc, argv, help_, kArgPlatform, size_t{0}))),
    device_(Device(platform_, kDeviceType, GetArgument(argc, argv, help_, kArgDevice, size_t{0}))),
@ -61,7 +51,7 @@ Tester<T>::Tester(int argc, char *argv[], const bool silent,
          kPrintMessage.c_str(), name.c_str(), kPrintEnd.c_str());
  // Checks whether the precision is supported
-  if (!PrecisionSupported()) {
+  if (!PrecisionSupported<T>(device_)) {
    fprintf(stdout, "\n* All tests skipped: %sUnsupported precision%s\n",
            kPrintWarning.c_str(), kPrintEnd.c_str());
    return;
@ -86,9 +76,9 @@ Tester<T>::Tester(int argc, char *argv[], const bool silent,
 }
 // Destructor prints the summary of the test cases and cleans-up the clBLAS library
-template <typename T>
+template <typename T, typename U>
-Tester<T>::~Tester() {
+Tester<T,U>::~Tester() {
-  if (PrecisionSupported()) {
+  if (PrecisionSupported<T>(device_)) {
    fprintf(stdout, "* Completed all test-cases for this routine. Results:\n");
    fprintf(stdout, "   %lu test(s) passed\n", tests_passed_);
    if (tests_skipped_ > 0) { fprintf(stdout, "%s", kPrintWarning.c_str()); }
@ -104,8 +94,8 @@ Tester<T>::~Tester() {
 // Function called at the start of each test. This prints a header with information about the
 // test and re-initializes all test data-structures.
-template <typename T>
+template <typename T, typename U>
-void Tester<T>::TestStart(const std::string &test_name, const std::string &test_configuration) {
+void Tester<T,U>::TestStart(const std::string &test_name, const std::string &test_configuration) {
  // Prints the header
  fprintf(stdout, "* Testing %s'%s'%s for %s'%s'%s:\n",
@ -123,8 +113,8 @@ void Tester<T>::TestStart(const std::string &test_name, const std::string &test_
 // Function called at the end of each test. This prints errors if any occured. It also prints a
 // summary of the number of sub-tests passed/failed.
-template <typename T>
+template <typename T, typename U>
-void Tester<T>::TestEnd() {
+void Tester<T,U>::TestEnd() {
  fprintf(stdout, "\n");
  tests_passed_ += num_passed_;
  tests_failed_ += num_skipped_;
@ -147,6 +137,7 @@ void Tester<T>::TestEnd() {
      if (o == kArgBTransp)  { fprintf(stdout, "%s=%d ", kArgBTransp, entry.args.b_transpose);}
      if (o == kArgSide)     { fprintf(stdout, "%s=%d ", kArgSide, entry.args.side);}
      if (o == kArgTriangle) { fprintf(stdout, "%s=%d ", kArgTriangle, entry.args.triangle);}
      if (o == kArgDiagonal) { fprintf(stdout, "%s=%d ", kArgDiagonal, entry.args.diagonal);}
      if (o == kArgXInc)     { fprintf(stdout, "%s=%lu ", kArgXInc, entry.args.x_inc);}
      if (o == kArgYInc)     { fprintf(stdout, "%s=%lu ", kArgYInc, entry.args.y_inc);}
      if (o == kArgXOffset)  { fprintf(stdout, "%s=%lu ", kArgXOffset, entry.args.x_offset);}
@ -181,45 +172,9 @@ void Tester<T>::TestEnd() {
 // =================================================================================================
 // Compares two floating point values and returns whether they are within an acceptable error
 // margin. This replaces GTest's EXPECT_NEAR().
 template <typename T>
 bool Tester<T>::TestSimilarity(const T val1, const T val2) {
  const auto difference = std::fabs(val1 - val2);
  // Shortcut, handles infinities
  if (val1 == val2) {
    return true;
  }
  // The values are zero or very small: the relative error is less meaningful
  else if (val1 == 0 || val2 == 0 || difference < static_cast<T>(kErrorMarginAbsolute)) {
    return (difference < static_cast<T>(kErrorMarginAbsolute));
  }
  // Use relative error
  else {
    return (difference / (std::fabs(val1)+std::fabs(val2))) < static_cast<T>(kErrorMarginRelative);
  }
 }
 // Specialisations for complex data-types
 template <>
 bool Tester<float2>::TestSimilarity(const float2 val1, const float2 val2) {
  auto real = Tester<float>::TestSimilarity(val1.real(), val2.real());
  auto imag = Tester<float>::TestSimilarity(val1.imag(), val2.imag());
  return (real && imag);
 }
 template <>
 bool Tester<double2>::TestSimilarity(const double2 val1, const double2 val2) {
  auto real = Tester<double>::TestSimilarity(val1.real(), val2.real());
  auto imag = Tester<double>::TestSimilarity(val1.imag(), val2.imag());
  return (real && imag);
 }
 // =================================================================================================
 // Handles a 'pass' or 'error' depending on whether there are any errors
-template <typename T>
+template <typename T, typename U>
-void Tester<T>::TestErrorCount(const size_t errors, const size_t size, const Arguments<T> &args) {
+void Tester<T,U>::TestErrorCount(const size_t errors, const size_t size, const Arguments<U> &args) {
  // Finished successfully
  if (errors == 0) {
@ -237,9 +192,9 @@ void Tester<T>::TestErrorCount(const size_t errors, const size_t size, const Arg
 // Compares two status codes for equality. The outcome can be a pass (they are the same), a warning
 // (CLBlast reported a compilation error), or an error (they are different).
-template <typename T>
+template <typename T, typename U>
-void Tester<T>::TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status,
+void Tester<T,U>::TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status,
-                            const Arguments<T> &args) {
+                                 const Arguments<U> &args) {
  // Finished successfully
  if (clblas_status == clblast_status) {
@ -270,62 +225,26 @@ void Tester<T>::TestErrorCodes(const StatusCode clblas_status, const StatusCode
 // =================================================================================================
 // Retrieves a list of example scalar values, used for the alpha and beta arguments for the various
 // routines. This function is specialised for the different data-types.
 template <>
 const std::vector<float> Tester<float>::GetExampleScalars() {
  if (full_test_) { return {0.0f, 1.0f, 3.14f}; }
  else { return {3.14f}; }
 }
 template <>
 const std::vector<double> Tester<double>::GetExampleScalars() {
  if (full_test_) { return {0.0, 1.0, 3.14}; }
  else { return {3.14}; }
 }
 template <>
 const std::vector<float2> Tester<float2>::GetExampleScalars() {
  if (full_test_) { return {{0.0f, 0.0f}, {1.0f, 1.3f}, {2.42f, 3.14f}}; }
  else { return {{2.42f, 3.14f}}; }
 }
 template <>
 const std::vector<double2> Tester<double2>::GetExampleScalars() {
  if (full_test_) { return {{0.0, 0.0}, {1.0, 1.3}, {2.42, 3.14}}; }
  else { return {{2.42, 3.14}}; }
 }
 // Retrieves the offset values to test with
-template <typename T>
+template <typename T, typename U>
-const std::vector<size_t> Tester<T>::GetOffsets() {
+const std::vector<size_t> Tester<T,U>::GetOffsets() const {
  if (full_test_) { return {0, 10}; }
  else { return {0}; }
 }
 // =================================================================================================
 template <> bool Tester<float>::PrecisionSupported() const { return true; }
 template <> bool Tester<float2>::PrecisionSupported() const { return true; }
 template <> bool Tester<double>::PrecisionSupported() const {
  auto extensions = device_.Extensions();
  return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
 }
 template <> bool Tester<double2>::PrecisionSupported() const {
  auto extensions = device_.Extensions();
  return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
 }
 // =================================================================================================
 // A test can either pass, be skipped, or fail
-template <typename T>
+template <typename T, typename U>
-void Tester<T>::ReportPass() {
+void Tester<T,U>::ReportPass() {
  num_passed_++;
 }
-template <typename T>
+template <typename T, typename U>
-void Tester<T>::ReportSkipped() {
+void Tester<T,U>::ReportSkipped() {
  num_skipped_++;
 }
-template <typename T>
+template <typename T, typename U>
-void Tester<T>::ReportError(const ErrorLogEntry &error_log_entry) {
+void Tester<T,U>::ReportError(const ErrorLogEntry &error_log_entry) {
  error_log_.push_back(error_log_entry);
  num_failed_++;
 }
@ -334,8 +253,8 @@ void Tester<T>::ReportError(const ErrorLogEntry &error_log_entry) {
 // Prints the test-result symbol to screen. This function limits the maximum number of symbols per
 // line by printing newlines once every so many calls.
-template <typename T>
+template <typename T, typename U>
-void Tester<T>::PrintTestResult(const std::string &message) {
+void Tester<T,U>::PrintTestResult(const std::string &message) {
  if (print_count_ == kResultsPerLine) {
    print_count_ = 0;
    fprintf(stdout, "\n   ");
@ -345,13 +264,98 @@ void Tester<T>::PrintTestResult(const std::string &message) {
  print_count_++;
 }
 // =================================================================================================
 // Below are the non-member functions (separated because of otherwise required partial class
 // template specialization)
 // =================================================================================================
 // Compares two floating point values and returns whether they are within an acceptable error
 // margin. This replaces GTest's EXPECT_NEAR().
 template <typename T>
 bool TestSimilarity(const T val1, const T val2) {
  const auto difference = std::fabs(val1 - val2);
  // Set the allowed error margin for floating-point comparisons
  constexpr auto kErrorMarginRelative = 1.0e-2;
  constexpr auto kErrorMarginAbsolute = 1.0e-10;
  // Shortcut, handles infinities
  if (val1 == val2) {
    return true;
  }
  // The values are zero or very small: the relative error is less meaningful
  else if (val1 == 0 || val2 == 0 || difference < static_cast<T>(kErrorMarginAbsolute)) {
    return (difference < static_cast<T>(kErrorMarginAbsolute));
  }
  // Use relative error
  else {
    const auto absolute_sum = std::fabs(val1) + std::fabs(val2);
    return (difference / absolute_sum) < static_cast<T>(kErrorMarginRelative);
  }
 }
 // Compiles the default case for non-complex data-types
 template bool TestSimilarity<float>(const float, const float);
 template bool TestSimilarity<double>(const double, const double);
 // Specialisations for complex data-types
 template <>
 bool TestSimilarity(const float2 val1, const float2 val2) {
  auto real = TestSimilarity(val1.real(), val2.real());
  auto imag = TestSimilarity(val1.imag(), val2.imag());
  return (real && imag);
 }
 template <>
 bool TestSimilarity(const double2 val1, const double2 val2) {
  auto real = TestSimilarity(val1.real(), val2.real());
  auto imag = TestSimilarity(val1.imag(), val2.imag());
  return (real && imag);
 }
 // =================================================================================================
 // Retrieves a list of example scalar values, used for the alpha and beta arguments for the various
 // routines. This function is specialised for the different data-types.
 template <> const std::vector<float> GetExampleScalars(const bool full_test) {
  if (full_test) { return {0.0f, 1.0f, 3.14f}; }
  else { return {3.14f}; }
 }
 template <> const std::vector<double> GetExampleScalars(const bool full_test) {
  if (full_test) { return {0.0, 1.0, 3.14}; }
  else { return {3.14}; }
 }
 template <> const std::vector<float2> GetExampleScalars(const bool full_test) {
  if (full_test) { return {{0.0f, 0.0f}, {1.0f, 1.3f}, {2.42f, 3.14f}}; }
  else { return {{2.42f, 3.14f}}; }
 }
 template <> const std::vector<double2> GetExampleScalars(const bool full_test) {
  if (full_test) { return {{0.0, 0.0}, {1.0, 1.3}, {2.42, 3.14}}; }
  else { return {{2.42, 3.14}}; }
 }
 // =================================================================================================
 // Returns false is this precision is not supported by the device
 template <> bool PrecisionSupported<float>(const Device &) { return true; }
 template <> bool PrecisionSupported<float2>(const Device &) { return true; }
 template <> bool PrecisionSupported<double>(const Device &device) {
  auto extensions = device.Extensions();
  return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
 }
 template <> bool PrecisionSupported<double2>(const Device &device) {
  auto extensions = device.Extensions();
  return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
 }
 // =================================================================================================
 // Compiles the templated class
-template class Tester<float>;
+template class Tester<float, float>;
-template class Tester<double>;
+template class Tester<double, double>;
-template class Tester<float2>;
+template class Tester<float2, float2>;
-template class Tester<double2>;
+template class Tester<double2, double2>;
 template class Tester<float2, float>;
 template class Tester<double2, double>;
 // =================================================================================================
 } // namespace clblast
--- a/test/correctness/tester.h
+++ b/test/correctness/tester.h
@ -10,6 +10,8 @@
 // This file implements the Tester class, providing a test-framework. GTest was used before, but
 // was not able to handle certain cases (e.g. template type + parameters). This is its (basic)
 // custom replacement.
 // Typename T: the data-type of the routine's memory buffers (==precision)
 // Typename U: the data-type of the alpha and beta arguments
 //
 // =================================================================================================
@ -30,7 +32,7 @@ namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
-template <typename T>
+template <typename T, typename U>
 class Tester {
 public:
@ -43,10 +45,6 @@ class Tester {
  // Error percentage is not applicable: error was caused by an incorrect status
  static constexpr auto kStatusError = -1.0f;
  // Set the allowed error margin for floating-point comparisons
  static constexpr auto kErrorMarginRelative = 1.0e-2;
  static constexpr auto kErrorMarginAbsolute = 1.0e-10;
  // Constants holding start and end strings for terminal-output in colour
  const std::string kPrintError{"\x1b[31m"};
  const std::string kPrintSuccess{"\x1b[32m"};
@ -62,16 +60,12 @@ class Tester {
  const std::string kSkippedCompilation{kPrintWarning + "\\" + kPrintEnd};
  const std::string kUnsupportedPrecision{kPrintWarning + "o" + kPrintEnd};
  // The layouts and transpose-options to test with
  static const std::vector<Layout> kLayouts;
  static const std::vector<Transpose> kTransposes;
  // This structure combines the above log-entry with a status code an error percentage
  struct ErrorLogEntry {
    StatusCode status_expect;
    StatusCode status_found;
    float error_percentage;
-    Arguments<T> args;
+    Arguments<U> args;
  };
  // Creates an instance of the tester, running on a particular OpenCL platform and device. It
@ -84,25 +78,13 @@ class Tester {
  void TestStart(const std::string &test_name, const std::string &test_configuration);
  void TestEnd();
  // Compares two floating point values for similarity. Allows for a certain relative error margin.
  static bool TestSimilarity(const T val1, const T val2);
  // Tests either an error count (should be zero) or two error codes (must match)
-  void TestErrorCount(const size_t errors, const size_t size, const Arguments<T> &args);
+  void TestErrorCount(const size_t errors, const size_t size, const Arguments<U> &args);
  void TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status,
-                      const Arguments<T> &args);
+                      const Arguments<U> &args);
 protected:
  // Retrieves a list of example scalars of the right type
  const std::vector<T> GetExampleScalars();
  // Retrieves a list of offset values to test
  const std::vector<size_t> GetOffsets();
  // Returns false is this precision is not supported by the device
  bool PrecisionSupported() const;
  // The help-message
  std::string help_;
@ -112,6 +94,12 @@ class Tester {
  Context context_;
  CommandQueue queue_;
  // Whether or not to run the full test-suite or just a smoke test
  bool full_test_;
  // Retrieves the offset values to test with
  const std::vector<size_t> GetOffsets() const;
 private:
  // Internal methods to report a passed, skipped, or failed test
@ -122,9 +110,6 @@ class Tester {
  // Prints the error or success symbol to screen
  void PrintTestResult(const std::string &message);
  // Whether or not to run the full test-suite or just a smoke test
  bool full_test_;
  // Logging and counting occurrences of errors
  std::vector<ErrorLogEntry> error_log_;
  size_t num_passed_;
@ -143,6 +128,25 @@ class Tester {
  std::vector<std::string> options_;
 };
 // =================================================================================================
 // Below are the non-member functions (separated because of otherwise required partial class
 // template specialization)
 // =================================================================================================
 // Compares two floating point values and returns whether they are within an acceptable error
 // margin. This replaces GTest's EXPECT_NEAR().
 template <typename T>
 bool TestSimilarity(const T val1, const T val2);
 // Retrieves a list of example scalar values, used for the alpha and beta arguments for the various
 // routines. This function is specialised for the different data-types.
 template <typename T>
 const std::vector<T> GetExampleScalars(const bool full_test);
 // Returns false is this precision is not supported by the device
 template <typename T>
 bool PrecisionSupported(const Device &device);
 // =================================================================================================
 } // namespace clblast
--- a/test/correctness/testxy.cc
+++ b/test/correctness/testxy.cc
@ -1,176 +0,0 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under the MIT license. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the TestXY class (see the header for information about the class).
 //
 // =================================================================================================
 #include <algorithm>
 #include "correctness/testxy.h"
 namespace clblast {
 // =================================================================================================
 // Constructor, initializes the base class tester and input data
 template <typename T>
 TestXY<T>::TestXY(int argc, char *argv[], const bool silent,
                  const std::string &name, const std::vector<std::string> &options,
                  const Routine clblast_lambda, const Routine clblas_lambda):
    Tester<T>{argc, argv, silent, name, options},
    clblast_lambda_(clblast_lambda),
    clblas_lambda_(clblas_lambda) {
  // Computes the maximum sizes. This allows for a single set of input/output buffers.
  auto max_dim = *std::max_element(kVectorDims.begin(), kVectorDims.end());
  auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end());
  auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
  // Creates test input data
  x_source_.resize(max_dim*max_inc + max_offset);
  y_source_.resize(max_dim*max_inc + max_offset);
  PopulateVector(x_source_);
  PopulateVector(y_source_);
 }
 // ===============================================================================================
 // Tests the routine for a wide variety of parameters
 template <typename T>
 void TestXY<T>::TestRegular(Arguments<T> &args, const std::string &name) {
  if (!PrecisionSupported()) { return; }
  TestStart("regular behaviour", name);
  // Iterates over the vector dimension
  for (auto &n: kVectorDims) {
    args.n = n;
    // Iterates over the increment-values and the offsets
    for (auto &x_inc: kIncrements) {
      args.x_inc = x_inc;
      for (auto &x_offset: kOffsets) {
        args.x_offset = x_offset;
        for (auto &y_inc: kIncrements) {
          args.y_inc = y_inc;
          for (auto &y_offset: kOffsets) {
            args.y_offset = y_offset;
            // Computes the buffer sizes
            auto x_size = n * x_inc + x_offset;
            auto y_size = n * y_inc + y_offset;
            if (x_size < 1 || y_size < 1) { continue; }
            // Creates the OpenCL buffers
            auto x_vec = Buffer(context_, CL_MEM_READ_WRITE, x_size*sizeof(T));
            auto r_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
            auto s_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
            // Iterates over the values for alpha
            for (auto &alpha: kAlphaValues) {
              args.alpha = alpha;
              // Runs the reference clBLAS code
              x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
              r_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
              auto status1 = clblas_lambda_(args, x_vec, r_vec, queue_);
              // Runs the CLBlast code
              x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
              s_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
              auto status2 = clblast_lambda_(args, x_vec, s_vec, queue_);
              // Tests for equality of the two status codes
              if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
                TestErrorCodes(status1, status2, args);
                continue;
              }
              // Downloads the results
              std::vector<T> r_result(y_size, static_cast<T>(0));
              std::vector<T> s_result(y_size, static_cast<T>(0));
              r_vec.ReadBuffer(queue_, y_size*sizeof(T), r_result);
              s_vec.ReadBuffer(queue_, y_size*sizeof(T), s_result);
              // Checks for differences in the output
              auto errors = size_t{0};
              for (auto idn=size_t{0}; idn<n; ++idn) {
                auto index = idn*y_inc + y_offset;
                if (!TestSimilarity(r_result[index], s_result[index])) {
                  errors++;
                }
              }
              // Tests the error count (should be zero)
              TestErrorCount(errors, n, args);
            }
          }
        }
      }
    }
  }
  TestEnd();
 }
 // =================================================================================================
 // Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
 // does not test for results (if any).
 template <typename T>
 void TestXY<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
  if (!PrecisionSupported()) { return; }
  TestStart("invalid buffer sizes", name);
  // Sets example test parameters
  args.n = kBufferSize;
  args.x_offset = 0;
  args.y_offset = 0;
  // Iterates over test buffer sizes
  const std::vector<size_t> kBufferSizes = {0, kBufferSize - 1, kBufferSize};
  for (auto &x_size: kBufferSizes) {
    for (auto &y_size: kBufferSizes) {
      // Iterates over test increments
      for (auto &x_inc: kInvalidIncrements) {
        args.x_inc = x_inc;
        for (auto &y_inc: kInvalidIncrements) {
          args.y_inc = y_inc;
          // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
          // want to be able to create invalid buffers (no error checking here).
          auto x = clCreateBuffer(context_(), CL_MEM_READ_WRITE, x_size*sizeof(T), nullptr, nullptr);
          auto x_vec = Buffer(x);
          auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
          auto r_vec = Buffer(r);
          auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
          auto s_vec = Buffer(s);
          // Runs the two routines
          auto status1 = clblas_lambda_(args, x_vec, r_vec, queue_);
          auto status2 = clblast_lambda_(args, x_vec, s_vec, queue_);
          // Tests for equality of the two status codes
          TestErrorCodes(status1, status2, args);
        }
      }
    }
  }
  TestEnd();
 }
 // =================================================================================================
 // Compiles the templated class
 template class TestXY<float>;
 template class TestXY<double>;
 template class TestXY<float2>;
 template class TestXY<double2>;
 // =================================================================================================
 } // namespace clblast
--- a/test/correctness/testxy.h
+++ b/test/correctness/testxy.h
@ -1,84 +0,0 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under the MIT license. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file tests any vector-vector (X,Y) routine. It contains two types of tests: one testing
 // all sorts of input combinations, and one deliberatly testing with invalid values.
 //
 // =================================================================================================
 #ifndef CLBLAST_TEST_CORRECTNESS_TESTXY_H_
 #define CLBLAST_TEST_CORRECTNESS_TESTXY_H_
 #include <vector>
 #include <string>
 #include "correctness/tester.h"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T>
 class TestXY: public Tester<T> {
 public:
  // Uses several variables from the Tester class
  using Tester<T>::context_;
  using Tester<T>::queue_;
  // Uses several helper functions from the Tester class
  using Tester<T>::TestStart;
  using Tester<T>::TestEnd;
  using Tester<T>::TestSimilarity;
  using Tester<T>::TestErrorCount;
  using Tester<T>::TestErrorCodes;
  using Tester<T>::GetExampleScalars;
  using Tester<T>::GetOffsets;
  using Tester<T>::PrecisionSupported;
  // Test settings for the regular test. Append to this list in case more tests are required.
  const std::vector<size_t> kVectorDims = { 7, 93, 4096 };
  const std::vector<size_t> kOffsets = GetOffsets();
  const std::vector<size_t> kIncrements = { 1, 2, 7 };
  const std::vector<T> kAlphaValues = GetExampleScalars();
  // Test settings for the invalid test
  const std::vector<size_t> kInvalidIncrements = { 0, 1 };
  const size_t kBufferSize = 512;
  // Shorthand for a BLAS routine
  using Routine = std::function<StatusCode(const Arguments<T>&,
                                           const Buffer&, const Buffer&,
                                           CommandQueue&)>;
  // Constructor, initializes the base class tester and input data
  TestXY(int argc, char *argv[], const bool silent,
         const std::string &name, const std::vector<std::string> &options,
         const Routine clblast_lambda, const Routine clblas_lambda);
  // The test functions, taking no inputs
  void TestRegular(Arguments<T> &args, const std::string &name);
  void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
 private:
  // Source data to test with
  std::vector<T> x_source_;
  std::vector<T> y_source_;
  // The routines to test
  Routine clblast_lambda_;
  Routine clblas_lambda_;
 };
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_TEST_CORRECTNESS_TESTXY_H_
 #endif
--- a/test/performance/client.cc
+++ b/test/performance/client.cc
@ -21,249 +21,36 @@
 namespace clblast {
 // =================================================================================================
-// This is the vector-vector variant of the set-up/tear-down client routine.
+// Constructor
-template <typename T>
+template <typename T, typename U>
-void ClientXY(int argc, char *argv[], Routine2<T> client_routine,
+Client<T,U>::Client(const Routine run_routine, const Routine run_reference,
-              const std::vector<std::string> &options) {
+                    const std::vector<std::string> &options,
-
+                    const GetMetric get_flops, const GetMetric get_bytes):
-  // Function to determine how to find the default value of the leading dimension of matrix A.
+  run_routine_(run_routine),
-  // Note: this is not relevant for this client but given anyway.
+  run_reference_(run_reference),
-  auto default_ld_a = [](const Arguments<T> args) { return args.n; };
+  options_(options),
-
+  get_flops_(get_flops),
-  // Simple command line argument parser with defaults
+  get_bytes_(get_bytes) {
  auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
  if (args.print_help) { return; }
  // Prints the header of the output table
  PrintTableHeader(args.silent, options);
  // Initializes OpenCL and the libraries
  auto platform = Platform(args.platform_id);
  auto device = Device(platform, kDeviceType, args.device_id);
  auto context = Context(device);
  auto queue = CommandQueue(context, device);
  if (args.compare_clblas) { clblasSetup(); }
  // Iterates over all "num_step" values jumping by "step" each time
  auto s = size_t{0};
  while(true) {
    // Computes the data sizes
    auto x_size = args.n*args.x_inc + args.x_offset;
    auto y_size = args.n*args.y_inc + args.y_offset;
    // Populates input host vectors with random data
    std::vector<T> x_source(x_size);
    std::vector<T> y_source(y_size);
    PopulateVector(x_source);
    PopulateVector(y_source);
    // Creates the vectors on the device
    auto x_buffer = Buffer(context, CL_MEM_READ_WRITE, x_size*sizeof(T));
    auto y_buffer = Buffer(context, CL_MEM_READ_WRITE, y_size*sizeof(T));
    x_buffer.WriteBuffer(queue, x_size*sizeof(T), x_source);
    y_buffer.WriteBuffer(queue, y_size*sizeof(T), y_source);
    // Runs the routine-specific code
    client_routine(args, x_buffer, y_buffer, queue);
    // Makes the jump to the next step
    ++s;
    if (s >= args.num_steps) { break; }
    args.n += args.step;
  }
  // Cleans-up and returns
  if (args.compare_clblas) { clblasTeardown(); }
 }
 // Compiles the above function
 template void ClientXY<float>(int, char **, Routine2<float>, const std::vector<std::string>&);
 template void ClientXY<double>(int, char **, Routine2<double>, const std::vector<std::string>&);
 template void ClientXY<float2>(int, char **, Routine2<float2>, const std::vector<std::string>&);
 template void ClientXY<double2>(int, char **, Routine2<double2>, const std::vector<std::string>&);
 // =================================================================================================
 // This is the matrix-vector-vector variant of the set-up/tear-down client routine.
 template <typename T>
 void ClientAXY(int argc, char *argv[], Routine3<T> client_routine,
               const std::vector<std::string> &options) {
  // Function to determine how to find the default value of the leading dimension of matrix A
  auto default_ld_a = [](const Arguments<T> args) { return args.n; };
  // Simple command line argument parser with defaults
  auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
  if (args.print_help) { return; }
  // Prints the header of the output table
  PrintTableHeader(args.silent, options);
  // Initializes OpenCL and the libraries
  auto platform = Platform(args.platform_id);
  auto device = Device(platform, kDeviceType, args.device_id);
  auto context = Context(device);
  auto queue = CommandQueue(context, device);
  if (args.compare_clblas) { clblasSetup(); }
  // Iterates over all "num_step" values jumping by "step" each time
  auto s = size_t{0};
  while(true) {
    // Computes the second dimension of the matrix taking the rotation into account
    auto a_two = (args.layout == Layout::kRowMajor) ? args.m : args.n;
    // Computes the vector sizes in case the matrix is transposed
    auto a_transposed = (args.a_transpose == Transpose::kYes);
    auto m_real = (a_transposed) ? args.n : args.m;
    auto n_real = (a_transposed) ? args.m : args.n;
    // Computes the data sizes
    auto a_size = a_two * args.a_ld + args.a_offset;
    auto x_size = n_real*args.x_inc + args.x_offset;
    auto y_size = m_real*args.y_inc + args.y_offset;
    // Populates input host vectors with random data
    std::vector<T> a_source(a_size);
    std::vector<T> x_source(x_size);
    std::vector<T> y_source(y_size);
    PopulateVector(a_source);
    PopulateVector(x_source);
    PopulateVector(y_source);
    // Creates the vectors on the device
    auto a_buffer = Buffer(context, CL_MEM_READ_WRITE, a_size*sizeof(T));
    auto x_buffer = Buffer(context, CL_MEM_READ_WRITE, x_size*sizeof(T));
    auto y_buffer = Buffer(context, CL_MEM_READ_WRITE, y_size*sizeof(T));
    a_buffer.WriteBuffer(queue, a_size*sizeof(T), a_source);
    x_buffer.WriteBuffer(queue, x_size*sizeof(T), x_source);
    y_buffer.WriteBuffer(queue, y_size*sizeof(T), y_source);
    // Runs the routine-specific code
    client_routine(args, a_buffer, x_buffer, y_buffer, queue);
    // Makes the jump to the next step
    ++s;
    if (s >= args.num_steps) { break; }
    args.m += args.step;
    args.n += args.step;
    args.a_ld += args.step;
  }
  // Cleans-up and returns
  if (args.compare_clblas) { clblasTeardown(); }
 }
 // Compiles the above function
 template void ClientAXY<float>(int, char **, Routine3<float>, const std::vector<std::string>&);
 template void ClientAXY<double>(int, char **, Routine3<double>, const std::vector<std::string>&);
 template void ClientAXY<float2>(int, char **, Routine3<float2>, const std::vector<std::string>&);
 template void ClientAXY<double2>(int, char **, Routine3<double2>, const std::vector<std::string>&);
 // =================================================================================================
 // This is the matrix-matrix-matrix variant of the set-up/tear-down client routine.
 template <typename T>
 void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
                     const std::vector<std::string> &options) {
  // Function to determine how to find the default value of the leading dimension of matrix A
  auto default_ld_a = [](const Arguments<T> args) { return args.m; };
  // Simple command line argument parser with defaults
  auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
  if (args.print_help) { return; }
  // Prints the header of the output table
  PrintTableHeader(args.silent, options);
  // Initializes OpenCL and the libraries
  auto platform = Platform(args.platform_id);
  auto device = Device(platform, kDeviceType, args.device_id);
  auto context = Context(device);
  auto queue = CommandQueue(context, device);
  if (args.compare_clblas) { clblasSetup(); }
  // Computes whether or not the matrices are transposed. Note that we assume a default of
  // column-major and no-transpose. If one of them is different (but not both), then rotated
  // is considered true.
  auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose == Transpose::kYes) ||
                   (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
  auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose == Transpose::kYes) ||
                   (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
  auto c_rotated = (args.layout == Layout::kRowMajor);
  // Iterates over all "num_step" values jumping by "step" each time
  auto s = size_t{0};
  while(true) {
    // Computes the data sizes
    auto a_two = (a_rotated) ? args.m : args.k;
    auto b_two = (b_rotated) ? args.k : args.n;
    auto c_two = (c_rotated) ? args.m : args.n;
    auto a_size = a_two * args.a_ld + args.a_offset;
    auto b_size = b_two * args.b_ld + args.b_offset;
    auto c_size = c_two * args.c_ld + args.c_offset;
    // Populates input host matrices with random data
    std::vector<T> a_source(a_size);
    std::vector<T> b_source(b_size);
    std::vector<T> c_source(c_size);
    PopulateVector(a_source);
    PopulateVector(b_source);
    PopulateVector(c_source);
    // Creates the matrices on the device
    auto a_buffer = Buffer(context, CL_MEM_READ_WRITE, a_size*sizeof(T));
    auto b_buffer = Buffer(context, CL_MEM_READ_WRITE, b_size*sizeof(T));
    auto c_buffer = Buffer(context, CL_MEM_READ_WRITE, c_size*sizeof(T));
    a_buffer.WriteBuffer(queue, a_size*sizeof(T), a_source);
    b_buffer.WriteBuffer(queue, b_size*sizeof(T), b_source);
    c_buffer.WriteBuffer(queue, c_size*sizeof(T), c_source);
    // Runs the routine-specific code
    client_routine(args, a_buffer, b_buffer, c_buffer, queue);
    // Makes the jump to the next step
    ++s;
    if (s >= args.num_steps) { break; }
    args.m += args.step;
    args.n += args.step;
    args.k += args.step;
    args.a_ld += args.step;
    args.b_ld += args.step;
    args.c_ld += args.step;
  }
  // Cleans-up and returns
  if (args.compare_clblas) { clblasTeardown(); }
 }
 // Compiles the above function
 template void ClientABC<float>(int, char **, Routine3<float>, const std::vector<std::string>&);
 template void ClientABC<double>(int, char **, Routine3<double>, const std::vector<std::string>&);
 template void ClientABC<float2>(int, char **, Routine3<float2>, const std::vector<std::string>&);
 template void ClientABC<double2>(int, char **, Routine3<double2>, const std::vector<std::string>&);
 // =================================================================================================
 // Parses all arguments available for the CLBlast client testers. Some arguments might not be
 // applicable, but are searched for anyway to be able to create one common argument parser. All
 // arguments have a default value in case they are not found.
-template <typename T>
+template <typename T, typename U>
-Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::string> &options,
+Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric default_a_ld,
-                            const std::function<size_t(const Arguments<T>)> default_ld_a) {
+                                         const GetMetric default_b_ld, const GetMetric default_c_ld) {
-  auto args = Arguments<T>{};
+  auto args = Arguments<U>{};
  auto help = std::string{"Options given/available:\n"};
  // These are the options which are not for every client: they are optional
-  for (auto &o: options) {
+  for (auto &o: options_) {
    // Data-sizes
-    if (o == kArgM) { args.m = args.k  = GetArgument(argc, argv, help, kArgM, 512UL); }
+    if (o == kArgM) { args.m  = GetArgument(argc, argv, help, kArgM, 512UL); }
-    if (o == kArgN) { args.n           = GetArgument(argc, argv, help, kArgN, 512UL); }
+    if (o == kArgN) { args.n  = GetArgument(argc, argv, help, kArgN, 512UL); }
-    if (o == kArgK) { args.k           = GetArgument(argc, argv, help, kArgK, 512UL); }
+    if (o == kArgK) { args.k  = GetArgument(argc, argv, help, kArgK, 512UL); }
    // Data-layouts
    if (o == kArgLayout)   { args.layout      = GetArgument(argc, argv, help, kArgLayout, Layout::kRowMajor); }
@ -271,6 +58,7 @@ Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::strin
    if (o == kArgBTransp)  { args.b_transpose = GetArgument(argc, argv, help, kArgBTransp, Transpose::kNo); }
    if (o == kArgSide)     { args.side        = GetArgument(argc, argv, help, kArgSide, Side::kLeft); }
    if (o == kArgTriangle) { args.triangle    = GetArgument(argc, argv, help, kArgTriangle, Triangle::kUpper); }
    if (o == kArgDiagonal) { args.diagonal    = GetArgument(argc, argv, help, kArgDiagonal, Diagonal::kUnit); }
    // Vector arguments
    if (o == kArgXInc)    { args.x_inc    = GetArgument(argc, argv, help, kArgXInc, size_t{1}); }
@ -279,16 +67,16 @@ Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::strin
    if (o == kArgYOffset) { args.y_offset = GetArgument(argc, argv, help, kArgYOffset, size_t{0}); }
    // Matrix arguments
-    if (o == kArgALeadDim) { args.a_ld     = GetArgument(argc, argv, help, kArgALeadDim, default_ld_a(args)); }
+    if (o == kArgALeadDim) { args.a_ld     = GetArgument(argc, argv, help, kArgALeadDim, default_a_ld(args)); }
-    if (o == kArgBLeadDim) { args.b_ld     = GetArgument(argc, argv, help, kArgBLeadDim, args.n); }
+    if (o == kArgBLeadDim) { args.b_ld     = GetArgument(argc, argv, help, kArgBLeadDim, default_b_ld(args)); }
-    if (o == kArgCLeadDim) { args.c_ld     = GetArgument(argc, argv, help, kArgCLeadDim, args.n); }
+    if (o == kArgCLeadDim) { args.c_ld     = GetArgument(argc, argv, help, kArgCLeadDim, default_c_ld(args)); }
    if (o == kArgAOffset)  { args.a_offset = GetArgument(argc, argv, help, kArgAOffset, size_t{0}); }
    if (o == kArgBOffset)  { args.b_offset = GetArgument(argc, argv, help, kArgBOffset, size_t{0}); }
    if (o == kArgCOffset)  { args.c_offset = GetArgument(argc, argv, help, kArgCOffset, size_t{0}); }
    // Scalar values 
-    if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>()); }
+    if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<U>()); }
-    if (o == kArgBeta)  { args.beta  = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); }
+    if (o == kArgBeta)  { args.beta  = GetArgument(argc, argv, help, kArgBeta, GetScalar<U>()); }
  }
  // These are the options common to all routines
@ -313,16 +101,92 @@ Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::strin
 // =================================================================================================
 // This is main performance tester
 template <typename T, typename U>
 void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) {
  // Prints the header of the output table
  PrintTableHeader(args.silent, options_);
  // Initializes OpenCL and the libraries
  auto platform = Platform(args.platform_id);
  auto device = Device(platform, kDeviceType, args.device_id);
  auto context = Context(device);
  auto queue = CommandQueue(context, device);
  if (args.compare_clblas) { clblasSetup(); }
  // Iterates over all "num_step" values jumping by "step" each time
  auto s = size_t{0};
  while(true) {
    // Sets the buffer sizes (routine-specific)
    set_sizes(args);
    // Populates input host matrices with random data
    std::vector<T> x_source(args.x_size);
    std::vector<T> y_source(args.y_size);
    std::vector<T> a_source(args.a_size);
    std::vector<T> b_source(args.b_size);
    std::vector<T> c_source(args.c_size);
    PopulateVector(x_source);
    PopulateVector(y_source);
    PopulateVector(a_source);
    PopulateVector(b_source);
    PopulateVector(c_source);
    // Creates the matrices on the device
    auto x_vec = Buffer(context, CL_MEM_READ_WRITE, args.x_size*sizeof(T));
    auto y_vec = Buffer(context, CL_MEM_READ_WRITE, args.y_size*sizeof(T));
    auto a_mat = Buffer(context, CL_MEM_READ_WRITE, args.a_size*sizeof(T));
    auto b_mat = Buffer(context, CL_MEM_READ_WRITE, args.b_size*sizeof(T));
    auto c_mat = Buffer(context, CL_MEM_READ_WRITE, args.c_size*sizeof(T));
    x_vec.WriteBuffer(queue, args.x_size*sizeof(T), x_source);
    y_vec.WriteBuffer(queue, args.y_size*sizeof(T), y_source);
    a_mat.WriteBuffer(queue, args.a_size*sizeof(T), a_source);
    b_mat.WriteBuffer(queue, args.b_size*sizeof(T), b_source);
    c_mat.WriteBuffer(queue, args.c_size*sizeof(T), c_source);
    auto buffers = Buffers{x_vec, y_vec, a_mat, b_mat, c_mat};
    // Runs the routines and collects the timings
    auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
    auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference_, "clBLAS");
    // Prints the performance of both libraries
    PrintTableRow(args, ms_clblast, ms_clblas);
    // Makes the jump to the next step
    ++s;
    if (s >= args.num_steps) { break; }
    args.m += args.step;
    args.n += args.step;
    args.k += args.step;
    args.a_ld += args.step;
    args.b_ld += args.step;
    args.c_ld += args.step;
  }
  // Cleans-up and returns
  if (args.compare_clblas) { clblasTeardown(); }
 }
 // =================================================================================================
 // Creates a vector of timing results, filled with execution times of the 'main computation'. The
 // timing is performed using the milliseconds chrono functions. The function returns the minimum
 // value found in the vector of timing results. The return value is in milliseconds.
-double TimedExecution(const size_t num_runs, std::function<void()> main_computation) {
+template <typename T, typename U>
 double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args,
                                   const Buffers &buffers, CommandQueue &queue,
                                   Routine run_blas, const std::string &library_name) {
  auto timings = std::vector<double>(num_runs);
  for (auto &timing: timings) {
    auto start_time = std::chrono::steady_clock::now();
    // Executes the main computation
-    main_computation();
+    auto status = run_blas(args, buffers, queue);
    if (status != StatusCode::kSuccess) {
      throw std::runtime_error(library_name+" error: "+ToString(static_cast<int>(status)));
    }
    // Records and stores the end-time
    auto elapsed_time = std::chrono::steady_clock::now() - start_time;
@ -334,7 +198,8 @@ double TimedExecution(const size_t num_runs, std::function<void()> main_computat
 // =================================================================================================
 // Prints the header of the performance table
-void PrintTableHeader(const bool silent, const std::vector<std::string> &args) {
+template <typename T, typename U>
 void Client<T,U>::PrintTableHeader(const bool silent, const std::vector<std::string> &args) {
  if (!silent) {
    for (auto i=size_t{0}; i<args.size(); ++i) { fprintf(stdout, "%9s ", ""); }
    fprintf(stdout, " | <--       CLBlast       --> | <--      clBLAS      --> |\n");
@ -345,29 +210,60 @@ void PrintTableHeader(const bool silent, const std::vector<std::string> &args) {
 }
 // Print a performance-result row
-void PrintTableRow(const std::vector<size_t> &args_int, const std::vector<std::string> &args_string,
+template <typename T, typename U>
-                   const bool no_abbrv, const double ms_clblast, const double ms_clblas,
+void Client<T,U>::PrintTableRow(const Arguments<U>& args, const double ms_clblast,
-                   const unsigned long long flops, const unsigned long long bytes) {
+                                const double ms_clblas) {
  // Creates a vector of relevant variables
  auto integers = std::vector<size_t>{};
  for (auto &o: options_) {
    if      (o == kArgM) {        integers.push_back(args.m); }
    if      (o == kArgN) {        integers.push_back(args.n); }
    else if (o == kArgK) {        integers.push_back(args.k); }
    else if (o == kArgLayout) {   integers.push_back(static_cast<size_t>(args.layout)); }
    else if (o == kArgSide) {     integers.push_back(static_cast<size_t>(args.side)); }
    else if (o == kArgTriangle) { integers.push_back(static_cast<size_t>(args.triangle)); }
    else if (o == kArgATransp) {  integers.push_back(static_cast<size_t>(args.a_transpose)); }
    else if (o == kArgBTransp) {  integers.push_back(static_cast<size_t>(args.b_transpose)); }
    else if (o == kArgDiagonal) { integers.push_back(static_cast<size_t>(args.diagonal)); }
    else if (o == kArgXInc) {     integers.push_back(args.x_inc); }
    else if (o == kArgYInc) {     integers.push_back(args.y_inc); }
    else if (o == kArgXOffset) {  integers.push_back(args.x_offset); }
    else if (o == kArgYOffset) {  integers.push_back(args.y_offset); }
    else if (o == kArgALeadDim) { integers.push_back(args.a_ld); }
    else if (o == kArgBLeadDim) { integers.push_back(args.b_ld); }
    else if (o == kArgCLeadDim) { integers.push_back(args.c_ld); }
    else if (o == kArgAOffset) {  integers.push_back(args.a_offset); }
    else if (o == kArgBOffset) {  integers.push_back(args.b_offset); }
    else if (o == kArgCOffset) {  integers.push_back(args.c_offset); }
  }
  auto strings = std::vector<std::string>{};
  for (auto &o: options_) {
    if      (o == kArgAlpha) {    strings.push_back(ToString(args.alpha)); }
    else if (o == kArgBeta) {     strings.push_back(ToString(args.beta)); }
  }
  // Computes the GFLOPS and GB/s metrics
  auto flops = get_flops_(args);
  auto bytes = get_bytes_(args);
  auto gflops_clblast = (ms_clblast != 0.0) ? (flops*1e-6)/ms_clblast : 0;
  auto gflops_clblas = (ms_clblas != 0.0) ? (flops*1e-6)/ms_clblas: 0;
  auto gbs_clblast = (ms_clblast != 0.0) ? (bytes*1e-6)/ms_clblast : 0;
  auto gbs_clblas = (ms_clblas != 0.0) ? (bytes*1e-6)/ms_clblas: 0;
  // Outputs the argument values
-  for (auto &argument: args_int) {
+  for (auto &argument: integers) {
-    if (!no_abbrv && argument >= 1024*1024 && IsMultiple(argument, 1024*1024)) {
+    if (!args.no_abbrv && argument >= 1024*1024 && IsMultiple(argument, 1024*1024)) {
      fprintf(stdout, "%8luM;", argument/(1024*1024));
    }
-    else if (!no_abbrv && argument >= 1024 && IsMultiple(argument, 1024)) {
+    else if (!args.no_abbrv && argument >= 1024 && IsMultiple(argument, 1024)) {
      fprintf(stdout, "%8luK;", argument/1024);
    }
    else {
      fprintf(stdout, "%9lu;", argument);
    }
  }
-  for (auto &argument: args_string) {
+  for (auto &argument: strings) {
    fprintf(stdout, "%9s;", argument.c_str());
  }
@ -377,5 +273,15 @@ void PrintTableRow(const std::vector<size_t> &args_int, const std::vector<std::s
          ms_clblas, gflops_clblas, gbs_clblas);
 }
 // =================================================================================================
 // Compiles the templated class
 template class Client<float,float>;
 template class Client<double,double>;
 template class Client<float2,float2>;
 template class Client<double2,double2>;
 template class Client<float2,float>;
 template class Client<double2,double>;
 // =================================================================================================
 } // namespace clblast
--- a/test/performance/client.h
+++ b/test/performance/client.h
@ -7,7 +7,14 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file provides common function declarations to be used with the test clients.
+// This class implements the performance-test client. It is generic for all CLBlast routines by
 // taking a number of routine-specific functions as arguments, such as how to compute buffer sizes
 // or how to get the FLOPS count.
 // Typename T: the data-type of the routine's memory buffers (==precision)
 // Typename U: the data-type of the alpha and beta arguments
 //
 // This file also provides the common interface to the performance client (see the 'RunClient'
 // function for details).
 //
 // =================================================================================================
@ -26,61 +33,71 @@
 namespace clblast {
 // =================================================================================================
-// Types of devices to consider
+// See comment at top of file for a description of the class
-const cl_device_type kDeviceType = CL_DEVICE_TYPE_ALL;
+template <typename T, typename U>
 class Client {
 public:
  // Types of devices to consider
  const cl_device_type kDeviceType = CL_DEVICE_TYPE_ALL;
  // Shorthand for the routine-specific functions passed to the tester
  using Routine = std::function<StatusCode(const Arguments<U>&, const Buffers&, CommandQueue&)>;
  using SetMetric = std::function<void(Arguments<U>&)>;
  using GetMetric = std::function<size_t(const Arguments<U>&)>;
  // The constructor
  Client(const Routine run_routine, const Routine run_reference,
         const std::vector<std::string> &options,
         const GetMetric get_flops, const GetMetric get_bytes);
  // Parses all command-line arguments, filling in the arguments structure. If no command-line
  // argument is given for a particular argument, it is filled in with a default value.
  Arguments<U> ParseArguments(int argc, char *argv[], const GetMetric default_a_ld,
                              const GetMetric default_b_ld, const GetMetric default_c_ld);
  // The main client function, setting-up arguments, matrices, OpenCL buffers, etc. After set-up, it
  // calls the client routines.
  void PerformanceTest(Arguments<U> &args, const SetMetric set_sizes);
 private:
  // Runs a function a given number of times and returns the execution time of the shortest instance
  double TimedExecution(const size_t num_runs, const Arguments<U> &args, const Buffers &buffers,
                        CommandQueue &queue, Routine run_blas, const std::string &library_name);
  // Prints the header of a performance-data table
  void PrintTableHeader(const bool silent, const std::vector<std::string> &args);
  // Prints a row of performance data, including results of two libraries
  void PrintTableRow(const Arguments<U>& args, const double ms_clblast, const double ms_clblas);
  // The routine-specific functions passed to the tester
  const Routine run_routine_;
  const Routine run_reference_;
  const std::vector<std::string> options_;
  const GetMetric get_flops_;
  const GetMetric get_bytes_;
 };
 // =================================================================================================
-// Shorthand for a BLAS routine with 2 or 3 OpenCL buffers as argument
+// The interface to the performance client. This is a separate function in the header such that it
-template <typename T>
+// is automatically compiled for each routine, templated by the parameter "C".
-using Routine2 = std::function<void(const Arguments<T>&,
+template <typename C, typename T, typename U>
-                                    const Buffer&, const Buffer&,
+void RunClient(int argc, char *argv[]) {
                                    CommandQueue&)>;
 template <typename T>
 using Routine3 = std::function<void(const Arguments<T>&,
                                    const Buffer&, const Buffer&, const Buffer&,
                                    CommandQueue&)>;
-// =================================================================================================
+  // Creates a new client
  auto client = Client<T,U>(C::RunRoutine, C::RunReference, C::GetOptions(),
                            C::GetFlops, C::GetBytes);
-// These are the main client functions, setting-up arguments, matrices, OpenCL buffers, etc. After
+  // Simple command line argument parser with defaults
-// set-up, they call the client routine, passed as argument to this function.
+  auto args = client.ParseArguments(argc, argv, C::DefaultLDA, C::DefaultLDB, C::DefaultLDC);
-template <typename T>
+  if (args.print_help) { return; }
 void ClientXY(int argc, char *argv[], Routine2<T> client_routine,
              const std::vector<std::string> &options);
 template <typename T>
 void ClientAXY(int argc, char *argv[], Routine3<T> client_routine,
               const std::vector<std::string> &options);
 template <typename T>
 void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
               const std::vector<std::string> &options);
-// =================================================================================================
+  // Runs the client
-
+  client.PerformanceTest(args, C::SetSizes);
-// Parses all command-line arguments, filling in the arguments structure. If no command-line
+}
 // argument is given for a particular argument, it is filled in with a default value.
 template <typename T>
 Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::string> &options,
                            const std::function<size_t(const Arguments<T>)> default_ld_a);
 // Retrieves only the precision command-line argument, since the above function is templated based
 // on the precision
 Precision GetPrecision(int argc, char *argv[]);
 // =================================================================================================
 // Runs a function a given number of times and returns the execution time of the shortest instance
 double TimedExecution(const size_t num_runs, std::function<void()> main_computation);
 // =================================================================================================
 // Prints the header of a performance-data table
 void PrintTableHeader(const bool silent, const std::vector<std::string> &args);
 // Prints a row of performance data, including results of two libraries
 void PrintTableRow(const std::vector<size_t> &args_int, const std::vector<std::string> &args_string,
                   const bool abbreviations, const double ms_clblast, const double ms_clblas,
                   const unsigned long long flops, const unsigned long long bytes);
 // =================================================================================================
 } // namespace clblast
--- a/test/performance/graphs/common.r
+++ b/test/performance/graphs/common.r
@ -83,7 +83,16 @@ main <- function(routine_name, precision, test_names, test_values,
      params_string <- paste(parameters, params_values[[command_id]], collapse=" ")
      arguments <- paste(devices_string, params_string, options_string, sep=" ")
      print(paste("Running", executable, arguments, sep=" "))
-      result_string <- system2(command=executable, args=arguments, stdout=TRUE)
+      raw_result_string <- system2(command=executable, args=arguments, stdout=TRUE)
      # Filter the string: only lines containing a ";" can be valid lines
      result_string <- c()
      for (line in raw_result_string) {
        if (grepl(";",line)) {
          result_string <-
           c(result_string, line)
        }
      }
      # Reads the result into a dataframe
      command_db <- read.csv(text=result_string, sep=";")
--- a/test/performance/graphs/xgemm.r
+++ b/test/performance/graphs/xgemm.r
@ -35,10 +35,10 @@ test_names <- list(
 # Defines the test-cases
 test_values <- list(
-  list(c(128, 128, 128, 0, 0, 0, 16, 128, num_runs, precision)),
+  list(c( 128,  128,  128, 1, 0, 0, 16, 128, num_runs, precision)),
-  list(c(129, 129, 129, 0, 0, 0, 16, 128, num_runs, precision)),
+  list(c( 129,  129,  129, 1, 0, 0, 16, 128, num_runs, precision)),
-  list(c(512, 512, 512, 0, 0, 0, 16, 1, num_runs, precision)),
+  list(c( 512,  512,  512, 1, 0, 0, 16, 1, num_runs, precision)),
-  list(c(2048, 2048, 2048, 0, 0, 0, 16, 1, num_runs, precision)),
+  list(c(2048, 2048, 2048, 1, 0, 0, 16, 1, num_runs, precision)),
  list(
    c(1024, 1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
    c(1024, 1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
@ -50,17 +50,17 @@ test_values <- list(
    c(1024, 1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
  ),
  list(
-    c(8, 8, 8, 0, 0, 0, 1, 0, num_runs, precision),
+    c(   8,    8,    8, 1, 0, 0, 1, 0, num_runs, precision),
-    c(16, 16, 16, 0, 0, 0, 1, 0, num_runs, precision),
+    c(  16,   16,   16, 1, 0, 0, 1, 0, num_runs, precision),
-    c(32, 32, 32, 0, 0, 0, 1, 0, num_runs, precision),
+    c(  32,   32,   32, 1, 0, 0, 1, 0, num_runs, precision),
-    c(64, 64, 64, 0, 0, 0, 1, 0, num_runs, precision),
+    c(  64,   64,   64, 1, 0, 0, 1, 0, num_runs, precision),
-    c(128, 128, 128, 0, 0, 0, 1, 0, num_runs, precision),
+    c( 128,  128,  128, 1, 0, 0, 1, 0, num_runs, precision),
-    c(256, 256, 256, 0, 0, 0, 1, 0, num_runs, precision),
+    c( 256,  256,  256, 1, 0, 0, 1, 0, num_runs, precision),
-    c(512, 512, 512, 0, 0, 0, 1, 0, num_runs, precision),
+    c( 512,  512,  512, 1, 0, 0, 1, 0, num_runs, precision),
-    c(1024, 1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
-    c(2048, 2048, 2048, 0, 0, 0, 1, 0, num_runs, precision),
+    c(2048, 2048, 2048, 1, 0, 0, 1, 0, num_runs, precision),
-    c(4096, 4096, 4096, 0, 0, 0, 1, 0, num_runs, precision),
+    c(4096, 4096, 4096, 1, 0, 0, 1, 0, num_runs, precision),
-    c(8192, 8192, 8192, 0, 0, 0, 1, 0, num_runs, precision)
+    c(8192, 8192, 8192, 1, 0, 0, 1, 0, num_runs, precision)
  )
 )
--- a/test/performance/graphs/xsymm.r
+++ b/test/performance/graphs/xsymm.r
@ -19,7 +19,7 @@ source(file.path(dirname(thisfile), "common.r"))
 # Settings
 routine_name <- "xsymm"
-parameters <- c("-m","-n","-layout","-triangle","-side",
+parameters <- c("-m","-n","-layout","-side","-triangle",
                "-num_steps","-step","-runs","-precision")
 precision <- 32
@ -29,16 +29,16 @@ test_names <- list(
  "multiples of 128 (+1)",
  "around m=n=512",
  "around m=n=2048",
-  "layouts and triangle/side (m=n=1024)",
+  "layouts and side/triangle (m=n=1024)",
  "powers of 2"
 )
 # Defines the test-cases
 test_values <- list(
-  list(c(128, 128, 0, 0, 0, 16, 128, num_runs, precision)),
+  list(c( 128,  128, 1, 0, 0, 16, 128, num_runs, precision)),
-  list(c(129, 129, 0, 0, 0, 16, 128, num_runs, precision)),
+  list(c( 129,  129, 1, 0, 0, 16, 128, num_runs, precision)),
-  list(c(512, 512, 0, 0, 0, 16, 1, num_runs, precision)),
+  list(c( 512,  512, 1, 0, 0, 16, 1, num_runs, precision)),
-  list(c(2048, 2048, 0, 0, 0, 16, 1, num_runs, precision)),
+  list(c(2048, 2048, 1, 0, 0, 16, 1, num_runs, precision)),
  list(
    c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
    c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
@ -50,17 +50,17 @@ test_values <- list(
    c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
  ),
  list(
-    c(8, 8, 0, 0, 0, 1, 0, num_runs, precision),
+    c(   8,    8, 1, 0, 0, 1, 0, num_runs, precision),
-    c(16, 16, 0, 0, 0, 1, 0, num_runs, precision),
+    c(  16,   16, 1, 0, 0, 1, 0, num_runs, precision),
-    c(32, 32, 0, 0, 0, 1, 0, num_runs, precision),
+    c(  32,   32, 1, 0, 0, 1, 0, num_runs, precision),
-    c(64, 64, 0, 0, 0, 1, 0, num_runs, precision),
+    c(  64,   64, 1, 0, 0, 1, 0, num_runs, precision),
-    c(128, 128, 0, 0, 0, 1, 0, num_runs, precision),
+    c( 128,  128, 1, 0, 0, 1, 0, num_runs, precision),
-    c(256, 256, 0, 0, 0, 1, 0, num_runs, precision),
+    c( 256,  256, 1, 0, 0, 1, 0, num_runs, precision),
-    c(512, 512, 0, 0, 0, 1, 0, num_runs, precision),
+    c( 512,  512, 1, 0, 0, 1, 0, num_runs, precision),
-    c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
-    c(2048, 2048, 0, 0, 0, 1, 0, num_runs, precision),
+    c(2048, 2048, 1, 0, 0, 1, 0, num_runs, precision),
-    c(4096, 4096, 0, 0, 0, 1, 0, num_runs, precision),
+    c(4096, 4096, 1, 0, 0, 1, 0, num_runs, precision),
-    c(8192, 8192, 0, 0, 0, 1, 0, num_runs, precision)
+    c(8192, 8192, 1, 0, 0, 1, 0, num_runs, precision)
  )
 )
@ -70,7 +70,7 @@ test_xlabels <- list(
  "matrix sizes (m=n)",
  "matrix sizes (m=n)",
  "matrix sizes (m=n)",
-  "layout (row/col), triangle (up/lo), side (l/r)",
+  "layout (row/col), side (l/r), triangle (up/lo)",
  "matrix sizes (m=n)"
 )
@ -80,8 +80,8 @@ test_xaxis <- list(
  c("m", ""),
  c("m", ""),
  c("m", ""),
-  list(1:8, c("row,up,l", "row,up,r", "row,lo,l", "row,lo,r",
+  list(1:8, c("row,l,up", "row,r,up", "row,l,lo", "row,r,lo",
-              "col,up,l", "col,up,r", "col,lo,l", "col,lo,r")),
+              "col,l,up", "col,r,up", "col,l,lo", "col,r,lo")),
  c("m", "x")
 )
--- a/test/performance/graphs/xsyr2k.r
+++ b/test/performance/graphs/xsyr2k.r
@ -0,0 +1,94 @@
 # ==================================================================================================
 # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 # project uses a tab-size of two spaces and a max-width of 100 characters per line.
 #
 # Author(s):
 #   Cedric Nugteren <www.cedricnugteren.nl>
 #
 # This file implements the performance script for the Xsyr2k routine
 #
 # ==================================================================================================
 # Includes the common functions
 args <- commandArgs(trailingOnly = FALSE)
 thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
 source(file.path(dirname(thisfile), "common.r"))
 # ==================================================================================================
 # Settings
 routine_name <- "xsyr2k"
 parameters <- c("-n","-k","-layout","-triangle","-transA",
                "-num_steps","-step","-runs","-precision")
 precision <- 32
 # Sets the names of the test-cases
 test_names <- list(
  "multiples of 128",
  "multiples of 128 (+1)",
  "around n=k=512",
  "around n=k=1536",
  "layouts and transposing (n=k=1024)",
  "powers of 2"
 )
 # Defines the test-cases
 test_values <- list(
  list(c( 128,  128, 1, 0, 0, 16, 128, num_runs, precision)),
  list(c( 129,  129, 1, 0, 0, 16, 128, num_runs, precision)),
  list(c( 512,  512, 1, 0, 0, 16, 1, num_runs, precision)),
  list(c(1536, 1536, 1, 0, 0, 16, 1, num_runs, precision)),
  list(
    c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
    c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
    c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision),
    c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision),
    c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
    c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision),
    c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision),
    c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
  ),
  list(
    c(   8,    8, 1, 0, 0, 1, 0, num_runs, precision),
    c(  16,   16, 1, 0, 0, 1, 0, num_runs, precision),
    c(  32,   32, 1, 0, 0, 1, 0, num_runs, precision),
    c(  64,   64, 1, 0, 0, 1, 0, num_runs, precision),
    c( 128,  128, 1, 0, 0, 1, 0, num_runs, precision),
    c( 256,  256, 1, 0, 0, 1, 0, num_runs, precision),
    c( 512,  512, 1, 0, 0, 1, 0, num_runs, precision),
    c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
    c(2048, 2048, 1, 0, 0, 1, 0, num_runs, precision),
    c(4096, 4096, 1, 0, 0, 1, 0, num_runs, precision),
    c(8192, 8192, 1, 0, 0, 1, 0, num_runs, precision)
  )
 )
 # Defines the x-labels corresponding to the test-cases
 test_xlabels <- list(
  "matrix sizes (n=k)",
  "matrix sizes (n=k)",
  "matrix sizes (n=k)",
  "matrix sizes (n=k)",
  "layout (row/col), triangle (u/l), transA (n/y)",
  "matrix sizes (n=k)"
 )
 # Defines the x-axis of the test-cases
 test_xaxis <- list(
  c("n", ""),
  c("n", ""),
  c("n", ""),
  c("n", ""),
  list(1:8, c("row,u,n", "row,u,y", "row,l,n", "row,l,y",
              "col,u,n", "col,u,y", "col,l,n", "col,l,y")),
  c("n", "x")
 )
 # ==================================================================================================
 # Start the script
 main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
 # ==================================================================================================
--- a/Show more
+++ b/Show more