Merge pull request #15 from CNugteren/development

Update to version 0.3.0
2024-07-02 20:36:58 +02:00 · 2015-07-24 08:30:41 +02:00 · 2015-07-24 08:30:41 +02:00 · db6846b791
parent 18251df848 efbdcd2d90
commit db6846b791
127 changed files with 6575 additions and 2664 deletions
--- a/12
+++ b/12
@ -1,4 +1,16 @@

+Version 0.3.0
+- Re-organized test/client infrastructure to avoid code duplication
+- Added an optional bypass for pre/post-processing kernels in level-3 routines
+- Significantly improved performance of level-3 routines on AMD GPUs
+- Added level-3 routines:
+  * CHEMM/ZHEMM
+  * SSYRK/DSYRK/CSYRK/ZSYRK
+  * CHERK/ZHERK
+  * SSYR2K/DSYR2K/CSYR2K/ZSYR2K
+  * CHER2K/ZHER2K
+  * STRMM/DTRMM/CTRMM/ZTRMM
+
 Version 0.2.0
 - Added support for complex conjugate transpose
 - Several host-code performance improvements
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -13,7 +13,7 @@
 cmake_minimum_required(VERSION 2.8.10)
 project("clblast" CXX)
 set(clblast_VERSION_MAJOR 0)
-set(clblast_VERSION_MINOR 2)
+set(clblast_VERSION_MINOR 3)
 set(clblast_VERSION_PATCH 0)

 # Options and their default values
@ -95,17 +95,23 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
 # Sets the supported routines and the used kernels. New routines and kernels should be added here.
 set(KERNELS copy pad transpose padtranspose xaxpy xgemv xgemm)
 set(SAMPLE_PROGRAMS sgemm)
-set(ROUTINES_XY xaxpy)
-set(ROUTINES_AXY xgemv)
-set(ROUTINES_ABC xgemm xsymm)
-set(ROUTINES ${ROUTINES_XY} ${ROUTINES_AXY} ${ROUTINES_ABC})
+set(LEVEL1_ROUTINES xaxpy)
+set(LEVEL2_ROUTINES xgemv)
+set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
+set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES})

 # ==================================================================================================

 # Gathers all source-files
 set(SOURCES src/clblast.cc src/database.cc src/routine.cc src/utilities.cc)
-foreach(ROUTINE ${ROUTINES})
-  set(SOURCES ${SOURCES} src/routines/${ROUTINE}.cc)
+foreach(ROUTINE ${LEVEL1_ROUTINES})
+  set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cc)
+endforeach()
+foreach(ROUTINE ${LEVEL2_ROUTINES})
+  set(SOURCES ${SOURCES} src/routines/level2/${ROUTINE}.cc)
+endforeach()
+foreach(ROUTINE ${LEVEL3_ROUTINES})
+  set(SOURCES ${SOURCES} src/routines/level3/${ROUTINE}.cc)
 endforeach()

 # Creates and links the library
@ -168,33 +174,23 @@ if(TESTS)
  include_directories(${clblast_SOURCE_DIR}/test ${clBLAS_SOURCE_DIR})

  # Creates the common correctness-tests objects (requires CMake 2.8.8)
-  add_library(test_correctness_common OBJECT test/correctness/tester.cc)
-  add_library(test_correctness_xy OBJECT test/correctness/testxy.cc)
-  add_library(test_correctness_axy OBJECT test/correctness/testaxy.cc)
-  add_library(test_correctness_abc OBJECT test/correctness/testabc.cc)
+  add_library(test_correctness_common OBJECT
+              test/correctness/tester.cc test/correctness/testblas.cc)

  # Compiles the correctness-tests
-  foreach(ROUTINE ${ROUTINES_XY})
-    add_executable(test_${ROUTINE}
-                   $<TARGET_OBJECTS:test_correctness_common>
-                   $<TARGET_OBJECTS:test_correctness_xy>
-                   test/correctness/routines/${ROUTINE}.cc)
-    target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
-    install(TARGETS test_${ROUTINE} DESTINATION bin)
+  foreach(ROUTINE ${LEVEL1_ROUTINES})
+    add_executable(test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
+                   test/correctness/routines/level1/${ROUTINE}.cc)
  endforeach()
-  foreach(ROUTINE ${ROUTINES_AXY})
-    add_executable(test_${ROUTINE}
-                   $<TARGET_OBJECTS:test_correctness_common>
-                   $<TARGET_OBJECTS:test_correctness_axy>
-                   test/correctness/routines/${ROUTINE}.cc)
-    target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
-    install(TARGETS test_${ROUTINE} DESTINATION bin)
+  foreach(ROUTINE ${LEVEL2_ROUTINES})
+    add_executable(test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
+                   test/correctness/routines/level2/${ROUTINE}.cc)
  endforeach()
-  foreach(ROUTINE ${ROUTINES_ABC})
-    add_executable(test_${ROUTINE}
-                   $<TARGET_OBJECTS:test_correctness_common>
-                   $<TARGET_OBJECTS:test_correctness_abc>
-                   test/correctness/routines/${ROUTINE}.cc)
+  foreach(ROUTINE ${LEVEL3_ROUTINES})
+    add_executable(test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
+                   test/correctness/routines/level3/${ROUTINE}.cc)
+  endforeach()
+  foreach(ROUTINE ${ROUTINES})
    target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
    install(TARGETS test_${ROUTINE} DESTINATION bin)
  endforeach()
@ -203,10 +199,19 @@ if(TESTS)
  add_library(test_performance_common OBJECT test/performance/client.cc)

  # Compiles the performance-tests
-  set(TEST_PERF_COMM )
-  foreach(ROUTINE ${ROUTINES})
+  foreach(ROUTINE ${LEVEL1_ROUTINES})
    add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
-                   test/performance/routines/${ROUTINE}.cc)
+                   test/performance/routines/level1/${ROUTINE}.cc)
+  endforeach()
+  foreach(ROUTINE ${LEVEL2_ROUTINES})
+    add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
+                   test/performance/routines/level2/${ROUTINE}.cc)
+  endforeach()
+  foreach(ROUTINE ${LEVEL3_ROUTINES})
+    add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
+                   test/performance/routines/level3/${ROUTINE}.cc)
+  endforeach()
+  foreach(ROUTINE ${ROUTINES})
    target_link_libraries(client_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
    install(TARGETS client_${ROUTINE} DESTINATION bin)
  endforeach()
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@ CLBlast: The tuned OpenCL BLAS library

 CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices.

-__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version supports only a minimal amount of routines (including `gemm` and `gemv`): others will be added in due time. It also lacks extensive tuning and testing on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
+__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version doesn't support all routines yet: others will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.


 Why CLBlast and not clBLAS or cuBLAS?
@ -109,13 +109,13 @@ Performance remarks

 The CLBlast library provides pre-tuned parameter-values for a number of OpenCL devices. If your device is not among these, then out-of-the-box performance might be poor. Even if the device is included performance might be poor in some cases: __the preview version is not thoroughly tested for performance yet__. See above under `Using the tuners` to find out how to tune for your device.

-The folder `doc/performance` contains some PDF files with performance results on tested devices. Performance is compared against a tuned version of the clBLAS library. The graphs of the level-3 routines (Xgemm and Xsymm) show the strong points of CLBlast:
+The folder `doc/performance` contains some PDF files with performance results on tested devices. Performance is compared against a tuned version of the clBLAS library. The graphs of the level-3 routines (Xgemm, Xsymm, Xsyrk) show the strong points of CLBlast:

 * The library reaches a high peak performance for large matrix sizes, in some cases a factor 2 more than clBLAS.
 * The performance for non-power of 2 values (e.g. 1000) is roughly equal to power of 2 cases (e.g. 1024). This is not the case for clBLAS, which sometimes shows a drop of a factor 2.
 * The performance is also constant for different layouts and transpose options. Again, this is not the case for clBLAS.

-The graphs also show the current weak point of CLBlast: its performance for smaller matrix sizes is not too good. Furthermore, although the GEMM kernels perform well on AMD GPUs, the supporting copy and transpose kernel do not.
+The graphs also show the current weak points of CLBlast: for small sizes the benefit is minimal or non-existent, and for some specific configurations clBLAS is still faster.

 These graphs can be generated automatically on your own device. First, compile CLBlast with the tests enabled. Then, make sure your installation of the reference clBLAS is performance-tuned by running the `tune` executable. Finally, run one of the graph-scripts found in `test/performance/graphs` using R. For example, to generate the Xgemm PDF on device 1 of platform 0:

@ -124,7 +124,7 @@ These graphs can be generated automatically on your own device. First, compile C
 Supported routines
 -------------

-CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with `x` in the following tables:
+CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with '✔' in the following tables:

 | Level-1  | S | D | C | Z | Notes   |
 | ---------|---|---|---|---|---------|
@ -135,7 +135,7 @@ CLBlast is in active development and currently does not support the full set of
 | xSWAP    |   |   |   |   |         |
 | xSCAL    |   |   |   |   | +CS +ZD |
 | xCOPY    |   |   |   |   |         |
-| xAXPY    |`x`|`x`|`x`|`x`|         |
+| xAXPY    | ✔ | ✔ | ✔ | ✔ |         |
 | xDOT     |   |   | - | - | +DS     |
 | xDOTU    | - | - |   |   |         |
 | xDOTC    | - | - |   |   |         |
@ -147,7 +147,7 @@ CLBlast is in active development and currently does not support the full set of

 | Level-2  | S | D | C | Z | Notes   |
 | ---------|---|---|---|---|---------|
-| xGEMV    |`x`|`x`|`x`|`x`|         |
+| xGEMV    | ✔ | ✔ | ✔ | ✔ |         |
 | xGBMV    |   |   |   |   |         |
 | xHEMV    | - | - |   |   |         |
 | xHBMV    | - | - |   |   |         |
@ -175,14 +175,14 @@ CLBlast is in active development and currently does not support the full set of

 | Level-3  | S | D | C | Z | Notes   |
 | ---------|---|---|---|---|---------|
-| xGEMM    |`x`|`x`|`x`|`x`|         |
-| xSYMM    |`x`|`x`|`x`|`x`|         |
-| xHEMM    | - | - |   |   |         |
-| xSYRK    |   |   |   |   |         |
-| xHERK    | - | - |   |   |         |
-| xSYR2K   |   |   |   |   |         |
-| xHER2K   | - | - |   |   |         |
-| xTRMM    |   |   |   |   |         |
+| xGEMM    | ✔ | ✔ | ✔ | ✔ |         |
+| xSYMM    | ✔ | ✔ | ✔ | ✔ |         |
+| xHEMM    | - | - | ✔ | ✔ |         |
+| xSYRK    | ✔ | ✔ | ✔ | ✔ |         |
+| xHERK    | - | - | ✔ | ✔ |         |
+| xSYR2K   | ✔ | ✔ | ✔ | ✔ |         |
+| xHER2K   | - | - | ✔ | ✔ |         |
+| xTRMM    | ✔ | ✔ | ✔ | ✔ |         |
 | xTRSM    |   |   |   |   |         |


@ -214,8 +214,6 @@ To-do list before release of version 1.0
 - Improve host performance:
  * Allow initialization to pre-compile kernels and store to disk
 - Improve device performance:
-  * Enable 'mad()' for AMD devices
-  * Improve the performance of the copy and transpose kernels
  * Tune for a wider range of devices
  * Allow users to define custom tuned parameters
 - Improve the tuning
--- a/doc/performance/GeForce_GTX480/SAXPY.pdf
+++ b/doc/performance/GeForce_GTX480/SAXPY.pdf
--- a/doc/performance/GeForce_GTX480/SGEMM.pdf
+++ b/doc/performance/GeForce_GTX480/SGEMM.pdf
--- a/doc/performance/GeForce_GTX480/SGEMV.pdf
+++ b/doc/performance/GeForce_GTX480/SGEMV.pdf
--- a/doc/performance/GeForce_GTX480/SSYMM.pdf
+++ b/doc/performance/GeForce_GTX480/SSYMM.pdf
--- a/doc/performance/Iris/SAXPY.pdf
+++ b/doc/performance/Iris/SAXPY.pdf
--- a/doc/performance/Iris/SGEMM.pdf
+++ b/doc/performance/Iris/SGEMM.pdf
--- a/doc/performance/Iris/SGEMV.pdf
+++ b/doc/performance/Iris/SGEMV.pdf
--- a/doc/performance/Iris/SSYMM.pdf
+++ b/doc/performance/Iris/SSYMM.pdf
--- a/doc/performance/Iris/SSYRK.pdf
+++ b/doc/performance/Iris/SSYRK.pdf
--- a/doc/performance/Radeon_HD7950/SAXPY.pdf
+++ b/doc/performance/Radeon_HD7950/SAXPY.pdf
--- a/doc/performance/Radeon_HD7950/SGEMM.pdf
+++ b/doc/performance/Radeon_HD7950/SGEMM.pdf
--- a/doc/performance/Radeon_HD7950/SGEMV.pdf
+++ b/doc/performance/Radeon_HD7950/SGEMV.pdf
--- a/doc/performance/Radeon_HD7950/SSYMM.pdf
+++ b/doc/performance/Radeon_HD7950/SSYMM.pdf
--- a/doc/performance/Radeon_HD7950/SSYRK.pdf
+++ b/doc/performance/Radeon_HD7950/SSYRK.pdf
--- a/doc/performance/Tesla_K40m/SAXPY.pdf
+++ b/doc/performance/Tesla_K40m/SAXPY.pdf
--- a/doc/performance/Tesla_K40m/SGEMM.pdf
+++ b/doc/performance/Tesla_K40m/SGEMM.pdf
--- a/doc/performance/Tesla_K40m/SGEMV.pdf
+++ b/doc/performance/Tesla_K40m/SGEMV.pdf
--- a/doc/performance/Tesla_K40m/SSYMM.pdf
+++ b/doc/performance/Tesla_K40m/SSYMM.pdf
--- a/doc/performance/Tesla_K40m/SSYRK.pdf
+++ b/doc/performance/Tesla_K40m/SSYRK.pdf
--- a/include/clblast.h
+++ b/include/clblast.h
@ -75,6 +75,7 @@ enum class Layout { kRowMajor, kColMajor };
 enum class Transpose { kNo, kYes, kConjugate };
 enum class Side { kLeft, kRight };
 enum class Triangle { kUpper, kLower };
+enum class Diagonal { kUnit, kNonUnit };

 // Precision scoped enum (values in bits)
 enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
@ -95,7 +96,7 @@ StatusCode Axpy(const size_t n, const T alpha,

 // Templated-precision generalized matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV
 template <typename T>
-StatusCode Gemv(const Layout layout, const Transpose transpose_a,
+StatusCode Gemv(const Layout layout, const Transpose a_transpose,
                const size_t m, const size_t n,
                const T alpha,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
@ -107,9 +108,9 @@ StatusCode Gemv(const Layout layout, const Transpose transpose_a,
 // =================================================================================================
 // BLAS level-3 (matrix-matrix) routines

-// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM
+// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM
 template <typename T>
-StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b,
+StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
                const size_t m, const size_t n, const size_t k,
                const T alpha,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
@ -118,7 +119,7 @@ StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpos
                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                cl_command_queue* queue, cl_event* event);

-// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM
+// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM
 template <typename T>
 StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
                const size_t m, const size_t n,
@ -129,6 +130,81 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                cl_command_queue* queue, cl_event* event);

+// Templated-precision hermitian matrix-matrix multiplication: CHEMM/ZHEMM
+template <typename T>
+StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event);
+
+// Templated-precision rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK
+template <typename T>
+StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                const size_t n, const size_t k,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event);
+
+// Templated-precision rank-K update of a hermitian matrix: CHERK/ZHERK
+template <typename T>
+StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                const size_t n, const size_t k,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event);
+
+// Templated-precision rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K
+template <typename T>
+StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                 const size_t n, const size_t k,
+                 const T alpha,
+                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                 const T beta,
+                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                 cl_command_queue* queue, cl_event* event);
+
+// Templated-precision rank-2K update of a hermitian matrix: CHER2K/ZHER2K
+template <typename T, typename U>
+StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                 const size_t n, const size_t k,
+                 const T alpha,
+                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                 const U beta,
+                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                 cl_command_queue* queue, cl_event* event);
+
+// Templated-precision triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM
+template <typename T>
+StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle,
+                const Transpose a_transpose, const Diagonal diagonal,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                cl_command_queue* queue, cl_event* event);
+
+// Templated-precision matrix equation solver: STRSM/DTRSM/CTRSM/ZTRSM
+/*
+template <typename T>
+StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle,
+                const Transpose a_transpose, const Diagonal diagonal,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                cl_command_queue* queue, cl_event* event);
+*/
+
 // =================================================================================================
 } // namespace clblast

--- a/include/internal/database/copy.h
+++ b/include/internal/database/copy.h
@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::CopySingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",4}, {"COPY_VW",2} } },
      }
    },
@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::CopyDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",4} } },
      }
    },
@ -81,7 +81,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
      }
    },
@ -110,7 +110,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",4}, {"COPY_VW",2} } },
      }
    },
--- a/include/internal/database/pad.h
+++ b/include/internal/database/pad.h
@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::PadSingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
      }
    },
@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::PadDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
      }
    },
@ -81,7 +81,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
@ -110,7 +110,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
      }
    },
--- a/include/internal/database/padtranspose.h
+++ b/include/internal/database/padtranspose.h
@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::PadTraSingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"PADTRA_TILE",16}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
      }
    },
@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::PadTraDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"PADTRA_TILE",8}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
      }
    },
@ -81,7 +81,7 @@ const Database::DatabaseEntry Database::PadTraComplexSingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
      }
    },
@ -110,7 +110,7 @@ const Database::DatabaseEntry Database::PadTraComplexDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"PADTRA_TILE",8}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
      }
    },
--- a/include/internal/database/transpose.h
+++ b/include/internal/database/transpose.h
@ -18,24 +18,24 @@ const Database::DatabaseEntry Database::TraSingle = {
  "Transpose", Precision::kSingle, {
    { // NVIDIA GPUs
      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
-        { "GeForce GTX 480",  { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
-        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
-        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+        { "GeForce GTX 480",  { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
+        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
+        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
-        { "Tahiti",           { {"TRA_DIM",8}, {"TRA_WPT",8}, {"TRA_PAD",0} } },
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
+        { "Tahiti",           { {"TRA_DIM",16}, {"TRA_WPT",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
      }
    },
    { // Intel GPUs
      CL_DEVICE_TYPE_GPU, "Intel", {
-        { "Iris",             { {"TRA_DIM",8}, {"TRA_WPT",4}, {"TRA_PAD",0} } },
+        { "Iris",             { {"TRA_DIM",8}, {"TRA_WPT",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
      }
    },
    { // Default
      CL_DEVICE_TYPE_ALL, kDefault, {
-        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
      }
    },
  }
@ -47,14 +47,14 @@ const Database::DatabaseEntry Database::TraDouble = {
  "Transpose", Precision::kDouble, {
    { // NVIDIA GPUs
      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
-        { "GeForce GTX 480",  { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
-        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
-        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+        { "GeForce GTX 480",  { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
+        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
+        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
-        { "Tahiti",           { {"TRA_DIM",8}, {"TRA_WPT",8}, {"TRA_PAD",0} } },
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
+        { "Tahiti",           { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
      }
    },
    { // Intel GPUs
@ -63,7 +63,7 @@ const Database::DatabaseEntry Database::TraDouble = {
    },
    { // Default
      CL_DEVICE_TYPE_ALL, kDefault, {
-        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
      }
    },
  }
@ -75,24 +75,24 @@ const Database::DatabaseEntry Database::TraComplexSingle = {
  "Transpose", Precision::kComplexSingle, {
    { // NVIDIA GPUs
      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
-        { "GeForce GTX 480",  { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
-        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
-        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+        { "GeForce GTX 480",  { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
+        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
+        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
-        { "Tahiti",           { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
+        { "Tahiti",           { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1} } },
      }
    },
    { // Intel GPUs
      CL_DEVICE_TYPE_GPU, "Intel", {
-        { "Iris",             { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+        { "Iris",             { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
      }
    },
    { // Default
      CL_DEVICE_TYPE_ALL, kDefault, {
-        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
      }
    },
  }
@ -104,14 +104,14 @@ const Database::DatabaseEntry Database::TraComplexDouble = {
  "Transpose", Precision::kComplexDouble, {
    { // NVIDIA GPUs
      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
-        { "GeForce GTX 480",  { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
-        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
-        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+        { "GeForce GTX 480",  { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
+        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
+        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
-        { "Tahiti",           { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
+        { "Tahiti",           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
      }
    },
    { // Intel GPUs
@ -120,7 +120,7 @@ const Database::DatabaseEntry Database::TraComplexDouble = {
    },
    { // Default
      CL_DEVICE_TYPE_ALL, kDefault, {
-        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
      }
    },
  }
--- a/include/internal/database/xaxpy.h
+++ b/include/internal/database/xaxpy.h
@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"WGS",64}, {"WPT",1}, {"VW",2} } },
      }
    },
@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"WGS",256}, {"WPT",1}, {"VW",1} } },
      }
    },
@ -80,7 +80,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"WGS",64}, {"WPT",1}, {"VW",1} } },
      }
    },
@ -109,7 +109,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"WGS",64}, {"WPT",1}, {"VW",1} } },
      }
    },
--- a/include/internal/database/xgemm.h
+++ b/include/internal/database/xgemm.h
@ -25,8 +25,8 @@ const Database::DatabaseEntry Database::XgemmSingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
-        { "Tahiti",           { {"MWG",128}, {"NWG",128}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",8}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
+        { "Tahiti",           { {"MWG",128}, {"NWG",128}, {"KWG",32}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",32}, {"NDIMB",8}, {"KWI",2}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
      }
    },
    { // Intel GPUs
@ -55,7 +55,7 @@ const Database::DatabaseEntry Database::XgemmDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
      }
    },
@ -84,13 +84,13 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"MWG",16}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } },
      }
    },
    { // Intel GPUs
      CL_DEVICE_TYPE_GPU, "Intel", {
-        { "Iris",             { {"MWG",64}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",8}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } },
+        { "Iris",             { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
      }
    },
    { // Default
@ -114,7 +114,7 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"MWG",128}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
      }
    },
--- a/include/internal/database/xgemv.h
+++ b/include/internal/database/xgemv.h
@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
      }
    },
@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::XgemvDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
      }
    },
@ -80,7 +80,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
      }
    },
@ -109,7 +109,7 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = {
      }
    },
    { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
        { "Tahiti",           { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
      }
    },
--- a/include/internal/routine.h
+++ b/include/internal/routine.h
@ -34,20 +34,14 @@ class Routine {
    Program program;
    std::string device_name;
    Precision precision;
-    std::vector<std::string> routines;
+    std::string routine_name_;

    // Finds out whether the properties match
-    bool MatchInCache(const std::string &ref_name, const Precision &ref_precision,
-                      const std::vector<std::string> &ref_routines) {
-      auto ref_size = ref_routines.size();
-      if (device_name == ref_name && precision == ref_precision && routines.size() == ref_size) {
-        auto found_match = true;
-        for (auto i=size_t{0}; i<ref_size; ++i) {
-          if (routines[i] != ref_routines[i]) { found_match = false; }
-        }
-        return found_match;
-      }
-      return false;
+    bool MatchInCache(const std::string &ref_device, const Precision &ref_precision,
+                      const std::string &ref_routine) {
+      return (device_name == ref_device &&
+              precision == ref_precision &&
+              routine_name_ == ref_routine);
    }
  };

@ -58,11 +52,11 @@ class Routine {
  static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }

  // Base class constructor
-  explicit Routine(CommandQueue &queue, Event &event,
+  explicit Routine(CommandQueue &queue, Event &event, const std::string &name,
                   const std::vector<std::string> &routines, const Precision precision);

  // Set-up phase of the kernel
-  StatusCode SetUp(const std::string &routine_source);
+  StatusCode SetUp();

 protected:
  
@ -84,15 +78,18 @@ class Routine {
  StatusCode TestVectorY(const size_t n, const Buffer &buffer, const size_t offset,
                         const size_t inc, const size_t data_size);

-  // Copies/transposes a matrix and padds/unpads it
+  // Copies/transposes a matrix and padds/unpads it with zeroes. This method is also able to write
+  // to symmetric and triangular matrices through optional arguments.
  StatusCode PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
                                    const size_t src_ld, const size_t src_offset,
                                    const Buffer &src,
                                    const size_t dest_one, const size_t dest_two,
                                    const size_t dest_ld, const size_t dest_offset,
                                    const Buffer &dest,
+                                    const Program &program, const bool do_pad,
                                    const bool do_transpose, const bool do_conjugate,
-                                    const bool pad, const Program &program);
+                                    const bool upper = false, const bool lower = false,
+                                    const bool diagonal_imag_zero = false);
  
  // Queries the cache and retrieve either a matching program or a boolean whether a match exists.
  // The first assumes that the program is available in the cache and will throw an exception
@ -104,6 +101,10 @@ class Routine {
  // a derived class.
  const Precision precision_;

+  // The routine's name and its kernel-source in string form
+  const std::string routine_name_;
+  std::string source_string_;
+
  // The OpenCL objects, accessible only from derived classes
  CommandQueue queue_;
  Event event_;
@ -118,7 +119,6 @@ class Routine {

  // Connection to the database for all the device-specific parameters
  const Database db_;
-  const std::vector<std::string> routines_;
 };

 // =================================================================================================
--- a/include/internal/routines/level1/xaxpy.h
+++ b/include/internal/routines/level1/xaxpy.h
--- a/include/internal/routines/level2/xgemv.h
+++ b/include/internal/routines/level2/xgemv.h
--- a/include/internal/routines/level3/xgemm.h
+++ b/include/internal/routines/level3/xgemm.h
--- a/include/internal/routines/level3/xhemm.h
+++ b/include/internal/routines/level3/xhemm.h
@ -0,0 +1,58 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhemm routine. It is based on the generalized matrix multiplication
+// routine (Xgemm). The implementation is very similar to the Xsymm routine.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHEMM_H_
+#define CLBLAST_ROUTINES_XHEMM_H_
+
+#include "internal/routines/level3/xgemm.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhemm: public Xgemm<T> {
+ public:
+
+  // Uses several variables from the Routine class
+  using Routine::db_;
+  using Routine::context_;
+
+  // Uses several helper functions from the Routine class
+  using Routine::RunKernel;
+  using Routine::ErrorIn;
+  using Routine::TestMatrixA;
+  using Routine::GetProgramFromCache;
+
+  // Uses the regular Xgemm routine
+  using Xgemm<T>::DoGemm;
+
+  // Constructor
+  Xhemm(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                    const T beta,
+                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHEMM_H_
+#endif
--- a/include/internal/routines/level3/xher2k.h
+++ b/include/internal/routines/level3/xher2k.h
@ -0,0 +1,48 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xher2k routine. The precision is implemented using the template argument
+// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the
+// Xsyr2k routine.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHER2K_H_
+#define CLBLAST_ROUTINES_XHER2K_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class Xher2k: public Routine {
+ public:
+  Xher2k(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                     const size_t n, const size_t k,
+                     const T alpha,
+                     const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                     const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                     const U beta,
+                     const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHER2K_H_
+#endif
--- a/include/internal/routines/level3/xherk.h
+++ b/include/internal/routines/level3/xherk.h
@ -0,0 +1,47 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xherk routine. The precision is implemented using the template argument
+// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the
+// Xsyrk routine.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHERK_H_
+#define CLBLAST_ROUTINES_XHERK_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class Xherk: public Routine {
+ public:
+  Xherk(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                    const size_t n, const size_t k,
+                    const U alpha,
+                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const U beta,
+                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHERK_H_
+#endif
--- a/include/internal/routines/level3/xsymm.h
+++ b/include/internal/routines/level3/xsymm.h
@ -17,7 +17,7 @@
 #ifndef CLBLAST_ROUTINES_XSYMM_H_
 #define CLBLAST_ROUTINES_XSYMM_H_

-#include "internal/routines/xgemm.h"
+#include "internal/routines/level3/xgemm.h"

 namespace clblast {
 // =================================================================================================
--- a/include/internal/routines/level3/xsyr2k.h
+++ b/include/internal/routines/level3/xsyr2k.h
@ -0,0 +1,48 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr2k routine. The precision is implemented using a template argument.
+// The implementation is very similar to Xsyrk (see header for details), except for the fact that
+// the main XgemmUpper/XgemmLower kernel is called twice: C = AB^T + C and C = BA^T + C.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYR2K_H_
+#define CLBLAST_ROUTINES_XSYR2K_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsyr2k: public Routine {
+ public:
+  Xsyr2k(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                     const size_t n, const size_t k,
+                     const T alpha,
+                     const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                     const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                     const T beta,
+                     const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYR2K_H_
+#endif
--- a/include/internal/routines/level3/xsyrk.h
+++ b/include/internal/routines/level3/xsyrk.h
@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyrk routine. The precision is implemented using a template argument.
+// The implementation is based on the regular Xgemm routine and kernel, but with two main changes:
+// 1) The final unpad(transpose) kernel updates only the upper/lower triangular part.
+// 2) The main Xgemm kernel masks workgroups not contributing to usefull data. This is only for
+//    performance reasons, as the actual masking is done later (see the first point).
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYRK_H_
+#define CLBLAST_ROUTINES_XSYRK_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsyrk: public Routine {
+ public:
+  Xsyrk(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                    const size_t n, const size_t k,
+                    const T alpha,
+                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const T beta,
+                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYRK_H_
+#endif
--- a/include/internal/routines/level3/xtrmm.h
+++ b/include/internal/routines/level3/xtrmm.h
@ -0,0 +1,58 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmm routine. The implementation is based on first transforming the
+// upper/lower unit/non-unit triangular matrix into a regular matrix and then calling the GEMM
+// routine. Therefore, this class inherits from the Xgemm class.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTRMM_H_
+#define CLBLAST_ROUTINES_XTRMM_H_
+
+#include "internal/routines/level3/xgemm.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtrmm: public Xgemm<T> {
+ public:
+
+  // Uses several variables from the Routine class
+  using Routine::db_;
+  using Routine::context_;
+
+  // Uses several helper functions from the Routine class
+  using Routine::RunKernel;
+  using Routine::ErrorIn;
+  using Routine::TestMatrixA;
+  using Routine::GetProgramFromCache;
+
+  // Uses the regular Xgemm routine
+  using Xgemm<T>::DoGemm;
+
+  // Constructor
+  Xtrmm(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle,
+                    const Transpose a_transpose, const Diagonal diagonal,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer &b_buffer, const size_t b_offset, const size_t b_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTRMM_H_
+#endif
--- a/include/internal/utilities.h
+++ b/include/internal/utilities.h
@ -46,6 +46,7 @@ constexpr auto kArgATransp = "transA";
 constexpr auto kArgBTransp = "transB";
 constexpr auto kArgSide = "side";
 constexpr auto kArgTriangle = "triangle";
+constexpr auto kArgDiagonal = "diagonal";
 constexpr auto kArgXInc = "incx";
 constexpr auto kArgYInc = "incy";
 constexpr auto kArgXOffset = "offx";
@ -93,6 +94,7 @@ struct Arguments {
  Transpose b_transpose = Transpose::kNo;
  Side side = Side::kLeft;
  Triangle triangle = Triangle::kUpper;
+  Diagonal diagonal = Diagonal::kUnit;
  size_t x_inc = 1;
  size_t y_inc = 1;
  size_t x_offset = 0;
@ -105,6 +107,11 @@ struct Arguments {
  size_t c_offset = 0;
  T alpha = T{1.0};
  T beta = T{1.0};
+  size_t x_size = 1;
+  size_t y_size = 1;
+  size_t a_size = 1;
+  size_t b_size = 1;
+  size_t c_size = 1;
  // Tuner-specific arguments
  double fraction = 1.0;
  // Client-specific arguments
@ -123,6 +130,15 @@ struct Arguments {
  bool no_abbrv = false;
 };

+// Structure containing all possible buffers for test clients
+struct Buffers {
+  Buffer x_vec;
+  Buffer y_vec;
+  Buffer a_mat;
+  Buffer b_mat;
+  Buffer c_mat;
+};
+
 // =================================================================================================

 // Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast
--- a/src/clblast.cc
+++ b/src/clblast.cc
@ -18,14 +18,20 @@
 #include "clblast.h"

 // BLAS level-1 includes
-#include "internal/routines/xaxpy.h"
+#include "internal/routines/level1/xaxpy.h"

 // BLAS level-2 includes
-#include "internal/routines/xgemv.h"
+#include "internal/routines/level2/xgemv.h"

 // BLAS level-3 includes
-#include "internal/routines/xgemm.h"
-#include "internal/routines/xsymm.h"
+#include "internal/routines/level3/xgemm.h"
+#include "internal/routines/level3/xsymm.h"
+#include "internal/routines/level3/xhemm.h"
+#include "internal/routines/level3/xsyrk.h"
+#include "internal/routines/level3/xherk.h"
+#include "internal/routines/level3/xsyr2k.h"
+#include "internal/routines/level3/xher2k.h"
+#include "internal/routines/level3/xtrmm.h"

 namespace clblast {
 // =================================================================================================
@ -41,10 +47,8 @@ StatusCode Axpy(const size_t n, const T alpha,
  auto event_cpp = Event(*event);
  auto routine = Xaxpy<T>(queue_cpp, event_cpp);

-  // Loads the kernel source-code as an include (C++11 raw string literal)
-  std::string kernel_source =
-  #include "kernels/xaxpy.opencl"
-  auto status = routine.SetUp(kernel_source);
+  // Compiles the routine's device kernels
+  auto status = routine.SetUp();
  if (status != StatusCode::kSuccess) { return status; }

  // Runs the routine
@ -74,7 +78,7 @@ template StatusCode Axpy<double2>(const size_t, const double2,

 // GEMV
 template <typename T>
-StatusCode Gemv(const Layout layout, const Transpose transpose_a,
+StatusCode Gemv(const Layout layout, const Transpose a_transpose,
                const size_t m, const size_t n, const T alpha,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta,
@ -85,14 +89,12 @@ StatusCode Gemv(const Layout layout, const Transpose transpose_a,
  auto event_cpp = Event(*event);
  auto routine = Xgemv<T>(queue_cpp, event_cpp);

-  // Loads the kernel source-code as an include (C++11 raw string literal)
-  std::string kernel_source =
-  #include "kernels/xgemv.opencl"
-  auto status = routine.SetUp(kernel_source);
+  // Compiles the routine's device kernels
+  auto status = routine.SetUp();
  if (status != StatusCode::kSuccess) { return status; }

  // Runs the routine
-  return routine.DoGemv(layout, transpose_a, m, n, alpha,
+  return routine.DoGemv(layout, a_transpose, m, n, alpha,
                        Buffer(a_buffer), a_offset, a_ld,
                        Buffer(x_buffer), x_offset, x_inc, beta,
                        Buffer(y_buffer), y_offset, y_inc);
@ -127,7 +129,7 @@ template StatusCode Gemv<double2>(const Layout, const Transpose,

 // GEMM
 template <typename T>
-StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b,
+StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
                const size_t m, const size_t n, const size_t k, const T alpha,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
@ -137,23 +139,12 @@ StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpos
  auto event_cpp = Event(*event);
  auto routine = Xgemm<T>(queue_cpp, event_cpp);

-  // Loads the kernel source-code as an include (C++11 raw string literal)
-  std::string common_source1 =
-  #include "kernels/copy.opencl"
-  std::string common_source2 =
-  #include "kernels/pad.opencl"
-  std::string common_source3 =
-  #include "kernels/transpose.opencl"
-  std::string common_source4 =
-  #include "kernels/padtranspose.opencl"
-  std::string kernel_source =
-  #include "kernels/xgemm.opencl"
-  auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
-                              kernel_source);
+  // Compiles the routine's device kernels
+  auto status = routine.SetUp();
  if (status != StatusCode::kSuccess) { return status; }

  // Runs the routine
-  return routine.DoGemm(layout, transpose_a, transpose_b, m, n, k, alpha,
+  return routine.DoGemm(layout, a_transpose, b_transpose, m, n, k, alpha,
                        Buffer(a_buffer), a_offset, a_ld,
                        Buffer(b_buffer), b_offset, b_ld, beta,
                        Buffer(c_buffer), c_offset, c_ld);
@ -197,19 +188,8 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
  auto event_cpp = Event(*event);
  auto routine = Xsymm<T>(queue_cpp, event_cpp);

-  // Loads the kernel source-code as an include (C++11 raw string literal)
-  std::string common_source1 =
-  #include "kernels/copy.opencl"
-  std::string common_source2 =
-  #include "kernels/pad.opencl"
-  std::string common_source3 =
-  #include "kernels/transpose.opencl"
-  std::string common_source4 =
-  #include "kernels/padtranspose.opencl"
-  std::string kernel_source =
-  #include "kernels/xgemm.opencl"
-  auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
-                            kernel_source);
+  // Compiles the routine's device kernels
+  auto status = routine.SetUp();
  if (status != StatusCode::kSuccess) { return status; }

  // Runs the routine
@ -244,4 +224,302 @@ template StatusCode Symm<double2>(const Layout, const Side, const Triangle,
                                  cl_command_queue*, cl_event*);

 // =================================================================================================
+
+// HEMM
+template <typename T>
+StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
+                const size_t m, const size_t n, const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = CommandQueue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xhemm<T>(queue_cpp, event_cpp);
+
+  // Compiles the routine's device kernels
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+
+  // Runs the routine
+  return routine.DoHemm(layout, side, triangle, m, n, alpha,
+                        Buffer(a_buffer), a_offset, a_ld,
+                        Buffer(b_buffer), b_offset, b_ld, beta,
+                        Buffer(c_buffer), c_offset, c_ld);
+}
+template StatusCode Hemm<float2>(const Layout, const Side, const Triangle,
+                                 const size_t, const size_t, const float2,
+                                 const cl_mem, const size_t, const size_t,
+                                 const cl_mem, const size_t, const size_t, const float2,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Hemm<double2>(const Layout, const Side, const Triangle,
+                                  const size_t, const size_t, const double2,
+                                  const cl_mem, const size_t, const size_t,
+                                  const cl_mem, const size_t, const size_t, const double2,
+                                  cl_mem, const size_t, const size_t,
+                                  cl_command_queue*, cl_event*);
+
+// =================================================================================================
+
+// SYRK
+template <typename T>
+StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                const size_t n, const size_t k, const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = CommandQueue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xsyrk<T>(queue_cpp, event_cpp);
+
+  // Compiles the routine's device kernels
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+
+  // Runs the routine
+  return routine.DoSyrk(layout, triangle, a_transpose, n, k, alpha,
+                        Buffer(a_buffer), a_offset, a_ld, beta,
+                        Buffer(c_buffer), c_offset, c_ld);
+}
+template StatusCode Syrk<float>(const Layout, const Triangle, const Transpose,
+                                const size_t, const size_t, const float,
+                                const cl_mem, const size_t, const size_t, const float,
+                                cl_mem, const size_t, const size_t,
+                                cl_command_queue*, cl_event*);
+template StatusCode Syrk<double>(const Layout, const Triangle, const Transpose,
+                                 const size_t, const size_t, const double,
+                                 const cl_mem, const size_t, const size_t, const double,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Syrk<float2>(const Layout, const Triangle, const Transpose,
+                                 const size_t, const size_t, const float2,
+                                 const cl_mem, const size_t, const size_t, const float2,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Syrk<double2>(const Layout, const Triangle, const Transpose,
+                                  const size_t, const size_t, const double2,
+                                  const cl_mem, const size_t, const size_t, const double2,
+                                  cl_mem, const size_t, const size_t,
+                                  cl_command_queue*, cl_event*);
+
+// =================================================================================================
+
+// HERK
+template <typename T>
+StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                const size_t n, const size_t k, const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = CommandQueue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xherk<std::complex<T>,T>(queue_cpp, event_cpp);
+
+  // Compiles the routine's device kernels
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+
+  // Runs the routine
+  return routine.DoHerk(layout, triangle, a_transpose, n, k, alpha,
+                        Buffer(a_buffer), a_offset, a_ld, beta,
+                        Buffer(c_buffer), c_offset, c_ld);
+}
+template StatusCode Herk<float>(const Layout, const Triangle, const Transpose,
+                                const size_t, const size_t, const float,
+                                const cl_mem, const size_t, const size_t, const float,
+                                cl_mem, const size_t, const size_t,
+                                cl_command_queue*, cl_event*);
+template StatusCode Herk<double>(const Layout, const Triangle, const Transpose,
+                                 const size_t, const size_t, const double,
+                                 const cl_mem, const size_t, const size_t, const double,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+
+// =================================================================================================
+
+// SYR2K
+template <typename T>
+StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                 const size_t n, const size_t k, const T alpha,
+                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
+                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                 cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = CommandQueue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xsyr2k<T>(queue_cpp, event_cpp);
+
+  // Compiles the routine's device kernels
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+
+  // Runs the routine
+  return routine.DoSyr2k(layout, triangle, ab_transpose, n, k, alpha,
+                         Buffer(a_buffer), a_offset, a_ld,
+                         Buffer(b_buffer), b_offset, b_ld, beta,
+                         Buffer(c_buffer), c_offset, c_ld);
+}
+template StatusCode Syr2k<float>(const Layout, const Triangle, const Transpose,
+                                 const size_t, const size_t, const float,
+                                 const cl_mem, const size_t, const size_t,
+                                 const cl_mem, const size_t, const size_t, const float,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Syr2k<double>(const Layout, const Triangle, const Transpose,
+                                  const size_t, const size_t, const double,
+                                  const cl_mem, const size_t, const size_t,
+                                  const cl_mem, const size_t, const size_t, const double,
+                                  cl_mem, const size_t, const size_t,
+                                  cl_command_queue*, cl_event*);
+template StatusCode Syr2k<float2>(const Layout, const Triangle, const Transpose,
+                                  const size_t, const size_t, const float2,
+                                  const cl_mem, const size_t, const size_t,
+                                  const cl_mem, const size_t, const size_t, const float2,
+                                  cl_mem, const size_t, const size_t,
+                                  cl_command_queue*, cl_event*);
+template StatusCode Syr2k<double2>(const Layout, const Triangle, const Transpose,
+                                   const size_t, const size_t, const double2,
+                                   const cl_mem, const size_t, const size_t,
+                                   const cl_mem, const size_t, const size_t, const double2,
+                                   cl_mem, const size_t, const size_t,
+                                   cl_command_queue*, cl_event*);
+
+// =================================================================================================
+
+// SYR2K
+template <typename T, typename U>
+StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                 const size_t n, const size_t k, const T alpha,
+                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const U beta,
+                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                 cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = CommandQueue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xher2k<T,U>(queue_cpp, event_cpp);
+
+  // Compiles the routine's device kernels
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+
+  // Runs the routine
+  return routine.DoHer2k(layout, triangle, ab_transpose, n, k, alpha,
+                         Buffer(a_buffer), a_offset, a_ld,
+                         Buffer(b_buffer), b_offset, b_ld, beta,
+                         Buffer(c_buffer), c_offset, c_ld);
+}
+template StatusCode Her2k<float2,float>(const Layout, const Triangle, const Transpose,
+                                        const size_t, const size_t, const float2,
+                                        const cl_mem, const size_t, const size_t,
+                                        const cl_mem, const size_t, const size_t, const float,
+                                        cl_mem, const size_t, const size_t,
+                                        cl_command_queue*, cl_event*);
+template StatusCode Her2k<double2,double>(const Layout, const Triangle, const Transpose,
+                                          const size_t, const size_t, const double2,
+                                          const cl_mem, const size_t, const size_t,
+                                          const cl_mem, const size_t, const size_t, const double,
+                                          cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
+
+// =================================================================================================
+
+// TRMM
+template <typename T>
+StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle,
+                const Transpose a_transpose, const Diagonal diagonal,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = CommandQueue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xtrmm<T>(queue_cpp, event_cpp);
+
+  // Compiles the routine's device kernels
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+
+  // Runs the routine
+  return routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha,
+                        Buffer(a_buffer), a_offset, a_ld,
+                        Buffer(b_buffer), b_offset, b_ld);
+}
+template StatusCode Trmm<float>(const Layout, const Side, const Triangle,
+                                const Transpose, const Diagonal,
+                                const size_t, const size_t, const float,
+                                const cl_mem, const size_t, const size_t,
+                                cl_mem, const size_t, const size_t,
+                                cl_command_queue*, cl_event*);
+template StatusCode Trmm<double>(const Layout, const Side, const Triangle,
+                                 const Transpose, const Diagonal,
+                                 const size_t, const size_t, const double,
+                                 const cl_mem, const size_t, const size_t,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Trmm<float2>(const Layout, const Side, const Triangle,
+                                 const Transpose, const Diagonal,
+                                 const size_t, const size_t, const float2,
+                                 const cl_mem, const size_t, const size_t,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Trmm<double2>(const Layout, const Side, const Triangle,
+                                  const Transpose, const Diagonal,
+                                  const size_t, const size_t, const double2,
+                                  const cl_mem, const size_t, const size_t,
+                                  cl_mem, const size_t, const size_t,
+                                  cl_command_queue*, cl_event*);
+
+// =================================================================================================
+
+// TRSM
+/*
+template <typename T>
+StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle,
+                const Transpose a_transpose, const Diagonal diagonal,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = CommandQueue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xtrsm<T>(queue_cpp, event_cpp);
+
+  // Compiles the routine's device kernels
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+
+  // Runs the routine
+  return routine.DoTrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha,
+                        Buffer(a_buffer), a_offset, a_ld,
+                        Buffer(b_buffer), b_offset, b_ld);
+}
+template StatusCode Trsm<float>(const Layout, const Side, const Triangle,
+                                const Transpose, const Diagonal,
+                                const size_t, const size_t, const float,
+                                const cl_mem, const size_t, const size_t,
+                                cl_mem, const size_t, const size_t,
+                                cl_command_queue*, cl_event*);
+template StatusCode Trsm<double>(const Layout, const Side, const Triangle,
+                                 const Transpose, const Diagonal,
+                                 const size_t, const size_t, const double,
+                                 const cl_mem, const size_t, const size_t,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Trsm<float2>(const Layout, const Side, const Triangle,
+                                 const Transpose, const Diagonal,
+                                 const size_t, const size_t, const float2,
+                                 const cl_mem, const size_t, const size_t,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Trsm<double2>(const Layout, const Side, const Triangle,
+                                  const Transpose, const Diagonal,
+                                  const size_t, const size_t, const double2,
+                                  const cl_mem, const size_t, const size_t,
+                                  cl_mem, const size_t, const size_t,
+                                  cl_command_queue*, cl_event*);
+*/
+// =================================================================================================
 } // namespace clblast
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@ -39,6 +39,7 @@ R"(
  typedef float8 real8;
  typedef float16 real16;
  #define ZERO 0.0f
+  #define ONE 1.0f

 // Double-precision 
 #elif PRECISION == 64
@ -48,6 +49,7 @@ R"(
  typedef double8 real8;
  typedef double16 real16;
  #define ZERO 0.0
+  #define ONE 1.0

 // Complex single-precision
 #elif PRECISION == 3232
@ -61,6 +63,7 @@ R"(
                           real s8; real s9; real sA; real sB;
                           real sC; real sD; real sE; real sF;} real16;
  #define ZERO 0.0f
+  #define ONE 1.0f

 // Complex Double-precision
 #elif PRECISION == 6464
@ -74,12 +77,16 @@ R"(
                            real s8; real s9; real sA; real sB;
                            real sC; real sD; real sE; real sF;} real16;
  #define ZERO 0.0
+  #define ONE 1.0
 #endif

 // =================================================================================================

-// Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction
-#define USE_CL_MAD 0
+// Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific
+// devices, this is enabled (see src/routine.cc).
+#ifndef USE_CL_MAD
+  #define USE_CL_MAD 0
+#endif

 // Sets a variable to zero
 #if PRECISION == 3232 || PRECISION == 6464
@ -88,6 +95,20 @@ R"(
  #define SetToZero(a) a = ZERO
 #endif

+// Sets a variable to zero (only the imaginary part)
+#if PRECISION == 3232 || PRECISION == 6464
+  #define ImagToZero(a) a.y = ZERO
+#else
+  #define ImagToZero(a) 
+#endif
+
+// Sets a variable to one
+#if PRECISION == 3232 || PRECISION == 6464
+  #define SetToOne(a) a.x = ONE; a.y = ZERO
+#else
+  #define SetToOne(a) a = ONE
+#endif
+
 // Multiply two complex variables (used in the define below)
 #if PRECISION == 3232 || PRECISION == 6464
  #define MulReal(a, b) a.x*b.x - a.y*b.y
@ -122,6 +143,6 @@ R"(
 // =================================================================================================

 // End of the C++11 raw string literal
-)";
+)"

 // =================================================================================================
--- a/src/kernels/copy.opencl
+++ b/src/kernels/copy.opencl
@ -68,6 +68,6 @@ __kernel void CopyMatrix(const int ld,
 // =================================================================================================

 // End of the C++11 raw string literal
-)";
+)"

 // =================================================================================================
--- a/src/kernels/pad.opencl
+++ b/src/kernels/pad.opencl
@ -86,7 +86,9 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
                          __global const real* restrict src,
                          const int dest_one, const int dest_two,
                          const int dest_ld, const int dest_offset,
-                          __global real* dest) {
+                          __global real* dest,
+                          const int upper, const int lower,
+                          const int diagonal_imag_zero) {

  // Loops over the work per thread in both dimensions
  #pragma unroll
@ -95,11 +97,20 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
    #pragma unroll
    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_two && id_one < dest_one) {
+
+      // Masking in case of triangular matrices: updates only the upper or lower part
+      bool condition = true;
+      if (upper == 1) { condition = (id_two >= id_one); }
+      else if (lower == 1) { condition = (id_two <= id_one); }
+      if (condition) {

        // Copies the value into the destination matrix. This is always within bounds of the source
        // matrix, as we know that the destination matrix is smaller than the source.
-        dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset];
+        if (id_two < dest_two && id_one < dest_one) {
+          real value = src[id_two*src_ld + id_one + src_offset];
+          if (diagonal_imag_zero == 1 && id_one == id_two) { ImagToZero(value); }
+          dest[id_two*dest_ld + id_one + dest_offset] = value;
+        }
      }
    }
  }
@ -127,15 +138,15 @@ __kernel void SymmLowerToSquared(const int src_dim,
      if (id_two < dest_dim && id_one < dest_dim) {

        // Loads data from the lower-symmetric matrix
-        real value;
-        SetToZero(value);
+        real result;
+        SetToZero(result);
        if (id_two < src_dim && id_one < src_dim) {
-          if (id_two <= id_one) { value = src[id_two*src_ld + id_one + src_offset]; }
-          else                  { value = src[id_one*src_ld + id_two + src_offset]; }
+          if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; }
+          else                  { result = src[id_one*src_ld + id_two + src_offset]; }
        }

-        // Stores the value in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = value;
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
      }
    }
  }
@ -160,15 +171,171 @@ __kernel void SymmUpperToSquared(const int src_dim,
      if (id_two < dest_dim && id_one < dest_dim) {

        // Loads data from the upper-symmetric matrix
-        real value;
-        SetToZero(value);
+        real result;
+        SetToZero(result);
        if (id_two < src_dim && id_one < src_dim) {
-          if (id_one <= id_two) { value = src[id_two*src_ld + id_one + src_offset]; }
-          else                  { value = src[id_one*src_ld + id_two + src_offset]; }
+          if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; }
+          else                  { result = src[id_one*src_ld + id_two + src_offset]; }
        }

-        // Stores the value in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = value;
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+// =================================================================================================
+#if PRECISION == 3232 || PRECISION == 6464
+
+// Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
+// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void HermLowerToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the lower-hermitian matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_two <= id_one) {
+            result = src[id_two*src_ld + id_one + src_offset];
+            if (id_one == id_two) { result.y = ZERO; }
+          }
+          else {
+            result = src[id_one*src_ld + id_two + src_offset];
+            COMPLEX_CONJUGATE(result);
+          }
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+// Same as above, but now the matrix' data is stored in the upper-triangle
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void HermUpperToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the upper-hermitian matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_one <= id_two) {
+            result = src[id_two*src_ld + id_one + src_offset];
+            if (id_one == id_two) { result.y = ZERO; }
+          }
+          else {
+            result = src[id_one*src_ld + id_two + src_offset];
+            COMPLEX_CONJUGATE(result);
+          }
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+#endif
+// =================================================================================================
+
+// Kernel to populate a squared triangular matrix, given that the triangle which holds the data is
+// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void TrmmLowerToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest,
+                                 const int unit_diagonal) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the lower-triangular matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; }
+          if (id_two == id_one && unit_diagonal) { SetToOne(result); }
+          // Else: result is zero
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+// Same as above, but now the matrix' data is stored in the upper-triangle
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void TrmmUpperToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest,
+                                 const int unit_diagonal) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the upper-triangular matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; }
+          if (id_one == id_two && unit_diagonal) { SetToOne(result); }
+          // Else: result is zero
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
      }
    }
  }
@ -177,6 +344,6 @@ __kernel void SymmUpperToSquared(const int src_dim,
 // =================================================================================================

 // End of the C++11 raw string literal
-)";
+)"

 // =================================================================================================
--- a/src/kernels/padtranspose.opencl
+++ b/src/kernels/padtranspose.opencl
@ -100,7 +100,9 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
                                   __global const real* restrict src,
                                   const int dest_one, const int dest_two,
                                   const int dest_ld, const int dest_offset,
-                                   __global real* dest) {
+                                   __global real* dest,
+                                   const int upper, const int lower,
+                                   const int diagonal_imag_zero) {

  // Local memory to store a tile of the matrix (for coalescing)
  __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
@ -137,10 +139,18 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
      const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
      const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);

-      // Stores the transposed value in the destination matrix
-      if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
-        real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
-        dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+      // Masking in case of triangular matrices: updates only the upper or lower part
+      bool condition = true;
+      if (upper == 1) { condition = (id_dest_one >= id_dest_two); }
+      else if (lower == 1) { condition = (id_dest_one <= id_dest_two); }
+      if (condition) {
+
+        // Stores the transposed value in the destination matrix
+        if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
+          real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
+          if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); }
+          dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+        }
      }
    }
  }
@ -149,6 +159,6 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
 // =================================================================================================

 // End of the C++11 raw string literal
-)";
+)"

 // =================================================================================================
--- a/src/kernels/transpose.opencl
+++ b/src/kernels/transpose.opencl
@ -20,13 +20,16 @@ R"(
 // Parameters set by the tuner or by the database. Here they are given a basic default value in case
 // this kernel file is used outside of the CLBlast library.
 #ifndef TRA_DIM
-  #define TRA_DIM 8    // Number of local threads in the two dimensions (x,y)
+  #define TRA_DIM 8       // Number of local threads in the two dimensions (x,y)
 #endif
 #ifndef TRA_WPT
-  #define TRA_WPT 1    // Work per thread in one dimension and vector-width in the other
+  #define TRA_WPT 1       // Work per thread in one dimension and vector-width in the other
 #endif
 #ifndef TRA_PAD
-  #define TRA_PAD 0    // Padding of the local memory to avoid bank-conflicts
+  #define TRA_PAD 0       // Padding of the local memory to avoid bank-conflicts
+#endif
+#ifndef TRA_SHUFFLE
+  #define TRA_SHUFFLE 0   // Shuffling of the global indices to avoid global memory bank-conflicts
 #endif

 // =================================================================================================
@ -53,116 +56,94 @@ __kernel void TransposeMatrix(const int ld,
                              __global const realT* restrict src,
                              __global realT* dest) {

-  // Local memory to store a tile of the matrix (for coalescing)
-  __local real tile[TRA_WPT*TRA_DIM][TRA_WPT*TRA_DIM + TRA_PAD];
+  // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different
+  // way over workgroups, breaking memory-bank dependencies.
+  const int gid0 = get_group_id(0);
+  #if TRA_SHUFFLE == 1
+    const int gid1 = (get_group_id(0) + get_group_id(1)) % get_num_groups(0);
+  #else
+    const int gid1 = get_group_id(1);
+  #endif

-  // Loop over the work per thread
+  // Local memory to store a tile of the matrix (for coalescing)
+  __local realT tile[TRA_WPT*TRA_DIM][TRA_DIM + TRA_PAD];
+
+  // Loops over the work per thread
  #pragma unroll
  for (int w_one=0; w_one<TRA_WPT; ++w_one) {

    // Computes the identifiers for the source matrix. Note that the local and global dimensions
    // do not correspond to each other!
-    const int id_one = get_group_id(1) * TRA_DIM + get_local_id(0);
-    const int id_two = (get_group_id(0) * TRA_DIM + get_local_id(1))*TRA_WPT + w_one;
+    const int id_one = gid1 * TRA_DIM + get_local_id(0);
+    const int id_two = (gid0 * TRA_DIM + get_local_id(1))*TRA_WPT + w_one;

    // Loads data into the local memory
    realT value = src[id_two*(ld/TRA_WPT) + id_one];
-    #if TRA_WPT == 1
-      tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value;
-    #elif TRA_WPT == 2
-      tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.x;
-      tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.y;
-    #elif TRA_WPT == 4
-      tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.x;
-      tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.y;
-      tile[get_local_id(1)*TRA_WPT + 2][get_local_id(0)*TRA_WPT + w_one] = value.z;
-      tile[get_local_id(1)*TRA_WPT + 3][get_local_id(0)*TRA_WPT + w_one] = value.w;
-    #elif TRA_WPT == 8
-      tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.s0;
-      tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.s1;
-      tile[get_local_id(1)*TRA_WPT + 2][get_local_id(0)*TRA_WPT + w_one] = value.s2;
-      tile[get_local_id(1)*TRA_WPT + 3][get_local_id(0)*TRA_WPT + w_one] = value.s3;
-      tile[get_local_id(1)*TRA_WPT + 4][get_local_id(0)*TRA_WPT + w_one] = value.s4;
-      tile[get_local_id(1)*TRA_WPT + 5][get_local_id(0)*TRA_WPT + w_one] = value.s5;
-      tile[get_local_id(1)*TRA_WPT + 6][get_local_id(0)*TRA_WPT + w_one] = value.s6;
-      tile[get_local_id(1)*TRA_WPT + 7][get_local_id(0)*TRA_WPT + w_one] = value.s7;
-    #elif TRA_WPT == 16
-      tile[get_local_id(1)*TRA_WPT +  0][get_local_id(0)*TRA_WPT + w_one] = value.s0;
-      tile[get_local_id(1)*TRA_WPT +  1][get_local_id(0)*TRA_WPT + w_one] = value.s1;
-      tile[get_local_id(1)*TRA_WPT +  2][get_local_id(0)*TRA_WPT + w_one] = value.s2;
-      tile[get_local_id(1)*TRA_WPT +  3][get_local_id(0)*TRA_WPT + w_one] = value.s3;
-      tile[get_local_id(1)*TRA_WPT +  4][get_local_id(0)*TRA_WPT + w_one] = value.s4;
-      tile[get_local_id(1)*TRA_WPT +  5][get_local_id(0)*TRA_WPT + w_one] = value.s5;
-      tile[get_local_id(1)*TRA_WPT +  6][get_local_id(0)*TRA_WPT + w_one] = value.s6;
-      tile[get_local_id(1)*TRA_WPT +  7][get_local_id(0)*TRA_WPT + w_one] = value.s7;
-      tile[get_local_id(1)*TRA_WPT +  8][get_local_id(0)*TRA_WPT + w_one] = value.s8;
-      tile[get_local_id(1)*TRA_WPT +  9][get_local_id(0)*TRA_WPT + w_one] = value.s9;
-      tile[get_local_id(1)*TRA_WPT + 10][get_local_id(0)*TRA_WPT + w_one] = value.sA;
-      tile[get_local_id(1)*TRA_WPT + 11][get_local_id(0)*TRA_WPT + w_one] = value.sB;
-      tile[get_local_id(1)*TRA_WPT + 12][get_local_id(0)*TRA_WPT + w_one] = value.sC;
-      tile[get_local_id(1)*TRA_WPT + 13][get_local_id(0)*TRA_WPT + w_one] = value.sD;
-      tile[get_local_id(1)*TRA_WPT + 14][get_local_id(0)*TRA_WPT + w_one] = value.sE;
-      tile[get_local_id(1)*TRA_WPT + 15][get_local_id(0)*TRA_WPT + w_one] = value.sF;
-    #endif
+    tile[get_local_id(0)*TRA_WPT + w_one][get_local_id(1)] = value;
  }

  // Synchronizes all threads in a workgroup
  barrier(CLK_LOCAL_MEM_FENCE);

-  // Loop over the work per thread
+  // Loads transposed data from the local memory
+  realT v[TRA_WPT];
+  #pragma unroll
+  for (int w_one=0; w_one<TRA_WPT; ++w_one) {
+    v[w_one] = tile[get_local_id(1)*TRA_WPT + w_one][get_local_id(0)];
+  }
+
+  // Performs the register-level transpose of the vectorized data
+  realT results[TRA_WPT];
+  #if TRA_WPT == 1
+    results[0] = v[0];
+  #elif TRA_WPT == 2
+    results[0] = (realT) (v[0].x, v[1].x);
+    results[1] = (realT) (v[0].y, v[1].y);
+  #elif TRA_WPT == 4
+    results[0] = (realT) (v[0].x, v[1].x, v[2].x, v[3].x);
+    results[1] = (realT) (v[0].y, v[1].y, v[2].y, v[3].y);
+    results[2] = (realT) (v[0].z, v[1].z, v[2].z, v[3].z);
+    results[3] = (realT) (v[0].w, v[1].w, v[2].w, v[3].w);
+  #elif TRA_WPT == 8
+    results[0] = (realT) (v[0].s0, v[1].s0, v[2].s0, v[3].s0, v[4].s0, v[5].s0, v[6].s0, v[7].s0);
+    results[1] = (realT) (v[0].s1, v[1].s1, v[2].s1, v[3].s1, v[4].s1, v[5].s1, v[6].s1, v[7].s1);
+    results[2] = (realT) (v[0].s2, v[1].s2, v[2].s2, v[3].s2, v[4].s2, v[5].s2, v[6].s2, v[7].s2);
+    results[3] = (realT) (v[0].s3, v[1].s3, v[2].s3, v[3].s3, v[4].s3, v[5].s3, v[6].s3, v[7].s3);
+    results[4] = (realT) (v[0].s4, v[1].s4, v[2].s4, v[3].s4, v[4].s4, v[5].s4, v[6].s4, v[7].s4);
+    results[5] = (realT) (v[0].s5, v[1].s5, v[2].s5, v[3].s5, v[4].s5, v[5].s5, v[6].s5, v[7].s5);
+    results[6] = (realT) (v[0].s6, v[1].s6, v[2].s6, v[3].s6, v[4].s6, v[5].s6, v[6].s6, v[7].s6);
+    results[7] = (realT) (v[0].s7, v[1].s7, v[2].s7, v[3].s7, v[4].s7, v[5].s7, v[6].s7, v[7].s7);
+  #elif TRA_WPT == 16
+    results[ 0] = (realT) (v[0].s0, v[1].s0, v[2].s0, v[3].s0, v[4].s0, v[5].s0, v[6].s0, v[7].s0, v[8].s0, v[9].s0, v[10].s0, v[11].s0, v[12].s0, v[13].s0, v[14].s0, v[15].s0);
+    results[ 1] = (realT) (v[0].s1, v[1].s1, v[2].s1, v[3].s1, v[4].s1, v[5].s1, v[6].s1, v[7].s1, v[8].s1, v[9].s1, v[10].s1, v[11].s1, v[12].s1, v[13].s1, v[14].s1, v[15].s1);
+    results[ 2] = (realT) (v[0].s2, v[1].s2, v[2].s2, v[3].s2, v[4].s2, v[5].s2, v[6].s2, v[7].s2, v[8].s2, v[9].s2, v[10].s2, v[11].s2, v[12].s2, v[13].s2, v[14].s2, v[15].s2);
+    results[ 3] = (realT) (v[0].s3, v[1].s3, v[2].s3, v[3].s3, v[4].s3, v[5].s3, v[6].s3, v[7].s3, v[8].s3, v[9].s3, v[10].s3, v[11].s3, v[12].s3, v[13].s3, v[14].s3, v[15].s3);
+    results[ 4] = (realT) (v[0].s4, v[1].s4, v[2].s4, v[3].s4, v[4].s4, v[5].s4, v[6].s4, v[7].s4, v[8].s4, v[9].s4, v[10].s4, v[11].s4, v[12].s4, v[13].s4, v[14].s4, v[15].s4);
+    results[ 5] = (realT) (v[0].s5, v[1].s5, v[2].s5, v[3].s5, v[4].s5, v[5].s5, v[6].s5, v[7].s5, v[8].s5, v[9].s5, v[10].s5, v[11].s5, v[12].s5, v[13].s5, v[14].s5, v[15].s5);
+    results[ 6] = (realT) (v[0].s6, v[1].s6, v[2].s6, v[3].s6, v[4].s6, v[5].s6, v[6].s6, v[7].s6, v[8].s6, v[9].s6, v[10].s6, v[11].s6, v[12].s6, v[13].s6, v[14].s6, v[15].s6);
+    results[ 7] = (realT) (v[0].s7, v[1].s7, v[2].s7, v[3].s7, v[4].s7, v[5].s7, v[6].s7, v[7].s7, v[8].s7, v[9].s7, v[10].s7, v[11].s7, v[12].s7, v[13].s7, v[14].s7, v[15].s7);
+    results[ 8] = (realT) (v[0].s8, v[1].s8, v[2].s8, v[3].s8, v[4].s8, v[5].s8, v[6].s8, v[7].s8, v[8].s8, v[9].s8, v[10].s8, v[11].s8, v[12].s8, v[13].s8, v[14].s8, v[15].s8);
+    results[ 9] = (realT) (v[0].s9, v[1].s9, v[2].s9, v[3].s9, v[4].s9, v[5].s9, v[6].s9, v[7].s9, v[8].s9, v[9].s9, v[10].s9, v[11].s9, v[12].s9, v[13].s9, v[14].s9, v[15].s9);
+    results[10] = (realT) (v[0].sA, v[1].sA, v[2].sA, v[3].sA, v[4].sA, v[5].sA, v[6].sA, v[7].sA, v[8].sA, v[9].sA, v[10].sA, v[11].sA, v[12].sA, v[13].sA, v[14].sA, v[15].sA);
+    results[11] = (realT) (v[0].sB, v[1].sB, v[2].sB, v[3].sB, v[4].sB, v[5].sB, v[6].sB, v[7].sB, v[8].sB, v[9].sB, v[10].sB, v[11].sB, v[12].sB, v[13].sB, v[14].sB, v[15].sB);
+    results[12] = (realT) (v[0].sC, v[1].sC, v[2].sC, v[3].sC, v[4].sC, v[5].sC, v[6].sC, v[7].sC, v[8].sC, v[9].sC, v[10].sC, v[11].sC, v[12].sC, v[13].sC, v[14].sC, v[15].sC);
+    results[13] = (realT) (v[0].sD, v[1].sD, v[2].sD, v[3].sD, v[4].sD, v[5].sD, v[6].sD, v[7].sD, v[8].sD, v[9].sD, v[10].sD, v[11].sD, v[12].sD, v[13].sD, v[14].sD, v[15].sD);
+    results[14] = (realT) (v[0].sE, v[1].sE, v[2].sE, v[3].sE, v[4].sE, v[5].sE, v[6].sE, v[7].sE, v[8].sE, v[9].sE, v[10].sE, v[11].sE, v[12].sE, v[13].sE, v[14].sE, v[15].sE);
+    results[15] = (realT) (v[0].sF, v[1].sF, v[2].sF, v[3].sF, v[4].sF, v[5].sF, v[6].sF, v[7].sF, v[8].sF, v[9].sF, v[10].sF, v[11].sF, v[12].sF, v[13].sF, v[14].sF, v[15].sF);
+  #endif
+
+  // Stores the results into the destination matrix
  #pragma unroll
  for (int w_two=0; w_two<TRA_WPT; ++w_two) {
-
-    // Computes the identifiers for the destination matrix
-    const int id_one = get_global_id(0);
-    const int id_two = get_global_id(1)*TRA_WPT + w_two;
-
-    // Stores the transposed value in the destination matrix
-    realT value;
-    #if TRA_WPT == 1
-      value = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
-    #elif TRA_WPT == 2
-      value.x = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
-      value.y = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
-    #elif TRA_WPT == 4
-      value.x = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
-      value.y = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
-      value.z = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 2];
-      value.w = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 3];
-    #elif TRA_WPT == 8
-      value.s0 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
-      value.s1 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
-      value.s2 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 2];
-      value.s3 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 3];
-      value.s4 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 4];
-      value.s5 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 5];
-      value.s6 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 6];
-      value.s7 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 7];
-    #elif TRA_WPT == 16
-      value.s0 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  0];
-      value.s1 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  1];
-      value.s2 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  2];
-      value.s3 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  3];
-      value.s4 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  4];
-      value.s5 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  5];
-      value.s6 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  6];
-      value.s7 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  7];
-      value.s8 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  8];
-      value.s9 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  9];
-      value.sA = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 10];
-      value.sB = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 11];
-      value.sC = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 12];
-      value.sD = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 13];
-      value.sE = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 14];
-      value.sF = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 15];
-    #endif
-    dest[id_two*(ld/TRA_WPT) + id_one] = value;
+    const int id_one = gid0*TRA_DIM + get_local_id(0);
+    const int id_two = (gid1*TRA_DIM + get_local_id(1))*TRA_WPT + w_two;
+    dest[id_two*(ld/TRA_WPT) + id_one] = results[w_two];
  }
 }

 // =================================================================================================

 // End of the C++11 raw string literal
-)";
+)"

 // =================================================================================================
--- a/src/kernels/xaxpy.opencl
+++ b/src/kernels/xaxpy.opencl
@ -123,6 +123,6 @@ __kernel void XaxpyFast(const int n, const real alpha,
 // =================================================================================================

 // End of the C++11 raw string literal
-)";
+)"

 // =================================================================================================
--- a/src/kernels/xgemm.opencl
+++ b/src/kernels/xgemm.opencl
@ -127,6 +127,55 @@ R"(

 // =================================================================================================

+// Initializes the accumulation registers to zero
+inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
+  #pragma unroll
+  for (int mi=0; mi<MWI/VWM; ++mi) {
+    #pragma unroll
+    for (int ni=0; ni<NWI; ++ni) {
+      #if VWM == 1
+        SetToZero(cpm[ni][mi]);
+      #elif VWM == 2
+        SetToZero(cpm[ni][mi].x);
+        SetToZero(cpm[ni][mi].y);
+      #elif VWM == 4
+        SetToZero(cpm[ni][mi].x);
+        SetToZero(cpm[ni][mi].y);
+        SetToZero(cpm[ni][mi].z);
+        SetToZero(cpm[ni][mi].w);
+      #elif VWM == 8
+        SetToZero(cpm[ni][mi].s0);
+        SetToZero(cpm[ni][mi].s1);
+        SetToZero(cpm[ni][mi].s2);
+        SetToZero(cpm[ni][mi].s3);
+        SetToZero(cpm[ni][mi].s4);
+        SetToZero(cpm[ni][mi].s5);
+        SetToZero(cpm[ni][mi].s6);
+        SetToZero(cpm[ni][mi].s7);
+      #elif VWM == 16
+        SetToZero(cpm[ni][mi].s0);
+        SetToZero(cpm[ni][mi].s1);
+        SetToZero(cpm[ni][mi].s2);
+        SetToZero(cpm[ni][mi].s3);
+        SetToZero(cpm[ni][mi].s4);
+        SetToZero(cpm[ni][mi].s5);
+        SetToZero(cpm[ni][mi].s6);
+        SetToZero(cpm[ni][mi].s7);
+        SetToZero(cpm[ni][mi].s8);
+        SetToZero(cpm[ni][mi].s9);
+        SetToZero(cpm[ni][mi].sA);
+        SetToZero(cpm[ni][mi].sB);
+        SetToZero(cpm[ni][mi].sC);
+        SetToZero(cpm[ni][mi].sD);
+        SetToZero(cpm[ni][mi].sE);
+        SetToZero(cpm[ni][mi].sF);
+      #endif
+    }
+  }
+}
+
+// =================================================================================================
+
 // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
 // caching the A input matrix.
 #if SA == 1
@ -272,71 +321,6 @@ inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg

 // =================================================================================================

-// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
-// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
-inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
-                         const real alpha, const real beta) {
-  #pragma unroll
-  for (int ni=0; ni<NWI; ++ni) {
-    #pragma unroll
-    for (int mi=0; mi<MWI/VWM; ++mi) {
-      #if STRM == 0
-        int mg = mi + get_local_id(0)*(MWI/VWM);
-      #elif STRM == 1
-        int mg = get_local_id(0) + mi*MDIMC;
-      #endif
-      #if STRN == 0
-        int ng = ni + get_local_id(1)*NWI;
-      #elif STRN == 1
-        int ng = ni%VWN + get_local_id(1)*VWN + (ni/VWN)*VWN*NDIMC;
-      #endif
-      int idm = mg + get_group_id(0)*(MWG/VWM);
-      int idn = ng + get_group_id(1)*NWG;
-      int index = idn*(kSizeM/VWM) + idm;
-      realM cval = cgm[index];
-      #if VWM == 1
-        AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cval);
-      #elif VWM == 2
-        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
-        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
-      #elif VWM == 4
-        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
-        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
-        AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cval.z);
-        AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cval.w);
-      #elif VWM == 8
-        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
-        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
-        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
-        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
-        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
-        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
-        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
-        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
-      #elif VWM == 16
-        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
-        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
-        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
-        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
-        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
-        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
-        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
-        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
-        AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cval.s8);
-        AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cval.s9);
-        AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cval.sA);
-        AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cval.sB);
-        AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cval.sC);
-        AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cval.sD);
-        AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cval.sE);
-        AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cval.sF);
-      #endif
-    }
-  }
-}
-
-// =================================================================================================
-
 // The vectorised multiply-add function
 inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
  #if USE_VECTOR_MAD == 1
@ -432,77 +416,97 @@ inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], real

 // =================================================================================================

-// Main entry of the kernel. This function contains the basic skeleton, the functionality is
-// provided by the inlined functions above
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
-                    const real alpha, const real beta,
-                    const __global realM* restrict agm,
-                    const __global realN* restrict bgm,
-                    __global realM* cgm) {
+// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
+// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
+inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
+                         const real alpha, const real beta) {
+  #pragma unroll
+  for (int ni=0; ni<NWI; ++ni) {
+    #pragma unroll
+    for (int mi=0; mi<MWI/VWM; ++mi) {
+      #if STRM == 0
+        int mg = mi + get_local_id(0)*(MWI/VWM);
+      #elif STRM == 1
+        int mg = get_local_id(0) + mi*MDIMC;
+      #endif
+      #if STRN == 0
+        int ng = ni + get_local_id(1)*NWI;
+      #elif STRN == 1
+        int ng = ni%VWN + get_local_id(1)*VWN + (ni/VWN)*VWN*NDIMC;
+      #endif
+      int idm = mg + get_group_id(0)*(MWG/VWM);
+      int idn = ng + get_group_id(1)*NWG;

-  // Combined thread identifier
+      // The final multiplication with alpha and the addition with beta*C
+      int index = idn*(kSizeM/VWM) + idm;
+      realM cval = cgm[index];
+      #if VWM == 1
+        AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cval);
+      #elif VWM == 2
+        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
+        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
+      #elif VWM == 4
+        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
+        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
+        AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cval.z);
+        AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cval.w);
+      #elif VWM == 8
+        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
+        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
+        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
+        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
+        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
+        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
+        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
+        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
+      #elif VWM == 16
+        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
+        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
+        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
+        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
+        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
+        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
+        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
+        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
+        AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cval.s8);
+        AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cval.s9);
+        AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cval.sA);
+        AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cval.sB);
+        AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cval.sC);
+        AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cval.sD);
+        AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cval.sE);
+        AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cval.sF);
+      #endif
+    }
+  }
+}
+
+// =================================================================================================
+
+// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
+inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
+                      const __global realM* restrict agm, const __global realN* restrict bgm,
+                      __global realM* cgm, realM cpm[NWI][MWI/VWM]
+                      #if SA == 1 && SB == 1
+                        , __local realM* alm, __local realN* blm
+                      #elif SA == 1
+                        , __local realM* alm
+                      #elif SB == 1
+                        , __local realN* blm
+                      #endif
+                      ) {
+
+  // Allocates workitem-private memory (registers)
+  realM apm[MWI/VWM];
+  realN bpm[NWI/VWN];
+
+  // Combined thread identifier (volatile to disable caching)
  #if SA == 1 || SB == 1
    volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
  #endif

-  // Allocates workgroup-private memory (local memory)
-  #if SA == 1
-    __local realM alm[KWG * MWG/VWM];
-  #endif
-  #if SB == 1
-    __local realN blm[KWG * NWG/VWN];
-  #endif
-  
-  // Allocates workitem-private memory (registers)
-  realM apm[MWI/VWM];
-  realN bpm[NWI/VWN];
-  realM cpm[NWI][MWI/VWM];
-
  // Initializes the accumulation registers
-  #pragma unroll
-  for (int mi=0; mi<MWI/VWM; ++mi) {
-    #pragma unroll
-    for (int ni=0; ni<NWI; ++ni) {
-      #if VWM == 1
-        SetToZero(cpm[ni][mi]);
-      #elif VWM == 2
-        SetToZero(cpm[ni][mi].x);
-        SetToZero(cpm[ni][mi].y);
-      #elif VWM == 4
-        SetToZero(cpm[ni][mi].x);
-        SetToZero(cpm[ni][mi].y);
-        SetToZero(cpm[ni][mi].z);
-        SetToZero(cpm[ni][mi].w);
-      #elif VWM == 8
-        SetToZero(cpm[ni][mi].s0);
-        SetToZero(cpm[ni][mi].s1);
-        SetToZero(cpm[ni][mi].s2);
-        SetToZero(cpm[ni][mi].s3);
-        SetToZero(cpm[ni][mi].s4);
-        SetToZero(cpm[ni][mi].s5);
-        SetToZero(cpm[ni][mi].s6);
-        SetToZero(cpm[ni][mi].s7);
-      #elif VWM == 16
-        SetToZero(cpm[ni][mi].s0);
-        SetToZero(cpm[ni][mi].s1);
-        SetToZero(cpm[ni][mi].s2);
-        SetToZero(cpm[ni][mi].s3);
-        SetToZero(cpm[ni][mi].s4);
-        SetToZero(cpm[ni][mi].s5);
-        SetToZero(cpm[ni][mi].s6);
-        SetToZero(cpm[ni][mi].s7);
-        SetToZero(cpm[ni][mi].s8);
-        SetToZero(cpm[ni][mi].s9);
-        SetToZero(cpm[ni][mi].sA);
-        SetToZero(cpm[ni][mi].sB);
-        SetToZero(cpm[ni][mi].sC);
-        SetToZero(cpm[ni][mi].sD);
-        SetToZero(cpm[ni][mi].sE);
-        SetToZero(cpm[ni][mi].sF);
-      #endif
-    }
-  }
+  InitAccRegisters(cpm);

  // Loops over all workgroup tiles
  for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
@ -515,8 +519,6 @@ __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
    #if SB == 1
      GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
    #endif
-
-    // Synchronizes all threads in a workgroup
    #if SA == 1 || SB == 1
      barrier(CLK_LOCAL_MEM_FENCE);
    #endif
@ -552,20 +554,130 @@ __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
        MultiplyAccumulate(cpm, apm, bpm);
      }
    }
-
-    // Synchronizes all threads in a workgroup
    #if SA == 1 || SB == 1
      barrier(CLK_LOCAL_MEM_FENCE);
    #endif
  }
-
-  // Stores an MWG * NWG tile of results and perform the multiplication with alpha and beta
-  StoreResults(cgm, cpm, kSizeM, alpha, beta);
 }

 // =================================================================================================
+// The upper-triangular and lower-triangular kernels are only used in special cases
+#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)

-// End of the C++11 raw string literal
-)";
+// Main entry point of the kernel. This is the upper-triangular version.
+__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
+                         const real alpha, const real beta,
+                         const __global realM* restrict agm,
+                         const __global realN* restrict bgm,
+                         __global realM* cgm) {
+
+  // Skip these threads if they do not contain threads contributing to the upper-triangle
+  if (get_group_id(1)*NWG < get_group_id(0)*MWG) {
+    return;
+  }
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+
+  // Computes the matrix-multiplication and stores the result in register memory
+  realM cpm[NWI][MWI/VWM];
+  #if SA == 1 && SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+  #elif SA == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+  #elif SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+  #else
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+  #endif
+
+  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+  StoreResults(cgm, cpm, kSizeN, alpha, beta);
+}
+
+// Main entry point of the kernel. This is the lower-triangular version.
+__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+__kernel void XgemmLower(const int kSizeN, const int kSizeK,
+                         const real alpha, const real beta,
+                         const __global realM* restrict agm,
+                         const __global realN* restrict bgm,
+                         __global realM* cgm) {
+
+  // Skip these threads if they do not contain threads contributing to the lower-triangle
+  if (get_group_id(1)*NWG > get_group_id(0)*MWG) {
+    return;
+  }
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+
+  // Computes the matrix-multiplication and stores the result in register memory
+  realM cpm[NWI][MWI/VWM];
+  #if SA == 1 && SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+  #elif SA == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+  #elif SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+  #else
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+  #endif
+
+  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+  StoreResults(cgm, cpm, kSizeN, alpha, beta);
+}
+
+// =================================================================================================
+// If not using a triangular version, include the regular kernel
+#else
+
+// Main entry point of the kernel. This is the regular full version.
+__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
+                    const real alpha, const real beta,
+                    const __global realM* restrict agm,
+                    const __global realN* restrict bgm,
+                    __global realM* cgm) {
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+
+  // Computes the matrix-multiplication and stores the result in register memory
+  realM cpm[NWI][MWI/VWM];
+  #if SA == 1 && SB == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+  #elif SA == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+  #elif SB == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+  #else
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+  #endif
+
+  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+  StoreResults(cgm, cpm, kSizeM, alpha, beta);
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"

 // =================================================================================================
--- a/src/kernels/xgemv.opencl
+++ b/src/kernels/xgemv.opencl
@ -368,6 +368,6 @@ __kernel void XgemvFastRot(const int m, const int n, const real alpha, const rea
 // =================================================================================================

 // End of the C++11 raw string literal
-)";
+)"

 // =================================================================================================
--- a/src/routine.cc
+++ b/src/routine.cc
@ -22,9 +22,10 @@ namespace clblast {
 std::vector<Routine::ProgramCache> Routine::program_cache_;

 // Constructor: not much here, because no status codes can be returned
-Routine::Routine(CommandQueue &queue, Event &event,
+Routine::Routine(CommandQueue &queue, Event &event, const std::string &name,
                 const std::vector<std::string> &routines, const Precision precision):
    precision_(precision),
+    routine_name_(name),
    queue_(queue),
    event_(event),
    context_(queue_.GetContext()),
@ -33,14 +34,13 @@ Routine::Routine(CommandQueue &queue, Event &event,
    max_work_item_dimensions_(device_.MaxWorkItemDimensions()),
    max_work_item_sizes_(device_.MaxWorkItemSizes()),
    max_work_group_size_(device_.MaxWorkGroupSize()),
-    db_(queue_, routines, precision_),
-    routines_(routines) {
+    db_(queue_, routines, precision_) {
 }

 // =================================================================================================

 // Separate set-up function to allow for status codes to be returned
-StatusCode Routine::SetUp(const std::string &routine_source) {
+StatusCode Routine::SetUp() {

  // Queries the cache to see whether or not the compiled kernel is already there. If not, it will
  // be built and added to the cache.
@ -63,12 +63,24 @@ StatusCode Routine::SetUp(const std::string &routine_source) {

    // Loads the common header (typedefs and defines and such)
    std::string common_header =
-    #include "kernels/common.opencl"
+      #include "kernels/common.opencl"
+    ;

    // Collects the parameters for this device in the form of defines, and adds the precision
    auto defines = db_.GetDefines();
    defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
-    auto source_string = defines + common_header + routine_source;
+
+    // Adds the name of the routine as a define
+    defines += "#define ROUTINE_"+routine_name_+"\n";
+
+    // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
+    // performance, but might result in a reduced accuracy.
+    if (device_.Vendor() == "AMD") {
+      defines += "#define USE_CL_MAD 1\n";
+    }
+
+    // Combines everything together into a single source string
+    auto source_string = defines + common_header + source_string_;

    // Compiles the kernel
    try {
@ -85,7 +97,7 @@ StatusCode Routine::SetUp(const std::string &routine_source) {
      if (status == CL_INVALID_BINARY) { return StatusCode::kInvalidBinary; }

      // Store the compiled program in the cache
-      program_cache_.push_back({program, device_name_, precision_, routines_});
+      program_cache_.push_back({program, device_name_, precision_, routine_name_});
    } catch (...) { return StatusCode::kBuildProgramFailure; }
  }

@ -202,19 +214,22 @@ StatusCode Routine::TestVectorY(const size_t n, const Buffer &buffer, const size

 // =================================================================================================

-// Copies a matrix and pads it with zeros
+// Copies or transposes a matrix and pads/unpads it with zeros
 StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
                                           const size_t src_ld, const size_t src_offset,
                                           const Buffer &src,
                                           const size_t dest_one, const size_t dest_two,
                                           const size_t dest_ld, const size_t dest_offset,
                                           const Buffer &dest,
+                                           const Program &program, const bool do_pad,
                                           const bool do_transpose, const bool do_conjugate,
-                                           const bool pad, const Program &program) {
+                                           const bool upper, const bool lower,
+                                           const bool diagonal_imag_zero) {

  // Determines whether or not the fast-version could potentially be used
  auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
-                         (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld);
+                         (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
+                         (upper == false) && (lower == false) && (diagonal_imag_zero == false);

  // Determines the right kernel
  auto kernel_name = std::string{};
@ -227,7 +242,7 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
    }
    else {
      use_fast_kernel = false;
-      kernel_name = (pad) ? "PadTransposeMatrix" : "UnPadTransposeMatrix";
+      kernel_name = (do_pad) ? "PadTransposeMatrix" : "UnPadTransposeMatrix";
    }
  }
  else {
@ -239,7 +254,7 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
    }
    else {
      use_fast_kernel = false;
-      kernel_name = (pad) ? "PadMatrix" : "UnPadMatrix";
+      kernel_name = (do_pad) ? "PadMatrix" : "UnPadMatrix";
    }
  }

@ -264,9 +279,14 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
      kernel.SetArgument(7, static_cast<int>(dest_ld));
      kernel.SetArgument(8, static_cast<int>(dest_offset));
      kernel.SetArgument(9, dest());
-      if (pad) {
+      if (do_pad) {
        kernel.SetArgument(10, static_cast<int>(do_conjugate));
      }
+      else {
+        kernel.SetArgument(10, static_cast<int>(upper));
+        kernel.SetArgument(11, static_cast<int>(lower));
+        kernel.SetArgument(12, static_cast<int>(diagonal_imag_zero));
+      }
    }

    // Launches the kernel and returns the error code. Uses global and local thread sizes based on
@ -310,7 +330,7 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
 // otherwise.
 const Program& Routine::GetProgramFromCache() const {
  for (auto &cached_program: program_cache_) {
-    if (cached_program.MatchInCache(device_name_, precision_, routines_)) {
+    if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) {
      return cached_program.program;
    }
  }
@ -320,7 +340,7 @@ const Program& Routine::GetProgramFromCache() const {
 // Queries the cache to see whether or not the compiled kernel is already there
 bool Routine::ProgramIsInCache() const {
  for (auto &cached_program: program_cache_) {
-    if (cached_program.MatchInCache(device_name_, precision_, routines_)) { return true; }
+    if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) { return true; }
  }
  return false;
 }
--- a/src/routines/level1/xaxpy.cc
+++ b/src/routines/level1/xaxpy.cc
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/xaxpy.h"
+#include "internal/routines/level1/xaxpy.h"

 #include <string>
 #include <vector>
@ -30,7 +30,10 @@ template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDoub
 // Constructor: forwards to base class constructor
 template <typename T>
 Xaxpy<T>::Xaxpy(CommandQueue &queue, Event &event):
-    Routine(queue, event, {"Xaxpy"}, precision_) {
+    Routine(queue, event, "AXPY", {"Xaxpy"}, precision_) {
+  source_string_ =
+    #include "../../kernels/xaxpy.opencl"
+  ;
 }

 // =================================================================================================
--- a/src/routines/level2/xgemv.cc
+++ b/src/routines/level2/xgemv.cc
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/xgemv.h"
+#include "internal/routines/level2/xgemv.h"

 #include <string>
 #include <vector>
@ -30,7 +30,10 @@ template <> const Precision Xgemv<double2>::precision_ = Precision::kComplexDoub
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgemv<T>::Xgemv(CommandQueue &queue, Event &event):
-    Routine(queue, event, {"Xgemv"}, precision_) {
+    Routine(queue, event, "GEMV", {"Xgemv"}, precision_) {
+  source_string_ =
+    #include "../../kernels/xgemv.opencl"
+  ;
 }

 // =================================================================================================
--- a/src/routines/level3/xgemm.cc
+++ b/src/routines/level3/xgemm.cc
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/xgemm.h"
+#include "internal/routines/level3/xgemm.h"

 #include <string>
 #include <vector>
@ -30,7 +30,14 @@ template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDoub
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgemm<T>::Xgemm(CommandQueue &queue, Event &event):
-    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
+    Routine(queue, event, "GEMM", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+  source_string_ =
+    #include "../../kernels/copy.opencl"
+    #include "../../kernels/pad.opencl"
+    #include "../../kernels/transpose.opencl"
+    #include "../../kernels/padtranspose.opencl"
+    #include "../../kernels/xgemm.opencl"
+  ;
 }

 // =================================================================================================
@ -95,31 +102,48 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
  auto n_ceiled = Ceil(n, db_["NWG"]);
  auto k_ceiled = Ceil(k, db_["KWG"]);

-  // Allocates space on the device for padded and/or transposed input and output matrices.
+  // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
  try {
-    auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T));
-    auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
-    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T));

    // Loads the program from the database
    auto& program = GetProgramFromCache();

-    // Runs the pre-processing kernels. This transposes the matrices, but also pads zeros to fill
-    // them up until they reach a certain multiple of size (kernel parameter dependent).
-    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
-                                    m_ceiled, k_ceiled, m_ceiled, 0, temp_a,
-                                    a_do_transpose, a_conjugate, true, program);
-    if (ErrorIn(status)) { return status; }
-    status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
-                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
-                                    b_do_transpose, b_conjugate, true, program);
-    if (ErrorIn(status)) { return status; }
+    // Determines whether or not temporary matrices are needed
+    auto a_no_temp = a_one == m_ceiled && a_two == k_ceiled && a_ld == m_ceiled && a_offset == 0 &&
+                     a_do_transpose == false && a_conjugate == false;
+    auto b_no_temp = b_one == n_ceiled && b_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
+                     b_do_transpose == false && b_conjugate == false;
+    auto c_no_temp = c_one == m_ceiled && c_two == n_ceiled && c_ld == m_ceiled && c_offset == 0 &&
+                     c_do_transpose == false;

-    // Only necessary for matrix C if it used both as input and output
-    if (beta != static_cast<T>(0)) {
+    // Creates the temporary matrices
+    auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T));
+    auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto c_temp = (c_no_temp) ? c_buffer : Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T));
+
+    // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
+    // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+    // case nothing has to be done, these kernels can be skipped.
+    if (!a_no_temp) {
+      status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+                                      m_ceiled, k_ceiled, m_ceiled, 0, a_temp,
+                                      program, true, a_do_transpose, a_conjugate);
+      if (ErrorIn(status)) { return status; }
+    }
+
+    // As above, but now for matrix B
+    if (!b_no_temp) {
+      status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
+                                      n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
+                                      program, true, b_do_transpose, b_conjugate);
+      if (ErrorIn(status)) { return status; }
+    }
+
+    // As above, but now for matrix C. This is only necessary if C is used both as input and output.
+    if (!c_no_temp && beta != static_cast<T>(0)) {
      status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer,
-                                      m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
-                                      c_do_transpose, false, true, program);
+                                      m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
+                                      program, true, c_do_transpose, false);
      if (ErrorIn(status)) { return status; }
    }

@ -133,9 +157,9 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
      kernel.SetArgument(2, static_cast<int>(k_ceiled));
      kernel.SetArgument(3, alpha);
      kernel.SetArgument(4, beta);
-      kernel.SetArgument(5, temp_a());
-      kernel.SetArgument(6, temp_b());
-      kernel.SetArgument(7, temp_c());
+      kernel.SetArgument(5, a_temp());
+      kernel.SetArgument(6, b_temp());
+      kernel.SetArgument(7, c_temp());

      // Computes the global and local thread sizes
      auto global = std::vector<size_t>{
@ -148,11 +172,13 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
      status = RunKernel(kernel, global, local);
      if (ErrorIn(status)) { return status; }

-      // Runs the post-processing kernel
-      status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
-                                      c_one, c_two, c_ld, c_offset, c_buffer,
-                                      c_do_transpose, false, false, program);
-      if (ErrorIn(status)) { return status; }
+      // Runs the post-processing kernel if needed
+      if (!c_no_temp) {
+        status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
+                                        c_one, c_two, c_ld, c_offset, c_buffer,
+                                        program, false, c_do_transpose, false);
+        if (ErrorIn(status)) { return status; }
+      }

      // Successfully finished the computation
      return StatusCode::kSuccess;
--- a/src/routines/level3/xhemm.cc
+++ b/src/routines/level3/xhemm.cc
@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhemm class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xhemm.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xhemm<T>::Xhemm(CommandQueue &queue, Event &event):
+    Xgemm<T>(queue, event) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle triangle,
+                            const size_t m, const size_t n,
+                            const T alpha,
+                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                            const T beta,
+                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Computes the k dimension. This is based on whether or not the hermitian matrix is A (on the
+  // left) or B (on the right) in the Xgemm routine.
+  auto k = (side == Side::kLeft) ? m : n;
+
+  // Checks for validity of the squared A matrix
+  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
+  // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix
+  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+  auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared";
+
+  // Temporary buffer for a copy of the hermitian matrix
+  try {
+    auto temp_herm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
+
+    // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm
+    // routine afterwards
+    try {
+      auto& program = GetProgramFromCache();
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the arguments for the hermitian-to-squared kernel
+      kernel.SetArgument(0, static_cast<int>(k));
+      kernel.SetArgument(1, static_cast<int>(a_ld));
+      kernel.SetArgument(2, static_cast<int>(a_offset));
+      kernel.SetArgument(3, a_buffer());
+      kernel.SetArgument(4, static_cast<int>(k));
+      kernel.SetArgument(5, static_cast<int>(k));
+      kernel.SetArgument(6, static_cast<int>(0));
+      kernel.SetArgument(7, temp_herm());
+
+      // Uses the common padding kernel's thread configuration. This is allowed, since the
+      // hermitian-to-squared kernel uses the same parameters.
+      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the regular Xgemm code with either "C := AB+C" or ...
+      if (side == Side::kLeft) {
+        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
+                        m, n, k,
+                        alpha,
+                        temp_herm, 0, k,
+                        b_buffer, b_offset, b_ld,
+                        beta,
+                        c_buffer, c_offset, c_ld);
+      }
+
+      // ... with "C := BA+C". Note that A and B are now reversed.
+      else {
+        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
+                        m, n, k,
+                        alpha,
+                        b_buffer, b_offset, b_ld,
+                        temp_herm, 0, k,
+                        beta,
+                        c_buffer, c_offset, c_ld);
+
+        // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
+        switch(status) {
+          case StatusCode::kInvalidMatrixA:      status = StatusCode::kInvalidMatrixB; break;
+          case StatusCode::kInvalidMatrixB:      status = StatusCode::kInvalidMatrixA; break;
+          case StatusCode::kInvalidLeadDimA:     status = StatusCode::kInvalidLeadDimB; break;
+          case StatusCode::kInvalidLeadDimB:     status = StatusCode::kInvalidLeadDimA; break;
+          case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
+          case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
+        }
+      }
+
+      // Return the status of the Xgemm routine
+      return status;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xhemm<float2>;
+template class Xhemm<double2>;
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routines/level3/xher2k.cc
+++ b/src/routines/level3/xher2k.cc
@ -0,0 +1,207 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xher2k class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xher2k.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xher2k<float2,float>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xher2k<double2,double>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T, typename U>
+Xher2k<T,U>::Xher2k(CommandQueue &queue, Event &event):
+    Routine(queue, event, "HER2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+  source_string_ =
+    #include "../../kernels/copy.opencl"
+    #include "../../kernels/pad.opencl"
+    #include "../../kernels/transpose.opencl"
+    #include "../../kernels/padtranspose.opencl"
+    #include "../../kernels/xgemm.opencl"
+  ;
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T, typename U>
+StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                                const size_t n, const size_t k,
+                                const T alpha,
+                                const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                                const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                                const U beta,
+                                const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
+  // to matrix A (argument: conjugate transpose)
+  auto ab_conjugate = (ab_transpose != Transpose::kNo);
+
+  // Computes whether or not the matrices are transposed in memory. This is based on their layout
+  // (row or column-major) and whether or not they are requested to be pre-transposed.
+  auto ab_rotated = (layout == Layout::kColMajor && ab_conjugate) ||
+                    (layout == Layout::kRowMajor && !ab_conjugate);
+  auto c_rotated = (layout == Layout::kRowMajor);
+
+  // Computes the first and second dimensions of the A and B matrices taking the layout into account
+  auto ab_one = (ab_rotated) ? k : n;
+  auto ab_two = (ab_rotated) ? n : k;
+
+  // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
+  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
+  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
+  // space. Also tests that the leading dimensions of:
+  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix C cannot be less than N
+  auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Calculates the ceiled versions of n and k
+  auto n_ceiled = Ceil(n, db_["NWG"]);
+  auto k_ceiled = Ceil(k, db_["KWG"]);
+
+  // Decides which kernel to run: the upper-triangular or lower-triangular version
+  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
+
+  // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
+  try {
+
+    // Loads the program from the database
+    auto& program = GetProgramFromCache();
+
+    // Determines whether or not temporary matrices are needed
+    auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+                      ab_rotated == false && ab_conjugate == false;
+    auto a2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+                      ab_rotated == false && ab_conjugate == true;
+    auto b1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
+                      ab_rotated == false && ab_conjugate == false;
+    auto b2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
+                      ab_rotated == false && ab_conjugate == true;
+
+    // Creates the temporary matrices
+    auto a1_temp = (a1_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto a2_temp = (a2_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto b1_temp = (b1_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto b2_temp = (b2_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
+
+    // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
+    // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+    // case nothing has to be done, these kernels can be skipped.
+    if (!a1_no_temp) {
+      status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
+                                      n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
+                                      program, true, ab_rotated, ab_conjugate);
+      if (ErrorIn(status)) { return status; }
+    }
+    if (!a2_no_temp) {
+      status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
+                                      n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
+                                      program, true, ab_rotated, !ab_conjugate);
+      if (ErrorIn(status)) { return status; }
+    }
+    if (!b1_no_temp) {
+      status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
+                                      n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
+                                      program, true, ab_rotated, ab_conjugate);
+      if (ErrorIn(status)) { return status; }
+    }
+    if (!b2_no_temp) {
+      status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
+                                      n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
+                                      program, true, ab_rotated, !ab_conjugate);
+      if (ErrorIn(status)) { return status; }
+    }
+
+    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+    // modify the other triangle.
+    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
+                                    n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                                    program, true, c_rotated, false);
+    if (ErrorIn(status)) { return status; }
+
+    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+    try {
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the kernel arguments
+      auto complex_beta = T{beta, static_cast<U>(0.0)};
+      kernel.SetArgument(0, static_cast<int>(n_ceiled));
+      kernel.SetArgument(1, static_cast<int>(k_ceiled));
+      kernel.SetArgument(2, alpha);
+      kernel.SetArgument(3, complex_beta);
+      kernel.SetArgument(4, a1_temp());
+      kernel.SetArgument(5, b2_temp());
+      kernel.SetArgument(6, c_temp());
+
+      // Computes the global and local thread sizes
+      auto global = std::vector<size_t>{
+        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+      };
+      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+      // Launches the kernel
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
+      auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
+      auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
+      kernel.SetArgument(2, conjugate_alpha);
+      kernel.SetArgument(3, complex_one);
+      kernel.SetArgument(4, b1_temp());
+      kernel.SetArgument(5, a2_temp());
+
+      // Runs the kernel again
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the post-processing kernel
+      auto upper = (triangle == Triangle::kUpper);
+      auto lower = (triangle == Triangle::kLower);
+      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                                      n, n, c_ld, c_offset, c_buffer,
+                                      program, false, c_rotated, false, upper, lower, true);
+      if (ErrorIn(status)) { return status; }
+
+      // Successfully finished the computation
+      return StatusCode::kSuccess;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xher2k<float2,float>;
+template class Xher2k<double2,double>;
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routines/level3/xherk.cc
+++ b/src/routines/level3/xherk.cc
@ -0,0 +1,175 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xherk class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xherk.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xherk<float2,float>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xherk<double2,double>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T, typename U>
+Xherk<T,U>::Xherk(CommandQueue &queue, Event &event):
+    Routine(queue, event, "HERK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+  source_string_ =
+    #include "../../kernels/copy.opencl"
+    #include "../../kernels/pad.opencl"
+    #include "../../kernels/transpose.opencl"
+    #include "../../kernels/padtranspose.opencl"
+    #include "../../kernels/xgemm.opencl"
+  ;
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T, typename U>
+StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                              const size_t n, const size_t k,
+                              const U alpha,
+                              const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                              const U beta,
+                              const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
+  // to matrix A (argument: conjugate transpose)
+  auto a_conjugate = (a_transpose != Transpose::kNo);
+  auto b_conjugate = (a_transpose == Transpose::kNo);
+
+  // Computes whether or not the matrices are transposed in memory. This is based on their layout
+  // (row or column-major) and whether or not they are requested to be pre-transposed.
+  auto a_rotated = (layout == Layout::kColMajor && a_conjugate) ||
+                   (layout == Layout::kRowMajor && !a_conjugate);
+  auto c_rotated = (layout == Layout::kRowMajor);
+
+  // Computes the first and second dimensions of the A matrix taking the layout into account
+  auto a_one = (a_rotated) ? k : n;
+  auto a_two = (a_rotated) ? n : k;
+
+  // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and
+  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
+  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
+  // space. Also tests that the leading dimensions of:
+  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix C cannot be less than N
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Calculates the ceiled versions of n and k
+  auto n_ceiled = Ceil(n, db_["NWG"]);
+  auto k_ceiled = Ceil(k, db_["KWG"]);
+
+  // Decides which kernel to run: the upper-triangular or lower-triangular version
+  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
+
+  // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
+  try {
+
+    // Loads the program from the database
+    auto& program = GetProgramFromCache();
+
+    // Determines whether or not temporary matrices are needed
+    auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+                     a_rotated == false && a_conjugate == false;
+    auto b_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+                     a_rotated == false && b_conjugate == false;
+
+    // Creates the temporary matrices
+    auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto b_temp = (b_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
+
+    // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
+    // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+    // case nothing has to be done, these kernels can be skipped. Two copies are created.
+    if (!a_no_temp) {
+      status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+                                      n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
+                                      program, true, a_rotated, a_conjugate);
+      if (ErrorIn(status)) { return status; }
+    }
+    if (!b_no_temp) {
+      status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+                                      n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
+                                      program, true, a_rotated, b_conjugate);
+      if (ErrorIn(status)) { return status; }
+    }
+
+    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+    // modify the other triangle.
+    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
+                                    n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                                    program, true, c_rotated, false);
+    if (ErrorIn(status)) { return status; }
+
+    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+    try {
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the kernel arguments
+      auto complex_alpha = T{alpha, static_cast<U>(0.0)};
+      auto complex_beta = T{beta, static_cast<U>(0.0)};
+      kernel.SetArgument(0, static_cast<int>(n_ceiled));
+      kernel.SetArgument(1, static_cast<int>(k_ceiled));
+      kernel.SetArgument(2, complex_alpha);
+      kernel.SetArgument(3, complex_beta);
+      kernel.SetArgument(4, a_temp());
+      kernel.SetArgument(5, b_temp());
+      kernel.SetArgument(6, c_temp());
+
+      // Computes the global and local thread sizes
+      auto global = std::vector<size_t>{
+        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+      };
+      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+      // Launches the kernel
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the post-processing kernel
+      auto upper = (triangle == Triangle::kUpper);
+      auto lower = (triangle == Triangle::kLower);
+      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                                      n, n, c_ld, c_offset, c_buffer,
+                                      program, false, c_rotated, false, upper, lower, true);
+      if (ErrorIn(status)) { return status; }
+
+      // Successfully finished the computation
+      return StatusCode::kSuccess;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xherk<float2,float>;
+template class Xherk<double2,double>;
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routines/level3/xsymm.cc
+++ b/src/routines/level3/xsymm.cc
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/xsymm.h"
+#include "internal/routines/level3/xsymm.h"

 #include <string>
 #include <vector>
@ -42,14 +42,14 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle

  // Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the
  // left) or B (on the right) in the Xgemm routine.
-  size_t k = (side == Side::kLeft) ? m : n;
+  auto k = (side == Side::kLeft) ? m : n;

  // Checks for validity of the squared A matrix
  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
  if (ErrorIn(status)) { return status; }

  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
-  // default) and on whether we are dealing with an upper or lower triangle of the symmetrix matrix
+  // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix
  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
  auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared";
@ -75,7 +75,7 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
      kernel.SetArgument(7, temp_symm());

      // Uses the common padding kernel's thread configuration. This is allowed, since the
-      // symmetry-to-squared kernel uses the same parameters.
+      // symmetric-to-squared kernel uses the same parameters.
      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
--- a/src/routines/level3/xsyr2k.cc
+++ b/src/routines/level3/xsyr2k.cc
@ -0,0 +1,186 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr2k class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xsyr2k.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xsyr2k<float>::precision_ = Precision::kSingle;
+template <> const Precision Xsyr2k<double>::precision_ = Precision::kDouble;
+template <> const Precision Xsyr2k<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xsyr2k<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xsyr2k<T>::Xsyr2k(CommandQueue &queue, Event &event):
+    Routine(queue, event, "SYR2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+  source_string_ =
+    #include "../../kernels/copy.opencl"
+    #include "../../kernels/pad.opencl"
+    #include "../../kernels/transpose.opencl"
+    #include "../../kernels/padtranspose.opencl"
+    #include "../../kernels/xgemm.opencl"
+  ;
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                              const size_t n, const size_t k,
+                              const T alpha,
+                              const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                              const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                              const T beta,
+                              const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Computes whether or not the matrices are transposed in memory. This is based on their layout
+  // (row or column-major) and whether or not they are requested to be pre-transposed.
+  auto ab_rotated = (layout == Layout::kColMajor && ab_transpose != Transpose::kNo) ||
+                    (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo);
+  auto c_rotated = (layout == Layout::kRowMajor);
+
+  // Computes the first and second dimensions of the A and B matrices taking the layout into account
+  auto ab_one = (ab_rotated) ? k : n;
+  auto ab_two = (ab_rotated) ? n : k;
+
+  // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
+  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
+  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
+  // space. Also tests that the leading dimensions of:
+  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix C cannot be less than N
+  auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Calculates the ceiled versions of n and k
+  auto n_ceiled = Ceil(n, db_["NWG"]);
+  auto k_ceiled = Ceil(k, db_["KWG"]);
+
+  // Decides which kernel to run: the upper-triangular or lower-triangular version
+  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
+
+  // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
+  try {
+
+    // Loads the program from the database
+    auto& program = GetProgramFromCache();
+
+    // Determines whether or not temporary matrices are needed
+    auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+                     ab_rotated == false;
+    auto b_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
+                     ab_rotated == false;
+
+    // Creates the temporary matrices
+    auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
+
+    // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
+    // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+    // case nothing has to be done, these kernels can be skipped.
+    if (!a_no_temp) {
+      status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
+                                      n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
+                                      program, true, ab_rotated, false);
+      if (ErrorIn(status)) { return status; }
+    }
+    if (!b_no_temp) {
+      status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
+                                      n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
+                                      program, true, ab_rotated, false);
+      if (ErrorIn(status)) { return status; }
+    }
+
+    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+    // modify the other triangle.
+    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
+                                    n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                                    program, true, c_rotated, false);
+    if (ErrorIn(status)) { return status; }
+
+    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+    try {
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the kernel arguments
+      kernel.SetArgument(0, static_cast<int>(n_ceiled));
+      kernel.SetArgument(1, static_cast<int>(k_ceiled));
+      kernel.SetArgument(2, alpha);
+      kernel.SetArgument(3, beta);
+      kernel.SetArgument(4, a_temp());
+      kernel.SetArgument(5, b_temp());
+      kernel.SetArgument(6, c_temp());
+
+      // Computes the global and local thread sizes
+      auto global = std::vector<size_t>{
+        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+      };
+      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+      // Launches the kernel
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Swaps the arguments for matrices A and B, and sets 'beta' to 1
+      auto one = static_cast<T>(1);
+      kernel.SetArgument(3, one);
+      kernel.SetArgument(4, b_temp());
+      kernel.SetArgument(5, a_temp());
+
+      // Runs the kernel again
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the post-processing kernel
+      auto upper = (triangle == Triangle::kUpper);
+      auto lower = (triangle == Triangle::kLower);
+      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                                      n, n, c_ld, c_offset, c_buffer,
+                                      program, false, c_rotated, false, upper, lower, false);
+      if (ErrorIn(status)) { return status; }
+
+      // Successfully finished the computation
+      return StatusCode::kSuccess;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xsyr2k<float>;
+template class Xsyr2k<double>;
+template class Xsyr2k<float2>;
+template class Xsyr2k<double2>;
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routines/level3/xsyrk.cc
+++ b/src/routines/level3/xsyrk.cc
@ -0,0 +1,163 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyrk class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xsyrk.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xsyrk<float>::precision_ = Precision::kSingle;
+template <> const Precision Xsyrk<double>::precision_ = Precision::kDouble;
+template <> const Precision Xsyrk<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xsyrk<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xsyrk<T>::Xsyrk(CommandQueue &queue, Event &event):
+    Routine(queue, event, "SYRK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+  source_string_ =
+    #include "../../kernels/copy.opencl"
+    #include "../../kernels/pad.opencl"
+    #include "../../kernels/transpose.opencl"
+    #include "../../kernels/padtranspose.opencl"
+    #include "../../kernels/xgemm.opencl"
+  ;
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                            const size_t n, const size_t k,
+                            const T alpha,
+                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const T beta,
+                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Computes whether or not the matrices are transposed in memory. This is based on their layout
+  // (row or column-major) and whether or not they are requested to be pre-transposed.
+  auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) ||
+                   (layout == Layout::kRowMajor && a_transpose == Transpose::kNo);
+  auto c_rotated = (layout == Layout::kRowMajor);
+
+  // Computes the first and second dimensions of the A matrix taking the layout into account
+  auto a_one = (a_rotated) ? k : n;
+  auto a_two = (a_rotated) ? n : k;
+
+  // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and
+  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
+  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
+  // space. Also tests that the leading dimensions of:
+  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix C cannot be less than N
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Calculates the ceiled versions of n and k
+  auto n_ceiled = Ceil(n, db_["NWG"]);
+  auto k_ceiled = Ceil(k, db_["KWG"]);
+
+  // Decides which kernel to run: the upper-triangular or lower-triangular version
+  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
+
+  // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
+  try {
+
+    // Loads the program from the database
+    auto& program = GetProgramFromCache();
+
+    // Determines whether or not temporary matrices are needed
+    auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+                     a_rotated == false;
+
+    // Creates the temporary matrices
+    auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
+
+    // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
+    // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+    // case nothing has to be done, these kernels can be skipped.
+    if (!a_no_temp) {
+      status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+                                      n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
+                                      program, true, a_rotated, false);
+      if (ErrorIn(status)) { return status; }
+    }
+
+    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+    // modify the other triangle.
+    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
+                                    n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                                    program, true, c_rotated, false);
+    if (ErrorIn(status)) { return status; }
+
+    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+    try {
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the kernel arguments
+      kernel.SetArgument(0, static_cast<int>(n_ceiled));
+      kernel.SetArgument(1, static_cast<int>(k_ceiled));
+      kernel.SetArgument(2, alpha);
+      kernel.SetArgument(3, beta);
+      kernel.SetArgument(4, a_temp());
+      kernel.SetArgument(5, a_temp());
+      kernel.SetArgument(6, c_temp());
+
+      // Computes the global and local thread sizes
+      auto global = std::vector<size_t>{
+        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+      };
+      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+      // Launches the kernel
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the post-processing kernel
+      auto upper = (triangle == Triangle::kUpper);
+      auto lower = (triangle == Triangle::kLower);
+      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                                      n, n, c_ld, c_offset, c_buffer,
+                                      program, false, c_rotated, false, upper, lower, false);
+      if (ErrorIn(status)) { return status; }
+
+      // Successfully finished the computation
+      return StatusCode::kSuccess;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xsyrk<float>;
+template class Xsyrk<double>;
+template class Xsyrk<float2>;
+template class Xsyrk<double2>;
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routines/level3/xtrmm.cc
+++ b/src/routines/level3/xtrmm.cc
@ -0,0 +1,135 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmm class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xtrmm.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtrmm<T>::Xtrmm(CommandQueue &queue, Event &event):
+    Xgemm<T>(queue, event) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle triangle,
+                            const Transpose a_transpose, const Diagonal diagonal,
+                            const size_t m, const size_t n,
+                            const T alpha,
+                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; }
+
+  // Computes the k dimension. This is based on whether or not matrix is A (on the left)
+  // or B (on the right) in the Xgemm routine.
+  auto k = (side == Side::kLeft) ? m : n;
+
+  // Checks for validity of the triangular A matrix
+  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
+  // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
+  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+  auto kernel_name = (is_upper) ? "TrmmUpperToSquared" : "TrmmLowerToSquared";
+
+  // Determines whether or not the triangular matrix is unit-diagonal
+  auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false;
+
+  // Temporary buffer for a copy of the triangular matrix
+  try {
+    auto temp_triangular = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
+
+    // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm
+    // routine afterwards
+    try {
+      auto& program = GetProgramFromCache();
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the arguments for the triangular-to-squared kernel
+      kernel.SetArgument(0, static_cast<int>(k));
+      kernel.SetArgument(1, static_cast<int>(a_ld));
+      kernel.SetArgument(2, static_cast<int>(a_offset));
+      kernel.SetArgument(3, a_buffer());
+      kernel.SetArgument(4, static_cast<int>(k));
+      kernel.SetArgument(5, static_cast<int>(k));
+      kernel.SetArgument(6, static_cast<int>(0));
+      kernel.SetArgument(7, temp_triangular());
+      kernel.SetArgument(8, static_cast<int>(unit_diagonal));
+
+      // Uses the common padding kernel's thread configuration. This is allowed, since the
+      // triangular-to-squared kernel uses the same parameters.
+      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the regular Xgemm code with either "B := alpha*A*B" or ...
+      if (side == Side::kLeft) {
+        status = DoGemm(layout, a_transpose, Transpose::kNo,
+                        m, n, k,
+                        alpha,
+                        temp_triangular, 0, k,
+                        b_buffer, b_offset, b_ld,
+                        static_cast<T>(0.0),
+                        b_buffer, b_offset, b_ld);
+      }
+
+      // ... with "B := alpha*B*A". Note that A and B are now reversed.
+      else {
+        status = DoGemm(layout, Transpose::kNo, a_transpose,
+                        m, n, k,
+                        alpha,
+                        b_buffer, b_offset, b_ld,
+                        temp_triangular, 0, k,
+                        static_cast<T>(0.0),
+                        b_buffer, b_offset, b_ld);
+
+        // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
+        switch(status) {
+          case StatusCode::kInvalidMatrixA:      status = StatusCode::kInvalidMatrixB; break;
+          case StatusCode::kInvalidMatrixB:      status = StatusCode::kInvalidMatrixA; break;
+          case StatusCode::kInvalidLeadDimA:     status = StatusCode::kInvalidLeadDimB; break;
+          case StatusCode::kInvalidLeadDimB:     status = StatusCode::kInvalidLeadDimA; break;
+          case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
+          case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
+        }
+      }
+
+      // Return the status of the Xgemm routine
+      return status;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtrmm<float>;
+template class Xtrmm<double>;
+template class Xtrmm<float2>;
+template class Xtrmm<double2>;
+
+// =================================================================================================
+} // namespace clblast
--- a/src/tuning/copy.cc
+++ b/src/tuning/copy.cc
@ -30,11 +30,10 @@ void CopyTune(const Arguments<T> &args,
  // This points to the CopyMatrix kernel as found in the CLBlast library. This is just one example
  // of a copy kernel. However, all copy-kernels use the same tuning parameters, so one has to be
  // chosen as a representative.
-  std::string common_source =
-  #include "../src/kernels/common.opencl"
-  std::string kernel_source =
-  #include "../src/kernels/copy.opencl"
-  auto sources = common_source + kernel_source;
+  std::string sources =
+    #include "../src/kernels/common.opencl"
+    #include "../src/kernels/copy.opencl"
+  ;
  auto id = tuner.AddKernelFromString(sources, "CopyMatrix", {args.m, args.n}, {1, 1});
  tuner.SetReferenceFromString(sources, "CopyMatrix", {args.m, args.n}, {8, 8});

--- a/src/tuning/pad.cc
+++ b/src/tuning/pad.cc
@ -30,11 +30,10 @@ void PadTune(const Arguments<T> &args,
  // This points to the PadMatrix kernel as found in the CLBlast library. This is just one
  // example of a pad kernel. However, all pad-kernels use the same tuning parameters, so one has
  // to be chosen as a representative.
-  std::string common_source =
-  #include "../src/kernels/common.opencl"
-  std::string kernel_source =
-  #include "../src/kernels/pad.opencl"
-  auto sources = common_source + kernel_source;
+  std::string sources =
+    #include "../src/kernels/common.opencl"
+    #include "../src/kernels/pad.opencl"
+  ;
  auto id = tuner.AddKernelFromString(sources, "PadMatrix", {args.m, args.n}, {1, 1});
  tuner.SetReferenceFromString(sources, "PadMatrix", {args.m, args.n}, {8, 8});

--- a/src/tuning/padtranspose.cc
+++ b/src/tuning/padtranspose.cc
@ -30,11 +30,10 @@ void PadTransposeTune(const Arguments<T> &args,
  // This points to the PadTransposeMatrix kernel as found in the CLBlast library. This is just one
  // example of a transpose kernel. However, all kernels use the same tuning parameters, so one has
  // to be chosen as a representative.
-  std::string common_source =
-  #include "../src/kernels/common.opencl"
-  std::string kernel_source =
-  #include "../src/kernels/padtranspose.opencl"
-  auto sources = common_source + kernel_source;
+  std::string sources =
+    #include "../src/kernels/common.opencl"
+    #include "../src/kernels/padtranspose.opencl"
+  ;
  auto id = tuner.AddKernelFromString(sources, "PadTransposeMatrix", {args.m, args.n}, {1, 1});
  tuner.SetReferenceFromString(sources, "PadTransposeMatrix", {args.m, args.n}, {8, 8});

--- a/src/tuning/transpose.cc
+++ b/src/tuning/transpose.cc
@ -30,11 +30,10 @@ void TransposeTune(const Arguments<T> &args,
  // This points to the PadTransposeMatrix kernel as found in the CLBlast library. This is just one
  // example of a transpose kernel. However, all kernels use the same tuning parameters, so one has
  // to be chosen as a representative.
-  std::string common_source =
-  #include "../src/kernels/common.opencl"
-  std::string kernel_source =
-  #include "../src/kernels/transpose.opencl"
-  auto sources = common_source + kernel_source;
+  std::string sources =
+    #include "../src/kernels/common.opencl"
+    #include "../src/kernels/transpose.opencl"
+  ;
  auto id = tuner.AddKernelFromString(sources, "TransposeMatrix", {args.m, args.n}, {1, 1});
  tuner.SetReferenceFromString(sources, "TransposeMatrix", {args.m, args.n}, {8, 8});

@ -42,6 +41,7 @@ void TransposeTune(const Arguments<T> &args,
  tuner.AddParameter(id, "TRA_DIM", {4, 8, 16, 32, 64});
  tuner.AddParameter(id, "TRA_WPT", {1, 2, 4, 8, 16});
  tuner.AddParameter(id, "TRA_PAD", {0, 1});
+  tuner.AddParameter(id, "TRA_SHUFFLE", {0, 1});

  // Tests for a specific precision
  tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
--- a/src/tuning/xaxpy.cc
+++ b/src/tuning/xaxpy.cc
@ -34,11 +34,10 @@ void XaxpyTune(const Arguments<T> &args,
  }

  // This points to the XaxpyFast kernel as found in the CLBlast library
-  std::string common_source =
-  #include "../src/kernels/common.opencl"
-  std::string kernel_source =
-  #include "../src/kernels/xaxpy.opencl"
-  auto sources = common_source + kernel_source;
+  std::string sources =
+    #include "../src/kernels/common.opencl"
+    #include "../src/kernels/xaxpy.opencl"
+  ;
  auto id = tuner.AddKernelFromString(sources, "XaxpyFast", {args.n}, {1});
  tuner.SetReferenceFromString(sources, "XaxpyFast", {args.n}, {64});

--- a/src/tuning/xgemm.cc
+++ b/src/tuning/xgemm.cc
@ -30,11 +30,10 @@ void XgemmTune(const Arguments<T> &args,
               cltune::Tuner &tuner) {

  // This points to the Xgemm kernel as found in the CLBlast library and its golden reference
-  std::string common_source =
-  #include "../src/kernels/common.opencl"
-  std::string kernel_source =
-  #include "../src/kernels/xgemm.opencl"
-  auto sources = common_source + kernel_source;
+  std::string sources =
+    #include "../src/kernels/common.opencl"
+    #include "../src/kernels/xgemm.opencl"
+  ;
  auto id = tuner.AddKernelFromString(sources, "Xgemm", {args.m, args.n}, {1, 1});
  tuner.SetReferenceFromString(sources, "Xgemm", {args.m, args.n}, {8, 8});

--- a/src/tuning/xgemv.cc
+++ b/src/tuning/xgemv.cc
@ -36,11 +36,10 @@ void XgemvTune(const Arguments<T> &args, const size_t variation,
  auto a_rotated = (variation == 3) ? 1 : 0;

  // This points to the Xgemv kernel as found in the CLBlast library
-  std::string common_source =
-  #include "../src/kernels/common.opencl"
-  std::string kernel_source =
-  #include "../src/kernels/xgemv.opencl"
-  auto sources = common_source + kernel_source;
+  std::string sources =
+    #include "../src/kernels/common.opencl"
+    #include "../src/kernels/xgemv.opencl"
+  ;
  auto id = tuner.AddKernelFromString(sources, kernel_name, {args.m}, {1});
  tuner.SetReferenceFromString(sources, "Xgemv", {args.m}, {64});

--- a/src/utilities.cc
+++ b/src/utilities.cc
@ -79,6 +79,13 @@ std::string ToString(Triangle value) {
  }
 }
 template <>
+std::string ToString(Diagonal value) {
+  switch(value) {
+    case Diagonal::kUnit: return ToString(static_cast<int>(value))+" (unit)";
+    case Diagonal::kNonUnit: return ToString(static_cast<int>(value))+" (non-unit)";
+  }
+}
+template <>
 std::string ToString(Precision value) {
  switch(value) {
    case Precision::kHalf: return ToString(static_cast<int>(value))+" (half)";
@ -143,6 +150,7 @@ template Layout GetArgument<Layout>(const int, char **, std::string&, const std:
 template Transpose GetArgument<Transpose>(const int, char **, std::string&, const std::string&, const Transpose);
 template Side GetArgument<Side>(const int, char **, std::string&, const std::string&, const Side);
 template Triangle GetArgument<Triangle>(const int, char **, std::string&, const std::string&, const Triangle);
+template Diagonal GetArgument<Diagonal>(const int, char **, std::string&, const std::string&, const Diagonal);
 template Precision GetArgument<Precision>(const int, char **, std::string&, const std::string&, const Precision);

 // =================================================================================================
--- a/test/correctness/routines/level1/xaxpy.cc
+++ b/test/correctness/routines/level1/xaxpy.cc
@ -0,0 +1,81 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xaxpy routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level1/xaxpy.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXaxpy<T>::GetOptions(),
+                       TestXaxpy<T>::RunRoutine, TestXaxpy<T>::RunReference,
+                       TestXaxpy<T>::DownloadResult, TestXaxpy<T>::GetResultIndex,
+                       TestXaxpy<T>::ResultID1, TestXaxpy<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Creates the arguments vector for the regular tests
+  auto regular_test_vector = std::vector<Arguments<T>>{};
+  for (auto &n: tester.kVectorDims) { args.n = n;
+    for (auto &x_inc: tester.kIncrements) { args.x_inc = x_inc;
+      for (auto &x_offset: tester.kOffsets) { args.x_offset = x_offset;
+        for (auto &y_inc: tester.kIncrements) { args.y_inc = y_inc;
+          for (auto &y_offset: tester.kOffsets) { args.y_offset = y_offset;
+            for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+              args.x_size = TestXaxpy<T>::GetSizeX(args);
+              args.y_size = TestXaxpy<T>::GetSizeY(args);
+              if (args.x_size<1 || args.y_size<1) { continue; }
+              regular_test_vector.push_back(args);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Creates the arguments vector for the invalid-buffer tests
+  auto invalid_test_vector = std::vector<Arguments<T>>{};
+  args.n = tester.kBufferSize;
+  args.x_inc = args.y_inc = 1;
+  args.x_offset = args.y_offset = 0;
+  for (auto &x_size: tester.kVecSizes) { args.x_size = x_size;
+    for (auto &y_size: tester.kVecSizes) { args.y_size = y_size;
+      invalid_test_vector.push_back(args);
+    }
+  }
+
+  // Runs the tests
+  const auto case_name = "default";
+  tester.TestRegular(regular_test_vector, case_name);
+  tester.TestInvalid(invalid_test_vector, case_name);
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<float>(argc, argv, false, "SAXPY");
+  clblast::RunTest<double>(argc, argv, true, "DAXPY");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CAXPY");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZAXPY");
+  return 0;
+}
+
+// =================================================================================================
--- a/test/correctness/routines/level2/xgemv.cc
+++ b/test/correctness/routines/level2/xgemv.cc
@ -0,0 +1,99 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xgemv routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXgemv<T>::GetOptions(),
+                       TestXgemv<T>::RunRoutine, TestXgemv<T>::RunReference,
+                       TestXgemv<T>::DownloadResult, TestXgemv<T>::GetResultIndex,
+                       TestXgemv<T>::ResultID1, TestXgemv<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose;
+
+      // Creates the arguments vector for the regular tests
+      auto regular_test_vector = std::vector<Arguments<T>>{};
+      for (auto &m: tester.kMatrixVectorDims) { args.m = m;
+        for (auto &n: tester.kMatrixVectorDims) { args.n = n;
+          for (auto &a_ld: tester.kMatrixVectorDims) { args.a_ld = a_ld;
+            for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+              for (auto &x_inc: tester.kIncrements) { args.x_inc = x_inc;
+                for (auto &x_offset: tester.kOffsets) { args.x_offset = x_offset;
+                  for (auto &y_inc: tester.kIncrements) { args.y_inc = y_inc;
+                    for (auto &y_offset: tester.kOffsets) { args.y_offset = y_offset;
+                      for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                        for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                          args.a_size = TestXgemv<T>::GetSizeA(args);
+                          args.x_size = TestXgemv<T>::GetSizeX(args);
+                          args.y_size = TestXgemv<T>::GetSizeY(args);
+                          if (args.a_size<1 || args.x_size<1 || args.y_size<1) { continue; }
+                          regular_test_vector.push_back(args);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+
+      // Creates the arguments vector for the invalid-buffer tests
+      auto invalid_test_vector = std::vector<Arguments<T>>{};
+      args.m = args.n = tester.kBufferSize;
+      args.a_ld = tester.kBufferSize;
+      args.x_inc = args.y_inc = 1;
+      args.a_offset = args.x_offset = args.y_offset = 0;
+      for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+        for (auto &x_size: tester.kVecSizes) { args.x_size = x_size;
+          for (auto &y_size: tester.kVecSizes) { args.y_size = y_size;
+            invalid_test_vector.push_back(args);
+          }
+        }
+      }
+
+      // Runs the tests
+      const auto case_name = ToString(layout)+" "+ToString(a_transpose);
+      tester.TestRegular(regular_test_vector, case_name);
+      tester.TestInvalid(invalid_test_vector, case_name);
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<float>(argc, argv, false, "SGEMV");
+  clblast::RunTest<double>(argc, argv, true, "DGEMV");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CGEMV");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZGEMV");
+  return 0;
+}
+
+// =================================================================================================
--- a/test/correctness/routines/level3/xgemm.cc
+++ b/test/correctness/routines/level3/xgemm.cc
@ -0,0 +1,102 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xgemm routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xgemm.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXgemm<T>::GetOptions(),
+                       TestXgemm<T>::RunRoutine, TestXgemm<T>::RunReference,
+                       TestXgemm<T>::DownloadResult, TestXgemm<T>::GetResultIndex,
+                       TestXgemm<T>::ResultID1, TestXgemm<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose;
+      for (auto &b_transpose: tester.kTransposes) { args.b_transpose = b_transpose;
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<T>>{};
+        for (auto &m: tester.kMatrixDims) { args.m = m;
+          for (auto &n: tester.kMatrixDims) { args.n = n;
+            for (auto &k: tester.kMatrixDims) { args.k = k;
+              for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+                for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                  for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
+                    for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
+                      for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                        for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                          for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                            for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                              args.a_size = TestXgemm<T>::GetSizeA(args);
+                              args.b_size = TestXgemm<T>::GetSizeB(args);
+                              args.c_size = TestXgemm<T>::GetSizeC(args);
+                              if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
+                              regular_test_vector.push_back(args);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<T>>{};
+        args.m = args.n = args.k = tester.kBufferSize;
+        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.b_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
+            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+              invalid_test_vector.push_back(args);
+            }
+          }
+        }
+
+        // Runs the tests
+        const auto case_name = ToString(layout)+" "+ToString(a_transpose)+" "+ToString(b_transpose);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<float>(argc, argv, false, "SGEMM");
+  clblast::RunTest<double>(argc, argv, true, "DGEMM");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CGEMM");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZGEMM");
+  return 0;
+}
+
+// =================================================================================================
--- a/test/correctness/routines/level3/xhemm.cc
+++ b/test/correctness/routines/level3/xhemm.cc
@ -0,0 +1,98 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xhemm routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xhemm.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXhemm<T>::GetOptions(),
+                       TestXhemm<T>::RunRoutine, TestXhemm<T>::RunReference,
+                       TestXhemm<T>::DownloadResult, TestXhemm<T>::GetResultIndex,
+                       TestXhemm<T>::ResultID1, TestXhemm<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &side: tester.kSides) { args.side = side;
+      for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<T>>{};
+        for (auto &m: tester.kMatrixDims) { args.m = m;
+          for (auto &n: tester.kMatrixDims) { args.n = n;
+            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
+                  for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
+                    for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                      for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                          for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                            args.a_size = TestXhemm<T>::GetSizeA(args);
+                            args.b_size = TestXhemm<T>::GetSizeB(args);
+                            args.c_size = TestXhemm<T>::GetSizeC(args);
+                            if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
+                            regular_test_vector.push_back(args);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<T>>{};
+        args.m = args.n = tester.kBufferSize;
+        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.b_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
+            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+              invalid_test_vector.push_back(args);
+            }
+          }
+        }
+
+        // Runs the tests
+        const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CHEMM");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZHEMM");
+  return 0;
+}
+
+// =================================================================================================
--- a/test/correctness/routines/level3/xher2k.cc
+++ b/test/correctness/routines/level3/xher2k.cc
@ -0,0 +1,100 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xher2k routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xher2k.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T, typename U>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,U> tester{argc, argv, silent, name, TestXher2k<T,U>::GetOptions(),
+                       TestXher2k<T,U>::RunRoutine, TestXher2k<T,U>::RunReference,
+                       TestXher2k<T,U>::DownloadResult, TestXher2k<T,U>::GetResultIndex,
+                       TestXher2k<T,U>::ResultID1, TestXher2k<T,U>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<U>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+      for (auto &ab_transpose: {Transpose::kNo, Transpose::kConjugate}) { // Regular transpose not a
+        args.a_transpose = ab_transpose;                                  // valid BLAS option
+        args.b_transpose = ab_transpose;
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<U>>{};
+        for (auto &n: tester.kMatrixDims) { args.n = n;
+          for (auto &k: tester.kMatrixDims) { args.k = k;
+            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
+                  for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
+                    for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                      for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                          for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                            args.a_size = TestXher2k<T,U>::GetSizeA(args);
+                            args.b_size = TestXher2k<T,U>::GetSizeB(args);
+                            args.c_size = TestXher2k<T,U>::GetSizeC(args);
+                            if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
+                            regular_test_vector.push_back(args);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<U>>{};
+        args.n = args.k = tester.kBufferSize;
+        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.b_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
+            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+              invalid_test_vector.push_back(args);
+            }
+          }
+        }
+
+        // Runs the tests
+        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<clblast::float2,float>(argc, argv, false, "CHER2K");
+  clblast::RunTest<clblast::double2,double>(argc, argv, true, "ZHER2K");
+  return 0;
+}
+
+// =================================================================================================
--- a/test/correctness/routines/level3/xherk.cc
+++ b/test/correctness/routines/level3/xherk.cc
@ -0,0 +1,92 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xherk routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xherk.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T, typename U>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,U> tester{argc, argv, silent, name, TestXherk<T,U>::GetOptions(),
+                       TestXherk<T,U>::RunRoutine, TestXherk<T,U>::RunReference,
+                       TestXherk<T,U>::DownloadResult, TestXherk<T,U>::GetResultIndex,
+                       TestXherk<T,U>::ResultID1, TestXherk<T,U>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<U>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+      for (auto &a_transpose: {Transpose::kNo, Transpose::kConjugate}) { // Regular transpose not a
+        args.a_transpose = a_transpose;                                  // valid BLAS option
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<U>>{};
+        for (auto &n: tester.kMatrixDims) { args.n = n;
+          for (auto &k: tester.kMatrixDims) { args.k = k;
+            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                  for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                    for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                      for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                        args.a_size = TestXherk<T,U>::GetSizeA(args);
+                        args.c_size = TestXherk<T,U>::GetSizeC(args);
+                        if (args.a_size<1 || args.c_size<1) { continue; }
+                        regular_test_vector.push_back(args);
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<U>>{};
+        args.n = args.k = tester.kBufferSize;
+        args.a_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+            invalid_test_vector.push_back(args);
+          }
+        }
+
+        // Runs the tests
+        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<clblast::float2,float>(argc, argv, false, "CHERK");
+  clblast::RunTest<clblast::double2,double>(argc, argv, true, "ZHERK");
+  return 0;
+}
+
+// =================================================================================================
--- a/test/correctness/routines/level3/xsymm.cc
+++ b/test/correctness/routines/level3/xsymm.cc
@ -0,0 +1,100 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xsymm routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xsymm.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXsymm<T>::GetOptions(),
+                       TestXsymm<T>::RunRoutine, TestXsymm<T>::RunReference,
+                       TestXsymm<T>::DownloadResult, TestXsymm<T>::GetResultIndex,
+                       TestXsymm<T>::ResultID1, TestXsymm<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &side: tester.kSides) { args.side = side;
+      for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<T>>{};
+        for (auto &m: tester.kMatrixDims) { args.m = m;
+          for (auto &n: tester.kMatrixDims) { args.n = n;
+            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
+                  for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
+                    for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                      for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                          for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                            args.a_size = TestXsymm<T>::GetSizeA(args);
+                            args.b_size = TestXsymm<T>::GetSizeB(args);
+                            args.c_size = TestXsymm<T>::GetSizeC(args);
+                            if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
+                            regular_test_vector.push_back(args);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<T>>{};
+        args.m = args.n = tester.kBufferSize;
+        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.b_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
+            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+              invalid_test_vector.push_back(args);
+            }
+          }
+        }
+
+        // Runs the tests
+        const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<float>(argc, argv, false, "SSYMM");
+  clblast::RunTest<double>(argc, argv, true, "DSYMM");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CSYMM");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYMM");
+  return 0;
+}
+
+// =================================================================================================
--- a/test/correctness/routines/level3/xsyr2k.cc
+++ b/test/correctness/routines/level3/xsyr2k.cc
@ -0,0 +1,102 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xsyr2k routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xsyr2k.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXsyr2k<T>::GetOptions(),
+                       TestXsyr2k<T>::RunRoutine, TestXsyr2k<T>::RunReference,
+                       TestXsyr2k<T>::DownloadResult, TestXsyr2k<T>::GetResultIndex,
+                       TestXsyr2k<T>::ResultID1, TestXsyr2k<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+      for (auto &ab_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it
+        args.a_transpose = ab_transpose;                            // is not supported by clBLAS
+        args.b_transpose = ab_transpose;
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<T>>{};
+        for (auto &n: tester.kMatrixDims) { args.n = n;
+          for (auto &k: tester.kMatrixDims) { args.k = k;
+            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
+                  for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
+                    for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                      for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                          for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                            args.a_size = TestXsyr2k<T>::GetSizeA(args);
+                            args.b_size = TestXsyr2k<T>::GetSizeB(args);
+                            args.c_size = TestXsyr2k<T>::GetSizeC(args);
+                            if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
+                            regular_test_vector.push_back(args);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<T>>{};
+        args.n = args.k = tester.kBufferSize;
+        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.b_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
+            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+              invalid_test_vector.push_back(args);
+            }
+          }
+        }
+
+        // Runs the tests
+        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<float>(argc, argv, false, "SSYR2K");
+  clblast::RunTest<double>(argc, argv, true, "DSYR2K");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CSYR2K");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYR2K");
+  return 0;
+}
+
+// =================================================================================================
--- a/test/correctness/routines/level3/xsyrk.cc
+++ b/test/correctness/routines/level3/xsyrk.cc
@ -0,0 +1,94 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xsyrk routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xsyrk.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXsyrk<T>::GetOptions(),
+                       TestXsyrk<T>::RunRoutine, TestXsyrk<T>::RunReference,
+                       TestXsyrk<T>::DownloadResult, TestXsyrk<T>::GetResultIndex,
+                       TestXsyrk<T>::ResultID1, TestXsyrk<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+      for (auto &a_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it
+        args.a_transpose = a_transpose;                            // is not supported by clBLAS
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<T>>{};
+        for (auto &n: tester.kMatrixDims) { args.n = n;
+          for (auto &k: tester.kMatrixDims) { args.k = k;
+            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                  for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                    for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                      for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                        args.a_size = TestXsyrk<T>::GetSizeA(args);
+                        args.c_size = TestXsyrk<T>::GetSizeC(args);
+                        if (args.a_size<1 || args.c_size<1) { continue; }
+                        regular_test_vector.push_back(args);
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<T>>{};
+        args.n = args.k = tester.kBufferSize;
+        args.a_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+            invalid_test_vector.push_back(args);
+          }
+        }
+
+        // Runs the tests
+        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<float>(argc, argv, false, "SSYRK");
+  clblast::RunTest<double>(argc, argv, true, "DSYRK");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CSYRK");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYRK");
+  return 0;
+}
+
+// =================================================================================================
--- a/test/correctness/routines/level3/xtrmm.cc
+++ b/test/correctness/routines/level3/xtrmm.cc
@ -0,0 +1,96 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xtrmm routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xtrmm.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXtrmm<T>::GetOptions(),
+                       TestXtrmm<T>::RunRoutine, TestXtrmm<T>::RunReference,
+                       TestXtrmm<T>::DownloadResult, TestXtrmm<T>::GetResultIndex,
+                       TestXtrmm<T>::ResultID1, TestXtrmm<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &side: tester.kSides) { args.side = side;
+      for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+        for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose;
+          for (auto &diagonal: tester.kDiagonals) { args.diagonal = diagonal;
+
+            // Creates the arguments vector for the regular tests
+            auto regular_test_vector = std::vector<Arguments<T>>{};
+            for (auto &m: tester.kMatrixDims) { args.m = m;
+              for (auto &n: tester.kMatrixDims) { args.n = n;
+                for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+                  for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                    for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
+                      for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
+                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                          args.a_size = TestXtrmm<T>::GetSizeA(args);
+                          args.b_size = TestXtrmm<T>::GetSizeB(args);
+                          if (args.a_size<1 || args.b_size<1) { continue; }
+                          regular_test_vector.push_back(args);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+
+            // Creates the arguments vector for the invalid-buffer tests
+            auto invalid_test_vector = std::vector<Arguments<T>>{};
+            args.m = args.n = tester.kBufferSize;
+            args.a_ld = args.b_ld = tester.kBufferSize;
+            args.a_offset = args.b_offset = 0;
+            for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+              for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
+                invalid_test_vector.push_back(args);
+              }
+            }
+
+            // Runs the tests
+            const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle)+" "+
+                                   ToString(a_transpose)+" "+ToString(diagonal);
+            tester.TestRegular(regular_test_vector, case_name);
+            tester.TestInvalid(invalid_test_vector, case_name);
+          }
+        }
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<float>(argc, argv, false, "STRMM");
+  clblast::RunTest<double>(argc, argv, true, "DTRMM");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CTRMM");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZTRMM");
+  return 0;
+}
+
+// =================================================================================================
--- a/test/correctness/routines/xaxpy.cc
+++ b/test/correctness/routines/xaxpy.cc
@ -1,75 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the tests for the Xaxpy routine. It is based on the TestXY class.
-//
-// =================================================================================================
-
-#include "wrapper_clblas.h"
-#include "correctness/testxy.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
-template <typename T>
-void XaxpyTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [](const Arguments<T> &args,
-                           const Buffer &x_vec, const Buffer &y_vec,
-                           CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    return Axpy(args.n, args.alpha,
-                x_vec(), args.x_offset, args.x_inc,
-                y_vec(), args.y_offset, args.y_inc,
-                &queue_plain, &event);
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [](const Arguments<T> &args,
-                          const Buffer &x_vec, const Buffer &y_vec,
-                          CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXaxpy(args.n, args.alpha,
-                              x_vec(), args.x_offset, args.x_inc,
-                              y_vec(), args.y_offset, args.y_inc,
-                              1, &queue_plain, 0, nullptr, &event);
-    return static_cast<StatusCode>(status);
-  };
-
-  // Initializes the arguments relevant for this routine
-  auto args = Arguments<T>{};
-  const auto options = std::vector<std::string>{kArgN, kArgXInc, kArgYInc,
-                                                kArgXOffset, kArgYOffset, kArgAlpha};
-
-  // Creates a tester
-  TestXY<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
-
-  // Runs the tests
-  const auto case_name = "default";
-  tester.TestRegular(args, case_name);
-  tester.TestInvalidBufferSizes(args, case_name);
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::XaxpyTest<float>(argc, argv, false, "SAXPY");
-  clblast::XaxpyTest<double>(argc, argv, true, "DAXPY");
-  clblast::XaxpyTest<clblast::float2>(argc, argv, true, "CAXPY");
-  clblast::XaxpyTest<clblast::double2>(argc, argv, true, "ZAXPY");
-  return 0;
-}
-
-// =================================================================================================
--- a/test/correctness/routines/xgemm.cc
+++ b/test/correctness/routines/xgemm.cc
@ -1,98 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the tests for the Xgemm routine. It is based on the TestABC class.
-//
-// =================================================================================================
-
-#include "wrapper_clblas.h"
-#include "correctness/testabc.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
-template <typename T>
-void XgemmTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [](const Arguments<T> &args,
-                           const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
-                           CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    return Gemm(args.layout, args.a_transpose, args.b_transpose,
-                args.m, args.n, args.k,
-                args.alpha,
-                a_mat(), args.a_offset, args.a_ld,
-                b_mat(), args.b_offset, args.b_ld,
-                args.beta,
-                c_mat(), args.c_offset, args.c_ld,
-                &queue_plain, &event);
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [](const Arguments<T> &args,
-                          const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
-                          CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXgemm(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasTranspose>(args.a_transpose),
-                              static_cast<clblasTranspose>(args.b_transpose),
-                              args.m, args.n, args.k,
-                              args.alpha,
-                              a_mat(), args.a_offset, args.a_ld,
-                              b_mat(), args.b_offset, args.b_ld,
-                              args.beta,
-                              c_mat(), args.c_offset, args.c_ld,
-                              1, &queue_plain, 0, nullptr, &event);
-    return static_cast<StatusCode>(status);
-  };
-
-  // Initializes the arguments relevant for this routine
-  auto args = Arguments<T>{};
-  const auto options = std::vector<std::string>{kArgM, kArgN, kArgK, kArgLayout,
-                                                kArgATransp, kArgBTransp,
-                                                kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
-                                                kArgAOffset, kArgBOffset, kArgCOffset};
-
-  // Creates a tester
-  TestABC<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
-
-  // Loops over the test-cases from a data-layout point of view
-  for (auto &layout: tester.kLayouts) {
-    args.layout = layout;
-    for (auto &a_transpose: tester.kTransposes) {
-      args.a_transpose = a_transpose;
-      for (auto &b_transpose: tester.kTransposes) {
-        args.b_transpose = b_transpose;
-        const auto case_name = ToString(layout)+" "+ToString(a_transpose)+" "+ToString(b_transpose);
-
-        // Runs the tests
-        tester.TestRegular(args, case_name);
-        tester.TestInvalidBufferSizes(args, case_name);
-      }
-    }
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::XgemmTest<float>(argc, argv, false, "SGEMM");
-  clblast::XgemmTest<double>(argc, argv, true, "DGEMM");
-  clblast::XgemmTest<clblast::float2>(argc, argv, true, "CGEMM");
-  clblast::XgemmTest<clblast::double2>(argc, argv, true, "ZGEMM");
-  return 0;
-}
-
-// =================================================================================================
--- a/test/correctness/routines/xgemv.cc
+++ b/test/correctness/routines/xgemv.cc
@ -1,88 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the tests for the Xgemv routine. It is based on the TestAXY class.
-//
-// =================================================================================================
-
-#include "wrapper_clblas.h"
-#include "correctness/testaxy.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
-template <typename T>
-void XgemvTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [](const Arguments<T> &args,
-                           const Buffer &a_mat, const Buffer &x_vec, const Buffer &y_vec,
-                           CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    return Gemv(args.layout, args.a_transpose, args.m, args.n, args.alpha,
-                a_mat(), args.a_offset, args.a_ld,
-                x_vec(), args.x_offset, args.x_inc, args.beta,
-                y_vec(), args.y_offset, args.y_inc,
-                &queue_plain, &event);
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [](const Arguments<T> &args,
-                          const Buffer &a_mat, const Buffer &x_vec, const Buffer &y_vec,
-                          CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXgemv(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasTranspose>(args.a_transpose),
-                              args.m, args.n, args.alpha,
-                              a_mat(), args.a_offset, args.a_ld,
-                              x_vec(), args.x_offset, args.x_inc, args.beta,
-                              y_vec(), args.y_offset, args.y_inc,
-                              1, &queue_plain, 0, nullptr, &event);
-    return static_cast<StatusCode>(status);
-  };
-
-  // Initializes the arguments relevant for this routine
-  auto args = Arguments<T>{};
-  const auto options = std::vector<std::string>{kArgM, kArgN, kArgLayout, kArgATransp,
-                                                kArgALeadDim, kArgXInc, kArgYInc,
-                                                kArgAOffset, kArgXOffset, kArgYOffset};
-
-  // Creates a tester
-  TestAXY<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
-
-  // Loops over the test-cases from a data-layout point of view
-  for (auto &layout: tester.kLayouts) {
-    args.layout = layout;
-    for (auto &a_transpose: tester.kTransposes) {
-      args.a_transpose = a_transpose;
-      const auto case_name = ToString(layout)+" "+ToString(a_transpose);
-
-      // Runs the tests
-      tester.TestRegular(args, case_name);
-      tester.TestInvalidBufferSizes(args, case_name);
-    }
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::XgemvTest<float>(argc, argv, false, "SGEMV");
-  clblast::XgemvTest<double>(argc, argv, true, "DGEMV");
-  clblast::XgemvTest<clblast::float2>(argc, argv, true, "CGEMV");
-  clblast::XgemvTest<clblast::double2>(argc, argv, true, "ZGEMV");
-  return 0;
-}
-
-// =================================================================================================
--- a/test/correctness/routines/xsymm.cc
+++ b/test/correctness/routines/xsymm.cc
@ -1,98 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the tests for the Xsymm routine. It is based on the TestABC class.
-//
-// =================================================================================================
-
-#include "wrapper_clblas.h"
-#include "correctness/testabc.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
-template <typename T>
-void XsymmTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [](const Arguments<T> &args,
-                           const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
-                           CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    return Symm(args.layout, args.side, args.triangle,
-                args.m, args.n,
-                args.alpha,
-                a_mat(), args.a_offset, args.a_ld,
-                b_mat(), args.b_offset, args.b_ld,
-                args.beta,
-                c_mat(), args.c_offset, args.c_ld,
-                &queue_plain, &event);
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [](const Arguments<T> &args,
-                          const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
-                          CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXsymm(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasSide>(args.side),
-                              static_cast<clblasUplo>(args.triangle),
-                              args.m, args.n,
-                              args.alpha,
-                              a_mat(), args.a_offset, args.a_ld,
-                              b_mat(), args.b_offset, args.b_ld,
-                              args.beta,
-                              c_mat(), args.c_offset, args.c_ld,
-                              1, &queue_plain, 0, nullptr, &event);
-    return static_cast<StatusCode>(status);
-  };
-
-  // Initializes the arguments relevant for this routine
-  auto args = Arguments<T>{};
-  const auto options = std::vector<std::string>{kArgM, kArgN, kArgLayout,
-                                                kArgSide, kArgTriangle,
-                                                kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
-                                                kArgAOffset, kArgBOffset, kArgCOffset};
-
-  // Creates a tester
-  TestABC<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
-
-  // Loops over the test-cases from a data-layout point of view
-  for (auto &layout: tester.kLayouts) {
-    args.layout = layout;
-    for (auto &side: {Side::kLeft, Side::kRight}) {
-      args.side = side;
-      for (auto &triangle: {Triangle::kUpper, Triangle::kLower}) {
-        args.triangle = triangle;
-        const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
-
-        // Runs the tests
-        tester.TestRegular(args, case_name);
-        tester.TestInvalidBufferSizes(args, case_name);
-      }
-    }
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::XsymmTest<float>(argc, argv, false, "SSYMM");
-  clblast::XsymmTest<double>(argc, argv, true, "DSYMM");
-  clblast::XsymmTest<clblast::float2>(argc, argv, true, "CSYMM");
-  clblast::XsymmTest<clblast::double2>(argc, argv, true, "ZSYMM");
-  return 0;
-}
-
-// =================================================================================================
--- a/test/correctness/testabc.cc
+++ b/test/correctness/testabc.cc
@ -1,217 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the TestABC class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include <algorithm>
-
-#include "correctness/testabc.h"
-
-namespace clblast {
-// =================================================================================================
-
-// Constructor, initializes the base class tester and input data
-template <typename T>
-TestABC<T>::TestABC(int argc, char *argv[], const bool silent,
-                    const std::string &name, const std::vector<std::string> &options,
-                    const Routine clblast_lambda, const Routine clblas_lambda):
-    Tester<T>{argc, argv, silent, name, options},
-    clblast_lambda_(clblast_lambda),
-    clblas_lambda_(clblas_lambda) {
-
-  // Computes the maximum sizes. This allows for a single set of input/output buffers.
-  auto max_dim = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
-  auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
-  auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
-
-  // Creates test input data
-  a_source_.resize(max_dim*max_ld + max_offset);
-  b_source_.resize(max_dim*max_ld + max_offset);
-  c_source_.resize(max_dim*max_ld + max_offset);
-  PopulateVector(a_source_);
-  PopulateVector(b_source_);
-  PopulateVector(c_source_);
-}
-
-// ===============================================================================================
-
-// Tests the routine for a wide variety of parameters
-template <typename T>
-void TestABC<T>::TestRegular(Arguments<T> &args, const std::string &name) {
-  if (!PrecisionSupported()) { return; }
-  TestStart("regular behaviour", name);
-
-  // Computes whether or not the matrices are transposed. Note that we assume a default of
-  // column-major and no-transpose. If one of them is different (but not both), then rotated
-  // is considered true.
-  auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
-                   (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
-  auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
-                   (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
-  auto c_rotated = (args.layout == Layout::kRowMajor);
-
-  // Iterates over the matrix dimensions
-  for (auto &m: kMatrixDims) {
-    args.m = m;
-    for (auto &n: kMatrixDims) {
-      args.n = n;
-      for (auto &k: kMatrixDims) {
-        args.k = k;
-
-        // Computes the second dimensions of the matrices taking the rotation into account
-        auto a_two = (a_rotated) ? m : k;
-        auto b_two = (b_rotated) ? k : n;
-        auto c_two = (c_rotated) ? m : n;
-
-        // Iterates over the leading-dimension values and the offsets
-        for (auto &a_ld: kMatrixDims) {
-          args.a_ld = a_ld;
-          for (auto &a_offset: kOffsets) {
-            args.a_offset = a_offset;
-            for (auto &b_ld: kMatrixDims) {
-              args.b_ld = b_ld;
-              for (auto &b_offset: kOffsets) {
-                args.b_offset = b_offset;
-                for (auto &c_ld: kMatrixDims) {
-                  args.c_ld = c_ld;
-                  for (auto &c_offset: kOffsets) {
-                    args.c_offset = c_offset;
-
-                    // Computes the buffer sizes
-                    auto a_size = a_two * a_ld + a_offset;
-                    auto b_size = b_two * b_ld + b_offset;
-                    auto c_size = c_two * c_ld + c_offset;
-                    if (a_size < 1 || b_size < 1 || c_size < 1) { continue; }
-
-                    // Creates the OpenCL buffers
-                    auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T));
-                    auto b_mat = Buffer(context_, CL_MEM_READ_WRITE, b_size*sizeof(T));
-                    auto r_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T));
-                    auto s_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T));
-
-                    // Iterates over the values for alpha and beta
-                    for (auto &alpha: kAlphaValues) {
-                      args.alpha = alpha;
-                      for (auto &beta: kBetaValues) {
-                        args.beta = beta;
-
-                        // Runs the reference clBLAS code
-                        a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
-                        b_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_);
-                        r_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_);
-                        auto status1 = clblas_lambda_(args, a_mat, b_mat, r_mat, queue_);
-
-                        // Runs the CLBlast code
-                        a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
-                        b_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_);
-                        s_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_);
-                        auto status2 = clblast_lambda_(args, a_mat, b_mat, s_mat, queue_);
-
-                        // Tests for equality of the two status codes
-                        if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
-                          TestErrorCodes(status1, status2, args);
-                          continue;
-                        }
-
-                        // Downloads the results
-                        std::vector<T> r_result(c_size, static_cast<T>(0));
-                        std::vector<T> s_result(c_size, static_cast<T>(0));
-                        r_mat.ReadBuffer(queue_, c_size*sizeof(T), r_result);
-                        s_mat.ReadBuffer(queue_, c_size*sizeof(T), s_result);
-
-                        // Checks for differences in the output
-                        auto errors = size_t{0};
-                        for (auto idm=size_t{0}; idm<m; ++idm) {
-                          for (auto idn=size_t{0}; idn<n; ++idn) {
-                            auto index = (args.layout == Layout::kRowMajor) ?
-                                          idm*args.c_ld + idn + args.c_offset:
-                                          idn*args.c_ld + idm + args.c_offset;
-                            if (!TestSimilarity(r_result[index], s_result[index])) {
-                              errors++;
-                            }
-                          }
-                        }
-
-                        // Tests the error count (should be zero)
-                        TestErrorCount(errors, m*n, args);
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  TestEnd();
-}
-
-// =================================================================================================
-
-// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
-// does not test for results (if any).
-template <typename T>
-void TestABC<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
-  if (!PrecisionSupported()) { return; }
-  TestStart("invalid buffer sizes", name);
-
-  // Sets example test parameters
-  args.m = kBufferSize;
-  args.n = kBufferSize;
-  args.k = kBufferSize;
-  args.a_ld = kBufferSize;
-  args.b_ld = kBufferSize;
-  args.c_ld = kBufferSize;
-  args.a_offset = 0;
-  args.b_offset = 0;
-  args.c_offset = 0;
-
-  // Iterates over test buffer sizes
-  const std::vector<size_t> kBufferSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
-  for (auto &a_size: kBufferSizes) {
-    for (auto &b_size: kBufferSizes) {
-      for (auto &c_size: kBufferSizes) {
-
-        // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
-        // want to be able to create invalid buffers (no error checking here).
-        auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr);
-        auto a_mat = Buffer(a);
-        auto b = clCreateBuffer(context_(), CL_MEM_READ_WRITE, b_size*sizeof(T), nullptr, nullptr);
-        auto b_mat = Buffer(b);
-        auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr);
-        auto r_mat = Buffer(r);
-        auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr);
-        auto s_mat = Buffer(s);
-
-        // Runs the two routines
-        auto status1 = clblas_lambda_(args, a_mat, b_mat, r_mat, queue_);
-        auto status2 = clblast_lambda_(args, a_mat, b_mat, s_mat, queue_);
-
-        // Tests for equality of the two status codes
-        TestErrorCodes(status1, status2, args);
-      }
-    }
-  }
-  TestEnd();
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class TestABC<float>;
-template class TestABC<double>;
-template class TestABC<float2>;
-template class TestABC<double2>;
-
-// =================================================================================================
-} // namespace clblast
--- a/test/correctness/testabc.h
+++ b/test/correctness/testabc.h
@ -1,86 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file tests any mat-mat-mat (A,B,C) routine. It contains two types of tests: one testing
-// all sorts of input combinations, and one deliberatly testing with invalid values.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_TEST_CORRECTNESS_TESTABC_H_
-#define CLBLAST_TEST_CORRECTNESS_TESTABC_H_
-
-#include <vector>
-#include <string>
-
-#include "correctness/tester.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class TestABC: public Tester<T> {
- public:
-
-  // Uses several variables from the Tester class
-  using Tester<T>::context_;
-  using Tester<T>::queue_;
-  using Tester<T>::kLayouts;
-  using Tester<T>::kTransposes;
-
-  // Uses several helper functions from the Tester class
-  using Tester<T>::TestStart;
-  using Tester<T>::TestEnd;
-  using Tester<T>::TestSimilarity;
-  using Tester<T>::TestErrorCount;
-  using Tester<T>::TestErrorCodes;
-  using Tester<T>::GetExampleScalars;
-  using Tester<T>::GetOffsets;
-  using Tester<T>::PrecisionSupported;
-
-  // Test settings for the regular test. Append to this list in case more tests are required.
-  const std::vector<size_t> kMatrixDims = { 7, 64 };
-  const std::vector<size_t> kOffsets = GetOffsets();
-  const std::vector<T> kAlphaValues = GetExampleScalars();
-  const std::vector<T> kBetaValues = GetExampleScalars();
-
-  // Test settings for the invalid test
-  const size_t kBufferSize = 64;
-
-  // Shorthand for a BLAS routine
-  using Routine = std::function<StatusCode(const Arguments<T>&,
-                                           const Buffer&, const Buffer&, const Buffer&,
-                                           CommandQueue&)>;
-
-  // Constructor, initializes the base class tester and input data
-  TestABC(int argc, char *argv[], const bool silent,
-          const std::string &name, const std::vector<std::string> &options,
-          const Routine clblast_lambda, const Routine clblas_lambda);
-
-  // The test functions, taking no inputs
-  void TestRegular(Arguments<T> &args, const std::string &name);
-  void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
-
- private:
-
-  // Source data to test with
-  std::vector<T> a_source_;
-  std::vector<T> b_source_;
-  std::vector<T> c_source_;
-  
-  // The routines to test
-  Routine clblast_lambda_;
-  Routine clblas_lambda_;
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_TEST_CORRECTNESS_TESTABC_H_
-#endif
--- a/test/correctness/testaxy.cc
+++ b/test/correctness/testaxy.cc
@ -1,213 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the TestAXY class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include <algorithm>
-
-#include "correctness/testaxy.h"
-
-namespace clblast {
-// =================================================================================================
-
-// Constructor, initializes the base class tester and input data
-template <typename T>
-TestAXY<T>::TestAXY(int argc, char *argv[], const bool silent,
-                    const std::string &name, const std::vector<std::string> &options,
-                    const Routine clblast_lambda, const Routine clblas_lambda):
-    Tester<T>{argc, argv, silent, name, options},
-    clblast_lambda_(clblast_lambda),
-    clblas_lambda_(clblas_lambda) {
-
-  // Computes the maximum sizes. This allows for a single set of input/output buffers.
-  auto max_dim = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
-  auto max_ld = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
-  auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end());
-  auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
-
-  // Creates test input data
-  a_source_.resize(max_dim*max_ld + max_offset);
-  x_source_.resize(max_dim*max_inc + max_offset);
-  y_source_.resize(max_dim*max_inc + max_offset);
-  PopulateVector(a_source_);
-  PopulateVector(x_source_);
-  PopulateVector(y_source_);
-}
-
-// ===============================================================================================
-
-// Tests the routine for a wide variety of parameters
-template <typename T>
-void TestAXY<T>::TestRegular(Arguments<T> &args, const std::string &name) {
-  if (!PrecisionSupported()) { return; }
-  TestStart("regular behaviour", name);
-
-  // Iterates over the dimension for the matrix and vectors
-  for (auto &m: kMatrixVectorDims) {
-    args.m = m;
-    for (auto &n: kMatrixVectorDims) {
-      args.n = n;
-
-      // Computes the second dimension of the matrix taking the rotation into account
-      auto a_two = (args.layout == Layout::kRowMajor) ? args.m : args.n;
-
-      // Computes the vector sizes in case the matrix is transposed
-      auto a_transposed = (args.a_transpose == Transpose::kYes);
-      auto m_real = (a_transposed) ? n : m;
-      auto n_real = (a_transposed) ? m : n;
-
-      // Iterates over the leading-dimension values and the offsets of the matrix
-      for (auto &a_ld: kMatrixVectorDims) {
-        args.a_ld = a_ld;
-        for (auto &a_offset: kOffsets) {
-          args.a_offset = a_offset;
-
-          // Iterates over the increment-values and the offsets of the vectors
-          for (auto &x_inc: kIncrements) {
-            args.x_inc = x_inc;
-            for (auto &x_offset: kOffsets) {
-              args.x_offset = x_offset;
-              for (auto &y_inc: kIncrements) {
-                args.y_inc = y_inc;
-                for (auto &y_offset: kOffsets) {
-                  args.y_offset = y_offset;
-
-                  // Computes the buffer sizes
-                  auto a_size = a_two * a_ld + a_offset;
-                  auto x_size = n_real * x_inc + x_offset;
-                  auto y_size = m_real * y_inc + y_offset;
-                  if (a_size < 1 || x_size < 1 || y_size < 1) { continue; }
-
-                  // Creates the OpenCL buffers
-                  auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T));
-                  auto x_vec = Buffer(context_, CL_MEM_READ_WRITE, x_size*sizeof(T));
-                  auto r_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
-                  auto s_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
-
-                  // Iterates over the values for alpha and beta
-                  for (auto &alpha: kAlphaValues) {
-                    args.alpha = alpha;
-                    for (auto &beta: kBetaValues) {
-                      args.beta = beta;
-
-                      // Runs the reference clBLAS code
-                      a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
-                      x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
-                      r_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
-                      auto status1 = clblas_lambda_(args, a_mat, x_vec, r_vec, queue_);
-
-                      // Runs the CLBlast code
-                      a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
-                      x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
-                      s_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
-                      auto status2 = clblast_lambda_(args, a_mat, x_vec, s_vec, queue_);
-
-                      // Tests for equality of the two status codes
-                      if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
-                        TestErrorCodes(status1, status2, args);
-                        continue;
-                      }
-
-                      // Downloads the results
-                      std::vector<T> r_result(y_size, static_cast<T>(0));
-                      std::vector<T> s_result(y_size, static_cast<T>(0));
-                      r_vec.ReadBuffer(queue_, y_size*sizeof(T), r_result);
-                      s_vec.ReadBuffer(queue_, y_size*sizeof(T), s_result);
-
-                      // Checks for differences in the output
-                      auto errors = size_t{0};
-                      for (auto idm=size_t{0}; idm<m_real; ++idm) {
-                        auto index = idm*y_inc + y_offset;
-                        if (!TestSimilarity(r_result[index], s_result[index])) {
-                          errors++;
-                        }
-                      }
-
-                      // Tests the error count (should be zero)
-                      TestErrorCount(errors, m_real, args);
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  TestEnd();
-}
-
-// =================================================================================================
-
-// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
-// does not test for results (if any).
-template <typename T>
-void TestAXY<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
-  if (!PrecisionSupported()) { return; }
-  TestStart("invalid buffer sizes", name);
-
-  // Sets example test parameters
-  args.m = kBufferSize;
-  args.n = kBufferSize;
-  args.a_ld = kBufferSize;
-  args.a_offset = 0;
-  args.x_offset = 0;
-  args.y_offset = 0;
-
-  // Iterates over test buffer sizes
-  const std::vector<size_t> kMatrixSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
-  const std::vector<size_t> kVectorSizes = {0, kBufferSize - 1, kBufferSize};
-  for (auto &a_size: kMatrixSizes) {
-    for (auto &x_size: kVectorSizes) {
-      for (auto &y_size: kVectorSizes) {
-
-        // Iterates over test increments
-        for (auto &x_inc: kInvalidIncrements) {
-          args.x_inc = x_inc;
-          for (auto &y_inc: kInvalidIncrements) {
-            args.y_inc = y_inc;
-
-            // Creates the OpenCL buffers. Note: we are not using the C++ version since we
-            // explicitly want to be able to create invalid buffers (no error checking here).
-            auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr);
-            auto a_mat = Buffer(a);
-            auto x = clCreateBuffer(context_(), CL_MEM_READ_WRITE, x_size*sizeof(T), nullptr, nullptr);
-            auto x_vec = Buffer(x);
-            auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
-            auto r_vec = Buffer(r);
-            auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
-            auto s_vec = Buffer(s);
-
-            // Runs the two routines
-            auto status1 = clblas_lambda_(args, a_mat, x_vec, r_vec, queue_);
-            auto status2 = clblast_lambda_(args, a_mat, x_vec, s_vec, queue_);
-
-            // Tests for equality of the two status codes
-            TestErrorCodes(status1, status2, args);
-          }
-        }
-      }
-    }
-  }
-  TestEnd();
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class TestAXY<float>;
-template class TestAXY<double>;
-template class TestAXY<float2>;
-template class TestAXY<double2>;
-
-// =================================================================================================
-} // namespace clblast
--- a/test/correctness/testaxy.h
+++ b/test/correctness/testaxy.h
@ -1,88 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file tests any mat-vec-vec (A,X,Y) routine. It contains two types of tests: one testing
-// all sorts of input combinations, and one deliberatly testing with invalid values.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_TEST_CORRECTNESS_TESTAXY_H_
-#define CLBLAST_TEST_CORRECTNESS_TESTAXY_H_
-
-#include <vector>
-#include <string>
-
-#include "correctness/tester.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class TestAXY: public Tester<T> {
- public:
-
-  // Uses several variables from the Tester class
-  using Tester<T>::context_;
-  using Tester<T>::queue_;
-  using Tester<T>::kLayouts;
-  using Tester<T>::kTransposes;
-
-  // Uses several helper functions from the Tester class
-  using Tester<T>::TestStart;
-  using Tester<T>::TestEnd;
-  using Tester<T>::TestSimilarity;
-  using Tester<T>::TestErrorCount;
-  using Tester<T>::TestErrorCodes;
-  using Tester<T>::GetExampleScalars;
-  using Tester<T>::GetOffsets;
-  using Tester<T>::PrecisionSupported;
-
-  // Test settings for the regular test. Append to this list in case more tests are required.
-  const std::vector<size_t> kMatrixVectorDims = { 61, 512 };
-  const std::vector<size_t> kOffsets = GetOffsets();
-  const std::vector<size_t> kIncrements = { 1, 2 };
-  const std::vector<T> kAlphaValues = GetExampleScalars();
-  const std::vector<T> kBetaValues = GetExampleScalars();
-
-  // Test settings for the invalid test
-  const std::vector<size_t> kInvalidIncrements = { 0, 1 };
-  const size_t kBufferSize = 64;
-
-  // Shorthand for a BLAS routine
-  using Routine = std::function<StatusCode(const Arguments<T>&,
-                                           const Buffer&, const Buffer&, const Buffer&,
-                                           CommandQueue&)>;
-
-  // Constructor, initializes the base class tester and input data
-  TestAXY(int argc, char *argv[], const bool silent,
-          const std::string &name, const std::vector<std::string> &options,
-          const Routine clblast_lambda, const Routine clblas_lambda);
-
-  // The test functions, taking no inputs
-  void TestRegular(Arguments<T> &args, const std::string &name);
-  void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
-
- private:
-
-  // Source data to test with
-  std::vector<T> a_source_;
-  std::vector<T> x_source_;
-  std::vector<T> y_source_;
-  
-  // The routines to test
-  Routine clblast_lambda_;
-  Routine clblas_lambda_;
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_TEST_CORRECTNESS_TESTAXY_H_
-#endif
--- a/test/correctness/testblas.cc
+++ b/test/correctness/testblas.cc
@ -0,0 +1,189 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the TestBlas class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include <algorithm>
+
+#include "correctness/testblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The transpose-options to test with (data-type dependent)
+template <> const std::vector<Transpose> TestBlas<float,float>::kTransposes = {Transpose::kNo, Transpose::kYes};
+template <> const std::vector<Transpose> TestBlas<double,double>::kTransposes = {Transpose::kNo, Transpose::kYes};
+template <> const std::vector<Transpose> TestBlas<float2,float2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
+template <> const std::vector<Transpose> TestBlas<double2,double2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
+template <> const std::vector<Transpose> TestBlas<float2,float>::kTransposes = {Transpose::kNo, Transpose::kConjugate};
+template <> const std::vector<Transpose> TestBlas<double2,double>::kTransposes = {Transpose::kNo, Transpose::kConjugate};
+
+// =================================================================================================
+
+// Constructor, initializes the base class tester and input data
+template <typename T, typename U>
+TestBlas<T,U>::TestBlas(int argc, char *argv[], const bool silent,
+                        const std::string &name, const std::vector<std::string> &options,
+                        const Routine run_routine, const Routine run_reference,
+                        const ResultGet get_result, const ResultIndex get_index,
+                        const ResultIterator get_id1, const ResultIterator get_id2):
+    Tester<T,U>{argc, argv, silent, name, options},
+    run_routine_(run_routine),
+    run_reference_(run_reference),
+    get_result_(get_result),
+    get_index_(get_index),
+    get_id1_(get_id1),
+    get_id2_(get_id2) {
+
+  // Computes the maximum sizes. This allows for a single set of input/output buffers.
+  auto max_vec = *std::max_element(kVectorDims.begin(), kVectorDims.end());
+  auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end());
+  auto max_mat = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
+  auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
+  auto max_matvec = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
+  auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
+
+  // Creates test input data
+  x_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset);
+  y_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset);
+  a_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
+  b_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
+  c_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
+  PopulateVector(x_source_);
+  PopulateVector(y_source_);
+  PopulateVector(a_source_);
+  PopulateVector(b_source_);
+  PopulateVector(c_source_);
+}
+
+// ===============================================================================================
+
+// Tests the routine for a wide variety of parameters
+template <typename T, typename U>
+void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const std::string &name) {
+  if (!PrecisionSupported<T>(device_)) { return; }
+  TestStart("regular behaviour", name);
+
+  // Iterates over all the to-be-tested combinations of arguments
+  for (auto &args: test_vector) {
+
+    // Runs the reference clBLAS code
+    auto x_vec1 = Buffer(context_, CL_MEM_READ_WRITE, args.x_size*sizeof(T));
+    auto y_vec1 = Buffer(context_, CL_MEM_READ_WRITE, args.y_size*sizeof(T));
+    auto a_mat1 = Buffer(context_, CL_MEM_READ_WRITE, args.a_size*sizeof(T));
+    auto b_mat1 = Buffer(context_, CL_MEM_READ_WRITE, args.b_size*sizeof(T));
+    auto c_mat1 = Buffer(context_, CL_MEM_READ_WRITE, args.c_size*sizeof(T));
+    x_vec1.WriteBuffer(queue_, args.x_size*sizeof(T), x_source_);
+    y_vec1.WriteBuffer(queue_, args.y_size*sizeof(T), y_source_);
+    a_mat1.WriteBuffer(queue_, args.a_size*sizeof(T), a_source_);
+    b_mat1.WriteBuffer(queue_, args.b_size*sizeof(T), b_source_);
+    c_mat1.WriteBuffer(queue_, args.c_size*sizeof(T), c_source_);
+    auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1};
+    auto status1 = run_reference_(args, buffers1, queue_);
+
+    // Runs the CLBlast code
+    auto x_vec2 = Buffer(context_, CL_MEM_READ_WRITE, args.x_size*sizeof(T));
+    auto y_vec2 = Buffer(context_, CL_MEM_READ_WRITE, args.y_size*sizeof(T));
+    auto a_mat2 = Buffer(context_, CL_MEM_READ_WRITE, args.a_size*sizeof(T));
+    auto b_mat2 = Buffer(context_, CL_MEM_READ_WRITE, args.b_size*sizeof(T));
+    auto c_mat2 = Buffer(context_, CL_MEM_READ_WRITE, args.c_size*sizeof(T));
+    x_vec2.WriteBuffer(queue_, args.x_size*sizeof(T), x_source_);
+    y_vec2.WriteBuffer(queue_, args.y_size*sizeof(T), y_source_);
+    a_mat2.WriteBuffer(queue_, args.a_size*sizeof(T), a_source_);
+    b_mat2.WriteBuffer(queue_, args.b_size*sizeof(T), b_source_);
+    c_mat2.WriteBuffer(queue_, args.c_size*sizeof(T), c_source_);
+    auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2};
+    auto status2 = run_routine_(args, buffers2, queue_);
+
+    // Tests for equality of the two status codes
+    if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
+      TestErrorCodes(status1, status2, args);
+      continue;
+    }
+
+    // Downloads the results
+    auto result1 = get_result_(args, buffers1, queue_);
+    auto result2 = get_result_(args, buffers2, queue_);
+
+    // Checks for differences in the output
+    auto errors = size_t{0};
+    for (auto id1=size_t{0}; id1<get_id1_(args); ++id1) {
+      for (auto id2=size_t{0}; id2<get_id2_(args); ++id2) {
+        auto index = get_index_(args, id1, id2);
+        if (!TestSimilarity(result1[index], result2[index])) {
+          errors++;
+        }
+      }
+    }
+
+    // Tests the error count (should be zero)
+    TestErrorCount(errors, get_id1_(args)*get_id2_(args), args);
+  }
+  TestEnd();
+}
+
+// =================================================================================================
+
+// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
+// does not test for results (if any).
+template <typename T, typename U>
+void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const std::string &name) {
+  if (!PrecisionSupported<T>(device_)) { return; }
+  TestStart("invalid buffer sizes", name);
+
+  // Iterates over all the to-be-tested combinations of arguments
+  for (auto &args: test_vector) {
+
+    // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
+    // want to be able to create invalid buffers (no error checking here).
+    auto x1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
+    auto y1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
+    auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
+    auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
+    auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
+    auto x_vec1 = Buffer(x1);
+    auto y_vec1 = Buffer(y1);
+    auto a_mat1 = Buffer(a1);
+    auto b_mat1 = Buffer(b1);
+    auto c_mat1 = Buffer(c1);
+    auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
+    auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
+    auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
+    auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
+    auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
+    auto x_vec2 = Buffer(x2);
+    auto y_vec2 = Buffer(y2);
+    auto a_mat2 = Buffer(a2);
+    auto b_mat2 = Buffer(b2);
+    auto c_mat2 = Buffer(c2);
+
+    // Runs the two routines
+    auto status1 = run_reference_(args, Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1}, queue_);
+    auto status2 = run_routine_(args, Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2}, queue_);
+
+    // Tests for equality of the two status codes
+    TestErrorCodes(status1, status2, args);
+  }
+  TestEnd();
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class TestBlas<float, float>;
+template class TestBlas<double, double>;
+template class TestBlas<float2, float2>;
+template class TestBlas<double2, double2>;
+template class TestBlas<float2, float>;
+template class TestBlas<double2, double>;
+
+// =================================================================================================
+} // namespace clblast
--- a/test/correctness/testblas.h
+++ b/test/correctness/testblas.h
@ -0,0 +1,106 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file tests any CLBlast routine. It contains two types of tests: one testing all sorts of
+// input combinations, and one deliberatly testing with invalid values.
+// Typename T: the data-type of the routine's memory buffers (==precision)
+// Typename U: the data-type of the alpha and beta arguments
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_CORRECTNESS_TESTBLAS_H_
+#define CLBLAST_TEST_CORRECTNESS_TESTBLAS_H_
+
+#include <vector>
+#include <string>
+
+#include "correctness/tester.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class TestBlas: public Tester<T,U> {
+ public:
+
+  // Uses several variables from the Tester class
+  using Tester<T,U>::context_;
+  using Tester<T,U>::queue_;
+  using Tester<T,U>::full_test_;
+  using Tester<T,U>::device_;
+
+  // Uses several helper functions from the Tester class
+  using Tester<T,U>::TestStart;
+  using Tester<T,U>::TestEnd;
+  using Tester<T,U>::TestErrorCount;
+  using Tester<T,U>::TestErrorCodes;
+  using Tester<T,U>::GetOffsets;
+
+  // Test settings for the regular test. Append to these lists in case more tests are required.
+  const std::vector<size_t> kVectorDims = { 7, 93, 4096 };
+  const std::vector<size_t> kIncrements = { 1, 2, 7 };
+  const std::vector<size_t> kMatrixDims = { 7, 64 };
+  const std::vector<size_t> kMatrixVectorDims = { 61, 512 };
+  const std::vector<size_t> kOffsets = GetOffsets();
+  const std::vector<U> kAlphaValues = GetExampleScalars<U>(full_test_);
+  const std::vector<U> kBetaValues = GetExampleScalars<U>(full_test_);
+
+  // Test settings for the invalid tests
+  const std::vector<size_t> kInvalidIncrements = { 0, 1 };
+  const size_t kBufferSize = 64;
+  const std::vector<size_t> kMatSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
+  const std::vector<size_t> kVecSizes = {0, kBufferSize - 1, kBufferSize};
+
+  // The layout/transpose/triangle options to test with
+  const std::vector<Layout> kLayouts = {Layout::kRowMajor, Layout::kColMajor};
+  const std::vector<Triangle> kTriangles = {Triangle::kUpper, Triangle::kLower};
+  const std::vector<Side> kSides = {Side::kLeft, Side::kRight};
+  const std::vector<Diagonal> kDiagonals = {Diagonal::kUnit, Diagonal::kNonUnit};
+  static const std::vector<Transpose> kTransposes; // Data-type dependent, see .cc-file
+
+  // Shorthand for the routine-specific functions passed to the tester
+  using Routine = std::function<StatusCode(const Arguments<U>&, const Buffers&, CommandQueue&)>;
+  using ResultGet = std::function<std::vector<T>(const Arguments<U>&, Buffers&, CommandQueue&)>;
+  using ResultIndex = std::function<size_t(const Arguments<U>&, const size_t, const size_t)>;
+  using ResultIterator = std::function<size_t(const Arguments<U>&)>;
+
+  // Constructor, initializes the base class tester and input data
+  TestBlas(int argc, char *argv[], const bool silent,
+           const std::string &name, const std::vector<std::string> &options,
+           const Routine run_routine, const Routine run_reference, const ResultGet get_result,
+           const ResultIndex get_index, const ResultIterator get_id1, const ResultIterator get_id2);
+
+  // The test functions, taking no inputs
+  void TestRegular(std::vector<Arguments<U>> &test_vector, const std::string &name);
+  void TestInvalid(std::vector<Arguments<U>> &test_vector, const std::string &name);
+
+ private:
+
+  // Source data to test with
+  std::vector<T> x_source_;
+  std::vector<T> y_source_;
+  std::vector<T> a_source_;
+  std::vector<T> b_source_;
+  std::vector<T> c_source_;
+  
+  // The routine-specific functions passed to the tester
+  Routine run_routine_;
+  Routine run_reference_;
+  ResultGet get_result_;
+  ResultIndex get_index_;
+  ResultIterator get_id1_;
+  ResultIterator get_id2_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_CORRECTNESS_TESTBLAS_H_
+#endif
--- a/test/correctness/tester.cc
+++ b/test/correctness/tester.cc
@ -21,21 +21,11 @@
 namespace clblast {
 // =================================================================================================

-// The layouts and transpose-options to test with (data-type dependent)
-template <typename T>
-const std::vector<Layout> Tester<T>::kLayouts = {Layout::kRowMajor, Layout::kColMajor};
-template <> const std::vector<Transpose> Tester<float>::kTransposes = {Transpose::kNo, Transpose::kYes};
-template <> const std::vector<Transpose> Tester<double>::kTransposes = {Transpose::kNo, Transpose::kYes};
-template <> const std::vector<Transpose> Tester<float2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
-template <> const std::vector<Transpose> Tester<double2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
-
-// =================================================================================================
-
 // General constructor for all CLBlast testers. It prints out the test header to stdout and sets-up
 // the clBLAS library for reference.
-template <typename T>
-Tester<T>::Tester(int argc, char *argv[], const bool silent,
-                  const std::string &name, const std::vector<std::string> &options):
+template <typename T, typename U>
+Tester<T,U>::Tester(int argc, char *argv[], const bool silent,
+                    const std::string &name, const std::vector<std::string> &options):
    help_("Options given/available:\n"),
    platform_(Platform(GetArgument(argc, argv, help_, kArgPlatform, size_t{0}))),
    device_(Device(platform_, kDeviceType, GetArgument(argc, argv, help_, kArgDevice, size_t{0}))),
@ -61,7 +51,7 @@ Tester<T>::Tester(int argc, char *argv[], const bool silent,
          kPrintMessage.c_str(), name.c_str(), kPrintEnd.c_str());

  // Checks whether the precision is supported
-  if (!PrecisionSupported()) {
+  if (!PrecisionSupported<T>(device_)) {
    fprintf(stdout, "\n* All tests skipped: %sUnsupported precision%s\n",
            kPrintWarning.c_str(), kPrintEnd.c_str());
    return;
@ -86,9 +76,9 @@ Tester<T>::Tester(int argc, char *argv[], const bool silent,
 }

 // Destructor prints the summary of the test cases and cleans-up the clBLAS library
-template <typename T>
-Tester<T>::~Tester() {
-  if (PrecisionSupported()) {
+template <typename T, typename U>
+Tester<T,U>::~Tester() {
+  if (PrecisionSupported<T>(device_)) {
    fprintf(stdout, "* Completed all test-cases for this routine. Results:\n");
    fprintf(stdout, "   %lu test(s) passed\n", tests_passed_);
    if (tests_skipped_ > 0) { fprintf(stdout, "%s", kPrintWarning.c_str()); }
@ -104,8 +94,8 @@ Tester<T>::~Tester() {

 // Function called at the start of each test. This prints a header with information about the
 // test and re-initializes all test data-structures.
-template <typename T>
-void Tester<T>::TestStart(const std::string &test_name, const std::string &test_configuration) {
+template <typename T, typename U>
+void Tester<T,U>::TestStart(const std::string &test_name, const std::string &test_configuration) {

  // Prints the header
  fprintf(stdout, "* Testing %s'%s'%s for %s'%s'%s:\n",
@ -123,8 +113,8 @@ void Tester<T>::TestStart(const std::string &test_name, const std::string &test_

 // Function called at the end of each test. This prints errors if any occured. It also prints a
 // summary of the number of sub-tests passed/failed.
-template <typename T>
-void Tester<T>::TestEnd() {
+template <typename T, typename U>
+void Tester<T,U>::TestEnd() {
  fprintf(stdout, "\n");
  tests_passed_ += num_passed_;
  tests_failed_ += num_skipped_;
@ -147,6 +137,7 @@ void Tester<T>::TestEnd() {
      if (o == kArgBTransp)  { fprintf(stdout, "%s=%d ", kArgBTransp, entry.args.b_transpose);}
      if (o == kArgSide)     { fprintf(stdout, "%s=%d ", kArgSide, entry.args.side);}
      if (o == kArgTriangle) { fprintf(stdout, "%s=%d ", kArgTriangle, entry.args.triangle);}
+      if (o == kArgDiagonal) { fprintf(stdout, "%s=%d ", kArgDiagonal, entry.args.diagonal);}
      if (o == kArgXInc)     { fprintf(stdout, "%s=%lu ", kArgXInc, entry.args.x_inc);}
      if (o == kArgYInc)     { fprintf(stdout, "%s=%lu ", kArgYInc, entry.args.y_inc);}
      if (o == kArgXOffset)  { fprintf(stdout, "%s=%lu ", kArgXOffset, entry.args.x_offset);}
@ -181,45 +172,9 @@ void Tester<T>::TestEnd() {

 // =================================================================================================

-// Compares two floating point values and returns whether they are within an acceptable error
-// margin. This replaces GTest's EXPECT_NEAR().
-template <typename T>
-bool Tester<T>::TestSimilarity(const T val1, const T val2) {
-  const auto difference = std::fabs(val1 - val2);
-
-  // Shortcut, handles infinities
-  if (val1 == val2) {
-    return true;
-  }
-  // The values are zero or very small: the relative error is less meaningful
-  else if (val1 == 0 || val2 == 0 || difference < static_cast<T>(kErrorMarginAbsolute)) {
-    return (difference < static_cast<T>(kErrorMarginAbsolute));
-  }
-  // Use relative error
-  else {
-    return (difference / (std::fabs(val1)+std::fabs(val2))) < static_cast<T>(kErrorMarginRelative);
-  }
-}
-
-// Specialisations for complex data-types
-template <>
-bool Tester<float2>::TestSimilarity(const float2 val1, const float2 val2) {
-  auto real = Tester<float>::TestSimilarity(val1.real(), val2.real());
-  auto imag = Tester<float>::TestSimilarity(val1.imag(), val2.imag());
-  return (real && imag);
-}
-template <>
-bool Tester<double2>::TestSimilarity(const double2 val1, const double2 val2) {
-  auto real = Tester<double>::TestSimilarity(val1.real(), val2.real());
-  auto imag = Tester<double>::TestSimilarity(val1.imag(), val2.imag());
-  return (real && imag);
-}
-
-// =================================================================================================
-
 // Handles a 'pass' or 'error' depending on whether there are any errors
-template <typename T>
-void Tester<T>::TestErrorCount(const size_t errors, const size_t size, const Arguments<T> &args) {
+template <typename T, typename U>
+void Tester<T,U>::TestErrorCount(const size_t errors, const size_t size, const Arguments<U> &args) {

  // Finished successfully
  if (errors == 0) {
@ -237,9 +192,9 @@ void Tester<T>::TestErrorCount(const size_t errors, const size_t size, const Arg

 // Compares two status codes for equality. The outcome can be a pass (they are the same), a warning
 // (CLBlast reported a compilation error), or an error (they are different).
-template <typename T>
-void Tester<T>::TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status,
-                            const Arguments<T> &args) {
+template <typename T, typename U>
+void Tester<T,U>::TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status,
+                                 const Arguments<U> &args) {

  // Finished successfully
  if (clblas_status == clblast_status) {
@ -270,62 +225,26 @@ void Tester<T>::TestErrorCodes(const StatusCode clblas_status, const StatusCode

 // =================================================================================================

-// Retrieves a list of example scalar values, used for the alpha and beta arguments for the various
-// routines. This function is specialised for the different data-types.
-template <>
-const std::vector<float> Tester<float>::GetExampleScalars() {
-  if (full_test_) { return {0.0f, 1.0f, 3.14f}; }
-  else { return {3.14f}; }
-}
-template <>
-const std::vector<double> Tester<double>::GetExampleScalars() {
-  if (full_test_) { return {0.0, 1.0, 3.14}; }
-  else { return {3.14}; }
-}
-template <>
-const std::vector<float2> Tester<float2>::GetExampleScalars() {
-  if (full_test_) { return {{0.0f, 0.0f}, {1.0f, 1.3f}, {2.42f, 3.14f}}; }
-  else { return {{2.42f, 3.14f}}; }
-}
-template <>
-const std::vector<double2> Tester<double2>::GetExampleScalars() {
-  if (full_test_) { return {{0.0, 0.0}, {1.0, 1.3}, {2.42, 3.14}}; }
-  else { return {{2.42, 3.14}}; }
-}
-
 // Retrieves the offset values to test with
-template <typename T>
-const std::vector<size_t> Tester<T>::GetOffsets() {
+template <typename T, typename U>
+const std::vector<size_t> Tester<T,U>::GetOffsets() const {
  if (full_test_) { return {0, 10}; }
  else { return {0}; }
 }

 // =================================================================================================

-template <> bool Tester<float>::PrecisionSupported() const { return true; }
-template <> bool Tester<float2>::PrecisionSupported() const { return true; }
-template <> bool Tester<double>::PrecisionSupported() const {
-  auto extensions = device_.Extensions();
-  return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
-}
-template <> bool Tester<double2>::PrecisionSupported() const {
-  auto extensions = device_.Extensions();
-  return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
-}
-
-// =================================================================================================
-
 // A test can either pass, be skipped, or fail
-template <typename T>
-void Tester<T>::ReportPass() {
+template <typename T, typename U>
+void Tester<T,U>::ReportPass() {
  num_passed_++;
 }
-template <typename T>
-void Tester<T>::ReportSkipped() {
+template <typename T, typename U>
+void Tester<T,U>::ReportSkipped() {
  num_skipped_++;
 }
-template <typename T>
-void Tester<T>::ReportError(const ErrorLogEntry &error_log_entry) {
+template <typename T, typename U>
+void Tester<T,U>::ReportError(const ErrorLogEntry &error_log_entry) {
  error_log_.push_back(error_log_entry);
  num_failed_++;
 }
@ -334,8 +253,8 @@ void Tester<T>::ReportError(const ErrorLogEntry &error_log_entry) {

 // Prints the test-result symbol to screen. This function limits the maximum number of symbols per
 // line by printing newlines once every so many calls.
-template <typename T>
-void Tester<T>::PrintTestResult(const std::string &message) {
+template <typename T, typename U>
+void Tester<T,U>::PrintTestResult(const std::string &message) {
  if (print_count_ == kResultsPerLine) {
    print_count_ = 0;
    fprintf(stdout, "\n   ");
@ -345,13 +264,98 @@ void Tester<T>::PrintTestResult(const std::string &message) {
  print_count_++;
 }

+// =================================================================================================
+// Below are the non-member functions (separated because of otherwise required partial class
+// template specialization)
+// =================================================================================================
+
+// Compares two floating point values and returns whether they are within an acceptable error
+// margin. This replaces GTest's EXPECT_NEAR().
+template <typename T>
+bool TestSimilarity(const T val1, const T val2) {
+  const auto difference = std::fabs(val1 - val2);
+
+  // Set the allowed error margin for floating-point comparisons
+  constexpr auto kErrorMarginRelative = 1.0e-2;
+  constexpr auto kErrorMarginAbsolute = 1.0e-10;
+
+  // Shortcut, handles infinities
+  if (val1 == val2) {
+    return true;
+  }
+  // The values are zero or very small: the relative error is less meaningful
+  else if (val1 == 0 || val2 == 0 || difference < static_cast<T>(kErrorMarginAbsolute)) {
+    return (difference < static_cast<T>(kErrorMarginAbsolute));
+  }
+  // Use relative error
+  else {
+    const auto absolute_sum = std::fabs(val1) + std::fabs(val2);
+    return (difference / absolute_sum) < static_cast<T>(kErrorMarginRelative);
+  }
+}
+
+// Compiles the default case for non-complex data-types
+template bool TestSimilarity<float>(const float, const float);
+template bool TestSimilarity<double>(const double, const double);
+
+// Specialisations for complex data-types
+template <>
+bool TestSimilarity(const float2 val1, const float2 val2) {
+  auto real = TestSimilarity(val1.real(), val2.real());
+  auto imag = TestSimilarity(val1.imag(), val2.imag());
+  return (real && imag);
+}
+template <>
+bool TestSimilarity(const double2 val1, const double2 val2) {
+  auto real = TestSimilarity(val1.real(), val2.real());
+  auto imag = TestSimilarity(val1.imag(), val2.imag());
+  return (real && imag);
+}
+
+// =================================================================================================
+
+// Retrieves a list of example scalar values, used for the alpha and beta arguments for the various
+// routines. This function is specialised for the different data-types.
+template <> const std::vector<float> GetExampleScalars(const bool full_test) {
+  if (full_test) { return {0.0f, 1.0f, 3.14f}; }
+  else { return {3.14f}; }
+}
+template <> const std::vector<double> GetExampleScalars(const bool full_test) {
+  if (full_test) { return {0.0, 1.0, 3.14}; }
+  else { return {3.14}; }
+}
+template <> const std::vector<float2> GetExampleScalars(const bool full_test) {
+  if (full_test) { return {{0.0f, 0.0f}, {1.0f, 1.3f}, {2.42f, 3.14f}}; }
+  else { return {{2.42f, 3.14f}}; }
+}
+template <> const std::vector<double2> GetExampleScalars(const bool full_test) {
+  if (full_test) { return {{0.0, 0.0}, {1.0, 1.3}, {2.42, 3.14}}; }
+  else { return {{2.42, 3.14}}; }
+}
+
+// =================================================================================================
+
+// Returns false is this precision is not supported by the device
+template <> bool PrecisionSupported<float>(const Device &) { return true; }
+template <> bool PrecisionSupported<float2>(const Device &) { return true; }
+template <> bool PrecisionSupported<double>(const Device &device) {
+  auto extensions = device.Extensions();
+  return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
+}
+template <> bool PrecisionSupported<double2>(const Device &device) {
+  auto extensions = device.Extensions();
+  return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
+}
+
 // =================================================================================================

 // Compiles the templated class
-template class Tester<float>;
-template class Tester<double>;
-template class Tester<float2>;
-template class Tester<double2>;
+template class Tester<float, float>;
+template class Tester<double, double>;
+template class Tester<float2, float2>;
+template class Tester<double2, double2>;
+template class Tester<float2, float>;
+template class Tester<double2, double>;

 // =================================================================================================
 } // namespace clblast
--- a/test/correctness/tester.h
+++ b/test/correctness/tester.h
@ -10,6 +10,8 @@
 // This file implements the Tester class, providing a test-framework. GTest was used before, but
 // was not able to handle certain cases (e.g. template type + parameters). This is its (basic)
 // custom replacement.
+// Typename T: the data-type of the routine's memory buffers (==precision)
+// Typename U: the data-type of the alpha and beta arguments
 //
 // =================================================================================================

@ -30,7 +32,7 @@ namespace clblast {
 // =================================================================================================

 // See comment at top of file for a description of the class
-template <typename T>
+template <typename T, typename U>
 class Tester {
 public:

@ -43,10 +45,6 @@ class Tester {
  // Error percentage is not applicable: error was caused by an incorrect status
  static constexpr auto kStatusError = -1.0f;

-  // Set the allowed error margin for floating-point comparisons
-  static constexpr auto kErrorMarginRelative = 1.0e-2;
-  static constexpr auto kErrorMarginAbsolute = 1.0e-10;
-
  // Constants holding start and end strings for terminal-output in colour
  const std::string kPrintError{"\x1b[31m"};
  const std::string kPrintSuccess{"\x1b[32m"};
@ -62,16 +60,12 @@ class Tester {
  const std::string kSkippedCompilation{kPrintWarning + "\\" + kPrintEnd};
  const std::string kUnsupportedPrecision{kPrintWarning + "o" + kPrintEnd};

-  // The layouts and transpose-options to test with
-  static const std::vector<Layout> kLayouts;
-  static const std::vector<Transpose> kTransposes;
-
  // This structure combines the above log-entry with a status code an error percentage
  struct ErrorLogEntry {
    StatusCode status_expect;
    StatusCode status_found;
    float error_percentage;
-    Arguments<T> args;
+    Arguments<U> args;
  };

  // Creates an instance of the tester, running on a particular OpenCL platform and device. It
@ -84,25 +78,13 @@ class Tester {
  void TestStart(const std::string &test_name, const std::string &test_configuration);
  void TestEnd();

-  // Compares two floating point values for similarity. Allows for a certain relative error margin.
-  static bool TestSimilarity(const T val1, const T val2);
-
  // Tests either an error count (should be zero) or two error codes (must match)
-  void TestErrorCount(const size_t errors, const size_t size, const Arguments<T> &args);
+  void TestErrorCount(const size_t errors, const size_t size, const Arguments<U> &args);
  void TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status,
-                      const Arguments<T> &args);
+                      const Arguments<U> &args);

 protected:

-  // Retrieves a list of example scalars of the right type
-  const std::vector<T> GetExampleScalars();
-
-  // Retrieves a list of offset values to test
-  const std::vector<size_t> GetOffsets();
-
-  // Returns false is this precision is not supported by the device
-  bool PrecisionSupported() const;
-
  // The help-message
  std::string help_;

@ -112,6 +94,12 @@ class Tester {
  Context context_;
  CommandQueue queue_;

+  // Whether or not to run the full test-suite or just a smoke test
+  bool full_test_;
+
+  // Retrieves the offset values to test with
+  const std::vector<size_t> GetOffsets() const;
+
 private:

  // Internal methods to report a passed, skipped, or failed test
@ -122,9 +110,6 @@ class Tester {
  // Prints the error or success symbol to screen
  void PrintTestResult(const std::string &message);

-  // Whether or not to run the full test-suite or just a smoke test
-  bool full_test_;
-
  // Logging and counting occurrences of errors
  std::vector<ErrorLogEntry> error_log_;
  size_t num_passed_;
@ -143,6 +128,25 @@ class Tester {
  std::vector<std::string> options_;
 };

+// =================================================================================================
+// Below are the non-member functions (separated because of otherwise required partial class
+// template specialization)
+// =================================================================================================
+
+// Compares two floating point values and returns whether they are within an acceptable error
+// margin. This replaces GTest's EXPECT_NEAR().
+template <typename T>
+bool TestSimilarity(const T val1, const T val2);
+
+// Retrieves a list of example scalar values, used for the alpha and beta arguments for the various
+// routines. This function is specialised for the different data-types.
+template <typename T>
+const std::vector<T> GetExampleScalars(const bool full_test);
+
+// Returns false is this precision is not supported by the device
+template <typename T>
+bool PrecisionSupported(const Device &device);
+
 // =================================================================================================
 } // namespace clblast

--- a/test/correctness/testxy.cc
+++ b/test/correctness/testxy.cc
@ -1,176 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the TestXY class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include <algorithm>
-
-#include "correctness/testxy.h"
-
-namespace clblast {
-// =================================================================================================
-
-// Constructor, initializes the base class tester and input data
-template <typename T>
-TestXY<T>::TestXY(int argc, char *argv[], const bool silent,
-                  const std::string &name, const std::vector<std::string> &options,
-                  const Routine clblast_lambda, const Routine clblas_lambda):
-    Tester<T>{argc, argv, silent, name, options},
-    clblast_lambda_(clblast_lambda),
-    clblas_lambda_(clblas_lambda) {
-
-  // Computes the maximum sizes. This allows for a single set of input/output buffers.
-  auto max_dim = *std::max_element(kVectorDims.begin(), kVectorDims.end());
-  auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end());
-  auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
-
-  // Creates test input data
-  x_source_.resize(max_dim*max_inc + max_offset);
-  y_source_.resize(max_dim*max_inc + max_offset);
-  PopulateVector(x_source_);
-  PopulateVector(y_source_);
-}
-
-// ===============================================================================================
-
-// Tests the routine for a wide variety of parameters
-template <typename T>
-void TestXY<T>::TestRegular(Arguments<T> &args, const std::string &name) {
-  if (!PrecisionSupported()) { return; }
-  TestStart("regular behaviour", name);
-
-  // Iterates over the vector dimension
-  for (auto &n: kVectorDims) {
-    args.n = n;
-
-    // Iterates over the increment-values and the offsets
-    for (auto &x_inc: kIncrements) {
-      args.x_inc = x_inc;
-      for (auto &x_offset: kOffsets) {
-        args.x_offset = x_offset;
-        for (auto &y_inc: kIncrements) {
-          args.y_inc = y_inc;
-          for (auto &y_offset: kOffsets) {
-            args.y_offset = y_offset;
-
-            // Computes the buffer sizes
-            auto x_size = n * x_inc + x_offset;
-            auto y_size = n * y_inc + y_offset;
-            if (x_size < 1 || y_size < 1) { continue; }
-
-            // Creates the OpenCL buffers
-            auto x_vec = Buffer(context_, CL_MEM_READ_WRITE, x_size*sizeof(T));
-            auto r_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
-            auto s_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
-
-            // Iterates over the values for alpha
-            for (auto &alpha: kAlphaValues) {
-              args.alpha = alpha;
-
-              // Runs the reference clBLAS code
-              x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
-              r_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
-              auto status1 = clblas_lambda_(args, x_vec, r_vec, queue_);
-
-              // Runs the CLBlast code
-              x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
-              s_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
-              auto status2 = clblast_lambda_(args, x_vec, s_vec, queue_);
-
-              // Tests for equality of the two status codes
-              if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
-                TestErrorCodes(status1, status2, args);
-                continue;
-              }
-
-              // Downloads the results
-              std::vector<T> r_result(y_size, static_cast<T>(0));
-              std::vector<T> s_result(y_size, static_cast<T>(0));
-              r_vec.ReadBuffer(queue_, y_size*sizeof(T), r_result);
-              s_vec.ReadBuffer(queue_, y_size*sizeof(T), s_result);
-
-              // Checks for differences in the output
-              auto errors = size_t{0};
-              for (auto idn=size_t{0}; idn<n; ++idn) {
-                auto index = idn*y_inc + y_offset;
-                if (!TestSimilarity(r_result[index], s_result[index])) {
-                  errors++;
-                }
-              }
-
-              // Tests the error count (should be zero)
-              TestErrorCount(errors, n, args);
-            }
-          }
-        }
-      }
-    }
-  }
-  TestEnd();
-}
-
-// =================================================================================================
-
-// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
-// does not test for results (if any).
-template <typename T>
-void TestXY<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
-  if (!PrecisionSupported()) { return; }
-  TestStart("invalid buffer sizes", name);
-
-  // Sets example test parameters
-  args.n = kBufferSize;
-  args.x_offset = 0;
-  args.y_offset = 0;
-
-  // Iterates over test buffer sizes
-  const std::vector<size_t> kBufferSizes = {0, kBufferSize - 1, kBufferSize};
-  for (auto &x_size: kBufferSizes) {
-    for (auto &y_size: kBufferSizes) {
-
-      // Iterates over test increments
-      for (auto &x_inc: kInvalidIncrements) {
-        args.x_inc = x_inc;
-        for (auto &y_inc: kInvalidIncrements) {
-          args.y_inc = y_inc;
-
-          // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
-          // want to be able to create invalid buffers (no error checking here).
-          auto x = clCreateBuffer(context_(), CL_MEM_READ_WRITE, x_size*sizeof(T), nullptr, nullptr);
-          auto x_vec = Buffer(x);
-          auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
-          auto r_vec = Buffer(r);
-          auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
-          auto s_vec = Buffer(s);
-
-          // Runs the two routines
-          auto status1 = clblas_lambda_(args, x_vec, r_vec, queue_);
-          auto status2 = clblast_lambda_(args, x_vec, s_vec, queue_);
-
-          // Tests for equality of the two status codes
-          TestErrorCodes(status1, status2, args);
-        }
-      }
-    }
-  }
-  TestEnd();
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class TestXY<float>;
-template class TestXY<double>;
-template class TestXY<float2>;
-template class TestXY<double2>;
-
-// =================================================================================================
-} // namespace clblast
--- a/test/correctness/testxy.h
+++ b/test/correctness/testxy.h
@ -1,84 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file tests any vector-vector (X,Y) routine. It contains two types of tests: one testing
-// all sorts of input combinations, and one deliberatly testing with invalid values.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_TEST_CORRECTNESS_TESTXY_H_
-#define CLBLAST_TEST_CORRECTNESS_TESTXY_H_
-
-#include <vector>
-#include <string>
-
-#include "correctness/tester.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class TestXY: public Tester<T> {
- public:
-
-  // Uses several variables from the Tester class
-  using Tester<T>::context_;
-  using Tester<T>::queue_;
-
-  // Uses several helper functions from the Tester class
-  using Tester<T>::TestStart;
-  using Tester<T>::TestEnd;
-  using Tester<T>::TestSimilarity;
-  using Tester<T>::TestErrorCount;
-  using Tester<T>::TestErrorCodes;
-  using Tester<T>::GetExampleScalars;
-  using Tester<T>::GetOffsets;
-  using Tester<T>::PrecisionSupported;
-
-  // Test settings for the regular test. Append to this list in case more tests are required.
-  const std::vector<size_t> kVectorDims = { 7, 93, 4096 };
-  const std::vector<size_t> kOffsets = GetOffsets();
-  const std::vector<size_t> kIncrements = { 1, 2, 7 };
-  const std::vector<T> kAlphaValues = GetExampleScalars();
-
-  // Test settings for the invalid test
-  const std::vector<size_t> kInvalidIncrements = { 0, 1 };
-  const size_t kBufferSize = 512;
-
-  // Shorthand for a BLAS routine
-  using Routine = std::function<StatusCode(const Arguments<T>&,
-                                           const Buffer&, const Buffer&,
-                                           CommandQueue&)>;
-
-  // Constructor, initializes the base class tester and input data
-  TestXY(int argc, char *argv[], const bool silent,
-         const std::string &name, const std::vector<std::string> &options,
-         const Routine clblast_lambda, const Routine clblas_lambda);
-
-  // The test functions, taking no inputs
-  void TestRegular(Arguments<T> &args, const std::string &name);
-  void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
-
- private:
-
-  // Source data to test with
-  std::vector<T> x_source_;
-  std::vector<T> y_source_;
-  
-  // The routines to test
-  Routine clblast_lambda_;
-  Routine clblas_lambda_;
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_TEST_CORRECTNESS_TESTXY_H_
-#endif
--- a/test/performance/client.cc
+++ b/test/performance/client.cc
@ -21,249 +21,36 @@
 namespace clblast {
 // =================================================================================================

-// This is the vector-vector variant of the set-up/tear-down client routine.
-template <typename T>
-void ClientXY(int argc, char *argv[], Routine2<T> client_routine,
-              const std::vector<std::string> &options) {
-
-  // Function to determine how to find the default value of the leading dimension of matrix A.
-  // Note: this is not relevant for this client but given anyway.
-  auto default_ld_a = [](const Arguments<T> args) { return args.n; };
-
-  // Simple command line argument parser with defaults
-  auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
-  if (args.print_help) { return; }
-
-  // Prints the header of the output table
-  PrintTableHeader(args.silent, options);
-
-  // Initializes OpenCL and the libraries
-  auto platform = Platform(args.platform_id);
-  auto device = Device(platform, kDeviceType, args.device_id);
-  auto context = Context(device);
-  auto queue = CommandQueue(context, device);
-  if (args.compare_clblas) { clblasSetup(); }
-
-  // Iterates over all "num_step" values jumping by "step" each time
-  auto s = size_t{0};
-  while(true) {
-
-    // Computes the data sizes
-    auto x_size = args.n*args.x_inc + args.x_offset;
-    auto y_size = args.n*args.y_inc + args.y_offset;
-
-    // Populates input host vectors with random data
-    std::vector<T> x_source(x_size);
-    std::vector<T> y_source(y_size);
-    PopulateVector(x_source);
-    PopulateVector(y_source);
-
-    // Creates the vectors on the device
-    auto x_buffer = Buffer(context, CL_MEM_READ_WRITE, x_size*sizeof(T));
-    auto y_buffer = Buffer(context, CL_MEM_READ_WRITE, y_size*sizeof(T));
-    x_buffer.WriteBuffer(queue, x_size*sizeof(T), x_source);
-    y_buffer.WriteBuffer(queue, y_size*sizeof(T), y_source);
-
-    // Runs the routine-specific code
-    client_routine(args, x_buffer, y_buffer, queue);
-
-    // Makes the jump to the next step
-    ++s;
-    if (s >= args.num_steps) { break; }
-    args.n += args.step;
-  }
-
-  // Cleans-up and returns
-  if (args.compare_clblas) { clblasTeardown(); }
+// Constructor
+template <typename T, typename U>
+Client<T,U>::Client(const Routine run_routine, const Routine run_reference,
+                    const std::vector<std::string> &options,
+                    const GetMetric get_flops, const GetMetric get_bytes):
+  run_routine_(run_routine),
+  run_reference_(run_reference),
+  options_(options),
+  get_flops_(get_flops),
+  get_bytes_(get_bytes) {
 }

-// Compiles the above function
-template void ClientXY<float>(int, char **, Routine2<float>, const std::vector<std::string>&);
-template void ClientXY<double>(int, char **, Routine2<double>, const std::vector<std::string>&);
-template void ClientXY<float2>(int, char **, Routine2<float2>, const std::vector<std::string>&);
-template void ClientXY<double2>(int, char **, Routine2<double2>, const std::vector<std::string>&);
-
-// =================================================================================================
-
-// This is the matrix-vector-vector variant of the set-up/tear-down client routine.
-template <typename T>
-void ClientAXY(int argc, char *argv[], Routine3<T> client_routine,
-               const std::vector<std::string> &options) {
-
-  // Function to determine how to find the default value of the leading dimension of matrix A
-  auto default_ld_a = [](const Arguments<T> args) { return args.n; };
-
-  // Simple command line argument parser with defaults
-  auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
-  if (args.print_help) { return; }
-
-  // Prints the header of the output table
-  PrintTableHeader(args.silent, options);
-
-  // Initializes OpenCL and the libraries
-  auto platform = Platform(args.platform_id);
-  auto device = Device(platform, kDeviceType, args.device_id);
-  auto context = Context(device);
-  auto queue = CommandQueue(context, device);
-  if (args.compare_clblas) { clblasSetup(); }
-
-  // Iterates over all "num_step" values jumping by "step" each time
-  auto s = size_t{0};
-  while(true) {
-
-    // Computes the second dimension of the matrix taking the rotation into account
-    auto a_two = (args.layout == Layout::kRowMajor) ? args.m : args.n;
-
-    // Computes the vector sizes in case the matrix is transposed
-    auto a_transposed = (args.a_transpose == Transpose::kYes);
-    auto m_real = (a_transposed) ? args.n : args.m;
-    auto n_real = (a_transposed) ? args.m : args.n;
-
-    // Computes the data sizes
-    auto a_size = a_two * args.a_ld + args.a_offset;
-    auto x_size = n_real*args.x_inc + args.x_offset;
-    auto y_size = m_real*args.y_inc + args.y_offset;
-
-    // Populates input host vectors with random data
-    std::vector<T> a_source(a_size);
-    std::vector<T> x_source(x_size);
-    std::vector<T> y_source(y_size);
-    PopulateVector(a_source);
-    PopulateVector(x_source);
-    PopulateVector(y_source);
-
-    // Creates the vectors on the device
-    auto a_buffer = Buffer(context, CL_MEM_READ_WRITE, a_size*sizeof(T));
-    auto x_buffer = Buffer(context, CL_MEM_READ_WRITE, x_size*sizeof(T));
-    auto y_buffer = Buffer(context, CL_MEM_READ_WRITE, y_size*sizeof(T));
-    a_buffer.WriteBuffer(queue, a_size*sizeof(T), a_source);
-    x_buffer.WriteBuffer(queue, x_size*sizeof(T), x_source);
-    y_buffer.WriteBuffer(queue, y_size*sizeof(T), y_source);
-
-    // Runs the routine-specific code
-    client_routine(args, a_buffer, x_buffer, y_buffer, queue);
-
-    // Makes the jump to the next step
-    ++s;
-    if (s >= args.num_steps) { break; }
-    args.m += args.step;
-    args.n += args.step;
-    args.a_ld += args.step;
-  }
-
-  // Cleans-up and returns
-  if (args.compare_clblas) { clblasTeardown(); }
-}
-
-// Compiles the above function
-template void ClientAXY<float>(int, char **, Routine3<float>, const std::vector<std::string>&);
-template void ClientAXY<double>(int, char **, Routine3<double>, const std::vector<std::string>&);
-template void ClientAXY<float2>(int, char **, Routine3<float2>, const std::vector<std::string>&);
-template void ClientAXY<double2>(int, char **, Routine3<double2>, const std::vector<std::string>&);
-
-// =================================================================================================
-
-// This is the matrix-matrix-matrix variant of the set-up/tear-down client routine.
-template <typename T>
-void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
-                     const std::vector<std::string> &options) {
-
-  // Function to determine how to find the default value of the leading dimension of matrix A
-  auto default_ld_a = [](const Arguments<T> args) { return args.m; };
-
-  // Simple command line argument parser with defaults
-  auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
-  if (args.print_help) { return; }
-
-  // Prints the header of the output table
-  PrintTableHeader(args.silent, options);
-
-  // Initializes OpenCL and the libraries
-  auto platform = Platform(args.platform_id);
-  auto device = Device(platform, kDeviceType, args.device_id);
-  auto context = Context(device);
-  auto queue = CommandQueue(context, device);
-  if (args.compare_clblas) { clblasSetup(); }
-
-  // Computes whether or not the matrices are transposed. Note that we assume a default of
-  // column-major and no-transpose. If one of them is different (but not both), then rotated
-  // is considered true.
-  auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose == Transpose::kYes) ||
-                   (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
-  auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose == Transpose::kYes) ||
-                   (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
-  auto c_rotated = (args.layout == Layout::kRowMajor);
-
-  // Iterates over all "num_step" values jumping by "step" each time
-  auto s = size_t{0};
-  while(true) {
-
-    // Computes the data sizes
-    auto a_two = (a_rotated) ? args.m : args.k;
-    auto b_two = (b_rotated) ? args.k : args.n;
-    auto c_two = (c_rotated) ? args.m : args.n;
-    auto a_size = a_two * args.a_ld + args.a_offset;
-    auto b_size = b_two * args.b_ld + args.b_offset;
-    auto c_size = c_two * args.c_ld + args.c_offset;
-
-    // Populates input host matrices with random data
-    std::vector<T> a_source(a_size);
-    std::vector<T> b_source(b_size);
-    std::vector<T> c_source(c_size);
-    PopulateVector(a_source);
-    PopulateVector(b_source);
-    PopulateVector(c_source);
-
-    // Creates the matrices on the device
-    auto a_buffer = Buffer(context, CL_MEM_READ_WRITE, a_size*sizeof(T));
-    auto b_buffer = Buffer(context, CL_MEM_READ_WRITE, b_size*sizeof(T));
-    auto c_buffer = Buffer(context, CL_MEM_READ_WRITE, c_size*sizeof(T));
-    a_buffer.WriteBuffer(queue, a_size*sizeof(T), a_source);
-    b_buffer.WriteBuffer(queue, b_size*sizeof(T), b_source);
-    c_buffer.WriteBuffer(queue, c_size*sizeof(T), c_source);
-
-    // Runs the routine-specific code
-    client_routine(args, a_buffer, b_buffer, c_buffer, queue);
-
-    // Makes the jump to the next step
-    ++s;
-    if (s >= args.num_steps) { break; }
-    args.m += args.step;
-    args.n += args.step;
-    args.k += args.step;
-    args.a_ld += args.step;
-    args.b_ld += args.step;
-    args.c_ld += args.step;
-  }
-
-  // Cleans-up and returns
-  if (args.compare_clblas) { clblasTeardown(); }
-}
-
-// Compiles the above function
-template void ClientABC<float>(int, char **, Routine3<float>, const std::vector<std::string>&);
-template void ClientABC<double>(int, char **, Routine3<double>, const std::vector<std::string>&);
-template void ClientABC<float2>(int, char **, Routine3<float2>, const std::vector<std::string>&);
-template void ClientABC<double2>(int, char **, Routine3<double2>, const std::vector<std::string>&);
-
 // =================================================================================================

 // Parses all arguments available for the CLBlast client testers. Some arguments might not be
 // applicable, but are searched for anyway to be able to create one common argument parser. All
 // arguments have a default value in case they are not found.
-template <typename T>
-Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::string> &options,
-                            const std::function<size_t(const Arguments<T>)> default_ld_a) {
-  auto args = Arguments<T>{};
+template <typename T, typename U>
+Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric default_a_ld,
+                                         const GetMetric default_b_ld, const GetMetric default_c_ld) {
+  auto args = Arguments<U>{};
  auto help = std::string{"Options given/available:\n"};

  // These are the options which are not for every client: they are optional
-  for (auto &o: options) {
+  for (auto &o: options_) {

    // Data-sizes
-    if (o == kArgM) { args.m = args.k  = GetArgument(argc, argv, help, kArgM, 512UL); }
-    if (o == kArgN) { args.n           = GetArgument(argc, argv, help, kArgN, 512UL); }
-    if (o == kArgK) { args.k           = GetArgument(argc, argv, help, kArgK, 512UL); }
+    if (o == kArgM) { args.m  = GetArgument(argc, argv, help, kArgM, 512UL); }
+    if (o == kArgN) { args.n  = GetArgument(argc, argv, help, kArgN, 512UL); }
+    if (o == kArgK) { args.k  = GetArgument(argc, argv, help, kArgK, 512UL); }

    // Data-layouts
    if (o == kArgLayout)   { args.layout      = GetArgument(argc, argv, help, kArgLayout, Layout::kRowMajor); }
@ -271,6 +58,7 @@ Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::strin
    if (o == kArgBTransp)  { args.b_transpose = GetArgument(argc, argv, help, kArgBTransp, Transpose::kNo); }
    if (o == kArgSide)     { args.side        = GetArgument(argc, argv, help, kArgSide, Side::kLeft); }
    if (o == kArgTriangle) { args.triangle    = GetArgument(argc, argv, help, kArgTriangle, Triangle::kUpper); }
+    if (o == kArgDiagonal) { args.diagonal    = GetArgument(argc, argv, help, kArgDiagonal, Diagonal::kUnit); }

    // Vector arguments
    if (o == kArgXInc)    { args.x_inc    = GetArgument(argc, argv, help, kArgXInc, size_t{1}); }
@ -279,16 +67,16 @@ Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::strin
    if (o == kArgYOffset) { args.y_offset = GetArgument(argc, argv, help, kArgYOffset, size_t{0}); }

    // Matrix arguments
-    if (o == kArgALeadDim) { args.a_ld     = GetArgument(argc, argv, help, kArgALeadDim, default_ld_a(args)); }
-    if (o == kArgBLeadDim) { args.b_ld     = GetArgument(argc, argv, help, kArgBLeadDim, args.n); }
-    if (o == kArgCLeadDim) { args.c_ld     = GetArgument(argc, argv, help, kArgCLeadDim, args.n); }
+    if (o == kArgALeadDim) { args.a_ld     = GetArgument(argc, argv, help, kArgALeadDim, default_a_ld(args)); }
+    if (o == kArgBLeadDim) { args.b_ld     = GetArgument(argc, argv, help, kArgBLeadDim, default_b_ld(args)); }
+    if (o == kArgCLeadDim) { args.c_ld     = GetArgument(argc, argv, help, kArgCLeadDim, default_c_ld(args)); }
    if (o == kArgAOffset)  { args.a_offset = GetArgument(argc, argv, help, kArgAOffset, size_t{0}); }
    if (o == kArgBOffset)  { args.b_offset = GetArgument(argc, argv, help, kArgBOffset, size_t{0}); }
    if (o == kArgCOffset)  { args.c_offset = GetArgument(argc, argv, help, kArgCOffset, size_t{0}); }

    // Scalar values 
-    if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>()); }
-    if (o == kArgBeta)  { args.beta  = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); }
+    if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<U>()); }
+    if (o == kArgBeta)  { args.beta  = GetArgument(argc, argv, help, kArgBeta, GetScalar<U>()); }
  }

  // These are the options common to all routines
@ -313,16 +101,92 @@ Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::strin

 // =================================================================================================

+// This is main performance tester
+template <typename T, typename U>
+void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) {
+
+  // Prints the header of the output table
+  PrintTableHeader(args.silent, options_);
+
+  // Initializes OpenCL and the libraries
+  auto platform = Platform(args.platform_id);
+  auto device = Device(platform, kDeviceType, args.device_id);
+  auto context = Context(device);
+  auto queue = CommandQueue(context, device);
+  if (args.compare_clblas) { clblasSetup(); }
+
+  // Iterates over all "num_step" values jumping by "step" each time
+  auto s = size_t{0};
+  while(true) {
+
+    // Sets the buffer sizes (routine-specific)
+    set_sizes(args);
+
+    // Populates input host matrices with random data
+    std::vector<T> x_source(args.x_size);
+    std::vector<T> y_source(args.y_size);
+    std::vector<T> a_source(args.a_size);
+    std::vector<T> b_source(args.b_size);
+    std::vector<T> c_source(args.c_size);
+    PopulateVector(x_source);
+    PopulateVector(y_source);
+    PopulateVector(a_source);
+    PopulateVector(b_source);
+    PopulateVector(c_source);
+
+    // Creates the matrices on the device
+    auto x_vec = Buffer(context, CL_MEM_READ_WRITE, args.x_size*sizeof(T));
+    auto y_vec = Buffer(context, CL_MEM_READ_WRITE, args.y_size*sizeof(T));
+    auto a_mat = Buffer(context, CL_MEM_READ_WRITE, args.a_size*sizeof(T));
+    auto b_mat = Buffer(context, CL_MEM_READ_WRITE, args.b_size*sizeof(T));
+    auto c_mat = Buffer(context, CL_MEM_READ_WRITE, args.c_size*sizeof(T));
+    x_vec.WriteBuffer(queue, args.x_size*sizeof(T), x_source);
+    y_vec.WriteBuffer(queue, args.y_size*sizeof(T), y_source);
+    a_mat.WriteBuffer(queue, args.a_size*sizeof(T), a_source);
+    b_mat.WriteBuffer(queue, args.b_size*sizeof(T), b_source);
+    c_mat.WriteBuffer(queue, args.c_size*sizeof(T), c_source);
+    auto buffers = Buffers{x_vec, y_vec, a_mat, b_mat, c_mat};
+
+    // Runs the routines and collects the timings
+    auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
+    auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference_, "clBLAS");
+
+    // Prints the performance of both libraries
+    PrintTableRow(args, ms_clblast, ms_clblas);
+
+    // Makes the jump to the next step
+    ++s;
+    if (s >= args.num_steps) { break; }
+    args.m += args.step;
+    args.n += args.step;
+    args.k += args.step;
+    args.a_ld += args.step;
+    args.b_ld += args.step;
+    args.c_ld += args.step;
+  }
+
+  // Cleans-up and returns
+  if (args.compare_clblas) { clblasTeardown(); }
+}
+
+// =================================================================================================
+
 // Creates a vector of timing results, filled with execution times of the 'main computation'. The
 // timing is performed using the milliseconds chrono functions. The function returns the minimum
 // value found in the vector of timing results. The return value is in milliseconds.
-double TimedExecution(const size_t num_runs, std::function<void()> main_computation) {
+template <typename T, typename U>
+double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args,
+                                   const Buffers &buffers, CommandQueue &queue,
+                                   Routine run_blas, const std::string &library_name) {
  auto timings = std::vector<double>(num_runs);
  for (auto &timing: timings) {
    auto start_time = std::chrono::steady_clock::now();

    // Executes the main computation
-    main_computation();
+    auto status = run_blas(args, buffers, queue);
+    if (status != StatusCode::kSuccess) {
+      throw std::runtime_error(library_name+" error: "+ToString(static_cast<int>(status)));
+    }

    // Records and stores the end-time
    auto elapsed_time = std::chrono::steady_clock::now() - start_time;
@ -334,7 +198,8 @@ double TimedExecution(const size_t num_runs, std::function<void()> main_computat
 // =================================================================================================

 // Prints the header of the performance table
-void PrintTableHeader(const bool silent, const std::vector<std::string> &args) {
+template <typename T, typename U>
+void Client<T,U>::PrintTableHeader(const bool silent, const std::vector<std::string> &args) {
  if (!silent) {
    for (auto i=size_t{0}; i<args.size(); ++i) { fprintf(stdout, "%9s ", ""); }
    fprintf(stdout, " | <--       CLBlast       --> | <--      clBLAS      --> |\n");
@ -345,29 +210,60 @@ void PrintTableHeader(const bool silent, const std::vector<std::string> &args) {
 }

 // Print a performance-result row
-void PrintTableRow(const std::vector<size_t> &args_int, const std::vector<std::string> &args_string,
-                   const bool no_abbrv, const double ms_clblast, const double ms_clblas,
-                   const unsigned long long flops, const unsigned long long bytes) {
+template <typename T, typename U>
+void Client<T,U>::PrintTableRow(const Arguments<U>& args, const double ms_clblast,
+                                const double ms_clblas) {
+
+  // Creates a vector of relevant variables
+  auto integers = std::vector<size_t>{};
+  for (auto &o: options_) {
+    if      (o == kArgM) {        integers.push_back(args.m); }
+    if      (o == kArgN) {        integers.push_back(args.n); }
+    else if (o == kArgK) {        integers.push_back(args.k); }
+    else if (o == kArgLayout) {   integers.push_back(static_cast<size_t>(args.layout)); }
+    else if (o == kArgSide) {     integers.push_back(static_cast<size_t>(args.side)); }
+    else if (o == kArgTriangle) { integers.push_back(static_cast<size_t>(args.triangle)); }
+    else if (o == kArgATransp) {  integers.push_back(static_cast<size_t>(args.a_transpose)); }
+    else if (o == kArgBTransp) {  integers.push_back(static_cast<size_t>(args.b_transpose)); }
+    else if (o == kArgDiagonal) { integers.push_back(static_cast<size_t>(args.diagonal)); }
+    else if (o == kArgXInc) {     integers.push_back(args.x_inc); }
+    else if (o == kArgYInc) {     integers.push_back(args.y_inc); }
+    else if (o == kArgXOffset) {  integers.push_back(args.x_offset); }
+    else if (o == kArgYOffset) {  integers.push_back(args.y_offset); }
+    else if (o == kArgALeadDim) { integers.push_back(args.a_ld); }
+    else if (o == kArgBLeadDim) { integers.push_back(args.b_ld); }
+    else if (o == kArgCLeadDim) { integers.push_back(args.c_ld); }
+    else if (o == kArgAOffset) {  integers.push_back(args.a_offset); }
+    else if (o == kArgBOffset) {  integers.push_back(args.b_offset); }
+    else if (o == kArgCOffset) {  integers.push_back(args.c_offset); }
+  }
+  auto strings = std::vector<std::string>{};
+  for (auto &o: options_) {
+    if      (o == kArgAlpha) {    strings.push_back(ToString(args.alpha)); }
+    else if (o == kArgBeta) {     strings.push_back(ToString(args.beta)); }
+  }

  // Computes the GFLOPS and GB/s metrics
+  auto flops = get_flops_(args);
+  auto bytes = get_bytes_(args);
  auto gflops_clblast = (ms_clblast != 0.0) ? (flops*1e-6)/ms_clblast : 0;
  auto gflops_clblas = (ms_clblas != 0.0) ? (flops*1e-6)/ms_clblas: 0;
  auto gbs_clblast = (ms_clblast != 0.0) ? (bytes*1e-6)/ms_clblast : 0;
  auto gbs_clblas = (ms_clblas != 0.0) ? (bytes*1e-6)/ms_clblas: 0;

  // Outputs the argument values
-  for (auto &argument: args_int) {
-    if (!no_abbrv && argument >= 1024*1024 && IsMultiple(argument, 1024*1024)) {
+  for (auto &argument: integers) {
+    if (!args.no_abbrv && argument >= 1024*1024 && IsMultiple(argument, 1024*1024)) {
      fprintf(stdout, "%8luM;", argument/(1024*1024));
    }
-    else if (!no_abbrv && argument >= 1024 && IsMultiple(argument, 1024)) {
+    else if (!args.no_abbrv && argument >= 1024 && IsMultiple(argument, 1024)) {
      fprintf(stdout, "%8luK;", argument/1024);
    }
    else {
      fprintf(stdout, "%9lu;", argument);
    }
  }
-  for (auto &argument: args_string) {
+  for (auto &argument: strings) {
    fprintf(stdout, "%9s;", argument.c_str());
  }

@ -377,5 +273,15 @@ void PrintTableRow(const std::vector<size_t> &args_int, const std::vector<std::s
          ms_clblas, gflops_clblas, gbs_clblas);
 }

+// =================================================================================================
+
+// Compiles the templated class
+template class Client<float,float>;
+template class Client<double,double>;
+template class Client<float2,float2>;
+template class Client<double2,double2>;
+template class Client<float2,float>;
+template class Client<double2,double>;
+
 // =================================================================================================
 } // namespace clblast
--- a/test/performance/client.h
+++ b/test/performance/client.h
@ -7,7 +7,14 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file provides common function declarations to be used with the test clients.
+// This class implements the performance-test client. It is generic for all CLBlast routines by
+// taking a number of routine-specific functions as arguments, such as how to compute buffer sizes
+// or how to get the FLOPS count.
+// Typename T: the data-type of the routine's memory buffers (==precision)
+// Typename U: the data-type of the alpha and beta arguments
+//
+// This file also provides the common interface to the performance client (see the 'RunClient'
+// function for details).
 //
 // =================================================================================================

@ -26,61 +33,71 @@
 namespace clblast {
 // =================================================================================================

-// Types of devices to consider
-const cl_device_type kDeviceType = CL_DEVICE_TYPE_ALL;
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class Client {
+ public:
+
+  // Types of devices to consider
+  const cl_device_type kDeviceType = CL_DEVICE_TYPE_ALL;
+
+  // Shorthand for the routine-specific functions passed to the tester
+  using Routine = std::function<StatusCode(const Arguments<U>&, const Buffers&, CommandQueue&)>;
+  using SetMetric = std::function<void(Arguments<U>&)>;
+  using GetMetric = std::function<size_t(const Arguments<U>&)>;
+
+  // The constructor
+  Client(const Routine run_routine, const Routine run_reference,
+         const std::vector<std::string> &options,
+         const GetMetric get_flops, const GetMetric get_bytes);
+
+  // Parses all command-line arguments, filling in the arguments structure. If no command-line
+  // argument is given for a particular argument, it is filled in with a default value.
+  Arguments<U> ParseArguments(int argc, char *argv[], const GetMetric default_a_ld,
+                              const GetMetric default_b_ld, const GetMetric default_c_ld);
+
+  // The main client function, setting-up arguments, matrices, OpenCL buffers, etc. After set-up, it
+  // calls the client routines.
+  void PerformanceTest(Arguments<U> &args, const SetMetric set_sizes);
+
+ private:
+
+  // Runs a function a given number of times and returns the execution time of the shortest instance
+  double TimedExecution(const size_t num_runs, const Arguments<U> &args, const Buffers &buffers,
+                        CommandQueue &queue, Routine run_blas, const std::string &library_name);
+
+  // Prints the header of a performance-data table
+  void PrintTableHeader(const bool silent, const std::vector<std::string> &args);
+
+  // Prints a row of performance data, including results of two libraries
+  void PrintTableRow(const Arguments<U>& args, const double ms_clblast, const double ms_clblas);
+
+  // The routine-specific functions passed to the tester
+  const Routine run_routine_;
+  const Routine run_reference_;
+  const std::vector<std::string> options_;
+  const GetMetric get_flops_;
+  const GetMetric get_bytes_;
+};

 // =================================================================================================

-// Shorthand for a BLAS routine with 2 or 3 OpenCL buffers as argument
-template <typename T>
-using Routine2 = std::function<void(const Arguments<T>&,
-                                    const Buffer&, const Buffer&,
-                                    CommandQueue&)>;
-template <typename T>
-using Routine3 = std::function<void(const Arguments<T>&,
-                                    const Buffer&, const Buffer&, const Buffer&,
-                                    CommandQueue&)>;
+// The interface to the performance client. This is a separate function in the header such that it
+// is automatically compiled for each routine, templated by the parameter "C".
+template <typename C, typename T, typename U>
+void RunClient(int argc, char *argv[]) {

-// =================================================================================================
+  // Creates a new client
+  auto client = Client<T,U>(C::RunRoutine, C::RunReference, C::GetOptions(),
+                            C::GetFlops, C::GetBytes);

-// These are the main client functions, setting-up arguments, matrices, OpenCL buffers, etc. After
-// set-up, they call the client routine, passed as argument to this function.
-template <typename T>
-void ClientXY(int argc, char *argv[], Routine2<T> client_routine,
-              const std::vector<std::string> &options);
-template <typename T>
-void ClientAXY(int argc, char *argv[], Routine3<T> client_routine,
-               const std::vector<std::string> &options);
-template <typename T>
-void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
-               const std::vector<std::string> &options);
+  // Simple command line argument parser with defaults
+  auto args = client.ParseArguments(argc, argv, C::DefaultLDA, C::DefaultLDB, C::DefaultLDC);
+  if (args.print_help) { return; }

-// =================================================================================================
-
-// Parses all command-line arguments, filling in the arguments structure. If no command-line
-// argument is given for a particular argument, it is filled in with a default value.
-template <typename T>
-Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::string> &options,
-                            const std::function<size_t(const Arguments<T>)> default_ld_a);
-
-// Retrieves only the precision command-line argument, since the above function is templated based
-// on the precision
-Precision GetPrecision(int argc, char *argv[]);
-
-// =================================================================================================
-
-// Runs a function a given number of times and returns the execution time of the shortest instance
-double TimedExecution(const size_t num_runs, std::function<void()> main_computation);
-
-// =================================================================================================
-
-// Prints the header of a performance-data table
-void PrintTableHeader(const bool silent, const std::vector<std::string> &args);
-
-// Prints a row of performance data, including results of two libraries
-void PrintTableRow(const std::vector<size_t> &args_int, const std::vector<std::string> &args_string,
-                   const bool abbreviations, const double ms_clblast, const double ms_clblas,
-                   const unsigned long long flops, const unsigned long long bytes);
+  // Runs the client
+  client.PerformanceTest(args, C::SetSizes);
+}

 // =================================================================================================
 } // namespace clblast
--- a/test/performance/graphs/common.r
+++ b/test/performance/graphs/common.r
@ -83,7 +83,16 @@ main <- function(routine_name, precision, test_names, test_values,
      params_string <- paste(parameters, params_values[[command_id]], collapse=" ")
      arguments <- paste(devices_string, params_string, options_string, sep=" ")
      print(paste("Running", executable, arguments, sep=" "))
-      result_string <- system2(command=executable, args=arguments, stdout=TRUE)
+      raw_result_string <- system2(command=executable, args=arguments, stdout=TRUE)
+
+      # Filter the string: only lines containing a ";" can be valid lines
+      result_string <- c()
+      for (line in raw_result_string) {
+        if (grepl(";",line)) {
+          result_string <-
+           c(result_string, line)
+        }
+      }

      # Reads the result into a dataframe
      command_db <- read.csv(text=result_string, sep=";")
--- a/test/performance/graphs/xgemm.r
+++ b/test/performance/graphs/xgemm.r
@ -35,10 +35,10 @@ test_names <- list(

 # Defines the test-cases
 test_values <- list(
-  list(c(128, 128, 128, 0, 0, 0, 16, 128, num_runs, precision)),
-  list(c(129, 129, 129, 0, 0, 0, 16, 128, num_runs, precision)),
-  list(c(512, 512, 512, 0, 0, 0, 16, 1, num_runs, precision)),
-  list(c(2048, 2048, 2048, 0, 0, 0, 16, 1, num_runs, precision)),
+  list(c( 128,  128,  128, 1, 0, 0, 16, 128, num_runs, precision)),
+  list(c( 129,  129,  129, 1, 0, 0, 16, 128, num_runs, precision)),
+  list(c( 512,  512,  512, 1, 0, 0, 16, 1, num_runs, precision)),
+  list(c(2048, 2048, 2048, 1, 0, 0, 16, 1, num_runs, precision)),
  list(
    c(1024, 1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
    c(1024, 1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
@ -50,17 +50,17 @@ test_values <- list(
    c(1024, 1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
  ),
  list(
-    c(8, 8, 8, 0, 0, 0, 1, 0, num_runs, precision),
-    c(16, 16, 16, 0, 0, 0, 1, 0, num_runs, precision),
-    c(32, 32, 32, 0, 0, 0, 1, 0, num_runs, precision),
-    c(64, 64, 64, 0, 0, 0, 1, 0, num_runs, precision),
-    c(128, 128, 128, 0, 0, 0, 1, 0, num_runs, precision),
-    c(256, 256, 256, 0, 0, 0, 1, 0, num_runs, precision),
-    c(512, 512, 512, 0, 0, 0, 1, 0, num_runs, precision),
-    c(1024, 1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
-    c(2048, 2048, 2048, 0, 0, 0, 1, 0, num_runs, precision),
-    c(4096, 4096, 4096, 0, 0, 0, 1, 0, num_runs, precision),
-    c(8192, 8192, 8192, 0, 0, 0, 1, 0, num_runs, precision)
+    c(   8,    8,    8, 1, 0, 0, 1, 0, num_runs, precision),
+    c(  16,   16,   16, 1, 0, 0, 1, 0, num_runs, precision),
+    c(  32,   32,   32, 1, 0, 0, 1, 0, num_runs, precision),
+    c(  64,   64,   64, 1, 0, 0, 1, 0, num_runs, precision),
+    c( 128,  128,  128, 1, 0, 0, 1, 0, num_runs, precision),
+    c( 256,  256,  256, 1, 0, 0, 1, 0, num_runs, precision),
+    c( 512,  512,  512, 1, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
+    c(2048, 2048, 2048, 1, 0, 0, 1, 0, num_runs, precision),
+    c(4096, 4096, 4096, 1, 0, 0, 1, 0, num_runs, precision),
+    c(8192, 8192, 8192, 1, 0, 0, 1, 0, num_runs, precision)
  )
 )

--- a/test/performance/graphs/xsymm.r
+++ b/test/performance/graphs/xsymm.r
@ -19,7 +19,7 @@ source(file.path(dirname(thisfile), "common.r"))

 # Settings
 routine_name <- "xsymm"
-parameters <- c("-m","-n","-layout","-triangle","-side",
+parameters <- c("-m","-n","-layout","-side","-triangle",
                "-num_steps","-step","-runs","-precision")
 precision <- 32

@ -29,16 +29,16 @@ test_names <- list(
  "multiples of 128 (+1)",
  "around m=n=512",
  "around m=n=2048",
-  "layouts and triangle/side (m=n=1024)",
+  "layouts and side/triangle (m=n=1024)",
  "powers of 2"
 )

 # Defines the test-cases
 test_values <- list(
-  list(c(128, 128, 0, 0, 0, 16, 128, num_runs, precision)),
-  list(c(129, 129, 0, 0, 0, 16, 128, num_runs, precision)),
-  list(c(512, 512, 0, 0, 0, 16, 1, num_runs, precision)),
-  list(c(2048, 2048, 0, 0, 0, 16, 1, num_runs, precision)),
+  list(c( 128,  128, 1, 0, 0, 16, 128, num_runs, precision)),
+  list(c( 129,  129, 1, 0, 0, 16, 128, num_runs, precision)),
+  list(c( 512,  512, 1, 0, 0, 16, 1, num_runs, precision)),
+  list(c(2048, 2048, 1, 0, 0, 16, 1, num_runs, precision)),
  list(
    c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
    c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
@ -50,17 +50,17 @@ test_values <- list(
    c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
  ),
  list(
-    c(8, 8, 0, 0, 0, 1, 0, num_runs, precision),
-    c(16, 16, 0, 0, 0, 1, 0, num_runs, precision),
-    c(32, 32, 0, 0, 0, 1, 0, num_runs, precision),
-    c(64, 64, 0, 0, 0, 1, 0, num_runs, precision),
-    c(128, 128, 0, 0, 0, 1, 0, num_runs, precision),
-    c(256, 256, 0, 0, 0, 1, 0, num_runs, precision),
-    c(512, 512, 0, 0, 0, 1, 0, num_runs, precision),
-    c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
-    c(2048, 2048, 0, 0, 0, 1, 0, num_runs, precision),
-    c(4096, 4096, 0, 0, 0, 1, 0, num_runs, precision),
-    c(8192, 8192, 0, 0, 0, 1, 0, num_runs, precision)
+    c(   8,    8, 1, 0, 0, 1, 0, num_runs, precision),
+    c(  16,   16, 1, 0, 0, 1, 0, num_runs, precision),
+    c(  32,   32, 1, 0, 0, 1, 0, num_runs, precision),
+    c(  64,   64, 1, 0, 0, 1, 0, num_runs, precision),
+    c( 128,  128, 1, 0, 0, 1, 0, num_runs, precision),
+    c( 256,  256, 1, 0, 0, 1, 0, num_runs, precision),
+    c( 512,  512, 1, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
+    c(2048, 2048, 1, 0, 0, 1, 0, num_runs, precision),
+    c(4096, 4096, 1, 0, 0, 1, 0, num_runs, precision),
+    c(8192, 8192, 1, 0, 0, 1, 0, num_runs, precision)
  )
 )

@ -70,7 +70,7 @@ test_xlabels <- list(
  "matrix sizes (m=n)",
  "matrix sizes (m=n)",
  "matrix sizes (m=n)",
-  "layout (row/col), triangle (up/lo), side (l/r)",
+  "layout (row/col), side (l/r), triangle (up/lo)",
  "matrix sizes (m=n)"
 )

@ -80,8 +80,8 @@ test_xaxis <- list(
  c("m", ""),
  c("m", ""),
  c("m", ""),
-  list(1:8, c("row,up,l", "row,up,r", "row,lo,l", "row,lo,r",
-              "col,up,l", "col,up,r", "col,lo,l", "col,lo,r")),
+  list(1:8, c("row,l,up", "row,r,up", "row,l,lo", "row,r,lo",
+              "col,l,up", "col,r,up", "col,l,lo", "col,r,lo")),
  c("m", "x")
 )

--- a/test/performance/graphs/xsyr2k.r
+++ b/test/performance/graphs/xsyr2k.r
@ -0,0 +1,94 @@
+
+# ==================================================================================================
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+# project uses a tab-size of two spaces and a max-width of 100 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+#
+# This file implements the performance script for the Xsyr2k routine
+#
+# ==================================================================================================
+
+# Includes the common functions
+args <- commandArgs(trailingOnly = FALSE)
+thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
+source(file.path(dirname(thisfile), "common.r"))
+
+# ==================================================================================================
+
+# Settings
+routine_name <- "xsyr2k"
+parameters <- c("-n","-k","-layout","-triangle","-transA",
+                "-num_steps","-step","-runs","-precision")
+precision <- 32
+
+# Sets the names of the test-cases
+test_names <- list(
+  "multiples of 128",
+  "multiples of 128 (+1)",
+  "around n=k=512",
+  "around n=k=1536",
+  "layouts and transposing (n=k=1024)",
+  "powers of 2"
+)
+
+# Defines the test-cases
+test_values <- list(
+  list(c( 128,  128, 1, 0, 0, 16, 128, num_runs, precision)),
+  list(c( 129,  129, 1, 0, 0, 16, 128, num_runs, precision)),
+  list(c( 512,  512, 1, 0, 0, 16, 1, num_runs, precision)),
+  list(c(1536, 1536, 1, 0, 0, 16, 1, num_runs, precision)),
+  list(
+    c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
+  ),
+  list(
+    c(   8,    8, 1, 0, 0, 1, 0, num_runs, precision),
+    c(  16,   16, 1, 0, 0, 1, 0, num_runs, precision),
+    c(  32,   32, 1, 0, 0, 1, 0, num_runs, precision),
+    c(  64,   64, 1, 0, 0, 1, 0, num_runs, precision),
+    c( 128,  128, 1, 0, 0, 1, 0, num_runs, precision),
+    c( 256,  256, 1, 0, 0, 1, 0, num_runs, precision),
+    c( 512,  512, 1, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
+    c(2048, 2048, 1, 0, 0, 1, 0, num_runs, precision),
+    c(4096, 4096, 1, 0, 0, 1, 0, num_runs, precision),
+    c(8192, 8192, 1, 0, 0, 1, 0, num_runs, precision)
+  )
+)
+
+# Defines the x-labels corresponding to the test-cases
+test_xlabels <- list(
+  "matrix sizes (n=k)",
+  "matrix sizes (n=k)",
+  "matrix sizes (n=k)",
+  "matrix sizes (n=k)",
+  "layout (row/col), triangle (u/l), transA (n/y)",
+  "matrix sizes (n=k)"
+)
+
+# Defines the x-axis of the test-cases
+test_xaxis <- list(
+  c("n", ""),
+  c("n", ""),
+  c("n", ""),
+  c("n", ""),
+  list(1:8, c("row,u,n", "row,u,y", "row,l,n", "row,l,y",
+              "col,u,n", "col,u,y", "col,l,n", "col,l,y")),
+  c("n", "x")
+)
+
+# ==================================================================================================
+
+# Start the script
+main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
+     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
+
+# ==================================================================================================
--- a/Show more
+++ b/Show more