diff --git a/CHANGELOG b/CHANGELOG
index 12d9322e..5f0ce8af 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,16 @@
 
+Version 0.3.0
+- Re-organized test/client infrastructure to avoid code duplication
+- Added an optional bypass for pre/post-processing kernels in level-3 routines
+- Significantly improved performance of level-3 routines on AMD GPUs
+- Added level-3 routines:
+  * CHEMM/ZHEMM
+  * SSYRK/DSYRK/CSYRK/ZSYRK
+  * CHERK/ZHERK
+  * SSYR2K/DSYR2K/CSYR2K/ZSYR2K
+  * CHER2K/ZHER2K
+  * STRMM/DTRMM/CTRMM/ZTRMM
+
 Version 0.2.0
 - Added support for complex conjugate transpose
 - Several host-code performance improvements
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 038e71ae..2bae4662 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,7 +13,7 @@
 cmake_minimum_required(VERSION 2.8.10)
 project("clblast" CXX)
 set(clblast_VERSION_MAJOR 0)
-set(clblast_VERSION_MINOR 2)
+set(clblast_VERSION_MINOR 3)
 set(clblast_VERSION_PATCH 0)
 
 # Options and their default values
@@ -95,17 +95,23 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
 # Sets the supported routines and the used kernels. New routines and kernels should be added here.
 set(KERNELS copy pad transpose padtranspose xaxpy xgemv xgemm)
 set(SAMPLE_PROGRAMS sgemm)
-set(ROUTINES_XY xaxpy)
-set(ROUTINES_AXY xgemv)
-set(ROUTINES_ABC xgemm xsymm)
-set(ROUTINES ${ROUTINES_XY} ${ROUTINES_AXY} ${ROUTINES_ABC})
+set(LEVEL1_ROUTINES xaxpy)
+set(LEVEL2_ROUTINES xgemv)
+set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
+set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES})
 
 # ==================================================================================================
 
 # Gathers all source-files
 set(SOURCES src/clblast.cc src/database.cc src/routine.cc src/utilities.cc)
-foreach(ROUTINE ${ROUTINES})
-  set(SOURCES ${SOURCES} src/routines/${ROUTINE}.cc)
+foreach(ROUTINE ${LEVEL1_ROUTINES})
+  set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cc)
+endforeach()
+foreach(ROUTINE ${LEVEL2_ROUTINES})
+  set(SOURCES ${SOURCES} src/routines/level2/${ROUTINE}.cc)
+endforeach()
+foreach(ROUTINE ${LEVEL3_ROUTINES})
+  set(SOURCES ${SOURCES} src/routines/level3/${ROUTINE}.cc)
 endforeach()
 
 # Creates and links the library
@@ -168,33 +174,23 @@ if(TESTS)
   include_directories(${clblast_SOURCE_DIR}/test ${clBLAS_SOURCE_DIR})
 
   # Creates the common correctness-tests objects (requires CMake 2.8.8)
-  add_library(test_correctness_common OBJECT test/correctness/tester.cc)
-  add_library(test_correctness_xy OBJECT test/correctness/testxy.cc)
-  add_library(test_correctness_axy OBJECT test/correctness/testaxy.cc)
-  add_library(test_correctness_abc OBJECT test/correctness/testabc.cc)
+  add_library(test_correctness_common OBJECT
+              test/correctness/tester.cc test/correctness/testblas.cc)
 
   # Compiles the correctness-tests
-  foreach(ROUTINE ${ROUTINES_XY})
-    add_executable(test_${ROUTINE}
-                   $<TARGET_OBJECTS:test_correctness_common>
-                   $<TARGET_OBJECTS:test_correctness_xy>
-                   test/correctness/routines/${ROUTINE}.cc)
-    target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
-    install(TARGETS test_${ROUTINE} DESTINATION bin)
+  foreach(ROUTINE ${LEVEL1_ROUTINES})
+    add_executable(test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
+                   test/correctness/routines/level1/${ROUTINE}.cc)
   endforeach()
-  foreach(ROUTINE ${ROUTINES_AXY})
-    add_executable(test_${ROUTINE}
-                   $<TARGET_OBJECTS:test_correctness_common>
-                   $<TARGET_OBJECTS:test_correctness_axy>
-                   test/correctness/routines/${ROUTINE}.cc)
-    target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
-    install(TARGETS test_${ROUTINE} DESTINATION bin)
+  foreach(ROUTINE ${LEVEL2_ROUTINES})
+    add_executable(test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
+                   test/correctness/routines/level2/${ROUTINE}.cc)
   endforeach()
-  foreach(ROUTINE ${ROUTINES_ABC})
-    add_executable(test_${ROUTINE}
-                   $<TARGET_OBJECTS:test_correctness_common>
-                   $<TARGET_OBJECTS:test_correctness_abc>
-                   test/correctness/routines/${ROUTINE}.cc)
+  foreach(ROUTINE ${LEVEL3_ROUTINES})
+    add_executable(test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
+                   test/correctness/routines/level3/${ROUTINE}.cc)
+  endforeach()
+  foreach(ROUTINE ${ROUTINES})
     target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
     install(TARGETS test_${ROUTINE} DESTINATION bin)
   endforeach()
@@ -203,10 +199,19 @@ if(TESTS)
   add_library(test_performance_common OBJECT test/performance/client.cc)
 
   # Compiles the performance-tests
-  set(TEST_PERF_COMM )
-  foreach(ROUTINE ${ROUTINES})
+  foreach(ROUTINE ${LEVEL1_ROUTINES})
     add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
-                   test/performance/routines/${ROUTINE}.cc)
+                   test/performance/routines/level1/${ROUTINE}.cc)
+  endforeach()
+  foreach(ROUTINE ${LEVEL2_ROUTINES})
+    add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
+                   test/performance/routines/level2/${ROUTINE}.cc)
+  endforeach()
+  foreach(ROUTINE ${LEVEL3_ROUTINES})
+    add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
+                   test/performance/routines/level3/${ROUTINE}.cc)
+  endforeach()
+  foreach(ROUTINE ${ROUTINES})
     target_link_libraries(client_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
     install(TARGETS client_${ROUTINE} DESTINATION bin)
   endforeach()
diff --git a/README.md b/README.md
index 1bed1146..c274a404 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ CLBlast: The tuned OpenCL BLAS library
 
 CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices.
 
-__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version supports only a minimal amount of routines (including `gemm` and `gemv`): others will be added in due time. It also lacks extensive tuning and testing on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
+__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version doesn't support all routines yet: others will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
 
 
 Why CLBlast and not clBLAS or cuBLAS?
@@ -109,13 +109,13 @@ Performance remarks
 
 The CLBlast library provides pre-tuned parameter-values for a number of OpenCL devices. If your device is not among these, then out-of-the-box performance might be poor. Even if the device is included performance might be poor in some cases: __the preview version is not thoroughly tested for performance yet__. See above under `Using the tuners` to find out how to tune for your device.
 
-The folder `doc/performance` contains some PDF files with performance results on tested devices. Performance is compared against a tuned version of the clBLAS library. The graphs of the level-3 routines (Xgemm and Xsymm) show the strong points of CLBlast:
+The folder `doc/performance` contains some PDF files with performance results on tested devices. Performance is compared against a tuned version of the clBLAS library. The graphs of the level-3 routines (Xgemm, Xsymm, Xsyrk) show the strong points of CLBlast:
 
 * The library reaches a high peak performance for large matrix sizes, in some cases a factor 2 more than clBLAS.
 * The performance for non-power of 2 values (e.g. 1000) is roughly equal to power of 2 cases (e.g. 1024). This is not the case for clBLAS, which sometimes shows a drop of a factor 2.
 * The performance is also constant for different layouts and transpose options. Again, this is not the case for clBLAS.
 
-The graphs also show the current weak point of CLBlast: its performance for smaller matrix sizes is not too good. Furthermore, although the GEMM kernels perform well on AMD GPUs, the supporting copy and transpose kernel do not.
+The graphs also show the current weak points of CLBlast: for small sizes the benefit is minimal or non-existent, and for some specific configurations clBLAS is still faster.
 
 These graphs can be generated automatically on your own device. First, compile CLBlast with the tests enabled. Then, make sure your installation of the reference clBLAS is performance-tuned by running the `tune` executable. Finally, run one of the graph-scripts found in `test/performance/graphs` using R. For example, to generate the Xgemm PDF on device 1 of platform 0:
 
@@ -124,7 +124,7 @@ These graphs can be generated automatically on your own device. First, compile C
 Supported routines
 -------------
 
-CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with `x` in the following tables:
+CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with '✔' in the following tables:
 
 | Level-1  | S | D | C | Z | Notes   |
 | ---------|---|---|---|---|---------|
@@ -135,7 +135,7 @@ CLBlast is in active development and currently does not support the full set of
 | xSWAP    |   |   |   |   |         |
 | xSCAL    |   |   |   |   | +CS +ZD |
 | xCOPY    |   |   |   |   |         |
-| xAXPY    |`x`|`x`|`x`|`x`|         |
+| xAXPY    | ✔ | ✔ | ✔ | ✔ |         |
 | xDOT     |   |   | - | - | +DS     |
 | xDOTU    | - | - |   |   |         |
 | xDOTC    | - | - |   |   |         |
@@ -147,7 +147,7 @@ CLBlast is in active development and currently does not support the full set of
 
 | Level-2  | S | D | C | Z | Notes   |
 | ---------|---|---|---|---|---------|
-| xGEMV    |`x`|`x`|`x`|`x`|         |
+| xGEMV    | ✔ | ✔ | ✔ | ✔ |         |
 | xGBMV    |   |   |   |   |         |
 | xHEMV    | - | - |   |   |         |
 | xHBMV    | - | - |   |   |         |
@@ -175,14 +175,14 @@ CLBlast is in active development and currently does not support the full set of
 
 | Level-3  | S | D | C | Z | Notes   |
 | ---------|---|---|---|---|---------|
-| xGEMM    |`x`|`x`|`x`|`x`|         |
-| xSYMM    |`x`|`x`|`x`|`x`|         |
-| xHEMM    | - | - |   |   |         |
-| xSYRK    |   |   |   |   |         |
-| xHERK    | - | - |   |   |         |
-| xSYR2K   |   |   |   |   |         |
-| xHER2K   | - | - |   |   |         |
-| xTRMM    |   |   |   |   |         |
+| xGEMM    | ✔ | ✔ | ✔ | ✔ |         |
+| xSYMM    | ✔ | ✔ | ✔ | ✔ |         |
+| xHEMM    | - | - | ✔ | ✔ |         |
+| xSYRK    | ✔ | ✔ | ✔ | ✔ |         |
+| xHERK    | - | - | ✔ | ✔ |         |
+| xSYR2K   | ✔ | ✔ | ✔ | ✔ |         |
+| xHER2K   | - | - | ✔ | ✔ |         |
+| xTRMM    | ✔ | ✔ | ✔ | ✔ |         |
 | xTRSM    |   |   |   |   |         |
 
 
@@ -214,8 +214,6 @@ To-do list before release of version 1.0
 - Improve host performance:
   * Allow initialization to pre-compile kernels and store to disk
 - Improve device performance:
-  * Enable 'mad()' for AMD devices
-  * Improve the performance of the copy and transpose kernels
   * Tune for a wider range of devices
   * Allow users to define custom tuned parameters
 - Improve the tuning
diff --git a/doc/performance/GeForce_GTX480/SAXPY.pdf b/doc/performance/GeForce_GTX480/SAXPY.pdf
index 29bf0056..6e1c8f5a 100644
Binary files a/doc/performance/GeForce_GTX480/SAXPY.pdf and b/doc/performance/GeForce_GTX480/SAXPY.pdf differ
diff --git a/doc/performance/GeForce_GTX480/SGEMM.pdf b/doc/performance/GeForce_GTX480/SGEMM.pdf
index ac6e59c8..f430f880 100644
Binary files a/doc/performance/GeForce_GTX480/SGEMM.pdf and b/doc/performance/GeForce_GTX480/SGEMM.pdf differ
diff --git a/doc/performance/GeForce_GTX480/SGEMV.pdf b/doc/performance/GeForce_GTX480/SGEMV.pdf
new file mode 100644
index 00000000..8cb57124
Binary files /dev/null and b/doc/performance/GeForce_GTX480/SGEMV.pdf differ
diff --git a/doc/performance/GeForce_GTX480/SSYMM.pdf b/doc/performance/GeForce_GTX480/SSYMM.pdf
index ca532190..ff5941ad 100644
Binary files a/doc/performance/GeForce_GTX480/SSYMM.pdf and b/doc/performance/GeForce_GTX480/SSYMM.pdf differ
diff --git a/doc/performance/Iris/SAXPY.pdf b/doc/performance/Iris/SAXPY.pdf
index a05219a4..1e32efdf 100644
Binary files a/doc/performance/Iris/SAXPY.pdf and b/doc/performance/Iris/SAXPY.pdf differ
diff --git a/doc/performance/Iris/SGEMM.pdf b/doc/performance/Iris/SGEMM.pdf
index 3a180adb..710ac230 100644
Binary files a/doc/performance/Iris/SGEMM.pdf and b/doc/performance/Iris/SGEMM.pdf differ
diff --git a/doc/performance/Iris/SGEMV.pdf b/doc/performance/Iris/SGEMV.pdf
index cd9b1025..e509a8a8 100644
Binary files a/doc/performance/Iris/SGEMV.pdf and b/doc/performance/Iris/SGEMV.pdf differ
diff --git a/doc/performance/Iris/SSYMM.pdf b/doc/performance/Iris/SSYMM.pdf
index 1b7e2d13..e83cc96c 100644
Binary files a/doc/performance/Iris/SSYMM.pdf and b/doc/performance/Iris/SSYMM.pdf differ
diff --git a/doc/performance/Iris/SSYRK.pdf b/doc/performance/Iris/SSYRK.pdf
new file mode 100644
index 00000000..d9cba08b
Binary files /dev/null and b/doc/performance/Iris/SSYRK.pdf differ
diff --git a/doc/performance/Radeon_HD7950/SAXPY.pdf b/doc/performance/Radeon_HD7950/SAXPY.pdf
new file mode 100644
index 00000000..640a3513
Binary files /dev/null and b/doc/performance/Radeon_HD7950/SAXPY.pdf differ
diff --git a/doc/performance/Radeon_HD7950/SGEMM.pdf b/doc/performance/Radeon_HD7950/SGEMM.pdf
new file mode 100644
index 00000000..c6a1f02e
Binary files /dev/null and b/doc/performance/Radeon_HD7950/SGEMM.pdf differ
diff --git a/doc/performance/Radeon_HD7950/SGEMV.pdf b/doc/performance/Radeon_HD7950/SGEMV.pdf
new file mode 100644
index 00000000..66d80046
Binary files /dev/null and b/doc/performance/Radeon_HD7950/SGEMV.pdf differ
diff --git a/doc/performance/Radeon_HD7950/SSYMM.pdf b/doc/performance/Radeon_HD7950/SSYMM.pdf
new file mode 100644
index 00000000..c53bc759
Binary files /dev/null and b/doc/performance/Radeon_HD7950/SSYMM.pdf differ
diff --git a/doc/performance/Radeon_HD7950/SSYRK.pdf b/doc/performance/Radeon_HD7950/SSYRK.pdf
new file mode 100644
index 00000000..320fb958
Binary files /dev/null and b/doc/performance/Radeon_HD7950/SSYRK.pdf differ
diff --git a/doc/performance/Tesla_K40m/SAXPY.pdf b/doc/performance/Tesla_K40m/SAXPY.pdf
index 778eb94d..ac536dfa 100644
Binary files a/doc/performance/Tesla_K40m/SAXPY.pdf and b/doc/performance/Tesla_K40m/SAXPY.pdf differ
diff --git a/doc/performance/Tesla_K40m/SGEMM.pdf b/doc/performance/Tesla_K40m/SGEMM.pdf
index 0b5891d5..4350edc5 100644
Binary files a/doc/performance/Tesla_K40m/SGEMM.pdf and b/doc/performance/Tesla_K40m/SGEMM.pdf differ
diff --git a/doc/performance/Tesla_K40m/SGEMV.pdf b/doc/performance/Tesla_K40m/SGEMV.pdf
new file mode 100644
index 00000000..0e748e3f
Binary files /dev/null and b/doc/performance/Tesla_K40m/SGEMV.pdf differ
diff --git a/doc/performance/Tesla_K40m/SSYMM.pdf b/doc/performance/Tesla_K40m/SSYMM.pdf
index f62bcc98..8385b541 100644
Binary files a/doc/performance/Tesla_K40m/SSYMM.pdf and b/doc/performance/Tesla_K40m/SSYMM.pdf differ
diff --git a/doc/performance/Tesla_K40m/SSYRK.pdf b/doc/performance/Tesla_K40m/SSYRK.pdf
new file mode 100644
index 00000000..b438b7b3
Binary files /dev/null and b/doc/performance/Tesla_K40m/SSYRK.pdf differ
diff --git a/include/clblast.h b/include/clblast.h
index 231348b8..80ea1707 100644
--- a/include/clblast.h
+++ b/include/clblast.h
@@ -75,6 +75,7 @@ enum class Layout { kRowMajor, kColMajor };
 enum class Transpose { kNo, kYes, kConjugate };
 enum class Side { kLeft, kRight };
 enum class Triangle { kUpper, kLower };
+enum class Diagonal { kUnit, kNonUnit };
 
 // Precision scoped enum (values in bits)
 enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
@@ -95,7 +96,7 @@ StatusCode Axpy(const size_t n, const T alpha,
 
 // Templated-precision generalized matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV
 template <typename T>
-StatusCode Gemv(const Layout layout, const Transpose transpose_a,
+StatusCode Gemv(const Layout layout, const Transpose a_transpose,
                 const size_t m, const size_t n,
                 const T alpha,
                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
@@ -107,9 +108,9 @@ StatusCode Gemv(const Layout layout, const Transpose transpose_a,
 // =================================================================================================
 // BLAS level-3 (matrix-matrix) routines
 
-// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM
+// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM
 template <typename T>
-StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b,
+StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
                 const size_t m, const size_t n, const size_t k,
                 const T alpha,
                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
@@ -118,7 +119,7 @@ StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpos
                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                 cl_command_queue* queue, cl_event* event);
 
-// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM
+// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM
 template <typename T>
 StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
                 const size_t m, const size_t n,
@@ -129,6 +130,81 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                 cl_command_queue* queue, cl_event* event);
 
+// Templated-precision hermitian matrix-matrix multiplication: CHEMM/ZHEMM
+template <typename T>
+StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event);
+
+// Templated-precision rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK
+template <typename T>
+StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                const size_t n, const size_t k,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event);
+
+// Templated-precision rank-K update of a hermitian matrix: CHERK/ZHERK
+template <typename T>
+StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                const size_t n, const size_t k,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event);
+
+// Templated-precision rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K
+template <typename T>
+StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                 const size_t n, const size_t k,
+                 const T alpha,
+                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                 const T beta,
+                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                 cl_command_queue* queue, cl_event* event);
+
+// Templated-precision rank-2K update of a hermitian matrix: CHER2K/ZHER2K
+template <typename T, typename U>
+StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                 const size_t n, const size_t k,
+                 const T alpha,
+                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                 const U beta,
+                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                 cl_command_queue* queue, cl_event* event);
+
+// Templated-precision triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM
+template <typename T>
+StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle,
+                const Transpose a_transpose, const Diagonal diagonal,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                cl_command_queue* queue, cl_event* event);
+
+// Templated-precision matrix equation solver: STRSM/DTRSM/CTRSM/ZTRSM
+/*
+template <typename T>
+StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle,
+                const Transpose a_transpose, const Diagonal diagonal,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                cl_command_queue* queue, cl_event* event);
+*/
+
 // =================================================================================================
 } // namespace clblast
 
diff --git a/include/internal/database/copy.h b/include/internal/database/copy.h
index b9335fc9..dfd69b80 100644
--- a/include/internal/database/copy.h
+++ b/include/internal/database/copy.h
@@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::CopySingle = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
         { "Tahiti",           { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",4}, {"COPY_VW",2} } },
       }
     },
@@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::CopyDouble = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
         { "Tahiti",           { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",4} } },
       }
     },
@@ -81,7 +81,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
         { "Tahiti",           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
       }
     },
@@ -110,7 +110,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
         { "Tahiti",           { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",4}, {"COPY_VW",2} } },
       }
     },
diff --git a/include/internal/database/pad.h b/include/internal/database/pad.h
index 5af75308..61ec3242 100644
--- a/include/internal/database/pad.h
+++ b/include/internal/database/pad.h
@@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::PadSingle = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
         { "Tahiti",           { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
       }
     },
@@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::PadDouble = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
         { "Tahiti",           { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
       }
     },
@@ -81,7 +81,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
         { "Tahiti",           { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
@@ -110,7 +110,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
         { "Tahiti",           { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
       }
     },
diff --git a/include/internal/database/padtranspose.h b/include/internal/database/padtranspose.h
index f1127d60..8f6fcba0 100644
--- a/include/internal/database/padtranspose.h
+++ b/include/internal/database/padtranspose.h
@@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::PadTraSingle = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
         { "Tahiti",           { {"PADTRA_TILE",16}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
       }
     },
@@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::PadTraDouble = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
         { "Tahiti",           { {"PADTRA_TILE",8}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
       }
     },
@@ -81,7 +81,7 @@ const Database::DatabaseEntry Database::PadTraComplexSingle = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
         { "Tahiti",           { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
       }
     },
@@ -110,7 +110,7 @@ const Database::DatabaseEntry Database::PadTraComplexDouble = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
         { "Tahiti",           { {"PADTRA_TILE",8}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
       }
     },
diff --git a/include/internal/database/transpose.h b/include/internal/database/transpose.h
index 0814eb8a..b348f364 100644
--- a/include/internal/database/transpose.h
+++ b/include/internal/database/transpose.h
@@ -18,24 +18,24 @@ const Database::DatabaseEntry Database::TraSingle = {
   "Transpose", Precision::kSingle, {
     { // NVIDIA GPUs
       CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
-        { "GeForce GTX 480",  { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
-        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
-        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+        { "GeForce GTX 480",  { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
+        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
+        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
-        { "Tahiti",           { {"TRA_DIM",8}, {"TRA_WPT",8}, {"TRA_PAD",0} } },
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
+        { "Tahiti",           { {"TRA_DIM",16}, {"TRA_WPT",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
       }
     },
     { // Intel GPUs
       CL_DEVICE_TYPE_GPU, "Intel", {
-        { "Iris",             { {"TRA_DIM",8}, {"TRA_WPT",4}, {"TRA_PAD",0} } },
+        { "Iris",             { {"TRA_DIM",8}, {"TRA_WPT",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
       }
     },
     { // Default
       CL_DEVICE_TYPE_ALL, kDefault, {
-        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
       }
     },
   }
@@ -47,14 +47,14 @@ const Database::DatabaseEntry Database::TraDouble = {
   "Transpose", Precision::kDouble, {
     { // NVIDIA GPUs
       CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
-        { "GeForce GTX 480",  { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
-        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
-        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+        { "GeForce GTX 480",  { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
+        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
+        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
-        { "Tahiti",           { {"TRA_DIM",8}, {"TRA_WPT",8}, {"TRA_PAD",0} } },
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
+        { "Tahiti",           { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
       }
     },
     { // Intel GPUs
@@ -63,7 +63,7 @@ const Database::DatabaseEntry Database::TraDouble = {
     },
     { // Default
       CL_DEVICE_TYPE_ALL, kDefault, {
-        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
       }
     },
   }
@@ -75,24 +75,24 @@ const Database::DatabaseEntry Database::TraComplexSingle = {
   "Transpose", Precision::kComplexSingle, {
     { // NVIDIA GPUs
       CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
-        { "GeForce GTX 480",  { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
-        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
-        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+        { "GeForce GTX 480",  { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
+        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
+        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
-        { "Tahiti",           { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
+        { "Tahiti",           { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1} } },
       }
     },
     { // Intel GPUs
       CL_DEVICE_TYPE_GPU, "Intel", {
-        { "Iris",             { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+        { "Iris",             { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
       }
     },
     { // Default
       CL_DEVICE_TYPE_ALL, kDefault, {
-        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
       }
     },
   }
@@ -104,14 +104,14 @@ const Database::DatabaseEntry Database::TraComplexDouble = {
   "Transpose", Precision::kComplexDouble, {
     { // NVIDIA GPUs
       CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
-        { "GeForce GTX 480",  { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
-        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
-        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+        { "GeForce GTX 480",  { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
+        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
+        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
-        { "Tahiti",           { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
+        { "Tahiti",           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
       }
     },
     { // Intel GPUs
@@ -120,7 +120,7 @@ const Database::DatabaseEntry Database::TraComplexDouble = {
     },
     { // Default
       CL_DEVICE_TYPE_ALL, kDefault, {
-        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
       }
     },
   }
diff --git a/include/internal/database/xaxpy.h b/include/internal/database/xaxpy.h
index c331945a..40747678 100644
--- a/include/internal/database/xaxpy.h
+++ b/include/internal/database/xaxpy.h
@@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
         { "Tahiti",           { {"WGS",64}, {"WPT",1}, {"VW",2} } },
       }
     },
@@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
         { "Tahiti",           { {"WGS",256}, {"WPT",1}, {"VW",1} } },
       }
     },
@@ -80,7 +80,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
         { "Tahiti",           { {"WGS",64}, {"WPT",1}, {"VW",1} } },
       }
     },
@@ -109,7 +109,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
         { "Tahiti",           { {"WGS",64}, {"WPT",1}, {"VW",1} } },
       }
     },
diff --git a/include/internal/database/xgemm.h b/include/internal/database/xgemm.h
index edf41e12..c2fe9bcb 100644
--- a/include/internal/database/xgemm.h
+++ b/include/internal/database/xgemm.h
@@ -25,8 +25,8 @@ const Database::DatabaseEntry Database::XgemmSingle = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
-        { "Tahiti",           { {"MWG",128}, {"NWG",128}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",8}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
+        { "Tahiti",           { {"MWG",128}, {"NWG",128}, {"KWG",32}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",32}, {"NDIMB",8}, {"KWI",2}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
       }
     },
     { // Intel GPUs
@@ -55,7 +55,7 @@ const Database::DatabaseEntry Database::XgemmDouble = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
         { "Tahiti",           { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
       }
     },
@@ -84,13 +84,13 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
         { "Tahiti",           { {"MWG",16}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } },
       }
     },
     { // Intel GPUs
       CL_DEVICE_TYPE_GPU, "Intel", {
-        { "Iris",             { {"MWG",64}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",8}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } },
+        { "Iris",             { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
       }
     },
     { // Default
@@ -114,7 +114,7 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
         { "Tahiti",           { {"MWG",128}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
       }
     },
diff --git a/include/internal/database/xgemv.h b/include/internal/database/xgemv.h
index ef45f486..0266dd3c 100644
--- a/include/internal/database/xgemv.h
+++ b/include/internal/database/xgemv.h
@@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
         { "Tahiti",           { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
       }
     },
@@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::XgemvDouble = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
         { "Tahiti",           { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
       }
     },
@@ -80,7 +80,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
         { "Tahiti",           { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
       }
     },
@@ -109,7 +109,7 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = {
       }
     },
     { // AMD GPUs
-      CL_DEVICE_TYPE_GPU, "AMD", {
+      CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
         { "Tahiti",           { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
       }
     },
diff --git a/include/internal/routine.h b/include/internal/routine.h
index a65ced20..911bda49 100644
--- a/include/internal/routine.h
+++ b/include/internal/routine.h
@@ -34,20 +34,14 @@ class Routine {
     Program program;
     std::string device_name;
     Precision precision;
-    std::vector<std::string> routines;
+    std::string routine_name_;
 
     // Finds out whether the properties match
-    bool MatchInCache(const std::string &ref_name, const Precision &ref_precision,
-                      const std::vector<std::string> &ref_routines) {
-      auto ref_size = ref_routines.size();
-      if (device_name == ref_name && precision == ref_precision && routines.size() == ref_size) {
-        auto found_match = true;
-        for (auto i=size_t{0}; i<ref_size; ++i) {
-          if (routines[i] != ref_routines[i]) { found_match = false; }
-        }
-        return found_match;
-      }
-      return false;
+    bool MatchInCache(const std::string &ref_device, const Precision &ref_precision,
+                      const std::string &ref_routine) {
+      return (device_name == ref_device &&
+              precision == ref_precision &&
+              routine_name_ == ref_routine);
     }
   };
 
@@ -58,11 +52,11 @@ class Routine {
   static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
 
   // Base class constructor
-  explicit Routine(CommandQueue &queue, Event &event,
+  explicit Routine(CommandQueue &queue, Event &event, const std::string &name,
                    const std::vector<std::string> &routines, const Precision precision);
 
   // Set-up phase of the kernel
-  StatusCode SetUp(const std::string &routine_source);
+  StatusCode SetUp();
 
  protected:
   
@@ -84,15 +78,18 @@ class Routine {
   StatusCode TestVectorY(const size_t n, const Buffer &buffer, const size_t offset,
                          const size_t inc, const size_t data_size);
 
-  // Copies/transposes a matrix and padds/unpads it
+  // Copies/transposes a matrix and padds/unpads it with zeroes. This method is also able to write
+  // to symmetric and triangular matrices through optional arguments.
   StatusCode PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
                                     const size_t src_ld, const size_t src_offset,
                                     const Buffer &src,
                                     const size_t dest_one, const size_t dest_two,
                                     const size_t dest_ld, const size_t dest_offset,
                                     const Buffer &dest,
+                                    const Program &program, const bool do_pad,
                                     const bool do_transpose, const bool do_conjugate,
-                                    const bool pad, const Program &program);
+                                    const bool upper = false, const bool lower = false,
+                                    const bool diagonal_imag_zero = false);
   
   // Queries the cache and retrieve either a matching program or a boolean whether a match exists.
   // The first assumes that the program is available in the cache and will throw an exception
@@ -104,6 +101,10 @@ class Routine {
   // a derived class.
   const Precision precision_;
 
+  // The routine's name and its kernel-source in string form
+  const std::string routine_name_;
+  std::string source_string_;
+
   // The OpenCL objects, accessible only from derived classes
   CommandQueue queue_;
   Event event_;
@@ -118,7 +119,6 @@ class Routine {
 
   // Connection to the database for all the device-specific parameters
   const Database db_;
-  const std::vector<std::string> routines_;
 };
 
 // =================================================================================================
diff --git a/include/internal/routines/xaxpy.h b/include/internal/routines/level1/xaxpy.h
similarity index 100%
rename from include/internal/routines/xaxpy.h
rename to include/internal/routines/level1/xaxpy.h
diff --git a/include/internal/routines/xgemv.h b/include/internal/routines/level2/xgemv.h
similarity index 100%
rename from include/internal/routines/xgemv.h
rename to include/internal/routines/level2/xgemv.h
diff --git a/include/internal/routines/xgemm.h b/include/internal/routines/level3/xgemm.h
similarity index 100%
rename from include/internal/routines/xgemm.h
rename to include/internal/routines/level3/xgemm.h
diff --git a/include/internal/routines/level3/xhemm.h b/include/internal/routines/level3/xhemm.h
new file mode 100644
index 00000000..6cc9d9ec
--- /dev/null
+++ b/include/internal/routines/level3/xhemm.h
@@ -0,0 +1,58 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhemm routine. It is based on the generalized matrix multiplication
+// routine (Xgemm). The implementation is very similar to the Xsymm routine.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHEMM_H_
+#define CLBLAST_ROUTINES_XHEMM_H_
+
+#include "internal/routines/level3/xgemm.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhemm: public Xgemm<T> {
+ public:
+
+  // Uses several variables from the Routine class
+  using Routine::db_;
+  using Routine::context_;
+
+  // Uses several helper functions from the Routine class
+  using Routine::RunKernel;
+  using Routine::ErrorIn;
+  using Routine::TestMatrixA;
+  using Routine::GetProgramFromCache;
+
+  // Uses the regular Xgemm routine
+  using Xgemm<T>::DoGemm;
+
+  // Constructor
+  Xhemm(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                    const T beta,
+                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHEMM_H_
+#endif
diff --git a/include/internal/routines/level3/xher2k.h b/include/internal/routines/level3/xher2k.h
new file mode 100644
index 00000000..1836a812
--- /dev/null
+++ b/include/internal/routines/level3/xher2k.h
@@ -0,0 +1,48 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xher2k routine. The precision is implemented using the template argument
+// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the
+// Xsyr2k routine.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHER2K_H_
+#define CLBLAST_ROUTINES_XHER2K_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class Xher2k: public Routine {
+ public:
+  Xher2k(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                     const size_t n, const size_t k,
+                     const T alpha,
+                     const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                     const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                     const U beta,
+                     const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHER2K_H_
+#endif
diff --git a/include/internal/routines/level3/xherk.h b/include/internal/routines/level3/xherk.h
new file mode 100644
index 00000000..9b361254
--- /dev/null
+++ b/include/internal/routines/level3/xherk.h
@@ -0,0 +1,47 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xherk routine. The precision is implemented using the template argument
+// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the
+// Xsyrk routine.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHERK_H_
+#define CLBLAST_ROUTINES_XHERK_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class Xherk: public Routine {
+ public:
+  Xherk(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                    const size_t n, const size_t k,
+                    const U alpha,
+                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const U beta,
+                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHERK_H_
+#endif
diff --git a/include/internal/routines/xsymm.h b/include/internal/routines/level3/xsymm.h
similarity index 98%
rename from include/internal/routines/xsymm.h
rename to include/internal/routines/level3/xsymm.h
index c6545164..2028ceea 100644
--- a/include/internal/routines/xsymm.h
+++ b/include/internal/routines/level3/xsymm.h
@@ -17,7 +17,7 @@
 #ifndef CLBLAST_ROUTINES_XSYMM_H_
 #define CLBLAST_ROUTINES_XSYMM_H_
 
-#include "internal/routines/xgemm.h"
+#include "internal/routines/level3/xgemm.h"
 
 namespace clblast {
 // =================================================================================================
diff --git a/include/internal/routines/level3/xsyr2k.h b/include/internal/routines/level3/xsyr2k.h
new file mode 100644
index 00000000..6259313c
--- /dev/null
+++ b/include/internal/routines/level3/xsyr2k.h
@@ -0,0 +1,48 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr2k routine. The precision is implemented using a template argument.
+// The implementation is very similar to Xsyrk (see header for details), except for the fact that
+// the main XgemmUpper/XgemmLower kernel is called twice: C = AB^T + C and C = BA^T + C.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYR2K_H_
+#define CLBLAST_ROUTINES_XSYR2K_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsyr2k: public Routine {
+ public:
+  Xsyr2k(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                     const size_t n, const size_t k,
+                     const T alpha,
+                     const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                     const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                     const T beta,
+                     const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYR2K_H_
+#endif
diff --git a/include/internal/routines/level3/xsyrk.h b/include/internal/routines/level3/xsyrk.h
new file mode 100644
index 00000000..3dab731f
--- /dev/null
+++ b/include/internal/routines/level3/xsyrk.h
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyrk routine. The precision is implemented using a template argument.
+// The implementation is based on the regular Xgemm routine and kernel, but with two main changes:
+// 1) The final unpad(transpose) kernel updates only the upper/lower triangular part.
+// 2) The main Xgemm kernel masks workgroups not contributing to usefull data. This is only for
+//    performance reasons, as the actual masking is done later (see the first point).
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYRK_H_
+#define CLBLAST_ROUTINES_XSYRK_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsyrk: public Routine {
+ public:
+  Xsyrk(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                    const size_t n, const size_t k,
+                    const T alpha,
+                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const T beta,
+                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYRK_H_
+#endif
diff --git a/include/internal/routines/level3/xtrmm.h b/include/internal/routines/level3/xtrmm.h
new file mode 100644
index 00000000..4f49bebd
--- /dev/null
+++ b/include/internal/routines/level3/xtrmm.h
@@ -0,0 +1,58 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmm routine. The implementation is based on first transforming the
+// upper/lower unit/non-unit triangular matrix into a regular matrix and then calling the GEMM
+// routine. Therefore, this class inherits from the Xgemm class.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTRMM_H_
+#define CLBLAST_ROUTINES_XTRMM_H_
+
+#include "internal/routines/level3/xgemm.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtrmm: public Xgemm<T> {
+ public:
+
+  // Uses several variables from the Routine class
+  using Routine::db_;
+  using Routine::context_;
+
+  // Uses several helper functions from the Routine class
+  using Routine::RunKernel;
+  using Routine::ErrorIn;
+  using Routine::TestMatrixA;
+  using Routine::GetProgramFromCache;
+
+  // Uses the regular Xgemm routine
+  using Xgemm<T>::DoGemm;
+
+  // Constructor
+  Xtrmm(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle,
+                    const Transpose a_transpose, const Diagonal diagonal,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer &b_buffer, const size_t b_offset, const size_t b_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTRMM_H_
+#endif
diff --git a/include/internal/utilities.h b/include/internal/utilities.h
index 6ad17a6a..60d70eae 100644
--- a/include/internal/utilities.h
+++ b/include/internal/utilities.h
@@ -46,6 +46,7 @@ constexpr auto kArgATransp = "transA";
 constexpr auto kArgBTransp = "transB";
 constexpr auto kArgSide = "side";
 constexpr auto kArgTriangle = "triangle";
+constexpr auto kArgDiagonal = "diagonal";
 constexpr auto kArgXInc = "incx";
 constexpr auto kArgYInc = "incy";
 constexpr auto kArgXOffset = "offx";
@@ -93,6 +94,7 @@ struct Arguments {
   Transpose b_transpose = Transpose::kNo;
   Side side = Side::kLeft;
   Triangle triangle = Triangle::kUpper;
+  Diagonal diagonal = Diagonal::kUnit;
   size_t x_inc = 1;
   size_t y_inc = 1;
   size_t x_offset = 0;
@@ -105,6 +107,11 @@ struct Arguments {
   size_t c_offset = 0;
   T alpha = T{1.0};
   T beta = T{1.0};
+  size_t x_size = 1;
+  size_t y_size = 1;
+  size_t a_size = 1;
+  size_t b_size = 1;
+  size_t c_size = 1;
   // Tuner-specific arguments
   double fraction = 1.0;
   // Client-specific arguments
@@ -123,6 +130,15 @@ struct Arguments {
   bool no_abbrv = false;
 };
 
+// Structure containing all possible buffers for test clients
+struct Buffers {
+  Buffer x_vec;
+  Buffer y_vec;
+  Buffer a_mat;
+  Buffer b_mat;
+  Buffer c_mat;
+};
+
 // =================================================================================================
 
 // Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast
diff --git a/src/clblast.cc b/src/clblast.cc
index bb0091a3..6cb4086e 100644
--- a/src/clblast.cc
+++ b/src/clblast.cc
@@ -18,14 +18,20 @@
 #include "clblast.h"
 
 // BLAS level-1 includes
-#include "internal/routines/xaxpy.h"
+#include "internal/routines/level1/xaxpy.h"
 
 // BLAS level-2 includes
-#include "internal/routines/xgemv.h"
+#include "internal/routines/level2/xgemv.h"
 
 // BLAS level-3 includes
-#include "internal/routines/xgemm.h"
-#include "internal/routines/xsymm.h"
+#include "internal/routines/level3/xgemm.h"
+#include "internal/routines/level3/xsymm.h"
+#include "internal/routines/level3/xhemm.h"
+#include "internal/routines/level3/xsyrk.h"
+#include "internal/routines/level3/xherk.h"
+#include "internal/routines/level3/xsyr2k.h"
+#include "internal/routines/level3/xher2k.h"
+#include "internal/routines/level3/xtrmm.h"
 
 namespace clblast {
 // =================================================================================================
@@ -41,10 +47,8 @@ StatusCode Axpy(const size_t n, const T alpha,
   auto event_cpp = Event(*event);
   auto routine = Xaxpy<T>(queue_cpp, event_cpp);
 
-  // Loads the kernel source-code as an include (C++11 raw string literal)
-  std::string kernel_source =
-  #include "kernels/xaxpy.opencl"
-  auto status = routine.SetUp(kernel_source);
+  // Compiles the routine's device kernels
+  auto status = routine.SetUp();
   if (status != StatusCode::kSuccess) { return status; }
 
   // Runs the routine
@@ -74,7 +78,7 @@ template StatusCode Axpy<double2>(const size_t, const double2,
 
 // GEMV
 template <typename T>
-StatusCode Gemv(const Layout layout, const Transpose transpose_a,
+StatusCode Gemv(const Layout layout, const Transpose a_transpose,
                 const size_t m, const size_t n, const T alpha,
                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta,
@@ -85,14 +89,12 @@ StatusCode Gemv(const Layout layout, const Transpose transpose_a,
   auto event_cpp = Event(*event);
   auto routine = Xgemv<T>(queue_cpp, event_cpp);
 
-  // Loads the kernel source-code as an include (C++11 raw string literal)
-  std::string kernel_source =
-  #include "kernels/xgemv.opencl"
-  auto status = routine.SetUp(kernel_source);
+  // Compiles the routine's device kernels
+  auto status = routine.SetUp();
   if (status != StatusCode::kSuccess) { return status; }
 
   // Runs the routine
-  return routine.DoGemv(layout, transpose_a, m, n, alpha,
+  return routine.DoGemv(layout, a_transpose, m, n, alpha,
                         Buffer(a_buffer), a_offset, a_ld,
                         Buffer(x_buffer), x_offset, x_inc, beta,
                         Buffer(y_buffer), y_offset, y_inc);
@@ -127,7 +129,7 @@ template StatusCode Gemv<double2>(const Layout, const Transpose,
 
 // GEMM
 template <typename T>
-StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b,
+StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
                 const size_t m, const size_t n, const size_t k, const T alpha,
                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
@@ -137,23 +139,12 @@ StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpos
   auto event_cpp = Event(*event);
   auto routine = Xgemm<T>(queue_cpp, event_cpp);
 
-  // Loads the kernel source-code as an include (C++11 raw string literal)
-  std::string common_source1 =
-  #include "kernels/copy.opencl"
-  std::string common_source2 =
-  #include "kernels/pad.opencl"
-  std::string common_source3 =
-  #include "kernels/transpose.opencl"
-  std::string common_source4 =
-  #include "kernels/padtranspose.opencl"
-  std::string kernel_source =
-  #include "kernels/xgemm.opencl"
-  auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
-                              kernel_source);
+  // Compiles the routine's device kernels
+  auto status = routine.SetUp();
   if (status != StatusCode::kSuccess) { return status; }
 
   // Runs the routine
-  return routine.DoGemm(layout, transpose_a, transpose_b, m, n, k, alpha,
+  return routine.DoGemm(layout, a_transpose, b_transpose, m, n, k, alpha,
                         Buffer(a_buffer), a_offset, a_ld,
                         Buffer(b_buffer), b_offset, b_ld, beta,
                         Buffer(c_buffer), c_offset, c_ld);
@@ -197,19 +188,8 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
   auto event_cpp = Event(*event);
   auto routine = Xsymm<T>(queue_cpp, event_cpp);
 
-  // Loads the kernel source-code as an include (C++11 raw string literal)
-  std::string common_source1 =
-  #include "kernels/copy.opencl"
-  std::string common_source2 =
-  #include "kernels/pad.opencl"
-  std::string common_source3 =
-  #include "kernels/transpose.opencl"
-  std::string common_source4 =
-  #include "kernels/padtranspose.opencl"
-  std::string kernel_source =
-  #include "kernels/xgemm.opencl"
-  auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
-                            kernel_source);
+  // Compiles the routine's device kernels
+  auto status = routine.SetUp();
   if (status != StatusCode::kSuccess) { return status; }
 
   // Runs the routine
@@ -244,4 +224,302 @@ template StatusCode Symm<double2>(const Layout, const Side, const Triangle,
                                   cl_command_queue*, cl_event*);
 
 // =================================================================================================
+
+// HEMM
+template <typename T>
+StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
+                const size_t m, const size_t n, const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = CommandQueue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xhemm<T>(queue_cpp, event_cpp);
+
+  // Compiles the routine's device kernels
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+
+  // Runs the routine
+  return routine.DoHemm(layout, side, triangle, m, n, alpha,
+                        Buffer(a_buffer), a_offset, a_ld,
+                        Buffer(b_buffer), b_offset, b_ld, beta,
+                        Buffer(c_buffer), c_offset, c_ld);
+}
+template StatusCode Hemm<float2>(const Layout, const Side, const Triangle,
+                                 const size_t, const size_t, const float2,
+                                 const cl_mem, const size_t, const size_t,
+                                 const cl_mem, const size_t, const size_t, const float2,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Hemm<double2>(const Layout, const Side, const Triangle,
+                                  const size_t, const size_t, const double2,
+                                  const cl_mem, const size_t, const size_t,
+                                  const cl_mem, const size_t, const size_t, const double2,
+                                  cl_mem, const size_t, const size_t,
+                                  cl_command_queue*, cl_event*);
+
+// =================================================================================================
+
+// SYRK
+template <typename T>
+StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                const size_t n, const size_t k, const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = CommandQueue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xsyrk<T>(queue_cpp, event_cpp);
+
+  // Compiles the routine's device kernels
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+
+  // Runs the routine
+  return routine.DoSyrk(layout, triangle, a_transpose, n, k, alpha,
+                        Buffer(a_buffer), a_offset, a_ld, beta,
+                        Buffer(c_buffer), c_offset, c_ld);
+}
+template StatusCode Syrk<float>(const Layout, const Triangle, const Transpose,
+                                const size_t, const size_t, const float,
+                                const cl_mem, const size_t, const size_t, const float,
+                                cl_mem, const size_t, const size_t,
+                                cl_command_queue*, cl_event*);
+template StatusCode Syrk<double>(const Layout, const Triangle, const Transpose,
+                                 const size_t, const size_t, const double,
+                                 const cl_mem, const size_t, const size_t, const double,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Syrk<float2>(const Layout, const Triangle, const Transpose,
+                                 const size_t, const size_t, const float2,
+                                 const cl_mem, const size_t, const size_t, const float2,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Syrk<double2>(const Layout, const Triangle, const Transpose,
+                                  const size_t, const size_t, const double2,
+                                  const cl_mem, const size_t, const size_t, const double2,
+                                  cl_mem, const size_t, const size_t,
+                                  cl_command_queue*, cl_event*);
+
+// =================================================================================================
+
+// HERK
+template <typename T>
+StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                const size_t n, const size_t k, const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = CommandQueue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xherk<std::complex<T>,T>(queue_cpp, event_cpp);
+
+  // Compiles the routine's device kernels
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+
+  // Runs the routine
+  return routine.DoHerk(layout, triangle, a_transpose, n, k, alpha,
+                        Buffer(a_buffer), a_offset, a_ld, beta,
+                        Buffer(c_buffer), c_offset, c_ld);
+}
+template StatusCode Herk<float>(const Layout, const Triangle, const Transpose,
+                                const size_t, const size_t, const float,
+                                const cl_mem, const size_t, const size_t, const float,
+                                cl_mem, const size_t, const size_t,
+                                cl_command_queue*, cl_event*);
+template StatusCode Herk<double>(const Layout, const Triangle, const Transpose,
+                                 const size_t, const size_t, const double,
+                                 const cl_mem, const size_t, const size_t, const double,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+
+// =================================================================================================
+
+// SYR2K
+template <typename T>
+StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                 const size_t n, const size_t k, const T alpha,
+                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
+                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                 cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = CommandQueue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xsyr2k<T>(queue_cpp, event_cpp);
+
+  // Compiles the routine's device kernels
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+
+  // Runs the routine
+  return routine.DoSyr2k(layout, triangle, ab_transpose, n, k, alpha,
+                         Buffer(a_buffer), a_offset, a_ld,
+                         Buffer(b_buffer), b_offset, b_ld, beta,
+                         Buffer(c_buffer), c_offset, c_ld);
+}
+template StatusCode Syr2k<float>(const Layout, const Triangle, const Transpose,
+                                 const size_t, const size_t, const float,
+                                 const cl_mem, const size_t, const size_t,
+                                 const cl_mem, const size_t, const size_t, const float,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Syr2k<double>(const Layout, const Triangle, const Transpose,
+                                  const size_t, const size_t, const double,
+                                  const cl_mem, const size_t, const size_t,
+                                  const cl_mem, const size_t, const size_t, const double,
+                                  cl_mem, const size_t, const size_t,
+                                  cl_command_queue*, cl_event*);
+template StatusCode Syr2k<float2>(const Layout, const Triangle, const Transpose,
+                                  const size_t, const size_t, const float2,
+                                  const cl_mem, const size_t, const size_t,
+                                  const cl_mem, const size_t, const size_t, const float2,
+                                  cl_mem, const size_t, const size_t,
+                                  cl_command_queue*, cl_event*);
+template StatusCode Syr2k<double2>(const Layout, const Triangle, const Transpose,
+                                   const size_t, const size_t, const double2,
+                                   const cl_mem, const size_t, const size_t,
+                                   const cl_mem, const size_t, const size_t, const double2,
+                                   cl_mem, const size_t, const size_t,
+                                   cl_command_queue*, cl_event*);
+
+// =================================================================================================
+
+// SYR2K
+template <typename T, typename U>
+StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                 const size_t n, const size_t k, const T alpha,
+                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const U beta,
+                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                 cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = CommandQueue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xher2k<T,U>(queue_cpp, event_cpp);
+
+  // Compiles the routine's device kernels
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+
+  // Runs the routine
+  return routine.DoHer2k(layout, triangle, ab_transpose, n, k, alpha,
+                         Buffer(a_buffer), a_offset, a_ld,
+                         Buffer(b_buffer), b_offset, b_ld, beta,
+                         Buffer(c_buffer), c_offset, c_ld);
+}
+template StatusCode Her2k<float2,float>(const Layout, const Triangle, const Transpose,
+                                        const size_t, const size_t, const float2,
+                                        const cl_mem, const size_t, const size_t,
+                                        const cl_mem, const size_t, const size_t, const float,
+                                        cl_mem, const size_t, const size_t,
+                                        cl_command_queue*, cl_event*);
+template StatusCode Her2k<double2,double>(const Layout, const Triangle, const Transpose,
+                                          const size_t, const size_t, const double2,
+                                          const cl_mem, const size_t, const size_t,
+                                          const cl_mem, const size_t, const size_t, const double,
+                                          cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
+
+// =================================================================================================
+
+// TRMM
+template <typename T>
+StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle,
+                const Transpose a_transpose, const Diagonal diagonal,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = CommandQueue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xtrmm<T>(queue_cpp, event_cpp);
+
+  // Compiles the routine's device kernels
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+
+  // Runs the routine
+  return routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha,
+                        Buffer(a_buffer), a_offset, a_ld,
+                        Buffer(b_buffer), b_offset, b_ld);
+}
+template StatusCode Trmm<float>(const Layout, const Side, const Triangle,
+                                const Transpose, const Diagonal,
+                                const size_t, const size_t, const float,
+                                const cl_mem, const size_t, const size_t,
+                                cl_mem, const size_t, const size_t,
+                                cl_command_queue*, cl_event*);
+template StatusCode Trmm<double>(const Layout, const Side, const Triangle,
+                                 const Transpose, const Diagonal,
+                                 const size_t, const size_t, const double,
+                                 const cl_mem, const size_t, const size_t,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Trmm<float2>(const Layout, const Side, const Triangle,
+                                 const Transpose, const Diagonal,
+                                 const size_t, const size_t, const float2,
+                                 const cl_mem, const size_t, const size_t,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Trmm<double2>(const Layout, const Side, const Triangle,
+                                  const Transpose, const Diagonal,
+                                  const size_t, const size_t, const double2,
+                                  const cl_mem, const size_t, const size_t,
+                                  cl_mem, const size_t, const size_t,
+                                  cl_command_queue*, cl_event*);
+
+// =================================================================================================
+
+// TRSM
+/*
+template <typename T>
+StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle,
+                const Transpose a_transpose, const Diagonal diagonal,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = CommandQueue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xtrsm<T>(queue_cpp, event_cpp);
+
+  // Compiles the routine's device kernels
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+
+  // Runs the routine
+  return routine.DoTrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha,
+                        Buffer(a_buffer), a_offset, a_ld,
+                        Buffer(b_buffer), b_offset, b_ld);
+}
+template StatusCode Trsm<float>(const Layout, const Side, const Triangle,
+                                const Transpose, const Diagonal,
+                                const size_t, const size_t, const float,
+                                const cl_mem, const size_t, const size_t,
+                                cl_mem, const size_t, const size_t,
+                                cl_command_queue*, cl_event*);
+template StatusCode Trsm<double>(const Layout, const Side, const Triangle,
+                                 const Transpose, const Diagonal,
+                                 const size_t, const size_t, const double,
+                                 const cl_mem, const size_t, const size_t,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Trsm<float2>(const Layout, const Side, const Triangle,
+                                 const Transpose, const Diagonal,
+                                 const size_t, const size_t, const float2,
+                                 const cl_mem, const size_t, const size_t,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Trsm<double2>(const Layout, const Side, const Triangle,
+                                  const Transpose, const Diagonal,
+                                  const size_t, const size_t, const double2,
+                                  const cl_mem, const size_t, const size_t,
+                                  cl_mem, const size_t, const size_t,
+                                  cl_command_queue*, cl_event*);
+*/
+// =================================================================================================
 } // namespace clblast
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index 818c725f..12d63b99 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -39,6 +39,7 @@ R"(
   typedef float8 real8;
   typedef float16 real16;
   #define ZERO 0.0f
+  #define ONE 1.0f
 
 // Double-precision 
 #elif PRECISION == 64
@@ -48,6 +49,7 @@ R"(
   typedef double8 real8;
   typedef double16 real16;
   #define ZERO 0.0
+  #define ONE 1.0
 
 // Complex single-precision
 #elif PRECISION == 3232
@@ -61,6 +63,7 @@ R"(
                            real s8; real s9; real sA; real sB;
                            real sC; real sD; real sE; real sF;} real16;
   #define ZERO 0.0f
+  #define ONE 1.0f
 
 // Complex Double-precision
 #elif PRECISION == 6464
@@ -74,12 +77,16 @@ R"(
                             real s8; real s9; real sA; real sB;
                             real sC; real sD; real sE; real sF;} real16;
   #define ZERO 0.0
+  #define ONE 1.0
 #endif
 
 // =================================================================================================
 
-// Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction
-#define USE_CL_MAD 0
+// Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific
+// devices, this is enabled (see src/routine.cc).
+#ifndef USE_CL_MAD
+  #define USE_CL_MAD 0
+#endif
 
 // Sets a variable to zero
 #if PRECISION == 3232 || PRECISION == 6464
@@ -88,6 +95,20 @@ R"(
   #define SetToZero(a) a = ZERO
 #endif
 
+// Sets a variable to zero (only the imaginary part)
+#if PRECISION == 3232 || PRECISION == 6464
+  #define ImagToZero(a) a.y = ZERO
+#else
+  #define ImagToZero(a) 
+#endif
+
+// Sets a variable to one
+#if PRECISION == 3232 || PRECISION == 6464
+  #define SetToOne(a) a.x = ONE; a.y = ZERO
+#else
+  #define SetToOne(a) a = ONE
+#endif
+
 // Multiply two complex variables (used in the define below)
 #if PRECISION == 3232 || PRECISION == 6464
   #define MulReal(a, b) a.x*b.x - a.y*b.y
@@ -122,6 +143,6 @@ R"(
 // =================================================================================================
 
 // End of the C++11 raw string literal
-)";
+)"
 
 // =================================================================================================
diff --git a/src/kernels/copy.opencl b/src/kernels/copy.opencl
index f95b476b..7dde688b 100644
--- a/src/kernels/copy.opencl
+++ b/src/kernels/copy.opencl
@@ -68,6 +68,6 @@ __kernel void CopyMatrix(const int ld,
 // =================================================================================================
 
 // End of the C++11 raw string literal
-)";
+)"
 
 // =================================================================================================
diff --git a/src/kernels/pad.opencl b/src/kernels/pad.opencl
index 45eaef91..69324f20 100644
--- a/src/kernels/pad.opencl
+++ b/src/kernels/pad.opencl
@@ -86,7 +86,9 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
                           __global const real* restrict src,
                           const int dest_one, const int dest_two,
                           const int dest_ld, const int dest_offset,
-                          __global real* dest) {
+                          __global real* dest,
+                          const int upper, const int lower,
+                          const int diagonal_imag_zero) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -95,11 +97,20 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
     #pragma unroll
     for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
       const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_two && id_one < dest_one) {
+
+      // Masking in case of triangular matrices: updates only the upper or lower part
+      bool condition = true;
+      if (upper == 1) { condition = (id_two >= id_one); }
+      else if (lower == 1) { condition = (id_two <= id_one); }
+      if (condition) {
 
         // Copies the value into the destination matrix. This is always within bounds of the source
         // matrix, as we know that the destination matrix is smaller than the source.
-        dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset];
+        if (id_two < dest_two && id_one < dest_one) {
+          real value = src[id_two*src_ld + id_one + src_offset];
+          if (diagonal_imag_zero == 1 && id_one == id_two) { ImagToZero(value); }
+          dest[id_two*dest_ld + id_one + dest_offset] = value;
+        }
       }
     }
   }
@@ -127,15 +138,15 @@ __kernel void SymmLowerToSquared(const int src_dim,
       if (id_two < dest_dim && id_one < dest_dim) {
 
         // Loads data from the lower-symmetric matrix
-        real value;
-        SetToZero(value);
+        real result;
+        SetToZero(result);
         if (id_two < src_dim && id_one < src_dim) {
-          if (id_two <= id_one) { value = src[id_two*src_ld + id_one + src_offset]; }
-          else                  { value = src[id_one*src_ld + id_two + src_offset]; }
+          if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; }
+          else                  { result = src[id_one*src_ld + id_two + src_offset]; }
         }
 
-        // Stores the value in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = value;
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
       }
     }
   }
@@ -160,15 +171,171 @@ __kernel void SymmUpperToSquared(const int src_dim,
       if (id_two < dest_dim && id_one < dest_dim) {
 
         // Loads data from the upper-symmetric matrix
-        real value;
-        SetToZero(value);
+        real result;
+        SetToZero(result);
         if (id_two < src_dim && id_one < src_dim) {
-          if (id_one <= id_two) { value = src[id_two*src_ld + id_one + src_offset]; }
-          else                  { value = src[id_one*src_ld + id_two + src_offset]; }
+          if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; }
+          else                  { result = src[id_one*src_ld + id_two + src_offset]; }
         }
 
-        // Stores the value in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = value;
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+// =================================================================================================
+#if PRECISION == 3232 || PRECISION == 6464
+
+// Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
+// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void HermLowerToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the lower-hermitian matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_two <= id_one) {
+            result = src[id_two*src_ld + id_one + src_offset];
+            if (id_one == id_two) { result.y = ZERO; }
+          }
+          else {
+            result = src[id_one*src_ld + id_two + src_offset];
+            COMPLEX_CONJUGATE(result);
+          }
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+// Same as above, but now the matrix' data is stored in the upper-triangle
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void HermUpperToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the upper-hermitian matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_one <= id_two) {
+            result = src[id_two*src_ld + id_one + src_offset];
+            if (id_one == id_two) { result.y = ZERO; }
+          }
+          else {
+            result = src[id_one*src_ld + id_two + src_offset];
+            COMPLEX_CONJUGATE(result);
+          }
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+#endif
+// =================================================================================================
+
+// Kernel to populate a squared triangular matrix, given that the triangle which holds the data is
+// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void TrmmLowerToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest,
+                                 const int unit_diagonal) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the lower-triangular matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; }
+          if (id_two == id_one && unit_diagonal) { SetToOne(result); }
+          // Else: result is zero
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+// Same as above, but now the matrix' data is stored in the upper-triangle
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void TrmmUpperToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest,
+                                 const int unit_diagonal) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the upper-triangular matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; }
+          if (id_one == id_two && unit_diagonal) { SetToOne(result); }
+          // Else: result is zero
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
       }
     }
   }
@@ -177,6 +344,6 @@ __kernel void SymmUpperToSquared(const int src_dim,
 // =================================================================================================
 
 // End of the C++11 raw string literal
-)";
+)"
 
 // =================================================================================================
diff --git a/src/kernels/padtranspose.opencl b/src/kernels/padtranspose.opencl
index 2f2aabd6..a6b70f0b 100644
--- a/src/kernels/padtranspose.opencl
+++ b/src/kernels/padtranspose.opencl
@@ -100,7 +100,9 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
                                    __global const real* restrict src,
                                    const int dest_one, const int dest_two,
                                    const int dest_ld, const int dest_offset,
-                                   __global real* dest) {
+                                   __global real* dest,
+                                   const int upper, const int lower,
+                                   const int diagonal_imag_zero) {
 
   // Local memory to store a tile of the matrix (for coalescing)
   __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
@@ -137,10 +139,18 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
       const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
       const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
 
-      // Stores the transposed value in the destination matrix
-      if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
-        real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
-        dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+      // Masking in case of triangular matrices: updates only the upper or lower part
+      bool condition = true;
+      if (upper == 1) { condition = (id_dest_one >= id_dest_two); }
+      else if (lower == 1) { condition = (id_dest_one <= id_dest_two); }
+      if (condition) {
+
+        // Stores the transposed value in the destination matrix
+        if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
+          real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
+          if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); }
+          dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+        }
       }
     }
   }
@@ -149,6 +159,6 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
 // =================================================================================================
 
 // End of the C++11 raw string literal
-)";
+)"
 
 // =================================================================================================
diff --git a/src/kernels/transpose.opencl b/src/kernels/transpose.opencl
index 79ab1688..2aa53bb8 100644
--- a/src/kernels/transpose.opencl
+++ b/src/kernels/transpose.opencl
@@ -20,13 +20,16 @@ R"(
 // Parameters set by the tuner or by the database. Here they are given a basic default value in case
 // this kernel file is used outside of the CLBlast library.
 #ifndef TRA_DIM
-  #define TRA_DIM 8    // Number of local threads in the two dimensions (x,y)
+  #define TRA_DIM 8       // Number of local threads in the two dimensions (x,y)
 #endif
 #ifndef TRA_WPT
-  #define TRA_WPT 1    // Work per thread in one dimension and vector-width in the other
+  #define TRA_WPT 1       // Work per thread in one dimension and vector-width in the other
 #endif
 #ifndef TRA_PAD
-  #define TRA_PAD 0    // Padding of the local memory to avoid bank-conflicts
+  #define TRA_PAD 0       // Padding of the local memory to avoid bank-conflicts
+#endif
+#ifndef TRA_SHUFFLE
+  #define TRA_SHUFFLE 0   // Shuffling of the global indices to avoid global memory bank-conflicts
 #endif
 
 // =================================================================================================
@@ -53,116 +56,94 @@ __kernel void TransposeMatrix(const int ld,
                               __global const realT* restrict src,
                               __global realT* dest) {
 
-  // Local memory to store a tile of the matrix (for coalescing)
-  __local real tile[TRA_WPT*TRA_DIM][TRA_WPT*TRA_DIM + TRA_PAD];
+  // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different
+  // way over workgroups, breaking memory-bank dependencies.
+  const int gid0 = get_group_id(0);
+  #if TRA_SHUFFLE == 1
+    const int gid1 = (get_group_id(0) + get_group_id(1)) % get_num_groups(0);
+  #else
+    const int gid1 = get_group_id(1);
+  #endif
 
-  // Loop over the work per thread
+  // Local memory to store a tile of the matrix (for coalescing)
+  __local realT tile[TRA_WPT*TRA_DIM][TRA_DIM + TRA_PAD];
+
+  // Loops over the work per thread
   #pragma unroll
   for (int w_one=0; w_one<TRA_WPT; ++w_one) {
 
     // Computes the identifiers for the source matrix. Note that the local and global dimensions
     // do not correspond to each other!
-    const int id_one = get_group_id(1) * TRA_DIM + get_local_id(0);
-    const int id_two = (get_group_id(0) * TRA_DIM + get_local_id(1))*TRA_WPT + w_one;
+    const int id_one = gid1 * TRA_DIM + get_local_id(0);
+    const int id_two = (gid0 * TRA_DIM + get_local_id(1))*TRA_WPT + w_one;
 
     // Loads data into the local memory
     realT value = src[id_two*(ld/TRA_WPT) + id_one];
-    #if TRA_WPT == 1
-      tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value;
-    #elif TRA_WPT == 2
-      tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.x;
-      tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.y;
-    #elif TRA_WPT == 4
-      tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.x;
-      tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.y;
-      tile[get_local_id(1)*TRA_WPT + 2][get_local_id(0)*TRA_WPT + w_one] = value.z;
-      tile[get_local_id(1)*TRA_WPT + 3][get_local_id(0)*TRA_WPT + w_one] = value.w;
-    #elif TRA_WPT == 8
-      tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.s0;
-      tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.s1;
-      tile[get_local_id(1)*TRA_WPT + 2][get_local_id(0)*TRA_WPT + w_one] = value.s2;
-      tile[get_local_id(1)*TRA_WPT + 3][get_local_id(0)*TRA_WPT + w_one] = value.s3;
-      tile[get_local_id(1)*TRA_WPT + 4][get_local_id(0)*TRA_WPT + w_one] = value.s4;
-      tile[get_local_id(1)*TRA_WPT + 5][get_local_id(0)*TRA_WPT + w_one] = value.s5;
-      tile[get_local_id(1)*TRA_WPT + 6][get_local_id(0)*TRA_WPT + w_one] = value.s6;
-      tile[get_local_id(1)*TRA_WPT + 7][get_local_id(0)*TRA_WPT + w_one] = value.s7;
-    #elif TRA_WPT == 16
-      tile[get_local_id(1)*TRA_WPT +  0][get_local_id(0)*TRA_WPT + w_one] = value.s0;
-      tile[get_local_id(1)*TRA_WPT +  1][get_local_id(0)*TRA_WPT + w_one] = value.s1;
-      tile[get_local_id(1)*TRA_WPT +  2][get_local_id(0)*TRA_WPT + w_one] = value.s2;
-      tile[get_local_id(1)*TRA_WPT +  3][get_local_id(0)*TRA_WPT + w_one] = value.s3;
-      tile[get_local_id(1)*TRA_WPT +  4][get_local_id(0)*TRA_WPT + w_one] = value.s4;
-      tile[get_local_id(1)*TRA_WPT +  5][get_local_id(0)*TRA_WPT + w_one] = value.s5;
-      tile[get_local_id(1)*TRA_WPT +  6][get_local_id(0)*TRA_WPT + w_one] = value.s6;
-      tile[get_local_id(1)*TRA_WPT +  7][get_local_id(0)*TRA_WPT + w_one] = value.s7;
-      tile[get_local_id(1)*TRA_WPT +  8][get_local_id(0)*TRA_WPT + w_one] = value.s8;
-      tile[get_local_id(1)*TRA_WPT +  9][get_local_id(0)*TRA_WPT + w_one] = value.s9;
-      tile[get_local_id(1)*TRA_WPT + 10][get_local_id(0)*TRA_WPT + w_one] = value.sA;
-      tile[get_local_id(1)*TRA_WPT + 11][get_local_id(0)*TRA_WPT + w_one] = value.sB;
-      tile[get_local_id(1)*TRA_WPT + 12][get_local_id(0)*TRA_WPT + w_one] = value.sC;
-      tile[get_local_id(1)*TRA_WPT + 13][get_local_id(0)*TRA_WPT + w_one] = value.sD;
-      tile[get_local_id(1)*TRA_WPT + 14][get_local_id(0)*TRA_WPT + w_one] = value.sE;
-      tile[get_local_id(1)*TRA_WPT + 15][get_local_id(0)*TRA_WPT + w_one] = value.sF;
-    #endif
+    tile[get_local_id(0)*TRA_WPT + w_one][get_local_id(1)] = value;
   }
 
   // Synchronizes all threads in a workgroup
   barrier(CLK_LOCAL_MEM_FENCE);
 
-  // Loop over the work per thread
+  // Loads transposed data from the local memory
+  realT v[TRA_WPT];
+  #pragma unroll
+  for (int w_one=0; w_one<TRA_WPT; ++w_one) {
+    v[w_one] = tile[get_local_id(1)*TRA_WPT + w_one][get_local_id(0)];
+  }
+
+  // Performs the register-level transpose of the vectorized data
+  realT results[TRA_WPT];
+  #if TRA_WPT == 1
+    results[0] = v[0];
+  #elif TRA_WPT == 2
+    results[0] = (realT) (v[0].x, v[1].x);
+    results[1] = (realT) (v[0].y, v[1].y);
+  #elif TRA_WPT == 4
+    results[0] = (realT) (v[0].x, v[1].x, v[2].x, v[3].x);
+    results[1] = (realT) (v[0].y, v[1].y, v[2].y, v[3].y);
+    results[2] = (realT) (v[0].z, v[1].z, v[2].z, v[3].z);
+    results[3] = (realT) (v[0].w, v[1].w, v[2].w, v[3].w);
+  #elif TRA_WPT == 8
+    results[0] = (realT) (v[0].s0, v[1].s0, v[2].s0, v[3].s0, v[4].s0, v[5].s0, v[6].s0, v[7].s0);
+    results[1] = (realT) (v[0].s1, v[1].s1, v[2].s1, v[3].s1, v[4].s1, v[5].s1, v[6].s1, v[7].s1);
+    results[2] = (realT) (v[0].s2, v[1].s2, v[2].s2, v[3].s2, v[4].s2, v[5].s2, v[6].s2, v[7].s2);
+    results[3] = (realT) (v[0].s3, v[1].s3, v[2].s3, v[3].s3, v[4].s3, v[5].s3, v[6].s3, v[7].s3);
+    results[4] = (realT) (v[0].s4, v[1].s4, v[2].s4, v[3].s4, v[4].s4, v[5].s4, v[6].s4, v[7].s4);
+    results[5] = (realT) (v[0].s5, v[1].s5, v[2].s5, v[3].s5, v[4].s5, v[5].s5, v[6].s5, v[7].s5);
+    results[6] = (realT) (v[0].s6, v[1].s6, v[2].s6, v[3].s6, v[4].s6, v[5].s6, v[6].s6, v[7].s6);
+    results[7] = (realT) (v[0].s7, v[1].s7, v[2].s7, v[3].s7, v[4].s7, v[5].s7, v[6].s7, v[7].s7);
+  #elif TRA_WPT == 16
+    results[ 0] = (realT) (v[0].s0, v[1].s0, v[2].s0, v[3].s0, v[4].s0, v[5].s0, v[6].s0, v[7].s0, v[8].s0, v[9].s0, v[10].s0, v[11].s0, v[12].s0, v[13].s0, v[14].s0, v[15].s0);
+    results[ 1] = (realT) (v[0].s1, v[1].s1, v[2].s1, v[3].s1, v[4].s1, v[5].s1, v[6].s1, v[7].s1, v[8].s1, v[9].s1, v[10].s1, v[11].s1, v[12].s1, v[13].s1, v[14].s1, v[15].s1);
+    results[ 2] = (realT) (v[0].s2, v[1].s2, v[2].s2, v[3].s2, v[4].s2, v[5].s2, v[6].s2, v[7].s2, v[8].s2, v[9].s2, v[10].s2, v[11].s2, v[12].s2, v[13].s2, v[14].s2, v[15].s2);
+    results[ 3] = (realT) (v[0].s3, v[1].s3, v[2].s3, v[3].s3, v[4].s3, v[5].s3, v[6].s3, v[7].s3, v[8].s3, v[9].s3, v[10].s3, v[11].s3, v[12].s3, v[13].s3, v[14].s3, v[15].s3);
+    results[ 4] = (realT) (v[0].s4, v[1].s4, v[2].s4, v[3].s4, v[4].s4, v[5].s4, v[6].s4, v[7].s4, v[8].s4, v[9].s4, v[10].s4, v[11].s4, v[12].s4, v[13].s4, v[14].s4, v[15].s4);
+    results[ 5] = (realT) (v[0].s5, v[1].s5, v[2].s5, v[3].s5, v[4].s5, v[5].s5, v[6].s5, v[7].s5, v[8].s5, v[9].s5, v[10].s5, v[11].s5, v[12].s5, v[13].s5, v[14].s5, v[15].s5);
+    results[ 6] = (realT) (v[0].s6, v[1].s6, v[2].s6, v[3].s6, v[4].s6, v[5].s6, v[6].s6, v[7].s6, v[8].s6, v[9].s6, v[10].s6, v[11].s6, v[12].s6, v[13].s6, v[14].s6, v[15].s6);
+    results[ 7] = (realT) (v[0].s7, v[1].s7, v[2].s7, v[3].s7, v[4].s7, v[5].s7, v[6].s7, v[7].s7, v[8].s7, v[9].s7, v[10].s7, v[11].s7, v[12].s7, v[13].s7, v[14].s7, v[15].s7);
+    results[ 8] = (realT) (v[0].s8, v[1].s8, v[2].s8, v[3].s8, v[4].s8, v[5].s8, v[6].s8, v[7].s8, v[8].s8, v[9].s8, v[10].s8, v[11].s8, v[12].s8, v[13].s8, v[14].s8, v[15].s8);
+    results[ 9] = (realT) (v[0].s9, v[1].s9, v[2].s9, v[3].s9, v[4].s9, v[5].s9, v[6].s9, v[7].s9, v[8].s9, v[9].s9, v[10].s9, v[11].s9, v[12].s9, v[13].s9, v[14].s9, v[15].s9);
+    results[10] = (realT) (v[0].sA, v[1].sA, v[2].sA, v[3].sA, v[4].sA, v[5].sA, v[6].sA, v[7].sA, v[8].sA, v[9].sA, v[10].sA, v[11].sA, v[12].sA, v[13].sA, v[14].sA, v[15].sA);
+    results[11] = (realT) (v[0].sB, v[1].sB, v[2].sB, v[3].sB, v[4].sB, v[5].sB, v[6].sB, v[7].sB, v[8].sB, v[9].sB, v[10].sB, v[11].sB, v[12].sB, v[13].sB, v[14].sB, v[15].sB);
+    results[12] = (realT) (v[0].sC, v[1].sC, v[2].sC, v[3].sC, v[4].sC, v[5].sC, v[6].sC, v[7].sC, v[8].sC, v[9].sC, v[10].sC, v[11].sC, v[12].sC, v[13].sC, v[14].sC, v[15].sC);
+    results[13] = (realT) (v[0].sD, v[1].sD, v[2].sD, v[3].sD, v[4].sD, v[5].sD, v[6].sD, v[7].sD, v[8].sD, v[9].sD, v[10].sD, v[11].sD, v[12].sD, v[13].sD, v[14].sD, v[15].sD);
+    results[14] = (realT) (v[0].sE, v[1].sE, v[2].sE, v[3].sE, v[4].sE, v[5].sE, v[6].sE, v[7].sE, v[8].sE, v[9].sE, v[10].sE, v[11].sE, v[12].sE, v[13].sE, v[14].sE, v[15].sE);
+    results[15] = (realT) (v[0].sF, v[1].sF, v[2].sF, v[3].sF, v[4].sF, v[5].sF, v[6].sF, v[7].sF, v[8].sF, v[9].sF, v[10].sF, v[11].sF, v[12].sF, v[13].sF, v[14].sF, v[15].sF);
+  #endif
+
+  // Stores the results into the destination matrix
   #pragma unroll
   for (int w_two=0; w_two<TRA_WPT; ++w_two) {
-
-    // Computes the identifiers for the destination matrix
-    const int id_one = get_global_id(0);
-    const int id_two = get_global_id(1)*TRA_WPT + w_two;
-
-    // Stores the transposed value in the destination matrix
-    realT value;
-    #if TRA_WPT == 1
-      value = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
-    #elif TRA_WPT == 2
-      value.x = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
-      value.y = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
-    #elif TRA_WPT == 4
-      value.x = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
-      value.y = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
-      value.z = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 2];
-      value.w = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 3];
-    #elif TRA_WPT == 8
-      value.s0 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
-      value.s1 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
-      value.s2 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 2];
-      value.s3 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 3];
-      value.s4 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 4];
-      value.s5 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 5];
-      value.s6 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 6];
-      value.s7 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 7];
-    #elif TRA_WPT == 16
-      value.s0 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  0];
-      value.s1 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  1];
-      value.s2 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  2];
-      value.s3 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  3];
-      value.s4 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  4];
-      value.s5 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  5];
-      value.s6 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  6];
-      value.s7 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  7];
-      value.s8 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  8];
-      value.s9 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  9];
-      value.sA = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 10];
-      value.sB = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 11];
-      value.sC = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 12];
-      value.sD = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 13];
-      value.sE = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 14];
-      value.sF = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 15];
-    #endif
-    dest[id_two*(ld/TRA_WPT) + id_one] = value;
+    const int id_one = gid0*TRA_DIM + get_local_id(0);
+    const int id_two = (gid1*TRA_DIM + get_local_id(1))*TRA_WPT + w_two;
+    dest[id_two*(ld/TRA_WPT) + id_one] = results[w_two];
   }
 }
 
 // =================================================================================================
 
 // End of the C++11 raw string literal
-)";
+)"
 
 // =================================================================================================
diff --git a/src/kernels/xaxpy.opencl b/src/kernels/xaxpy.opencl
index 40c6c3bd..b7ffe9ff 100644
--- a/src/kernels/xaxpy.opencl
+++ b/src/kernels/xaxpy.opencl
@@ -123,6 +123,6 @@ __kernel void XaxpyFast(const int n, const real alpha,
 // =================================================================================================
 
 // End of the C++11 raw string literal
-)";
+)"
 
 // =================================================================================================
diff --git a/src/kernels/xgemm.opencl b/src/kernels/xgemm.opencl
index a4f45e90..8db0f557 100644
--- a/src/kernels/xgemm.opencl
+++ b/src/kernels/xgemm.opencl
@@ -127,6 +127,55 @@ R"(
 
 // =================================================================================================
 
+// Initializes the accumulation registers to zero
+inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
+  #pragma unroll
+  for (int mi=0; mi<MWI/VWM; ++mi) {
+    #pragma unroll
+    for (int ni=0; ni<NWI; ++ni) {
+      #if VWM == 1
+        SetToZero(cpm[ni][mi]);
+      #elif VWM == 2
+        SetToZero(cpm[ni][mi].x);
+        SetToZero(cpm[ni][mi].y);
+      #elif VWM == 4
+        SetToZero(cpm[ni][mi].x);
+        SetToZero(cpm[ni][mi].y);
+        SetToZero(cpm[ni][mi].z);
+        SetToZero(cpm[ni][mi].w);
+      #elif VWM == 8
+        SetToZero(cpm[ni][mi].s0);
+        SetToZero(cpm[ni][mi].s1);
+        SetToZero(cpm[ni][mi].s2);
+        SetToZero(cpm[ni][mi].s3);
+        SetToZero(cpm[ni][mi].s4);
+        SetToZero(cpm[ni][mi].s5);
+        SetToZero(cpm[ni][mi].s6);
+        SetToZero(cpm[ni][mi].s7);
+      #elif VWM == 16
+        SetToZero(cpm[ni][mi].s0);
+        SetToZero(cpm[ni][mi].s1);
+        SetToZero(cpm[ni][mi].s2);
+        SetToZero(cpm[ni][mi].s3);
+        SetToZero(cpm[ni][mi].s4);
+        SetToZero(cpm[ni][mi].s5);
+        SetToZero(cpm[ni][mi].s6);
+        SetToZero(cpm[ni][mi].s7);
+        SetToZero(cpm[ni][mi].s8);
+        SetToZero(cpm[ni][mi].s9);
+        SetToZero(cpm[ni][mi].sA);
+        SetToZero(cpm[ni][mi].sB);
+        SetToZero(cpm[ni][mi].sC);
+        SetToZero(cpm[ni][mi].sD);
+        SetToZero(cpm[ni][mi].sE);
+        SetToZero(cpm[ni][mi].sF);
+      #endif
+    }
+  }
+}
+
+// =================================================================================================
+
 // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
 // caching the A input matrix.
 #if SA == 1
@@ -272,71 +321,6 @@ inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg
 
 // =================================================================================================
 
-// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
-// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
-inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
-                         const real alpha, const real beta) {
-  #pragma unroll
-  for (int ni=0; ni<NWI; ++ni) {
-    #pragma unroll
-    for (int mi=0; mi<MWI/VWM; ++mi) {
-      #if STRM == 0
-        int mg = mi + get_local_id(0)*(MWI/VWM);
-      #elif STRM == 1
-        int mg = get_local_id(0) + mi*MDIMC;
-      #endif
-      #if STRN == 0
-        int ng = ni + get_local_id(1)*NWI;
-      #elif STRN == 1
-        int ng = ni%VWN + get_local_id(1)*VWN + (ni/VWN)*VWN*NDIMC;
-      #endif
-      int idm = mg + get_group_id(0)*(MWG/VWM);
-      int idn = ng + get_group_id(1)*NWG;
-      int index = idn*(kSizeM/VWM) + idm;
-      realM cval = cgm[index];
-      #if VWM == 1
-        AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cval);
-      #elif VWM == 2
-        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
-        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
-      #elif VWM == 4
-        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
-        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
-        AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cval.z);
-        AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cval.w);
-      #elif VWM == 8
-        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
-        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
-        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
-        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
-        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
-        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
-        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
-        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
-      #elif VWM == 16
-        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
-        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
-        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
-        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
-        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
-        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
-        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
-        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
-        AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cval.s8);
-        AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cval.s9);
-        AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cval.sA);
-        AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cval.sB);
-        AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cval.sC);
-        AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cval.sD);
-        AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cval.sE);
-        AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cval.sF);
-      #endif
-    }
-  }
-}
-
-// =================================================================================================
-
 // The vectorised multiply-add function
 inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
   #if USE_VECTOR_MAD == 1
@@ -432,77 +416,97 @@ inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], real
 
 // =================================================================================================
 
-// Main entry of the kernel. This function contains the basic skeleton, the functionality is
-// provided by the inlined functions above
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
-                    const real alpha, const real beta,
-                    const __global realM* restrict agm,
-                    const __global realN* restrict bgm,
-                    __global realM* cgm) {
+// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
+// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
+inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
+                         const real alpha, const real beta) {
+  #pragma unroll
+  for (int ni=0; ni<NWI; ++ni) {
+    #pragma unroll
+    for (int mi=0; mi<MWI/VWM; ++mi) {
+      #if STRM == 0
+        int mg = mi + get_local_id(0)*(MWI/VWM);
+      #elif STRM == 1
+        int mg = get_local_id(0) + mi*MDIMC;
+      #endif
+      #if STRN == 0
+        int ng = ni + get_local_id(1)*NWI;
+      #elif STRN == 1
+        int ng = ni%VWN + get_local_id(1)*VWN + (ni/VWN)*VWN*NDIMC;
+      #endif
+      int idm = mg + get_group_id(0)*(MWG/VWM);
+      int idn = ng + get_group_id(1)*NWG;
 
-  // Combined thread identifier
+      // The final multiplication with alpha and the addition with beta*C
+      int index = idn*(kSizeM/VWM) + idm;
+      realM cval = cgm[index];
+      #if VWM == 1
+        AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cval);
+      #elif VWM == 2
+        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
+        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
+      #elif VWM == 4
+        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
+        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
+        AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cval.z);
+        AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cval.w);
+      #elif VWM == 8
+        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
+        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
+        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
+        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
+        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
+        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
+        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
+        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
+      #elif VWM == 16
+        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
+        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
+        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
+        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
+        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
+        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
+        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
+        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
+        AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cval.s8);
+        AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cval.s9);
+        AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cval.sA);
+        AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cval.sB);
+        AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cval.sC);
+        AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cval.sD);
+        AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cval.sE);
+        AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cval.sF);
+      #endif
+    }
+  }
+}
+
+// =================================================================================================
+
+// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
+inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
+                      const __global realM* restrict agm, const __global realN* restrict bgm,
+                      __global realM* cgm, realM cpm[NWI][MWI/VWM]
+                      #if SA == 1 && SB == 1
+                        , __local realM* alm, __local realN* blm
+                      #elif SA == 1
+                        , __local realM* alm
+                      #elif SB == 1
+                        , __local realN* blm
+                      #endif
+                      ) {
+
+  // Allocates workitem-private memory (registers)
+  realM apm[MWI/VWM];
+  realN bpm[NWI/VWN];
+
+  // Combined thread identifier (volatile to disable caching)
   #if SA == 1 || SB == 1
     volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
   #endif
 
-  // Allocates workgroup-private memory (local memory)
-  #if SA == 1
-    __local realM alm[KWG * MWG/VWM];
-  #endif
-  #if SB == 1
-    __local realN blm[KWG * NWG/VWN];
-  #endif
-  
-  // Allocates workitem-private memory (registers)
-  realM apm[MWI/VWM];
-  realN bpm[NWI/VWN];
-  realM cpm[NWI][MWI/VWM];
-
   // Initializes the accumulation registers
-  #pragma unroll
-  for (int mi=0; mi<MWI/VWM; ++mi) {
-    #pragma unroll
-    for (int ni=0; ni<NWI; ++ni) {
-      #if VWM == 1
-        SetToZero(cpm[ni][mi]);
-      #elif VWM == 2
-        SetToZero(cpm[ni][mi].x);
-        SetToZero(cpm[ni][mi].y);
-      #elif VWM == 4
-        SetToZero(cpm[ni][mi].x);
-        SetToZero(cpm[ni][mi].y);
-        SetToZero(cpm[ni][mi].z);
-        SetToZero(cpm[ni][mi].w);
-      #elif VWM == 8
-        SetToZero(cpm[ni][mi].s0);
-        SetToZero(cpm[ni][mi].s1);
-        SetToZero(cpm[ni][mi].s2);
-        SetToZero(cpm[ni][mi].s3);
-        SetToZero(cpm[ni][mi].s4);
-        SetToZero(cpm[ni][mi].s5);
-        SetToZero(cpm[ni][mi].s6);
-        SetToZero(cpm[ni][mi].s7);
-      #elif VWM == 16
-        SetToZero(cpm[ni][mi].s0);
-        SetToZero(cpm[ni][mi].s1);
-        SetToZero(cpm[ni][mi].s2);
-        SetToZero(cpm[ni][mi].s3);
-        SetToZero(cpm[ni][mi].s4);
-        SetToZero(cpm[ni][mi].s5);
-        SetToZero(cpm[ni][mi].s6);
-        SetToZero(cpm[ni][mi].s7);
-        SetToZero(cpm[ni][mi].s8);
-        SetToZero(cpm[ni][mi].s9);
-        SetToZero(cpm[ni][mi].sA);
-        SetToZero(cpm[ni][mi].sB);
-        SetToZero(cpm[ni][mi].sC);
-        SetToZero(cpm[ni][mi].sD);
-        SetToZero(cpm[ni][mi].sE);
-        SetToZero(cpm[ni][mi].sF);
-      #endif
-    }
-  }
+  InitAccRegisters(cpm);
 
   // Loops over all workgroup tiles
   for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
@@ -515,8 +519,6 @@ __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
     #if SB == 1
       GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
     #endif
-
-    // Synchronizes all threads in a workgroup
     #if SA == 1 || SB == 1
       barrier(CLK_LOCAL_MEM_FENCE);
     #endif
@@ -552,20 +554,130 @@ __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
         MultiplyAccumulate(cpm, apm, bpm);
       }
     }
-
-    // Synchronizes all threads in a workgroup
     #if SA == 1 || SB == 1
       barrier(CLK_LOCAL_MEM_FENCE);
     #endif
   }
-
-  // Stores an MWG * NWG tile of results and perform the multiplication with alpha and beta
-  StoreResults(cgm, cpm, kSizeM, alpha, beta);
 }
 
 // =================================================================================================
+// The upper-triangular and lower-triangular kernels are only used in special cases
+#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
 
-// End of the C++11 raw string literal
-)";
+// Main entry point of the kernel. This is the upper-triangular version.
+__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
+                         const real alpha, const real beta,
+                         const __global realM* restrict agm,
+                         const __global realN* restrict bgm,
+                         __global realM* cgm) {
+
+  // Skip these threads if they do not contain threads contributing to the upper-triangle
+  if (get_group_id(1)*NWG < get_group_id(0)*MWG) {
+    return;
+  }
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+
+  // Computes the matrix-multiplication and stores the result in register memory
+  realM cpm[NWI][MWI/VWM];
+  #if SA == 1 && SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+  #elif SA == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+  #elif SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+  #else
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+  #endif
+
+  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+  StoreResults(cgm, cpm, kSizeN, alpha, beta);
+}
+
+// Main entry point of the kernel. This is the lower-triangular version.
+__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+__kernel void XgemmLower(const int kSizeN, const int kSizeK,
+                         const real alpha, const real beta,
+                         const __global realM* restrict agm,
+                         const __global realN* restrict bgm,
+                         __global realM* cgm) {
+
+  // Skip these threads if they do not contain threads contributing to the lower-triangle
+  if (get_group_id(1)*NWG > get_group_id(0)*MWG) {
+    return;
+  }
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+
+  // Computes the matrix-multiplication and stores the result in register memory
+  realM cpm[NWI][MWI/VWM];
+  #if SA == 1 && SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+  #elif SA == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+  #elif SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+  #else
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+  #endif
+
+  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+  StoreResults(cgm, cpm, kSizeN, alpha, beta);
+}
+
+// =================================================================================================
+// If not using a triangular version, include the regular kernel
+#else
+
+// Main entry point of the kernel. This is the regular full version.
+__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
+                    const real alpha, const real beta,
+                    const __global realM* restrict agm,
+                    const __global realN* restrict bgm,
+                    __global realM* cgm) {
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+
+  // Computes the matrix-multiplication and stores the result in register memory
+  realM cpm[NWI][MWI/VWM];
+  #if SA == 1 && SB == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+  #elif SA == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+  #elif SB == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+  #else
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+  #endif
+
+  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+  StoreResults(cgm, cpm, kSizeM, alpha, beta);
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
 
 // =================================================================================================
diff --git a/src/kernels/xgemv.opencl b/src/kernels/xgemv.opencl
index 4bb69090..65061717 100644
--- a/src/kernels/xgemv.opencl
+++ b/src/kernels/xgemv.opencl
@@ -368,6 +368,6 @@ __kernel void XgemvFastRot(const int m, const int n, const real alpha, const rea
 // =================================================================================================
 
 // End of the C++11 raw string literal
-)";
+)"
 
 // =================================================================================================
diff --git a/src/routine.cc b/src/routine.cc
index a4e0bb37..aded1a31 100644
--- a/src/routine.cc
+++ b/src/routine.cc
@@ -22,9 +22,10 @@ namespace clblast {
 std::vector<Routine::ProgramCache> Routine::program_cache_;
 
 // Constructor: not much here, because no status codes can be returned
-Routine::Routine(CommandQueue &queue, Event &event,
+Routine::Routine(CommandQueue &queue, Event &event, const std::string &name,
                  const std::vector<std::string> &routines, const Precision precision):
     precision_(precision),
+    routine_name_(name),
     queue_(queue),
     event_(event),
     context_(queue_.GetContext()),
@@ -33,14 +34,13 @@ Routine::Routine(CommandQueue &queue, Event &event,
     max_work_item_dimensions_(device_.MaxWorkItemDimensions()),
     max_work_item_sizes_(device_.MaxWorkItemSizes()),
     max_work_group_size_(device_.MaxWorkGroupSize()),
-    db_(queue_, routines, precision_),
-    routines_(routines) {
+    db_(queue_, routines, precision_) {
 }
 
 // =================================================================================================
 
 // Separate set-up function to allow for status codes to be returned
-StatusCode Routine::SetUp(const std::string &routine_source) {
+StatusCode Routine::SetUp() {
 
   // Queries the cache to see whether or not the compiled kernel is already there. If not, it will
   // be built and added to the cache.
@@ -63,12 +63,24 @@ StatusCode Routine::SetUp(const std::string &routine_source) {
 
     // Loads the common header (typedefs and defines and such)
     std::string common_header =
-    #include "kernels/common.opencl"
+      #include "kernels/common.opencl"
+    ;
 
     // Collects the parameters for this device in the form of defines, and adds the precision
     auto defines = db_.GetDefines();
     defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
-    auto source_string = defines + common_header + routine_source;
+
+    // Adds the name of the routine as a define
+    defines += "#define ROUTINE_"+routine_name_+"\n";
+
+    // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
+    // performance, but might result in a reduced accuracy.
+    if (device_.Vendor() == "AMD") {
+      defines += "#define USE_CL_MAD 1\n";
+    }
+
+    // Combines everything together into a single source string
+    auto source_string = defines + common_header + source_string_;
 
     // Compiles the kernel
     try {
@@ -85,7 +97,7 @@ StatusCode Routine::SetUp(const std::string &routine_source) {
       if (status == CL_INVALID_BINARY) { return StatusCode::kInvalidBinary; }
 
       // Store the compiled program in the cache
-      program_cache_.push_back({program, device_name_, precision_, routines_});
+      program_cache_.push_back({program, device_name_, precision_, routine_name_});
     } catch (...) { return StatusCode::kBuildProgramFailure; }
   }
 
@@ -202,19 +214,22 @@ StatusCode Routine::TestVectorY(const size_t n, const Buffer &buffer, const size
 
 // =================================================================================================
 
-// Copies a matrix and pads it with zeros
+// Copies or transposes a matrix and pads/unpads it with zeros
 StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
                                            const size_t src_ld, const size_t src_offset,
                                            const Buffer &src,
                                            const size_t dest_one, const size_t dest_two,
                                            const size_t dest_ld, const size_t dest_offset,
                                            const Buffer &dest,
+                                           const Program &program, const bool do_pad,
                                            const bool do_transpose, const bool do_conjugate,
-                                           const bool pad, const Program &program) {
+                                           const bool upper, const bool lower,
+                                           const bool diagonal_imag_zero) {
 
   // Determines whether or not the fast-version could potentially be used
   auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
-                         (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld);
+                         (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
+                         (upper == false) && (lower == false) && (diagonal_imag_zero == false);
 
   // Determines the right kernel
   auto kernel_name = std::string{};
@@ -227,7 +242,7 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
     }
     else {
       use_fast_kernel = false;
-      kernel_name = (pad) ? "PadTransposeMatrix" : "UnPadTransposeMatrix";
+      kernel_name = (do_pad) ? "PadTransposeMatrix" : "UnPadTransposeMatrix";
     }
   }
   else {
@@ -239,7 +254,7 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
     }
     else {
       use_fast_kernel = false;
-      kernel_name = (pad) ? "PadMatrix" : "UnPadMatrix";
+      kernel_name = (do_pad) ? "PadMatrix" : "UnPadMatrix";
     }
   }
 
@@ -264,9 +279,14 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
       kernel.SetArgument(7, static_cast<int>(dest_ld));
       kernel.SetArgument(8, static_cast<int>(dest_offset));
       kernel.SetArgument(9, dest());
-      if (pad) {
+      if (do_pad) {
         kernel.SetArgument(10, static_cast<int>(do_conjugate));
       }
+      else {
+        kernel.SetArgument(10, static_cast<int>(upper));
+        kernel.SetArgument(11, static_cast<int>(lower));
+        kernel.SetArgument(12, static_cast<int>(diagonal_imag_zero));
+      }
     }
 
     // Launches the kernel and returns the error code. Uses global and local thread sizes based on
@@ -310,7 +330,7 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
 // otherwise.
 const Program& Routine::GetProgramFromCache() const {
   for (auto &cached_program: program_cache_) {
-    if (cached_program.MatchInCache(device_name_, precision_, routines_)) {
+    if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) {
       return cached_program.program;
     }
   }
@@ -320,7 +340,7 @@ const Program& Routine::GetProgramFromCache() const {
 // Queries the cache to see whether or not the compiled kernel is already there
 bool Routine::ProgramIsInCache() const {
   for (auto &cached_program: program_cache_) {
-    if (cached_program.MatchInCache(device_name_, precision_, routines_)) { return true; }
+    if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) { return true; }
   }
   return false;
 }
diff --git a/src/routines/xaxpy.cc b/src/routines/level1/xaxpy.cc
similarity index 96%
rename from src/routines/xaxpy.cc
rename to src/routines/level1/xaxpy.cc
index b68458da..e6b320d9 100644
--- a/src/routines/xaxpy.cc
+++ b/src/routines/level1/xaxpy.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/xaxpy.h"
+#include "internal/routines/level1/xaxpy.h"
 
 #include <string>
 #include <vector>
@@ -30,7 +30,10 @@ template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDoub
 // Constructor: forwards to base class constructor
 template <typename T>
 Xaxpy<T>::Xaxpy(CommandQueue &queue, Event &event):
-    Routine(queue, event, {"Xaxpy"}, precision_) {
+    Routine(queue, event, "AXPY", {"Xaxpy"}, precision_) {
+  source_string_ =
+    #include "../../kernels/xaxpy.opencl"
+  ;
 }
 
 // =================================================================================================
diff --git a/src/routines/xgemv.cc b/src/routines/level2/xgemv.cc
similarity index 97%
rename from src/routines/xgemv.cc
rename to src/routines/level2/xgemv.cc
index 1868dec4..a7052af8 100644
--- a/src/routines/xgemv.cc
+++ b/src/routines/level2/xgemv.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/xgemv.h"
+#include "internal/routines/level2/xgemv.h"
 
 #include <string>
 #include <vector>
@@ -30,7 +30,10 @@ template <> const Precision Xgemv<double2>::precision_ = Precision::kComplexDoub
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgemv<T>::Xgemv(CommandQueue &queue, Event &event):
-    Routine(queue, event, {"Xgemv"}, precision_) {
+    Routine(queue, event, "GEMV", {"Xgemv"}, precision_) {
+  source_string_ =
+    #include "../../kernels/xgemv.opencl"
+  ;
 }
 
 // =================================================================================================
diff --git a/src/routines/xgemm.cc b/src/routines/level3/xgemm.cc
similarity index 68%
rename from src/routines/xgemm.cc
rename to src/routines/level3/xgemm.cc
index 20cd2675..85524891 100644
--- a/src/routines/xgemm.cc
+++ b/src/routines/level3/xgemm.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/xgemm.h"
+#include "internal/routines/level3/xgemm.h"
 
 #include <string>
 #include <vector>
@@ -30,7 +30,14 @@ template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDoub
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgemm<T>::Xgemm(CommandQueue &queue, Event &event):
-    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
+    Routine(queue, event, "GEMM", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+  source_string_ =
+    #include "../../kernels/copy.opencl"
+    #include "../../kernels/pad.opencl"
+    #include "../../kernels/transpose.opencl"
+    #include "../../kernels/padtranspose.opencl"
+    #include "../../kernels/xgemm.opencl"
+  ;
 }
 
 // =================================================================================================
@@ -95,31 +102,48 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
   auto n_ceiled = Ceil(n, db_["NWG"]);
   auto k_ceiled = Ceil(k, db_["KWG"]);
 
-  // Allocates space on the device for padded and/or transposed input and output matrices.
+  // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
   try {
-    auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T));
-    auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
-    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T));
 
     // Loads the program from the database
     auto& program = GetProgramFromCache();
 
-    // Runs the pre-processing kernels. This transposes the matrices, but also pads zeros to fill
-    // them up until they reach a certain multiple of size (kernel parameter dependent).
-    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
-                                    m_ceiled, k_ceiled, m_ceiled, 0, temp_a,
-                                    a_do_transpose, a_conjugate, true, program);
-    if (ErrorIn(status)) { return status; }
-    status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
-                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
-                                    b_do_transpose, b_conjugate, true, program);
-    if (ErrorIn(status)) { return status; }
+    // Determines whether or not temporary matrices are needed
+    auto a_no_temp = a_one == m_ceiled && a_two == k_ceiled && a_ld == m_ceiled && a_offset == 0 &&
+                     a_do_transpose == false && a_conjugate == false;
+    auto b_no_temp = b_one == n_ceiled && b_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
+                     b_do_transpose == false && b_conjugate == false;
+    auto c_no_temp = c_one == m_ceiled && c_two == n_ceiled && c_ld == m_ceiled && c_offset == 0 &&
+                     c_do_transpose == false;
 
-    // Only necessary for matrix C if it used both as input and output
-    if (beta != static_cast<T>(0)) {
+    // Creates the temporary matrices
+    auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T));
+    auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto c_temp = (c_no_temp) ? c_buffer : Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T));
+
+    // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
+    // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+    // case nothing has to be done, these kernels can be skipped.
+    if (!a_no_temp) {
+      status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+                                      m_ceiled, k_ceiled, m_ceiled, 0, a_temp,
+                                      program, true, a_do_transpose, a_conjugate);
+      if (ErrorIn(status)) { return status; }
+    }
+
+    // As above, but now for matrix B
+    if (!b_no_temp) {
+      status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
+                                      n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
+                                      program, true, b_do_transpose, b_conjugate);
+      if (ErrorIn(status)) { return status; }
+    }
+
+    // As above, but now for matrix C. This is only necessary if C is used both as input and output.
+    if (!c_no_temp && beta != static_cast<T>(0)) {
       status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer,
-                                      m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
-                                      c_do_transpose, false, true, program);
+                                      m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
+                                      program, true, c_do_transpose, false);
       if (ErrorIn(status)) { return status; }
     }
 
@@ -133,9 +157,9 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
       kernel.SetArgument(2, static_cast<int>(k_ceiled));
       kernel.SetArgument(3, alpha);
       kernel.SetArgument(4, beta);
-      kernel.SetArgument(5, temp_a());
-      kernel.SetArgument(6, temp_b());
-      kernel.SetArgument(7, temp_c());
+      kernel.SetArgument(5, a_temp());
+      kernel.SetArgument(6, b_temp());
+      kernel.SetArgument(7, c_temp());
 
       // Computes the global and local thread sizes
       auto global = std::vector<size_t>{
@@ -148,11 +172,13 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
       status = RunKernel(kernel, global, local);
       if (ErrorIn(status)) { return status; }
 
-      // Runs the post-processing kernel
-      status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
-                                      c_one, c_two, c_ld, c_offset, c_buffer,
-                                      c_do_transpose, false, false, program);
-      if (ErrorIn(status)) { return status; }
+      // Runs the post-processing kernel if needed
+      if (!c_no_temp) {
+        status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
+                                        c_one, c_two, c_ld, c_offset, c_buffer,
+                                        program, false, c_do_transpose, false);
+        if (ErrorIn(status)) { return status; }
+      }
 
       // Successfully finished the computation
       return StatusCode::kSuccess;
diff --git a/src/routines/level3/xhemm.cc b/src/routines/level3/xhemm.cc
new file mode 100644
index 00000000..bc257c44
--- /dev/null
+++ b/src/routines/level3/xhemm.cc
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhemm class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xhemm.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xhemm<T>::Xhemm(CommandQueue &queue, Event &event):
+    Xgemm<T>(queue, event) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle triangle,
+                            const size_t m, const size_t n,
+                            const T alpha,
+                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                            const T beta,
+                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Computes the k dimension. This is based on whether or not the hermitian matrix is A (on the
+  // left) or B (on the right) in the Xgemm routine.
+  auto k = (side == Side::kLeft) ? m : n;
+
+  // Checks for validity of the squared A matrix
+  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
+  // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix
+  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+  auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared";
+
+  // Temporary buffer for a copy of the hermitian matrix
+  try {
+    auto temp_herm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
+
+    // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm
+    // routine afterwards
+    try {
+      auto& program = GetProgramFromCache();
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the arguments for the hermitian-to-squared kernel
+      kernel.SetArgument(0, static_cast<int>(k));
+      kernel.SetArgument(1, static_cast<int>(a_ld));
+      kernel.SetArgument(2, static_cast<int>(a_offset));
+      kernel.SetArgument(3, a_buffer());
+      kernel.SetArgument(4, static_cast<int>(k));
+      kernel.SetArgument(5, static_cast<int>(k));
+      kernel.SetArgument(6, static_cast<int>(0));
+      kernel.SetArgument(7, temp_herm());
+
+      // Uses the common padding kernel's thread configuration. This is allowed, since the
+      // hermitian-to-squared kernel uses the same parameters.
+      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the regular Xgemm code with either "C := AB+C" or ...
+      if (side == Side::kLeft) {
+        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
+                        m, n, k,
+                        alpha,
+                        temp_herm, 0, k,
+                        b_buffer, b_offset, b_ld,
+                        beta,
+                        c_buffer, c_offset, c_ld);
+      }
+
+      // ... with "C := BA+C". Note that A and B are now reversed.
+      else {
+        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
+                        m, n, k,
+                        alpha,
+                        b_buffer, b_offset, b_ld,
+                        temp_herm, 0, k,
+                        beta,
+                        c_buffer, c_offset, c_ld);
+
+        // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
+        switch(status) {
+          case StatusCode::kInvalidMatrixA:      status = StatusCode::kInvalidMatrixB; break;
+          case StatusCode::kInvalidMatrixB:      status = StatusCode::kInvalidMatrixA; break;
+          case StatusCode::kInvalidLeadDimA:     status = StatusCode::kInvalidLeadDimB; break;
+          case StatusCode::kInvalidLeadDimB:     status = StatusCode::kInvalidLeadDimA; break;
+          case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
+          case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
+        }
+      }
+
+      // Return the status of the Xgemm routine
+      return status;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xhemm<float2>;
+template class Xhemm<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc
new file mode 100644
index 00000000..fa42733f
--- /dev/null
+++ b/src/routines/level3/xher2k.cc
@@ -0,0 +1,207 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xher2k class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xher2k.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xher2k<float2,float>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xher2k<double2,double>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T, typename U>
+Xher2k<T,U>::Xher2k(CommandQueue &queue, Event &event):
+    Routine(queue, event, "HER2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+  source_string_ =
+    #include "../../kernels/copy.opencl"
+    #include "../../kernels/pad.opencl"
+    #include "../../kernels/transpose.opencl"
+    #include "../../kernels/padtranspose.opencl"
+    #include "../../kernels/xgemm.opencl"
+  ;
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T, typename U>
+StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                                const size_t n, const size_t k,
+                                const T alpha,
+                                const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                                const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                                const U beta,
+                                const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
+  // to matrix A (argument: conjugate transpose)
+  auto ab_conjugate = (ab_transpose != Transpose::kNo);
+
+  // Computes whether or not the matrices are transposed in memory. This is based on their layout
+  // (row or column-major) and whether or not they are requested to be pre-transposed.
+  auto ab_rotated = (layout == Layout::kColMajor && ab_conjugate) ||
+                    (layout == Layout::kRowMajor && !ab_conjugate);
+  auto c_rotated = (layout == Layout::kRowMajor);
+
+  // Computes the first and second dimensions of the A and B matrices taking the layout into account
+  auto ab_one = (ab_rotated) ? k : n;
+  auto ab_two = (ab_rotated) ? n : k;
+
+  // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
+  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
+  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
+  // space. Also tests that the leading dimensions of:
+  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix C cannot be less than N
+  auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Calculates the ceiled versions of n and k
+  auto n_ceiled = Ceil(n, db_["NWG"]);
+  auto k_ceiled = Ceil(k, db_["KWG"]);
+
+  // Decides which kernel to run: the upper-triangular or lower-triangular version
+  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
+
+  // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
+  try {
+
+    // Loads the program from the database
+    auto& program = GetProgramFromCache();
+
+    // Determines whether or not temporary matrices are needed
+    auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+                      ab_rotated == false && ab_conjugate == false;
+    auto a2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+                      ab_rotated == false && ab_conjugate == true;
+    auto b1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
+                      ab_rotated == false && ab_conjugate == false;
+    auto b2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
+                      ab_rotated == false && ab_conjugate == true;
+
+    // Creates the temporary matrices
+    auto a1_temp = (a1_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto a2_temp = (a2_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto b1_temp = (b1_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto b2_temp = (b2_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
+
+    // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
+    // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+    // case nothing has to be done, these kernels can be skipped.
+    if (!a1_no_temp) {
+      status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
+                                      n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
+                                      program, true, ab_rotated, ab_conjugate);
+      if (ErrorIn(status)) { return status; }
+    }
+    if (!a2_no_temp) {
+      status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
+                                      n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
+                                      program, true, ab_rotated, !ab_conjugate);
+      if (ErrorIn(status)) { return status; }
+    }
+    if (!b1_no_temp) {
+      status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
+                                      n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
+                                      program, true, ab_rotated, ab_conjugate);
+      if (ErrorIn(status)) { return status; }
+    }
+    if (!b2_no_temp) {
+      status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
+                                      n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
+                                      program, true, ab_rotated, !ab_conjugate);
+      if (ErrorIn(status)) { return status; }
+    }
+
+    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+    // modify the other triangle.
+    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
+                                    n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                                    program, true, c_rotated, false);
+    if (ErrorIn(status)) { return status; }
+
+    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+    try {
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the kernel arguments
+      auto complex_beta = T{beta, static_cast<U>(0.0)};
+      kernel.SetArgument(0, static_cast<int>(n_ceiled));
+      kernel.SetArgument(1, static_cast<int>(k_ceiled));
+      kernel.SetArgument(2, alpha);
+      kernel.SetArgument(3, complex_beta);
+      kernel.SetArgument(4, a1_temp());
+      kernel.SetArgument(5, b2_temp());
+      kernel.SetArgument(6, c_temp());
+
+      // Computes the global and local thread sizes
+      auto global = std::vector<size_t>{
+        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+      };
+      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+      // Launches the kernel
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
+      auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
+      auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
+      kernel.SetArgument(2, conjugate_alpha);
+      kernel.SetArgument(3, complex_one);
+      kernel.SetArgument(4, b1_temp());
+      kernel.SetArgument(5, a2_temp());
+
+      // Runs the kernel again
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the post-processing kernel
+      auto upper = (triangle == Triangle::kUpper);
+      auto lower = (triangle == Triangle::kLower);
+      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                                      n, n, c_ld, c_offset, c_buffer,
+                                      program, false, c_rotated, false, upper, lower, true);
+      if (ErrorIn(status)) { return status; }
+
+      // Successfully finished the computation
+      return StatusCode::kSuccess;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xher2k<float2,float>;
+template class Xher2k<double2,double>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc
new file mode 100644
index 00000000..ae350050
--- /dev/null
+++ b/src/routines/level3/xherk.cc
@@ -0,0 +1,175 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xherk class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xherk.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xherk<float2,float>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xherk<double2,double>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T, typename U>
+Xherk<T,U>::Xherk(CommandQueue &queue, Event &event):
+    Routine(queue, event, "HERK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+  source_string_ =
+    #include "../../kernels/copy.opencl"
+    #include "../../kernels/pad.opencl"
+    #include "../../kernels/transpose.opencl"
+    #include "../../kernels/padtranspose.opencl"
+    #include "../../kernels/xgemm.opencl"
+  ;
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T, typename U>
+StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                              const size_t n, const size_t k,
+                              const U alpha,
+                              const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                              const U beta,
+                              const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
+  // to matrix A (argument: conjugate transpose)
+  auto a_conjugate = (a_transpose != Transpose::kNo);
+  auto b_conjugate = (a_transpose == Transpose::kNo);
+
+  // Computes whether or not the matrices are transposed in memory. This is based on their layout
+  // (row or column-major) and whether or not they are requested to be pre-transposed.
+  auto a_rotated = (layout == Layout::kColMajor && a_conjugate) ||
+                   (layout == Layout::kRowMajor && !a_conjugate);
+  auto c_rotated = (layout == Layout::kRowMajor);
+
+  // Computes the first and second dimensions of the A matrix taking the layout into account
+  auto a_one = (a_rotated) ? k : n;
+  auto a_two = (a_rotated) ? n : k;
+
+  // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and
+  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
+  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
+  // space. Also tests that the leading dimensions of:
+  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix C cannot be less than N
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Calculates the ceiled versions of n and k
+  auto n_ceiled = Ceil(n, db_["NWG"]);
+  auto k_ceiled = Ceil(k, db_["KWG"]);
+
+  // Decides which kernel to run: the upper-triangular or lower-triangular version
+  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
+
+  // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
+  try {
+
+    // Loads the program from the database
+    auto& program = GetProgramFromCache();
+
+    // Determines whether or not temporary matrices are needed
+    auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+                     a_rotated == false && a_conjugate == false;
+    auto b_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+                     a_rotated == false && b_conjugate == false;
+
+    // Creates the temporary matrices
+    auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto b_temp = (b_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
+
+    // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
+    // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+    // case nothing has to be done, these kernels can be skipped. Two copies are created.
+    if (!a_no_temp) {
+      status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+                                      n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
+                                      program, true, a_rotated, a_conjugate);
+      if (ErrorIn(status)) { return status; }
+    }
+    if (!b_no_temp) {
+      status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+                                      n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
+                                      program, true, a_rotated, b_conjugate);
+      if (ErrorIn(status)) { return status; }
+    }
+
+    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+    // modify the other triangle.
+    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
+                                    n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                                    program, true, c_rotated, false);
+    if (ErrorIn(status)) { return status; }
+
+    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+    try {
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the kernel arguments
+      auto complex_alpha = T{alpha, static_cast<U>(0.0)};
+      auto complex_beta = T{beta, static_cast<U>(0.0)};
+      kernel.SetArgument(0, static_cast<int>(n_ceiled));
+      kernel.SetArgument(1, static_cast<int>(k_ceiled));
+      kernel.SetArgument(2, complex_alpha);
+      kernel.SetArgument(3, complex_beta);
+      kernel.SetArgument(4, a_temp());
+      kernel.SetArgument(5, b_temp());
+      kernel.SetArgument(6, c_temp());
+
+      // Computes the global and local thread sizes
+      auto global = std::vector<size_t>{
+        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+      };
+      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+      // Launches the kernel
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the post-processing kernel
+      auto upper = (triangle == Triangle::kUpper);
+      auto lower = (triangle == Triangle::kLower);
+      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                                      n, n, c_ld, c_offset, c_buffer,
+                                      program, false, c_rotated, false, upper, lower, true);
+      if (ErrorIn(status)) { return status; }
+
+      // Successfully finished the computation
+      return StatusCode::kSuccess;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xherk<float2,float>;
+template class Xherk<double2,double>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/xsymm.cc b/src/routines/level3/xsymm.cc
similarity index 96%
rename from src/routines/xsymm.cc
rename to src/routines/level3/xsymm.cc
index 97f35be8..1d17f0eb 100644
--- a/src/routines/xsymm.cc
+++ b/src/routines/level3/xsymm.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/xsymm.h"
+#include "internal/routines/level3/xsymm.h"
 
 #include <string>
 #include <vector>
@@ -42,14 +42,14 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
 
   // Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the
   // left) or B (on the right) in the Xgemm routine.
-  size_t k = (side == Side::kLeft) ? m : n;
+  auto k = (side == Side::kLeft) ? m : n;
 
   // Checks for validity of the squared A matrix
   auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
   if (ErrorIn(status)) { return status; }
 
   // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
-  // default) and on whether we are dealing with an upper or lower triangle of the symmetrix matrix
+  // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix
   bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
                    (triangle == Triangle::kLower && layout == Layout::kRowMajor));
   auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared";
@@ -75,7 +75,7 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
       kernel.SetArgument(7, temp_symm());
 
       // Uses the common padding kernel's thread configuration. This is allowed, since the
-      // symmetry-to-squared kernel uses the same parameters.
+      // symmetric-to-squared kernel uses the same parameters.
       auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
                                         Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
       auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc
new file mode 100644
index 00000000..7ab3430a
--- /dev/null
+++ b/src/routines/level3/xsyr2k.cc
@@ -0,0 +1,186 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr2k class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xsyr2k.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xsyr2k<float>::precision_ = Precision::kSingle;
+template <> const Precision Xsyr2k<double>::precision_ = Precision::kDouble;
+template <> const Precision Xsyr2k<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xsyr2k<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xsyr2k<T>::Xsyr2k(CommandQueue &queue, Event &event):
+    Routine(queue, event, "SYR2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+  source_string_ =
+    #include "../../kernels/copy.opencl"
+    #include "../../kernels/pad.opencl"
+    #include "../../kernels/transpose.opencl"
+    #include "../../kernels/padtranspose.opencl"
+    #include "../../kernels/xgemm.opencl"
+  ;
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                              const size_t n, const size_t k,
+                              const T alpha,
+                              const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                              const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                              const T beta,
+                              const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Computes whether or not the matrices are transposed in memory. This is based on their layout
+  // (row or column-major) and whether or not they are requested to be pre-transposed.
+  auto ab_rotated = (layout == Layout::kColMajor && ab_transpose != Transpose::kNo) ||
+                    (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo);
+  auto c_rotated = (layout == Layout::kRowMajor);
+
+  // Computes the first and second dimensions of the A and B matrices taking the layout into account
+  auto ab_one = (ab_rotated) ? k : n;
+  auto ab_two = (ab_rotated) ? n : k;
+
+  // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
+  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
+  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
+  // space. Also tests that the leading dimensions of:
+  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix C cannot be less than N
+  auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Calculates the ceiled versions of n and k
+  auto n_ceiled = Ceil(n, db_["NWG"]);
+  auto k_ceiled = Ceil(k, db_["KWG"]);
+
+  // Decides which kernel to run: the upper-triangular or lower-triangular version
+  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
+
+  // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
+  try {
+
+    // Loads the program from the database
+    auto& program = GetProgramFromCache();
+
+    // Determines whether or not temporary matrices are needed
+    auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+                     ab_rotated == false;
+    auto b_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
+                     ab_rotated == false;
+
+    // Creates the temporary matrices
+    auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
+
+    // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
+    // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+    // case nothing has to be done, these kernels can be skipped.
+    if (!a_no_temp) {
+      status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
+                                      n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
+                                      program, true, ab_rotated, false);
+      if (ErrorIn(status)) { return status; }
+    }
+    if (!b_no_temp) {
+      status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
+                                      n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
+                                      program, true, ab_rotated, false);
+      if (ErrorIn(status)) { return status; }
+    }
+
+    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+    // modify the other triangle.
+    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
+                                    n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                                    program, true, c_rotated, false);
+    if (ErrorIn(status)) { return status; }
+
+    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+    try {
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the kernel arguments
+      kernel.SetArgument(0, static_cast<int>(n_ceiled));
+      kernel.SetArgument(1, static_cast<int>(k_ceiled));
+      kernel.SetArgument(2, alpha);
+      kernel.SetArgument(3, beta);
+      kernel.SetArgument(4, a_temp());
+      kernel.SetArgument(5, b_temp());
+      kernel.SetArgument(6, c_temp());
+
+      // Computes the global and local thread sizes
+      auto global = std::vector<size_t>{
+        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+      };
+      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+      // Launches the kernel
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Swaps the arguments for matrices A and B, and sets 'beta' to 1
+      auto one = static_cast<T>(1);
+      kernel.SetArgument(3, one);
+      kernel.SetArgument(4, b_temp());
+      kernel.SetArgument(5, a_temp());
+
+      // Runs the kernel again
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the post-processing kernel
+      auto upper = (triangle == Triangle::kUpper);
+      auto lower = (triangle == Triangle::kLower);
+      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                                      n, n, c_ld, c_offset, c_buffer,
+                                      program, false, c_rotated, false, upper, lower, false);
+      if (ErrorIn(status)) { return status; }
+
+      // Successfully finished the computation
+      return StatusCode::kSuccess;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xsyr2k<float>;
+template class Xsyr2k<double>;
+template class Xsyr2k<float2>;
+template class Xsyr2k<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc
new file mode 100644
index 00000000..c6feb5e6
--- /dev/null
+++ b/src/routines/level3/xsyrk.cc
@@ -0,0 +1,163 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyrk class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xsyrk.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xsyrk<float>::precision_ = Precision::kSingle;
+template <> const Precision Xsyrk<double>::precision_ = Precision::kDouble;
+template <> const Precision Xsyrk<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xsyrk<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xsyrk<T>::Xsyrk(CommandQueue &queue, Event &event):
+    Routine(queue, event, "SYRK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+  source_string_ =
+    #include "../../kernels/copy.opencl"
+    #include "../../kernels/pad.opencl"
+    #include "../../kernels/transpose.opencl"
+    #include "../../kernels/padtranspose.opencl"
+    #include "../../kernels/xgemm.opencl"
+  ;
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                            const size_t n, const size_t k,
+                            const T alpha,
+                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const T beta,
+                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Computes whether or not the matrices are transposed in memory. This is based on their layout
+  // (row or column-major) and whether or not they are requested to be pre-transposed.
+  auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) ||
+                   (layout == Layout::kRowMajor && a_transpose == Transpose::kNo);
+  auto c_rotated = (layout == Layout::kRowMajor);
+
+  // Computes the first and second dimensions of the A matrix taking the layout into account
+  auto a_one = (a_rotated) ? k : n;
+  auto a_two = (a_rotated) ? n : k;
+
+  // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and
+  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
+  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
+  // space. Also tests that the leading dimensions of:
+  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix C cannot be less than N
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Calculates the ceiled versions of n and k
+  auto n_ceiled = Ceil(n, db_["NWG"]);
+  auto k_ceiled = Ceil(k, db_["KWG"]);
+
+  // Decides which kernel to run: the upper-triangular or lower-triangular version
+  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
+
+  // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
+  try {
+
+    // Loads the program from the database
+    auto& program = GetProgramFromCache();
+
+    // Determines whether or not temporary matrices are needed
+    auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+                     a_rotated == false;
+
+    // Creates the temporary matrices
+    auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
+
+    // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
+    // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+    // case nothing has to be done, these kernels can be skipped.
+    if (!a_no_temp) {
+      status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+                                      n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
+                                      program, true, a_rotated, false);
+      if (ErrorIn(status)) { return status; }
+    }
+
+    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+    // modify the other triangle.
+    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
+                                    n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                                    program, true, c_rotated, false);
+    if (ErrorIn(status)) { return status; }
+
+    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+    try {
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the kernel arguments
+      kernel.SetArgument(0, static_cast<int>(n_ceiled));
+      kernel.SetArgument(1, static_cast<int>(k_ceiled));
+      kernel.SetArgument(2, alpha);
+      kernel.SetArgument(3, beta);
+      kernel.SetArgument(4, a_temp());
+      kernel.SetArgument(5, a_temp());
+      kernel.SetArgument(6, c_temp());
+
+      // Computes the global and local thread sizes
+      auto global = std::vector<size_t>{
+        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+      };
+      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+      // Launches the kernel
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the post-processing kernel
+      auto upper = (triangle == Triangle::kUpper);
+      auto lower = (triangle == Triangle::kLower);
+      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                                      n, n, c_ld, c_offset, c_buffer,
+                                      program, false, c_rotated, false, upper, lower, false);
+      if (ErrorIn(status)) { return status; }
+
+      // Successfully finished the computation
+      return StatusCode::kSuccess;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xsyrk<float>;
+template class Xsyrk<double>;
+template class Xsyrk<float2>;
+template class Xsyrk<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc
new file mode 100644
index 00000000..52f272e3
--- /dev/null
+++ b/src/routines/level3/xtrmm.cc
@@ -0,0 +1,135 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmm class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xtrmm.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtrmm<T>::Xtrmm(CommandQueue &queue, Event &event):
+    Xgemm<T>(queue, event) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle triangle,
+                            const Transpose a_transpose, const Diagonal diagonal,
+                            const size_t m, const size_t n,
+                            const T alpha,
+                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; }
+
+  // Computes the k dimension. This is based on whether or not matrix is A (on the left)
+  // or B (on the right) in the Xgemm routine.
+  auto k = (side == Side::kLeft) ? m : n;
+
+  // Checks for validity of the triangular A matrix
+  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
+  // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
+  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+  auto kernel_name = (is_upper) ? "TrmmUpperToSquared" : "TrmmLowerToSquared";
+
+  // Determines whether or not the triangular matrix is unit-diagonal
+  auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false;
+
+  // Temporary buffer for a copy of the triangular matrix
+  try {
+    auto temp_triangular = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
+
+    // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm
+    // routine afterwards
+    try {
+      auto& program = GetProgramFromCache();
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the arguments for the triangular-to-squared kernel
+      kernel.SetArgument(0, static_cast<int>(k));
+      kernel.SetArgument(1, static_cast<int>(a_ld));
+      kernel.SetArgument(2, static_cast<int>(a_offset));
+      kernel.SetArgument(3, a_buffer());
+      kernel.SetArgument(4, static_cast<int>(k));
+      kernel.SetArgument(5, static_cast<int>(k));
+      kernel.SetArgument(6, static_cast<int>(0));
+      kernel.SetArgument(7, temp_triangular());
+      kernel.SetArgument(8, static_cast<int>(unit_diagonal));
+
+      // Uses the common padding kernel's thread configuration. This is allowed, since the
+      // triangular-to-squared kernel uses the same parameters.
+      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the regular Xgemm code with either "B := alpha*A*B" or ...
+      if (side == Side::kLeft) {
+        status = DoGemm(layout, a_transpose, Transpose::kNo,
+                        m, n, k,
+                        alpha,
+                        temp_triangular, 0, k,
+                        b_buffer, b_offset, b_ld,
+                        static_cast<T>(0.0),
+                        b_buffer, b_offset, b_ld);
+      }
+
+      // ... with "B := alpha*B*A". Note that A and B are now reversed.
+      else {
+        status = DoGemm(layout, Transpose::kNo, a_transpose,
+                        m, n, k,
+                        alpha,
+                        b_buffer, b_offset, b_ld,
+                        temp_triangular, 0, k,
+                        static_cast<T>(0.0),
+                        b_buffer, b_offset, b_ld);
+
+        // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
+        switch(status) {
+          case StatusCode::kInvalidMatrixA:      status = StatusCode::kInvalidMatrixB; break;
+          case StatusCode::kInvalidMatrixB:      status = StatusCode::kInvalidMatrixA; break;
+          case StatusCode::kInvalidLeadDimA:     status = StatusCode::kInvalidLeadDimB; break;
+          case StatusCode::kInvalidLeadDimB:     status = StatusCode::kInvalidLeadDimA; break;
+          case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
+          case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
+        }
+      }
+
+      // Return the status of the Xgemm routine
+      return status;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtrmm<float>;
+template class Xtrmm<double>;
+template class Xtrmm<float2>;
+template class Xtrmm<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/tuning/copy.cc b/src/tuning/copy.cc
index da223bf0..125b076e 100644
--- a/src/tuning/copy.cc
+++ b/src/tuning/copy.cc
@@ -30,11 +30,10 @@ void CopyTune(const Arguments<T> &args,
   // This points to the CopyMatrix kernel as found in the CLBlast library. This is just one example
   // of a copy kernel. However, all copy-kernels use the same tuning parameters, so one has to be
   // chosen as a representative.
-  std::string common_source =
-  #include "../src/kernels/common.opencl"
-  std::string kernel_source =
-  #include "../src/kernels/copy.opencl"
-  auto sources = common_source + kernel_source;
+  std::string sources =
+    #include "../src/kernels/common.opencl"
+    #include "../src/kernels/copy.opencl"
+  ;
   auto id = tuner.AddKernelFromString(sources, "CopyMatrix", {args.m, args.n}, {1, 1});
   tuner.SetReferenceFromString(sources, "CopyMatrix", {args.m, args.n}, {8, 8});
 
diff --git a/src/tuning/pad.cc b/src/tuning/pad.cc
index b6254cd5..584415c7 100644
--- a/src/tuning/pad.cc
+++ b/src/tuning/pad.cc
@@ -30,11 +30,10 @@ void PadTune(const Arguments<T> &args,
   // This points to the PadMatrix kernel as found in the CLBlast library. This is just one
   // example of a pad kernel. However, all pad-kernels use the same tuning parameters, so one has
   // to be chosen as a representative.
-  std::string common_source =
-  #include "../src/kernels/common.opencl"
-  std::string kernel_source =
-  #include "../src/kernels/pad.opencl"
-  auto sources = common_source + kernel_source;
+  std::string sources =
+    #include "../src/kernels/common.opencl"
+    #include "../src/kernels/pad.opencl"
+  ;
   auto id = tuner.AddKernelFromString(sources, "PadMatrix", {args.m, args.n}, {1, 1});
   tuner.SetReferenceFromString(sources, "PadMatrix", {args.m, args.n}, {8, 8});
 
diff --git a/src/tuning/padtranspose.cc b/src/tuning/padtranspose.cc
index c84e5950..25044556 100644
--- a/src/tuning/padtranspose.cc
+++ b/src/tuning/padtranspose.cc
@@ -30,11 +30,10 @@ void PadTransposeTune(const Arguments<T> &args,
   // This points to the PadTransposeMatrix kernel as found in the CLBlast library. This is just one
   // example of a transpose kernel. However, all kernels use the same tuning parameters, so one has
   // to be chosen as a representative.
-  std::string common_source =
-  #include "../src/kernels/common.opencl"
-  std::string kernel_source =
-  #include "../src/kernels/padtranspose.opencl"
-  auto sources = common_source + kernel_source;
+  std::string sources =
+    #include "../src/kernels/common.opencl"
+    #include "../src/kernels/padtranspose.opencl"
+  ;
   auto id = tuner.AddKernelFromString(sources, "PadTransposeMatrix", {args.m, args.n}, {1, 1});
   tuner.SetReferenceFromString(sources, "PadTransposeMatrix", {args.m, args.n}, {8, 8});
 
diff --git a/src/tuning/transpose.cc b/src/tuning/transpose.cc
index 90392866..8963a688 100644
--- a/src/tuning/transpose.cc
+++ b/src/tuning/transpose.cc
@@ -30,11 +30,10 @@ void TransposeTune(const Arguments<T> &args,
   // This points to the PadTransposeMatrix kernel as found in the CLBlast library. This is just one
   // example of a transpose kernel. However, all kernels use the same tuning parameters, so one has
   // to be chosen as a representative.
-  std::string common_source =
-  #include "../src/kernels/common.opencl"
-  std::string kernel_source =
-  #include "../src/kernels/transpose.opencl"
-  auto sources = common_source + kernel_source;
+  std::string sources =
+    #include "../src/kernels/common.opencl"
+    #include "../src/kernels/transpose.opencl"
+  ;
   auto id = tuner.AddKernelFromString(sources, "TransposeMatrix", {args.m, args.n}, {1, 1});
   tuner.SetReferenceFromString(sources, "TransposeMatrix", {args.m, args.n}, {8, 8});
 
@@ -42,6 +41,7 @@ void TransposeTune(const Arguments<T> &args,
   tuner.AddParameter(id, "TRA_DIM", {4, 8, 16, 32, 64});
   tuner.AddParameter(id, "TRA_WPT", {1, 2, 4, 8, 16});
   tuner.AddParameter(id, "TRA_PAD", {0, 1});
+  tuner.AddParameter(id, "TRA_SHUFFLE", {0, 1});
 
   // Tests for a specific precision
   tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
diff --git a/src/tuning/xaxpy.cc b/src/tuning/xaxpy.cc
index 0439ed05..20b5978e 100644
--- a/src/tuning/xaxpy.cc
+++ b/src/tuning/xaxpy.cc
@@ -34,11 +34,10 @@ void XaxpyTune(const Arguments<T> &args,
   }
 
   // This points to the XaxpyFast kernel as found in the CLBlast library
-  std::string common_source =
-  #include "../src/kernels/common.opencl"
-  std::string kernel_source =
-  #include "../src/kernels/xaxpy.opencl"
-  auto sources = common_source + kernel_source;
+  std::string sources =
+    #include "../src/kernels/common.opencl"
+    #include "../src/kernels/xaxpy.opencl"
+  ;
   auto id = tuner.AddKernelFromString(sources, "XaxpyFast", {args.n}, {1});
   tuner.SetReferenceFromString(sources, "XaxpyFast", {args.n}, {64});
 
diff --git a/src/tuning/xgemm.cc b/src/tuning/xgemm.cc
index aba56810..3fe58ed5 100644
--- a/src/tuning/xgemm.cc
+++ b/src/tuning/xgemm.cc
@@ -30,11 +30,10 @@ void XgemmTune(const Arguments<T> &args,
                cltune::Tuner &tuner) {
 
   // This points to the Xgemm kernel as found in the CLBlast library and its golden reference
-  std::string common_source =
-  #include "../src/kernels/common.opencl"
-  std::string kernel_source =
-  #include "../src/kernels/xgemm.opencl"
-  auto sources = common_source + kernel_source;
+  std::string sources =
+    #include "../src/kernels/common.opencl"
+    #include "../src/kernels/xgemm.opencl"
+  ;
   auto id = tuner.AddKernelFromString(sources, "Xgemm", {args.m, args.n}, {1, 1});
   tuner.SetReferenceFromString(sources, "Xgemm", {args.m, args.n}, {8, 8});
 
diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc
index 48df6f25..a9d88e4b 100644
--- a/src/tuning/xgemv.cc
+++ b/src/tuning/xgemv.cc
@@ -36,11 +36,10 @@ void XgemvTune(const Arguments<T> &args, const size_t variation,
   auto a_rotated = (variation == 3) ? 1 : 0;
 
   // This points to the Xgemv kernel as found in the CLBlast library
-  std::string common_source =
-  #include "../src/kernels/common.opencl"
-  std::string kernel_source =
-  #include "../src/kernels/xgemv.opencl"
-  auto sources = common_source + kernel_source;
+  std::string sources =
+    #include "../src/kernels/common.opencl"
+    #include "../src/kernels/xgemv.opencl"
+  ;
   auto id = tuner.AddKernelFromString(sources, kernel_name, {args.m}, {1});
   tuner.SetReferenceFromString(sources, "Xgemv", {args.m}, {64});
 
diff --git a/src/utilities.cc b/src/utilities.cc
index 98570088..62abbb91 100644
--- a/src/utilities.cc
+++ b/src/utilities.cc
@@ -79,6 +79,13 @@ std::string ToString(Triangle value) {
   }
 }
 template <>
+std::string ToString(Diagonal value) {
+  switch(value) {
+    case Diagonal::kUnit: return ToString(static_cast<int>(value))+" (unit)";
+    case Diagonal::kNonUnit: return ToString(static_cast<int>(value))+" (non-unit)";
+  }
+}
+template <>
 std::string ToString(Precision value) {
   switch(value) {
     case Precision::kHalf: return ToString(static_cast<int>(value))+" (half)";
@@ -143,6 +150,7 @@ template Layout GetArgument<Layout>(const int, char **, std::string&, const std:
 template Transpose GetArgument<Transpose>(const int, char **, std::string&, const std::string&, const Transpose);
 template Side GetArgument<Side>(const int, char **, std::string&, const std::string&, const Side);
 template Triangle GetArgument<Triangle>(const int, char **, std::string&, const std::string&, const Triangle);
+template Diagonal GetArgument<Diagonal>(const int, char **, std::string&, const std::string&, const Diagonal);
 template Precision GetArgument<Precision>(const int, char **, std::string&, const std::string&, const Precision);
 
 // =================================================================================================
diff --git a/test/correctness/routines/level1/xaxpy.cc b/test/correctness/routines/level1/xaxpy.cc
new file mode 100644
index 00000000..ac44caec
--- /dev/null
+++ b/test/correctness/routines/level1/xaxpy.cc
@@ -0,0 +1,81 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xaxpy routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level1/xaxpy.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXaxpy<T>::GetOptions(),
+                       TestXaxpy<T>::RunRoutine, TestXaxpy<T>::RunReference,
+                       TestXaxpy<T>::DownloadResult, TestXaxpy<T>::GetResultIndex,
+                       TestXaxpy<T>::ResultID1, TestXaxpy<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Creates the arguments vector for the regular tests
+  auto regular_test_vector = std::vector<Arguments<T>>{};
+  for (auto &n: tester.kVectorDims) { args.n = n;
+    for (auto &x_inc: tester.kIncrements) { args.x_inc = x_inc;
+      for (auto &x_offset: tester.kOffsets) { args.x_offset = x_offset;
+        for (auto &y_inc: tester.kIncrements) { args.y_inc = y_inc;
+          for (auto &y_offset: tester.kOffsets) { args.y_offset = y_offset;
+            for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+              args.x_size = TestXaxpy<T>::GetSizeX(args);
+              args.y_size = TestXaxpy<T>::GetSizeY(args);
+              if (args.x_size<1 || args.y_size<1) { continue; }
+              regular_test_vector.push_back(args);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Creates the arguments vector for the invalid-buffer tests
+  auto invalid_test_vector = std::vector<Arguments<T>>{};
+  args.n = tester.kBufferSize;
+  args.x_inc = args.y_inc = 1;
+  args.x_offset = args.y_offset = 0;
+  for (auto &x_size: tester.kVecSizes) { args.x_size = x_size;
+    for (auto &y_size: tester.kVecSizes) { args.y_size = y_size;
+      invalid_test_vector.push_back(args);
+    }
+  }
+
+  // Runs the tests
+  const auto case_name = "default";
+  tester.TestRegular(regular_test_vector, case_name);
+  tester.TestInvalid(invalid_test_vector, case_name);
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<float>(argc, argv, false, "SAXPY");
+  clblast::RunTest<double>(argc, argv, true, "DAXPY");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CAXPY");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZAXPY");
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xgemv.cc b/test/correctness/routines/level2/xgemv.cc
new file mode 100644
index 00000000..4e6942cc
--- /dev/null
+++ b/test/correctness/routines/level2/xgemv.cc
@@ -0,0 +1,99 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xgemv routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXgemv<T>::GetOptions(),
+                       TestXgemv<T>::RunRoutine, TestXgemv<T>::RunReference,
+                       TestXgemv<T>::DownloadResult, TestXgemv<T>::GetResultIndex,
+                       TestXgemv<T>::ResultID1, TestXgemv<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose;
+
+      // Creates the arguments vector for the regular tests
+      auto regular_test_vector = std::vector<Arguments<T>>{};
+      for (auto &m: tester.kMatrixVectorDims) { args.m = m;
+        for (auto &n: tester.kMatrixVectorDims) { args.n = n;
+          for (auto &a_ld: tester.kMatrixVectorDims) { args.a_ld = a_ld;
+            for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+              for (auto &x_inc: tester.kIncrements) { args.x_inc = x_inc;
+                for (auto &x_offset: tester.kOffsets) { args.x_offset = x_offset;
+                  for (auto &y_inc: tester.kIncrements) { args.y_inc = y_inc;
+                    for (auto &y_offset: tester.kOffsets) { args.y_offset = y_offset;
+                      for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                        for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                          args.a_size = TestXgemv<T>::GetSizeA(args);
+                          args.x_size = TestXgemv<T>::GetSizeX(args);
+                          args.y_size = TestXgemv<T>::GetSizeY(args);
+                          if (args.a_size<1 || args.x_size<1 || args.y_size<1) { continue; }
+                          regular_test_vector.push_back(args);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+
+      // Creates the arguments vector for the invalid-buffer tests
+      auto invalid_test_vector = std::vector<Arguments<T>>{};
+      args.m = args.n = tester.kBufferSize;
+      args.a_ld = tester.kBufferSize;
+      args.x_inc = args.y_inc = 1;
+      args.a_offset = args.x_offset = args.y_offset = 0;
+      for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+        for (auto &x_size: tester.kVecSizes) { args.x_size = x_size;
+          for (auto &y_size: tester.kVecSizes) { args.y_size = y_size;
+            invalid_test_vector.push_back(args);
+          }
+        }
+      }
+
+      // Runs the tests
+      const auto case_name = ToString(layout)+" "+ToString(a_transpose);
+      tester.TestRegular(regular_test_vector, case_name);
+      tester.TestInvalid(invalid_test_vector, case_name);
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<float>(argc, argv, false, "SGEMV");
+  clblast::RunTest<double>(argc, argv, true, "DGEMV");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CGEMV");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZGEMV");
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xgemm.cc b/test/correctness/routines/level3/xgemm.cc
new file mode 100644
index 00000000..c1ce8fe2
--- /dev/null
+++ b/test/correctness/routines/level3/xgemm.cc
@@ -0,0 +1,102 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xgemm routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xgemm.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXgemm<T>::GetOptions(),
+                       TestXgemm<T>::RunRoutine, TestXgemm<T>::RunReference,
+                       TestXgemm<T>::DownloadResult, TestXgemm<T>::GetResultIndex,
+                       TestXgemm<T>::ResultID1, TestXgemm<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose;
+      for (auto &b_transpose: tester.kTransposes) { args.b_transpose = b_transpose;
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<T>>{};
+        for (auto &m: tester.kMatrixDims) { args.m = m;
+          for (auto &n: tester.kMatrixDims) { args.n = n;
+            for (auto &k: tester.kMatrixDims) { args.k = k;
+              for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+                for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                  for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
+                    for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
+                      for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                        for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                          for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                            for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                              args.a_size = TestXgemm<T>::GetSizeA(args);
+                              args.b_size = TestXgemm<T>::GetSizeB(args);
+                              args.c_size = TestXgemm<T>::GetSizeC(args);
+                              if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
+                              regular_test_vector.push_back(args);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<T>>{};
+        args.m = args.n = args.k = tester.kBufferSize;
+        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.b_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
+            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+              invalid_test_vector.push_back(args);
+            }
+          }
+        }
+
+        // Runs the tests
+        const auto case_name = ToString(layout)+" "+ToString(a_transpose)+" "+ToString(b_transpose);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<float>(argc, argv, false, "SGEMM");
+  clblast::RunTest<double>(argc, argv, true, "DGEMM");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CGEMM");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZGEMM");
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xhemm.cc b/test/correctness/routines/level3/xhemm.cc
new file mode 100644
index 00000000..4d66a57f
--- /dev/null
+++ b/test/correctness/routines/level3/xhemm.cc
@@ -0,0 +1,98 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xhemm routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xhemm.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXhemm<T>::GetOptions(),
+                       TestXhemm<T>::RunRoutine, TestXhemm<T>::RunReference,
+                       TestXhemm<T>::DownloadResult, TestXhemm<T>::GetResultIndex,
+                       TestXhemm<T>::ResultID1, TestXhemm<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &side: tester.kSides) { args.side = side;
+      for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<T>>{};
+        for (auto &m: tester.kMatrixDims) { args.m = m;
+          for (auto &n: tester.kMatrixDims) { args.n = n;
+            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
+                  for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
+                    for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                      for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                          for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                            args.a_size = TestXhemm<T>::GetSizeA(args);
+                            args.b_size = TestXhemm<T>::GetSizeB(args);
+                            args.c_size = TestXhemm<T>::GetSizeC(args);
+                            if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
+                            regular_test_vector.push_back(args);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<T>>{};
+        args.m = args.n = tester.kBufferSize;
+        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.b_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
+            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+              invalid_test_vector.push_back(args);
+            }
+          }
+        }
+
+        // Runs the tests
+        const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CHEMM");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZHEMM");
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xher2k.cc b/test/correctness/routines/level3/xher2k.cc
new file mode 100644
index 00000000..ba5260fb
--- /dev/null
+++ b/test/correctness/routines/level3/xher2k.cc
@@ -0,0 +1,100 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xher2k routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xher2k.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T, typename U>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,U> tester{argc, argv, silent, name, TestXher2k<T,U>::GetOptions(),
+                       TestXher2k<T,U>::RunRoutine, TestXher2k<T,U>::RunReference,
+                       TestXher2k<T,U>::DownloadResult, TestXher2k<T,U>::GetResultIndex,
+                       TestXher2k<T,U>::ResultID1, TestXher2k<T,U>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<U>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+      for (auto &ab_transpose: {Transpose::kNo, Transpose::kConjugate}) { // Regular transpose not a
+        args.a_transpose = ab_transpose;                                  // valid BLAS option
+        args.b_transpose = ab_transpose;
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<U>>{};
+        for (auto &n: tester.kMatrixDims) { args.n = n;
+          for (auto &k: tester.kMatrixDims) { args.k = k;
+            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
+                  for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
+                    for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                      for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                          for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                            args.a_size = TestXher2k<T,U>::GetSizeA(args);
+                            args.b_size = TestXher2k<T,U>::GetSizeB(args);
+                            args.c_size = TestXher2k<T,U>::GetSizeC(args);
+                            if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
+                            regular_test_vector.push_back(args);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<U>>{};
+        args.n = args.k = tester.kBufferSize;
+        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.b_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
+            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+              invalid_test_vector.push_back(args);
+            }
+          }
+        }
+
+        // Runs the tests
+        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<clblast::float2,float>(argc, argv, false, "CHER2K");
+  clblast::RunTest<clblast::double2,double>(argc, argv, true, "ZHER2K");
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xherk.cc b/test/correctness/routines/level3/xherk.cc
new file mode 100644
index 00000000..7a4a7278
--- /dev/null
+++ b/test/correctness/routines/level3/xherk.cc
@@ -0,0 +1,92 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xherk routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xherk.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T, typename U>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,U> tester{argc, argv, silent, name, TestXherk<T,U>::GetOptions(),
+                       TestXherk<T,U>::RunRoutine, TestXherk<T,U>::RunReference,
+                       TestXherk<T,U>::DownloadResult, TestXherk<T,U>::GetResultIndex,
+                       TestXherk<T,U>::ResultID1, TestXherk<T,U>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<U>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+      for (auto &a_transpose: {Transpose::kNo, Transpose::kConjugate}) { // Regular transpose not a
+        args.a_transpose = a_transpose;                                  // valid BLAS option
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<U>>{};
+        for (auto &n: tester.kMatrixDims) { args.n = n;
+          for (auto &k: tester.kMatrixDims) { args.k = k;
+            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                  for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                    for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                      for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                        args.a_size = TestXherk<T,U>::GetSizeA(args);
+                        args.c_size = TestXherk<T,U>::GetSizeC(args);
+                        if (args.a_size<1 || args.c_size<1) { continue; }
+                        regular_test_vector.push_back(args);
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<U>>{};
+        args.n = args.k = tester.kBufferSize;
+        args.a_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+            invalid_test_vector.push_back(args);
+          }
+        }
+
+        // Runs the tests
+        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<clblast::float2,float>(argc, argv, false, "CHERK");
+  clblast::RunTest<clblast::double2,double>(argc, argv, true, "ZHERK");
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xsymm.cc b/test/correctness/routines/level3/xsymm.cc
new file mode 100644
index 00000000..851efff2
--- /dev/null
+++ b/test/correctness/routines/level3/xsymm.cc
@@ -0,0 +1,100 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xsymm routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xsymm.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXsymm<T>::GetOptions(),
+                       TestXsymm<T>::RunRoutine, TestXsymm<T>::RunReference,
+                       TestXsymm<T>::DownloadResult, TestXsymm<T>::GetResultIndex,
+                       TestXsymm<T>::ResultID1, TestXsymm<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &side: tester.kSides) { args.side = side;
+      for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<T>>{};
+        for (auto &m: tester.kMatrixDims) { args.m = m;
+          for (auto &n: tester.kMatrixDims) { args.n = n;
+            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
+                  for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
+                    for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                      for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                          for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                            args.a_size = TestXsymm<T>::GetSizeA(args);
+                            args.b_size = TestXsymm<T>::GetSizeB(args);
+                            args.c_size = TestXsymm<T>::GetSizeC(args);
+                            if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
+                            regular_test_vector.push_back(args);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<T>>{};
+        args.m = args.n = tester.kBufferSize;
+        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.b_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
+            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+              invalid_test_vector.push_back(args);
+            }
+          }
+        }
+
+        // Runs the tests
+        const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<float>(argc, argv, false, "SSYMM");
+  clblast::RunTest<double>(argc, argv, true, "DSYMM");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CSYMM");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYMM");
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xsyr2k.cc b/test/correctness/routines/level3/xsyr2k.cc
new file mode 100644
index 00000000..61ea59a3
--- /dev/null
+++ b/test/correctness/routines/level3/xsyr2k.cc
@@ -0,0 +1,102 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xsyr2k routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xsyr2k.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXsyr2k<T>::GetOptions(),
+                       TestXsyr2k<T>::RunRoutine, TestXsyr2k<T>::RunReference,
+                       TestXsyr2k<T>::DownloadResult, TestXsyr2k<T>::GetResultIndex,
+                       TestXsyr2k<T>::ResultID1, TestXsyr2k<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+      for (auto &ab_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it
+        args.a_transpose = ab_transpose;                            // is not supported by clBLAS
+        args.b_transpose = ab_transpose;
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<T>>{};
+        for (auto &n: tester.kMatrixDims) { args.n = n;
+          for (auto &k: tester.kMatrixDims) { args.k = k;
+            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
+                  for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
+                    for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                      for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                          for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                            args.a_size = TestXsyr2k<T>::GetSizeA(args);
+                            args.b_size = TestXsyr2k<T>::GetSizeB(args);
+                            args.c_size = TestXsyr2k<T>::GetSizeC(args);
+                            if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
+                            regular_test_vector.push_back(args);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<T>>{};
+        args.n = args.k = tester.kBufferSize;
+        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.b_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
+            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+              invalid_test_vector.push_back(args);
+            }
+          }
+        }
+
+        // Runs the tests
+        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<float>(argc, argv, false, "SSYR2K");
+  clblast::RunTest<double>(argc, argv, true, "DSYR2K");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CSYR2K");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYR2K");
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xsyrk.cc b/test/correctness/routines/level3/xsyrk.cc
new file mode 100644
index 00000000..126e201b
--- /dev/null
+++ b/test/correctness/routines/level3/xsyrk.cc
@@ -0,0 +1,94 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xsyrk routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xsyrk.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXsyrk<T>::GetOptions(),
+                       TestXsyrk<T>::RunRoutine, TestXsyrk<T>::RunReference,
+                       TestXsyrk<T>::DownloadResult, TestXsyrk<T>::GetResultIndex,
+                       TestXsyrk<T>::ResultID1, TestXsyrk<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+      for (auto &a_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it
+        args.a_transpose = a_transpose;                            // is not supported by clBLAS
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<T>>{};
+        for (auto &n: tester.kMatrixDims) { args.n = n;
+          for (auto &k: tester.kMatrixDims) { args.k = k;
+            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                  for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                    for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                      for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                        args.a_size = TestXsyrk<T>::GetSizeA(args);
+                        args.c_size = TestXsyrk<T>::GetSizeC(args);
+                        if (args.a_size<1 || args.c_size<1) { continue; }
+                        regular_test_vector.push_back(args);
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<T>>{};
+        args.n = args.k = tester.kBufferSize;
+        args.a_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+            invalid_test_vector.push_back(args);
+          }
+        }
+
+        // Runs the tests
+        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<float>(argc, argv, false, "SSYRK");
+  clblast::RunTest<double>(argc, argv, true, "DSYRK");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CSYRK");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYRK");
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xtrmm.cc b/test/correctness/routines/level3/xtrmm.cc
new file mode 100644
index 00000000..5f04bb18
--- /dev/null
+++ b/test/correctness/routines/level3/xtrmm.cc
@@ -0,0 +1,96 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xtrmm routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xtrmm.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXtrmm<T>::GetOptions(),
+                       TestXtrmm<T>::RunRoutine, TestXtrmm<T>::RunReference,
+                       TestXtrmm<T>::DownloadResult, TestXtrmm<T>::GetResultIndex,
+                       TestXtrmm<T>::ResultID1, TestXtrmm<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &side: tester.kSides) { args.side = side;
+      for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+        for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose;
+          for (auto &diagonal: tester.kDiagonals) { args.diagonal = diagonal;
+
+            // Creates the arguments vector for the regular tests
+            auto regular_test_vector = std::vector<Arguments<T>>{};
+            for (auto &m: tester.kMatrixDims) { args.m = m;
+              for (auto &n: tester.kMatrixDims) { args.n = n;
+                for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+                  for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                    for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
+                      for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
+                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                          args.a_size = TestXtrmm<T>::GetSizeA(args);
+                          args.b_size = TestXtrmm<T>::GetSizeB(args);
+                          if (args.a_size<1 || args.b_size<1) { continue; }
+                          regular_test_vector.push_back(args);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+
+            // Creates the arguments vector for the invalid-buffer tests
+            auto invalid_test_vector = std::vector<Arguments<T>>{};
+            args.m = args.n = tester.kBufferSize;
+            args.a_ld = args.b_ld = tester.kBufferSize;
+            args.a_offset = args.b_offset = 0;
+            for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+              for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
+                invalid_test_vector.push_back(args);
+              }
+            }
+
+            // Runs the tests
+            const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle)+" "+
+                                   ToString(a_transpose)+" "+ToString(diagonal);
+            tester.TestRegular(regular_test_vector, case_name);
+            tester.TestInvalid(invalid_test_vector, case_name);
+          }
+        }
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<float>(argc, argv, false, "STRMM");
+  clblast::RunTest<double>(argc, argv, true, "DTRMM");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CTRMM");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZTRMM");
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/xaxpy.cc b/test/correctness/routines/xaxpy.cc
deleted file mode 100644
index 45dcf6bb..00000000
--- a/test/correctness/routines/xaxpy.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the tests for the Xaxpy routine. It is based on the TestXY class.
-//
-// =================================================================================================
-
-#include "wrapper_clblas.h"
-#include "correctness/testxy.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
-template <typename T>
-void XaxpyTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [](const Arguments<T> &args,
-                           const Buffer &x_vec, const Buffer &y_vec,
-                           CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    return Axpy(args.n, args.alpha,
-                x_vec(), args.x_offset, args.x_inc,
-                y_vec(), args.y_offset, args.y_inc,
-                &queue_plain, &event);
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [](const Arguments<T> &args,
-                          const Buffer &x_vec, const Buffer &y_vec,
-                          CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXaxpy(args.n, args.alpha,
-                              x_vec(), args.x_offset, args.x_inc,
-                              y_vec(), args.y_offset, args.y_inc,
-                              1, &queue_plain, 0, nullptr, &event);
-    return static_cast<StatusCode>(status);
-  };
-
-  // Initializes the arguments relevant for this routine
-  auto args = Arguments<T>{};
-  const auto options = std::vector<std::string>{kArgN, kArgXInc, kArgYInc,
-                                                kArgXOffset, kArgYOffset, kArgAlpha};
-
-  // Creates a tester
-  TestXY<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
-
-  // Runs the tests
-  const auto case_name = "default";
-  tester.TestRegular(args, case_name);
-  tester.TestInvalidBufferSizes(args, case_name);
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::XaxpyTest<float>(argc, argv, false, "SAXPY");
-  clblast::XaxpyTest<double>(argc, argv, true, "DAXPY");
-  clblast::XaxpyTest<clblast::float2>(argc, argv, true, "CAXPY");
-  clblast::XaxpyTest<clblast::double2>(argc, argv, true, "ZAXPY");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/xgemm.cc b/test/correctness/routines/xgemm.cc
deleted file mode 100644
index 4129e17c..00000000
--- a/test/correctness/routines/xgemm.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the tests for the Xgemm routine. It is based on the TestABC class.
-//
-// =================================================================================================
-
-#include "wrapper_clblas.h"
-#include "correctness/testabc.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
-template <typename T>
-void XgemmTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [](const Arguments<T> &args,
-                           const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
-                           CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    return Gemm(args.layout, args.a_transpose, args.b_transpose,
-                args.m, args.n, args.k,
-                args.alpha,
-                a_mat(), args.a_offset, args.a_ld,
-                b_mat(), args.b_offset, args.b_ld,
-                args.beta,
-                c_mat(), args.c_offset, args.c_ld,
-                &queue_plain, &event);
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [](const Arguments<T> &args,
-                          const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
-                          CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXgemm(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasTranspose>(args.a_transpose),
-                              static_cast<clblasTranspose>(args.b_transpose),
-                              args.m, args.n, args.k,
-                              args.alpha,
-                              a_mat(), args.a_offset, args.a_ld,
-                              b_mat(), args.b_offset, args.b_ld,
-                              args.beta,
-                              c_mat(), args.c_offset, args.c_ld,
-                              1, &queue_plain, 0, nullptr, &event);
-    return static_cast<StatusCode>(status);
-  };
-
-  // Initializes the arguments relevant for this routine
-  auto args = Arguments<T>{};
-  const auto options = std::vector<std::string>{kArgM, kArgN, kArgK, kArgLayout,
-                                                kArgATransp, kArgBTransp,
-                                                kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
-                                                kArgAOffset, kArgBOffset, kArgCOffset};
-
-  // Creates a tester
-  TestABC<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
-
-  // Loops over the test-cases from a data-layout point of view
-  for (auto &layout: tester.kLayouts) {
-    args.layout = layout;
-    for (auto &a_transpose: tester.kTransposes) {
-      args.a_transpose = a_transpose;
-      for (auto &b_transpose: tester.kTransposes) {
-        args.b_transpose = b_transpose;
-        const auto case_name = ToString(layout)+" "+ToString(a_transpose)+" "+ToString(b_transpose);
-
-        // Runs the tests
-        tester.TestRegular(args, case_name);
-        tester.TestInvalidBufferSizes(args, case_name);
-      }
-    }
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::XgemmTest<float>(argc, argv, false, "SGEMM");
-  clblast::XgemmTest<double>(argc, argv, true, "DGEMM");
-  clblast::XgemmTest<clblast::float2>(argc, argv, true, "CGEMM");
-  clblast::XgemmTest<clblast::double2>(argc, argv, true, "ZGEMM");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/xgemv.cc b/test/correctness/routines/xgemv.cc
deleted file mode 100644
index 1f484eb4..00000000
--- a/test/correctness/routines/xgemv.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the tests for the Xgemv routine. It is based on the TestAXY class.
-//
-// =================================================================================================
-
-#include "wrapper_clblas.h"
-#include "correctness/testaxy.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
-template <typename T>
-void XgemvTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [](const Arguments<T> &args,
-                           const Buffer &a_mat, const Buffer &x_vec, const Buffer &y_vec,
-                           CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    return Gemv(args.layout, args.a_transpose, args.m, args.n, args.alpha,
-                a_mat(), args.a_offset, args.a_ld,
-                x_vec(), args.x_offset, args.x_inc, args.beta,
-                y_vec(), args.y_offset, args.y_inc,
-                &queue_plain, &event);
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [](const Arguments<T> &args,
-                          const Buffer &a_mat, const Buffer &x_vec, const Buffer &y_vec,
-                          CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXgemv(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasTranspose>(args.a_transpose),
-                              args.m, args.n, args.alpha,
-                              a_mat(), args.a_offset, args.a_ld,
-                              x_vec(), args.x_offset, args.x_inc, args.beta,
-                              y_vec(), args.y_offset, args.y_inc,
-                              1, &queue_plain, 0, nullptr, &event);
-    return static_cast<StatusCode>(status);
-  };
-
-  // Initializes the arguments relevant for this routine
-  auto args = Arguments<T>{};
-  const auto options = std::vector<std::string>{kArgM, kArgN, kArgLayout, kArgATransp,
-                                                kArgALeadDim, kArgXInc, kArgYInc,
-                                                kArgAOffset, kArgXOffset, kArgYOffset};
-
-  // Creates a tester
-  TestAXY<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
-
-  // Loops over the test-cases from a data-layout point of view
-  for (auto &layout: tester.kLayouts) {
-    args.layout = layout;
-    for (auto &a_transpose: tester.kTransposes) {
-      args.a_transpose = a_transpose;
-      const auto case_name = ToString(layout)+" "+ToString(a_transpose);
-
-      // Runs the tests
-      tester.TestRegular(args, case_name);
-      tester.TestInvalidBufferSizes(args, case_name);
-    }
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::XgemvTest<float>(argc, argv, false, "SGEMV");
-  clblast::XgemvTest<double>(argc, argv, true, "DGEMV");
-  clblast::XgemvTest<clblast::float2>(argc, argv, true, "CGEMV");
-  clblast::XgemvTest<clblast::double2>(argc, argv, true, "ZGEMV");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/xsymm.cc b/test/correctness/routines/xsymm.cc
deleted file mode 100644
index d769177f..00000000
--- a/test/correctness/routines/xsymm.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the tests for the Xsymm routine. It is based on the TestABC class.
-//
-// =================================================================================================
-
-#include "wrapper_clblas.h"
-#include "correctness/testabc.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
-template <typename T>
-void XsymmTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [](const Arguments<T> &args,
-                           const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
-                           CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    return Symm(args.layout, args.side, args.triangle,
-                args.m, args.n,
-                args.alpha,
-                a_mat(), args.a_offset, args.a_ld,
-                b_mat(), args.b_offset, args.b_ld,
-                args.beta,
-                c_mat(), args.c_offset, args.c_ld,
-                &queue_plain, &event);
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [](const Arguments<T> &args,
-                          const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
-                          CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXsymm(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasSide>(args.side),
-                              static_cast<clblasUplo>(args.triangle),
-                              args.m, args.n,
-                              args.alpha,
-                              a_mat(), args.a_offset, args.a_ld,
-                              b_mat(), args.b_offset, args.b_ld,
-                              args.beta,
-                              c_mat(), args.c_offset, args.c_ld,
-                              1, &queue_plain, 0, nullptr, &event);
-    return static_cast<StatusCode>(status);
-  };
-
-  // Initializes the arguments relevant for this routine
-  auto args = Arguments<T>{};
-  const auto options = std::vector<std::string>{kArgM, kArgN, kArgLayout,
-                                                kArgSide, kArgTriangle,
-                                                kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
-                                                kArgAOffset, kArgBOffset, kArgCOffset};
-
-  // Creates a tester
-  TestABC<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
-
-  // Loops over the test-cases from a data-layout point of view
-  for (auto &layout: tester.kLayouts) {
-    args.layout = layout;
-    for (auto &side: {Side::kLeft, Side::kRight}) {
-      args.side = side;
-      for (auto &triangle: {Triangle::kUpper, Triangle::kLower}) {
-        args.triangle = triangle;
-        const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
-
-        // Runs the tests
-        tester.TestRegular(args, case_name);
-        tester.TestInvalidBufferSizes(args, case_name);
-      }
-    }
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::XsymmTest<float>(argc, argv, false, "SSYMM");
-  clblast::XsymmTest<double>(argc, argv, true, "DSYMM");
-  clblast::XsymmTest<clblast::float2>(argc, argv, true, "CSYMM");
-  clblast::XsymmTest<clblast::double2>(argc, argv, true, "ZSYMM");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/testabc.cc b/test/correctness/testabc.cc
deleted file mode 100644
index eed17560..00000000
--- a/test/correctness/testabc.cc
+++ /dev/null
@@ -1,217 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the TestABC class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include <algorithm>
-
-#include "correctness/testabc.h"
-
-namespace clblast {
-// =================================================================================================
-
-// Constructor, initializes the base class tester and input data
-template <typename T>
-TestABC<T>::TestABC(int argc, char *argv[], const bool silent,
-                    const std::string &name, const std::vector<std::string> &options,
-                    const Routine clblast_lambda, const Routine clblas_lambda):
-    Tester<T>{argc, argv, silent, name, options},
-    clblast_lambda_(clblast_lambda),
-    clblas_lambda_(clblas_lambda) {
-
-  // Computes the maximum sizes. This allows for a single set of input/output buffers.
-  auto max_dim = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
-  auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
-  auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
-
-  // Creates test input data
-  a_source_.resize(max_dim*max_ld + max_offset);
-  b_source_.resize(max_dim*max_ld + max_offset);
-  c_source_.resize(max_dim*max_ld + max_offset);
-  PopulateVector(a_source_);
-  PopulateVector(b_source_);
-  PopulateVector(c_source_);
-}
-
-// ===============================================================================================
-
-// Tests the routine for a wide variety of parameters
-template <typename T>
-void TestABC<T>::TestRegular(Arguments<T> &args, const std::string &name) {
-  if (!PrecisionSupported()) { return; }
-  TestStart("regular behaviour", name);
-
-  // Computes whether or not the matrices are transposed. Note that we assume a default of
-  // column-major and no-transpose. If one of them is different (but not both), then rotated
-  // is considered true.
-  auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
-                   (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
-  auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
-                   (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
-  auto c_rotated = (args.layout == Layout::kRowMajor);
-
-  // Iterates over the matrix dimensions
-  for (auto &m: kMatrixDims) {
-    args.m = m;
-    for (auto &n: kMatrixDims) {
-      args.n = n;
-      for (auto &k: kMatrixDims) {
-        args.k = k;
-
-        // Computes the second dimensions of the matrices taking the rotation into account
-        auto a_two = (a_rotated) ? m : k;
-        auto b_two = (b_rotated) ? k : n;
-        auto c_two = (c_rotated) ? m : n;
-
-        // Iterates over the leading-dimension values and the offsets
-        for (auto &a_ld: kMatrixDims) {
-          args.a_ld = a_ld;
-          for (auto &a_offset: kOffsets) {
-            args.a_offset = a_offset;
-            for (auto &b_ld: kMatrixDims) {
-              args.b_ld = b_ld;
-              for (auto &b_offset: kOffsets) {
-                args.b_offset = b_offset;
-                for (auto &c_ld: kMatrixDims) {
-                  args.c_ld = c_ld;
-                  for (auto &c_offset: kOffsets) {
-                    args.c_offset = c_offset;
-
-                    // Computes the buffer sizes
-                    auto a_size = a_two * a_ld + a_offset;
-                    auto b_size = b_two * b_ld + b_offset;
-                    auto c_size = c_two * c_ld + c_offset;
-                    if (a_size < 1 || b_size < 1 || c_size < 1) { continue; }
-
-                    // Creates the OpenCL buffers
-                    auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T));
-                    auto b_mat = Buffer(context_, CL_MEM_READ_WRITE, b_size*sizeof(T));
-                    auto r_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T));
-                    auto s_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T));
-
-                    // Iterates over the values for alpha and beta
-                    for (auto &alpha: kAlphaValues) {
-                      args.alpha = alpha;
-                      for (auto &beta: kBetaValues) {
-                        args.beta = beta;
-
-                        // Runs the reference clBLAS code
-                        a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
-                        b_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_);
-                        r_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_);
-                        auto status1 = clblas_lambda_(args, a_mat, b_mat, r_mat, queue_);
-
-                        // Runs the CLBlast code
-                        a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
-                        b_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_);
-                        s_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_);
-                        auto status2 = clblast_lambda_(args, a_mat, b_mat, s_mat, queue_);
-
-                        // Tests for equality of the two status codes
-                        if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
-                          TestErrorCodes(status1, status2, args);
-                          continue;
-                        }
-
-                        // Downloads the results
-                        std::vector<T> r_result(c_size, static_cast<T>(0));
-                        std::vector<T> s_result(c_size, static_cast<T>(0));
-                        r_mat.ReadBuffer(queue_, c_size*sizeof(T), r_result);
-                        s_mat.ReadBuffer(queue_, c_size*sizeof(T), s_result);
-
-                        // Checks for differences in the output
-                        auto errors = size_t{0};
-                        for (auto idm=size_t{0}; idm<m; ++idm) {
-                          for (auto idn=size_t{0}; idn<n; ++idn) {
-                            auto index = (args.layout == Layout::kRowMajor) ?
-                                          idm*args.c_ld + idn + args.c_offset:
-                                          idn*args.c_ld + idm + args.c_offset;
-                            if (!TestSimilarity(r_result[index], s_result[index])) {
-                              errors++;
-                            }
-                          }
-                        }
-
-                        // Tests the error count (should be zero)
-                        TestErrorCount(errors, m*n, args);
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  TestEnd();
-}
-
-// =================================================================================================
-
-// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
-// does not test for results (if any).
-template <typename T>
-void TestABC<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
-  if (!PrecisionSupported()) { return; }
-  TestStart("invalid buffer sizes", name);
-
-  // Sets example test parameters
-  args.m = kBufferSize;
-  args.n = kBufferSize;
-  args.k = kBufferSize;
-  args.a_ld = kBufferSize;
-  args.b_ld = kBufferSize;
-  args.c_ld = kBufferSize;
-  args.a_offset = 0;
-  args.b_offset = 0;
-  args.c_offset = 0;
-
-  // Iterates over test buffer sizes
-  const std::vector<size_t> kBufferSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
-  for (auto &a_size: kBufferSizes) {
-    for (auto &b_size: kBufferSizes) {
-      for (auto &c_size: kBufferSizes) {
-
-        // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
-        // want to be able to create invalid buffers (no error checking here).
-        auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr);
-        auto a_mat = Buffer(a);
-        auto b = clCreateBuffer(context_(), CL_MEM_READ_WRITE, b_size*sizeof(T), nullptr, nullptr);
-        auto b_mat = Buffer(b);
-        auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr);
-        auto r_mat = Buffer(r);
-        auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr);
-        auto s_mat = Buffer(s);
-
-        // Runs the two routines
-        auto status1 = clblas_lambda_(args, a_mat, b_mat, r_mat, queue_);
-        auto status2 = clblast_lambda_(args, a_mat, b_mat, s_mat, queue_);
-
-        // Tests for equality of the two status codes
-        TestErrorCodes(status1, status2, args);
-      }
-    }
-  }
-  TestEnd();
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class TestABC<float>;
-template class TestABC<double>;
-template class TestABC<float2>;
-template class TestABC<double2>;
-
-// =================================================================================================
-} // namespace clblast
diff --git a/test/correctness/testabc.h b/test/correctness/testabc.h
deleted file mode 100644
index 2c44d532..00000000
--- a/test/correctness/testabc.h
+++ /dev/null
@@ -1,86 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file tests any mat-mat-mat (A,B,C) routine. It contains two types of tests: one testing
-// all sorts of input combinations, and one deliberatly testing with invalid values.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_TEST_CORRECTNESS_TESTABC_H_
-#define CLBLAST_TEST_CORRECTNESS_TESTABC_H_
-
-#include <vector>
-#include <string>
-
-#include "correctness/tester.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class TestABC: public Tester<T> {
- public:
-
-  // Uses several variables from the Tester class
-  using Tester<T>::context_;
-  using Tester<T>::queue_;
-  using Tester<T>::kLayouts;
-  using Tester<T>::kTransposes;
-
-  // Uses several helper functions from the Tester class
-  using Tester<T>::TestStart;
-  using Tester<T>::TestEnd;
-  using Tester<T>::TestSimilarity;
-  using Tester<T>::TestErrorCount;
-  using Tester<T>::TestErrorCodes;
-  using Tester<T>::GetExampleScalars;
-  using Tester<T>::GetOffsets;
-  using Tester<T>::PrecisionSupported;
-
-  // Test settings for the regular test. Append to this list in case more tests are required.
-  const std::vector<size_t> kMatrixDims = { 7, 64 };
-  const std::vector<size_t> kOffsets = GetOffsets();
-  const std::vector<T> kAlphaValues = GetExampleScalars();
-  const std::vector<T> kBetaValues = GetExampleScalars();
-
-  // Test settings for the invalid test
-  const size_t kBufferSize = 64;
-
-  // Shorthand for a BLAS routine
-  using Routine = std::function<StatusCode(const Arguments<T>&,
-                                           const Buffer&, const Buffer&, const Buffer&,
-                                           CommandQueue&)>;
-
-  // Constructor, initializes the base class tester and input data
-  TestABC(int argc, char *argv[], const bool silent,
-          const std::string &name, const std::vector<std::string> &options,
-          const Routine clblast_lambda, const Routine clblas_lambda);
-
-  // The test functions, taking no inputs
-  void TestRegular(Arguments<T> &args, const std::string &name);
-  void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
-
- private:
-
-  // Source data to test with
-  std::vector<T> a_source_;
-  std::vector<T> b_source_;
-  std::vector<T> c_source_;
-  
-  // The routines to test
-  Routine clblast_lambda_;
-  Routine clblas_lambda_;
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_TEST_CORRECTNESS_TESTABC_H_
-#endif
diff --git a/test/correctness/testaxy.cc b/test/correctness/testaxy.cc
deleted file mode 100644
index cb5e9923..00000000
--- a/test/correctness/testaxy.cc
+++ /dev/null
@@ -1,213 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the TestAXY class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include <algorithm>
-
-#include "correctness/testaxy.h"
-
-namespace clblast {
-// =================================================================================================
-
-// Constructor, initializes the base class tester and input data
-template <typename T>
-TestAXY<T>::TestAXY(int argc, char *argv[], const bool silent,
-                    const std::string &name, const std::vector<std::string> &options,
-                    const Routine clblast_lambda, const Routine clblas_lambda):
-    Tester<T>{argc, argv, silent, name, options},
-    clblast_lambda_(clblast_lambda),
-    clblas_lambda_(clblas_lambda) {
-
-  // Computes the maximum sizes. This allows for a single set of input/output buffers.
-  auto max_dim = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
-  auto max_ld = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
-  auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end());
-  auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
-
-  // Creates test input data
-  a_source_.resize(max_dim*max_ld + max_offset);
-  x_source_.resize(max_dim*max_inc + max_offset);
-  y_source_.resize(max_dim*max_inc + max_offset);
-  PopulateVector(a_source_);
-  PopulateVector(x_source_);
-  PopulateVector(y_source_);
-}
-
-// ===============================================================================================
-
-// Tests the routine for a wide variety of parameters
-template <typename T>
-void TestAXY<T>::TestRegular(Arguments<T> &args, const std::string &name) {
-  if (!PrecisionSupported()) { return; }
-  TestStart("regular behaviour", name);
-
-  // Iterates over the dimension for the matrix and vectors
-  for (auto &m: kMatrixVectorDims) {
-    args.m = m;
-    for (auto &n: kMatrixVectorDims) {
-      args.n = n;
-
-      // Computes the second dimension of the matrix taking the rotation into account
-      auto a_two = (args.layout == Layout::kRowMajor) ? args.m : args.n;
-
-      // Computes the vector sizes in case the matrix is transposed
-      auto a_transposed = (args.a_transpose == Transpose::kYes);
-      auto m_real = (a_transposed) ? n : m;
-      auto n_real = (a_transposed) ? m : n;
-
-      // Iterates over the leading-dimension values and the offsets of the matrix
-      for (auto &a_ld: kMatrixVectorDims) {
-        args.a_ld = a_ld;
-        for (auto &a_offset: kOffsets) {
-          args.a_offset = a_offset;
-
-          // Iterates over the increment-values and the offsets of the vectors
-          for (auto &x_inc: kIncrements) {
-            args.x_inc = x_inc;
-            for (auto &x_offset: kOffsets) {
-              args.x_offset = x_offset;
-              for (auto &y_inc: kIncrements) {
-                args.y_inc = y_inc;
-                for (auto &y_offset: kOffsets) {
-                  args.y_offset = y_offset;
-
-                  // Computes the buffer sizes
-                  auto a_size = a_two * a_ld + a_offset;
-                  auto x_size = n_real * x_inc + x_offset;
-                  auto y_size = m_real * y_inc + y_offset;
-                  if (a_size < 1 || x_size < 1 || y_size < 1) { continue; }
-
-                  // Creates the OpenCL buffers
-                  auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T));
-                  auto x_vec = Buffer(context_, CL_MEM_READ_WRITE, x_size*sizeof(T));
-                  auto r_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
-                  auto s_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
-
-                  // Iterates over the values for alpha and beta
-                  for (auto &alpha: kAlphaValues) {
-                    args.alpha = alpha;
-                    for (auto &beta: kBetaValues) {
-                      args.beta = beta;
-
-                      // Runs the reference clBLAS code
-                      a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
-                      x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
-                      r_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
-                      auto status1 = clblas_lambda_(args, a_mat, x_vec, r_vec, queue_);
-
-                      // Runs the CLBlast code
-                      a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
-                      x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
-                      s_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
-                      auto status2 = clblast_lambda_(args, a_mat, x_vec, s_vec, queue_);
-
-                      // Tests for equality of the two status codes
-                      if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
-                        TestErrorCodes(status1, status2, args);
-                        continue;
-                      }
-
-                      // Downloads the results
-                      std::vector<T> r_result(y_size, static_cast<T>(0));
-                      std::vector<T> s_result(y_size, static_cast<T>(0));
-                      r_vec.ReadBuffer(queue_, y_size*sizeof(T), r_result);
-                      s_vec.ReadBuffer(queue_, y_size*sizeof(T), s_result);
-
-                      // Checks for differences in the output
-                      auto errors = size_t{0};
-                      for (auto idm=size_t{0}; idm<m_real; ++idm) {
-                        auto index = idm*y_inc + y_offset;
-                        if (!TestSimilarity(r_result[index], s_result[index])) {
-                          errors++;
-                        }
-                      }
-
-                      // Tests the error count (should be zero)
-                      TestErrorCount(errors, m_real, args);
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  TestEnd();
-}
-
-// =================================================================================================
-
-// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
-// does not test for results (if any).
-template <typename T>
-void TestAXY<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
-  if (!PrecisionSupported()) { return; }
-  TestStart("invalid buffer sizes", name);
-
-  // Sets example test parameters
-  args.m = kBufferSize;
-  args.n = kBufferSize;
-  args.a_ld = kBufferSize;
-  args.a_offset = 0;
-  args.x_offset = 0;
-  args.y_offset = 0;
-
-  // Iterates over test buffer sizes
-  const std::vector<size_t> kMatrixSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
-  const std::vector<size_t> kVectorSizes = {0, kBufferSize - 1, kBufferSize};
-  for (auto &a_size: kMatrixSizes) {
-    for (auto &x_size: kVectorSizes) {
-      for (auto &y_size: kVectorSizes) {
-
-        // Iterates over test increments
-        for (auto &x_inc: kInvalidIncrements) {
-          args.x_inc = x_inc;
-          for (auto &y_inc: kInvalidIncrements) {
-            args.y_inc = y_inc;
-
-            // Creates the OpenCL buffers. Note: we are not using the C++ version since we
-            // explicitly want to be able to create invalid buffers (no error checking here).
-            auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr);
-            auto a_mat = Buffer(a);
-            auto x = clCreateBuffer(context_(), CL_MEM_READ_WRITE, x_size*sizeof(T), nullptr, nullptr);
-            auto x_vec = Buffer(x);
-            auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
-            auto r_vec = Buffer(r);
-            auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
-            auto s_vec = Buffer(s);
-
-            // Runs the two routines
-            auto status1 = clblas_lambda_(args, a_mat, x_vec, r_vec, queue_);
-            auto status2 = clblast_lambda_(args, a_mat, x_vec, s_vec, queue_);
-
-            // Tests for equality of the two status codes
-            TestErrorCodes(status1, status2, args);
-          }
-        }
-      }
-    }
-  }
-  TestEnd();
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class TestAXY<float>;
-template class TestAXY<double>;
-template class TestAXY<float2>;
-template class TestAXY<double2>;
-
-// =================================================================================================
-} // namespace clblast
diff --git a/test/correctness/testaxy.h b/test/correctness/testaxy.h
deleted file mode 100644
index fa2c4a98..00000000
--- a/test/correctness/testaxy.h
+++ /dev/null
@@ -1,88 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file tests any mat-vec-vec (A,X,Y) routine. It contains two types of tests: one testing
-// all sorts of input combinations, and one deliberatly testing with invalid values.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_TEST_CORRECTNESS_TESTAXY_H_
-#define CLBLAST_TEST_CORRECTNESS_TESTAXY_H_
-
-#include <vector>
-#include <string>
-
-#include "correctness/tester.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class TestAXY: public Tester<T> {
- public:
-
-  // Uses several variables from the Tester class
-  using Tester<T>::context_;
-  using Tester<T>::queue_;
-  using Tester<T>::kLayouts;
-  using Tester<T>::kTransposes;
-
-  // Uses several helper functions from the Tester class
-  using Tester<T>::TestStart;
-  using Tester<T>::TestEnd;
-  using Tester<T>::TestSimilarity;
-  using Tester<T>::TestErrorCount;
-  using Tester<T>::TestErrorCodes;
-  using Tester<T>::GetExampleScalars;
-  using Tester<T>::GetOffsets;
-  using Tester<T>::PrecisionSupported;
-
-  // Test settings for the regular test. Append to this list in case more tests are required.
-  const std::vector<size_t> kMatrixVectorDims = { 61, 512 };
-  const std::vector<size_t> kOffsets = GetOffsets();
-  const std::vector<size_t> kIncrements = { 1, 2 };
-  const std::vector<T> kAlphaValues = GetExampleScalars();
-  const std::vector<T> kBetaValues = GetExampleScalars();
-
-  // Test settings for the invalid test
-  const std::vector<size_t> kInvalidIncrements = { 0, 1 };
-  const size_t kBufferSize = 64;
-
-  // Shorthand for a BLAS routine
-  using Routine = std::function<StatusCode(const Arguments<T>&,
-                                           const Buffer&, const Buffer&, const Buffer&,
-                                           CommandQueue&)>;
-
-  // Constructor, initializes the base class tester and input data
-  TestAXY(int argc, char *argv[], const bool silent,
-          const std::string &name, const std::vector<std::string> &options,
-          const Routine clblast_lambda, const Routine clblas_lambda);
-
-  // The test functions, taking no inputs
-  void TestRegular(Arguments<T> &args, const std::string &name);
-  void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
-
- private:
-
-  // Source data to test with
-  std::vector<T> a_source_;
-  std::vector<T> x_source_;
-  std::vector<T> y_source_;
-  
-  // The routines to test
-  Routine clblast_lambda_;
-  Routine clblas_lambda_;
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_TEST_CORRECTNESS_TESTAXY_H_
-#endif
diff --git a/test/correctness/testblas.cc b/test/correctness/testblas.cc
new file mode 100644
index 00000000..5951b177
--- /dev/null
+++ b/test/correctness/testblas.cc
@@ -0,0 +1,189 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the TestBlas class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include <algorithm>
+
+#include "correctness/testblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The transpose-options to test with (data-type dependent)
+template <> const std::vector<Transpose> TestBlas<float,float>::kTransposes = {Transpose::kNo, Transpose::kYes};
+template <> const std::vector<Transpose> TestBlas<double,double>::kTransposes = {Transpose::kNo, Transpose::kYes};
+template <> const std::vector<Transpose> TestBlas<float2,float2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
+template <> const std::vector<Transpose> TestBlas<double2,double2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
+template <> const std::vector<Transpose> TestBlas<float2,float>::kTransposes = {Transpose::kNo, Transpose::kConjugate};
+template <> const std::vector<Transpose> TestBlas<double2,double>::kTransposes = {Transpose::kNo, Transpose::kConjugate};
+
+// =================================================================================================
+
+// Constructor, initializes the base class tester and input data
+template <typename T, typename U>
+TestBlas<T,U>::TestBlas(int argc, char *argv[], const bool silent,
+                        const std::string &name, const std::vector<std::string> &options,
+                        const Routine run_routine, const Routine run_reference,
+                        const ResultGet get_result, const ResultIndex get_index,
+                        const ResultIterator get_id1, const ResultIterator get_id2):
+    Tester<T,U>{argc, argv, silent, name, options},
+    run_routine_(run_routine),
+    run_reference_(run_reference),
+    get_result_(get_result),
+    get_index_(get_index),
+    get_id1_(get_id1),
+    get_id2_(get_id2) {
+
+  // Computes the maximum sizes. This allows for a single set of input/output buffers.
+  auto max_vec = *std::max_element(kVectorDims.begin(), kVectorDims.end());
+  auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end());
+  auto max_mat = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
+  auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
+  auto max_matvec = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
+  auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
+
+  // Creates test input data
+  x_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset);
+  y_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset);
+  a_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
+  b_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
+  c_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
+  PopulateVector(x_source_);
+  PopulateVector(y_source_);
+  PopulateVector(a_source_);
+  PopulateVector(b_source_);
+  PopulateVector(c_source_);
+}
+
+// ===============================================================================================
+
+// Tests the routine for a wide variety of parameters
+template <typename T, typename U>
+void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const std::string &name) {
+  if (!PrecisionSupported<T>(device_)) { return; }
+  TestStart("regular behaviour", name);
+
+  // Iterates over all the to-be-tested combinations of arguments
+  for (auto &args: test_vector) {
+
+    // Runs the reference clBLAS code
+    auto x_vec1 = Buffer(context_, CL_MEM_READ_WRITE, args.x_size*sizeof(T));
+    auto y_vec1 = Buffer(context_, CL_MEM_READ_WRITE, args.y_size*sizeof(T));
+    auto a_mat1 = Buffer(context_, CL_MEM_READ_WRITE, args.a_size*sizeof(T));
+    auto b_mat1 = Buffer(context_, CL_MEM_READ_WRITE, args.b_size*sizeof(T));
+    auto c_mat1 = Buffer(context_, CL_MEM_READ_WRITE, args.c_size*sizeof(T));
+    x_vec1.WriteBuffer(queue_, args.x_size*sizeof(T), x_source_);
+    y_vec1.WriteBuffer(queue_, args.y_size*sizeof(T), y_source_);
+    a_mat1.WriteBuffer(queue_, args.a_size*sizeof(T), a_source_);
+    b_mat1.WriteBuffer(queue_, args.b_size*sizeof(T), b_source_);
+    c_mat1.WriteBuffer(queue_, args.c_size*sizeof(T), c_source_);
+    auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1};
+    auto status1 = run_reference_(args, buffers1, queue_);
+
+    // Runs the CLBlast code
+    auto x_vec2 = Buffer(context_, CL_MEM_READ_WRITE, args.x_size*sizeof(T));
+    auto y_vec2 = Buffer(context_, CL_MEM_READ_WRITE, args.y_size*sizeof(T));
+    auto a_mat2 = Buffer(context_, CL_MEM_READ_WRITE, args.a_size*sizeof(T));
+    auto b_mat2 = Buffer(context_, CL_MEM_READ_WRITE, args.b_size*sizeof(T));
+    auto c_mat2 = Buffer(context_, CL_MEM_READ_WRITE, args.c_size*sizeof(T));
+    x_vec2.WriteBuffer(queue_, args.x_size*sizeof(T), x_source_);
+    y_vec2.WriteBuffer(queue_, args.y_size*sizeof(T), y_source_);
+    a_mat2.WriteBuffer(queue_, args.a_size*sizeof(T), a_source_);
+    b_mat2.WriteBuffer(queue_, args.b_size*sizeof(T), b_source_);
+    c_mat2.WriteBuffer(queue_, args.c_size*sizeof(T), c_source_);
+    auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2};
+    auto status2 = run_routine_(args, buffers2, queue_);
+
+    // Tests for equality of the two status codes
+    if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
+      TestErrorCodes(status1, status2, args);
+      continue;
+    }
+
+    // Downloads the results
+    auto result1 = get_result_(args, buffers1, queue_);
+    auto result2 = get_result_(args, buffers2, queue_);
+
+    // Checks for differences in the output
+    auto errors = size_t{0};
+    for (auto id1=size_t{0}; id1<get_id1_(args); ++id1) {
+      for (auto id2=size_t{0}; id2<get_id2_(args); ++id2) {
+        auto index = get_index_(args, id1, id2);
+        if (!TestSimilarity(result1[index], result2[index])) {
+          errors++;
+        }
+      }
+    }
+
+    // Tests the error count (should be zero)
+    TestErrorCount(errors, get_id1_(args)*get_id2_(args), args);
+  }
+  TestEnd();
+}
+
+// =================================================================================================
+
+// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
+// does not test for results (if any).
+template <typename T, typename U>
+void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const std::string &name) {
+  if (!PrecisionSupported<T>(device_)) { return; }
+  TestStart("invalid buffer sizes", name);
+
+  // Iterates over all the to-be-tested combinations of arguments
+  for (auto &args: test_vector) {
+
+    // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
+    // want to be able to create invalid buffers (no error checking here).
+    auto x1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
+    auto y1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
+    auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
+    auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
+    auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
+    auto x_vec1 = Buffer(x1);
+    auto y_vec1 = Buffer(y1);
+    auto a_mat1 = Buffer(a1);
+    auto b_mat1 = Buffer(b1);
+    auto c_mat1 = Buffer(c1);
+    auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
+    auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
+    auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
+    auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
+    auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
+    auto x_vec2 = Buffer(x2);
+    auto y_vec2 = Buffer(y2);
+    auto a_mat2 = Buffer(a2);
+    auto b_mat2 = Buffer(b2);
+    auto c_mat2 = Buffer(c2);
+
+    // Runs the two routines
+    auto status1 = run_reference_(args, Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1}, queue_);
+    auto status2 = run_routine_(args, Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2}, queue_);
+
+    // Tests for equality of the two status codes
+    TestErrorCodes(status1, status2, args);
+  }
+  TestEnd();
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class TestBlas<float, float>;
+template class TestBlas<double, double>;
+template class TestBlas<float2, float2>;
+template class TestBlas<double2, double2>;
+template class TestBlas<float2, float>;
+template class TestBlas<double2, double>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/test/correctness/testblas.h b/test/correctness/testblas.h
new file mode 100644
index 00000000..96c140c1
--- /dev/null
+++ b/test/correctness/testblas.h
@@ -0,0 +1,106 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file tests any CLBlast routine. It contains two types of tests: one testing all sorts of
+// input combinations, and one deliberatly testing with invalid values.
+// Typename T: the data-type of the routine's memory buffers (==precision)
+// Typename U: the data-type of the alpha and beta arguments
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_CORRECTNESS_TESTBLAS_H_
+#define CLBLAST_TEST_CORRECTNESS_TESTBLAS_H_
+
+#include <vector>
+#include <string>
+
+#include "correctness/tester.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class TestBlas: public Tester<T,U> {
+ public:
+
+  // Uses several variables from the Tester class
+  using Tester<T,U>::context_;
+  using Tester<T,U>::queue_;
+  using Tester<T,U>::full_test_;
+  using Tester<T,U>::device_;
+
+  // Uses several helper functions from the Tester class
+  using Tester<T,U>::TestStart;
+  using Tester<T,U>::TestEnd;
+  using Tester<T,U>::TestErrorCount;
+  using Tester<T,U>::TestErrorCodes;
+  using Tester<T,U>::GetOffsets;
+
+  // Test settings for the regular test. Append to these lists in case more tests are required.
+  const std::vector<size_t> kVectorDims = { 7, 93, 4096 };
+  const std::vector<size_t> kIncrements = { 1, 2, 7 };
+  const std::vector<size_t> kMatrixDims = { 7, 64 };
+  const std::vector<size_t> kMatrixVectorDims = { 61, 512 };
+  const std::vector<size_t> kOffsets = GetOffsets();
+  const std::vector<U> kAlphaValues = GetExampleScalars<U>(full_test_);
+  const std::vector<U> kBetaValues = GetExampleScalars<U>(full_test_);
+
+  // Test settings for the invalid tests
+  const std::vector<size_t> kInvalidIncrements = { 0, 1 };
+  const size_t kBufferSize = 64;
+  const std::vector<size_t> kMatSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
+  const std::vector<size_t> kVecSizes = {0, kBufferSize - 1, kBufferSize};
+
+  // The layout/transpose/triangle options to test with
+  const std::vector<Layout> kLayouts = {Layout::kRowMajor, Layout::kColMajor};
+  const std::vector<Triangle> kTriangles = {Triangle::kUpper, Triangle::kLower};
+  const std::vector<Side> kSides = {Side::kLeft, Side::kRight};
+  const std::vector<Diagonal> kDiagonals = {Diagonal::kUnit, Diagonal::kNonUnit};
+  static const std::vector<Transpose> kTransposes; // Data-type dependent, see .cc-file
+
+  // Shorthand for the routine-specific functions passed to the tester
+  using Routine = std::function<StatusCode(const Arguments<U>&, const Buffers&, CommandQueue&)>;
+  using ResultGet = std::function<std::vector<T>(const Arguments<U>&, Buffers&, CommandQueue&)>;
+  using ResultIndex = std::function<size_t(const Arguments<U>&, const size_t, const size_t)>;
+  using ResultIterator = std::function<size_t(const Arguments<U>&)>;
+
+  // Constructor, initializes the base class tester and input data
+  TestBlas(int argc, char *argv[], const bool silent,
+           const std::string &name, const std::vector<std::string> &options,
+           const Routine run_routine, const Routine run_reference, const ResultGet get_result,
+           const ResultIndex get_index, const ResultIterator get_id1, const ResultIterator get_id2);
+
+  // The test functions, taking no inputs
+  void TestRegular(std::vector<Arguments<U>> &test_vector, const std::string &name);
+  void TestInvalid(std::vector<Arguments<U>> &test_vector, const std::string &name);
+
+ private:
+
+  // Source data to test with
+  std::vector<T> x_source_;
+  std::vector<T> y_source_;
+  std::vector<T> a_source_;
+  std::vector<T> b_source_;
+  std::vector<T> c_source_;
+  
+  // The routine-specific functions passed to the tester
+  Routine run_routine_;
+  Routine run_reference_;
+  ResultGet get_result_;
+  ResultIndex get_index_;
+  ResultIterator get_id1_;
+  ResultIterator get_id2_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_CORRECTNESS_TESTBLAS_H_
+#endif
diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc
index 501f1906..378968ed 100644
--- a/test/correctness/tester.cc
+++ b/test/correctness/tester.cc
@@ -21,21 +21,11 @@
 namespace clblast {
 // =================================================================================================
 
-// The layouts and transpose-options to test with (data-type dependent)
-template <typename T>
-const std::vector<Layout> Tester<T>::kLayouts = {Layout::kRowMajor, Layout::kColMajor};
-template <> const std::vector<Transpose> Tester<float>::kTransposes = {Transpose::kNo, Transpose::kYes};
-template <> const std::vector<Transpose> Tester<double>::kTransposes = {Transpose::kNo, Transpose::kYes};
-template <> const std::vector<Transpose> Tester<float2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
-template <> const std::vector<Transpose> Tester<double2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
-
-// =================================================================================================
-
 // General constructor for all CLBlast testers. It prints out the test header to stdout and sets-up
 // the clBLAS library for reference.
-template <typename T>
-Tester<T>::Tester(int argc, char *argv[], const bool silent,
-                  const std::string &name, const std::vector<std::string> &options):
+template <typename T, typename U>
+Tester<T,U>::Tester(int argc, char *argv[], const bool silent,
+                    const std::string &name, const std::vector<std::string> &options):
     help_("Options given/available:\n"),
     platform_(Platform(GetArgument(argc, argv, help_, kArgPlatform, size_t{0}))),
     device_(Device(platform_, kDeviceType, GetArgument(argc, argv, help_, kArgDevice, size_t{0}))),
@@ -61,7 +51,7 @@ Tester<T>::Tester(int argc, char *argv[], const bool silent,
           kPrintMessage.c_str(), name.c_str(), kPrintEnd.c_str());
 
   // Checks whether the precision is supported
-  if (!PrecisionSupported()) {
+  if (!PrecisionSupported<T>(device_)) {
     fprintf(stdout, "\n* All tests skipped: %sUnsupported precision%s\n",
             kPrintWarning.c_str(), kPrintEnd.c_str());
     return;
@@ -86,9 +76,9 @@ Tester<T>::Tester(int argc, char *argv[], const bool silent,
 }
 
 // Destructor prints the summary of the test cases and cleans-up the clBLAS library
-template <typename T>
-Tester<T>::~Tester() {
-  if (PrecisionSupported()) {
+template <typename T, typename U>
+Tester<T,U>::~Tester() {
+  if (PrecisionSupported<T>(device_)) {
     fprintf(stdout, "* Completed all test-cases for this routine. Results:\n");
     fprintf(stdout, "   %lu test(s) passed\n", tests_passed_);
     if (tests_skipped_ > 0) { fprintf(stdout, "%s", kPrintWarning.c_str()); }
@@ -104,8 +94,8 @@ Tester<T>::~Tester() {
 
 // Function called at the start of each test. This prints a header with information about the
 // test and re-initializes all test data-structures.
-template <typename T>
-void Tester<T>::TestStart(const std::string &test_name, const std::string &test_configuration) {
+template <typename T, typename U>
+void Tester<T,U>::TestStart(const std::string &test_name, const std::string &test_configuration) {
 
   // Prints the header
   fprintf(stdout, "* Testing %s'%s'%s for %s'%s'%s:\n",
@@ -123,8 +113,8 @@ void Tester<T>::TestStart(const std::string &test_name, const std::string &test_
 
 // Function called at the end of each test. This prints errors if any occured. It also prints a
 // summary of the number of sub-tests passed/failed.
-template <typename T>
-void Tester<T>::TestEnd() {
+template <typename T, typename U>
+void Tester<T,U>::TestEnd() {
   fprintf(stdout, "\n");
   tests_passed_ += num_passed_;
   tests_failed_ += num_skipped_;
@@ -147,6 +137,7 @@ void Tester<T>::TestEnd() {
       if (o == kArgBTransp)  { fprintf(stdout, "%s=%d ", kArgBTransp, entry.args.b_transpose);}
       if (o == kArgSide)     { fprintf(stdout, "%s=%d ", kArgSide, entry.args.side);}
       if (o == kArgTriangle) { fprintf(stdout, "%s=%d ", kArgTriangle, entry.args.triangle);}
+      if (o == kArgDiagonal) { fprintf(stdout, "%s=%d ", kArgDiagonal, entry.args.diagonal);}
       if (o == kArgXInc)     { fprintf(stdout, "%s=%lu ", kArgXInc, entry.args.x_inc);}
       if (o == kArgYInc)     { fprintf(stdout, "%s=%lu ", kArgYInc, entry.args.y_inc);}
       if (o == kArgXOffset)  { fprintf(stdout, "%s=%lu ", kArgXOffset, entry.args.x_offset);}
@@ -181,45 +172,9 @@ void Tester<T>::TestEnd() {
 
 // =================================================================================================
 
-// Compares two floating point values and returns whether they are within an acceptable error
-// margin. This replaces GTest's EXPECT_NEAR().
-template <typename T>
-bool Tester<T>::TestSimilarity(const T val1, const T val2) {
-  const auto difference = std::fabs(val1 - val2);
-
-  // Shortcut, handles infinities
-  if (val1 == val2) {
-    return true;
-  }
-  // The values are zero or very small: the relative error is less meaningful
-  else if (val1 == 0 || val2 == 0 || difference < static_cast<T>(kErrorMarginAbsolute)) {
-    return (difference < static_cast<T>(kErrorMarginAbsolute));
-  }
-  // Use relative error
-  else {
-    return (difference / (std::fabs(val1)+std::fabs(val2))) < static_cast<T>(kErrorMarginRelative);
-  }
-}
-
-// Specialisations for complex data-types
-template <>
-bool Tester<float2>::TestSimilarity(const float2 val1, const float2 val2) {
-  auto real = Tester<float>::TestSimilarity(val1.real(), val2.real());
-  auto imag = Tester<float>::TestSimilarity(val1.imag(), val2.imag());
-  return (real && imag);
-}
-template <>
-bool Tester<double2>::TestSimilarity(const double2 val1, const double2 val2) {
-  auto real = Tester<double>::TestSimilarity(val1.real(), val2.real());
-  auto imag = Tester<double>::TestSimilarity(val1.imag(), val2.imag());
-  return (real && imag);
-}
-
-// =================================================================================================
-
 // Handles a 'pass' or 'error' depending on whether there are any errors
-template <typename T>
-void Tester<T>::TestErrorCount(const size_t errors, const size_t size, const Arguments<T> &args) {
+template <typename T, typename U>
+void Tester<T,U>::TestErrorCount(const size_t errors, const size_t size, const Arguments<U> &args) {
 
   // Finished successfully
   if (errors == 0) {
@@ -237,9 +192,9 @@ void Tester<T>::TestErrorCount(const size_t errors, const size_t size, const Arg
 
 // Compares two status codes for equality. The outcome can be a pass (they are the same), a warning
 // (CLBlast reported a compilation error), or an error (they are different).
-template <typename T>
-void Tester<T>::TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status,
-                            const Arguments<T> &args) {
+template <typename T, typename U>
+void Tester<T,U>::TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status,
+                                 const Arguments<U> &args) {
 
   // Finished successfully
   if (clblas_status == clblast_status) {
@@ -270,62 +225,26 @@ void Tester<T>::TestErrorCodes(const StatusCode clblas_status, const StatusCode
 
 // =================================================================================================
 
-// Retrieves a list of example scalar values, used for the alpha and beta arguments for the various
-// routines. This function is specialised for the different data-types.
-template <>
-const std::vector<float> Tester<float>::GetExampleScalars() {
-  if (full_test_) { return {0.0f, 1.0f, 3.14f}; }
-  else { return {3.14f}; }
-}
-template <>
-const std::vector<double> Tester<double>::GetExampleScalars() {
-  if (full_test_) { return {0.0, 1.0, 3.14}; }
-  else { return {3.14}; }
-}
-template <>
-const std::vector<float2> Tester<float2>::GetExampleScalars() {
-  if (full_test_) { return {{0.0f, 0.0f}, {1.0f, 1.3f}, {2.42f, 3.14f}}; }
-  else { return {{2.42f, 3.14f}}; }
-}
-template <>
-const std::vector<double2> Tester<double2>::GetExampleScalars() {
-  if (full_test_) { return {{0.0, 0.0}, {1.0, 1.3}, {2.42, 3.14}}; }
-  else { return {{2.42, 3.14}}; }
-}
-
 // Retrieves the offset values to test with
-template <typename T>
-const std::vector<size_t> Tester<T>::GetOffsets() {
+template <typename T, typename U>
+const std::vector<size_t> Tester<T,U>::GetOffsets() const {
   if (full_test_) { return {0, 10}; }
   else { return {0}; }
 }
 
 // =================================================================================================
 
-template <> bool Tester<float>::PrecisionSupported() const { return true; }
-template <> bool Tester<float2>::PrecisionSupported() const { return true; }
-template <> bool Tester<double>::PrecisionSupported() const {
-  auto extensions = device_.Extensions();
-  return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
-}
-template <> bool Tester<double2>::PrecisionSupported() const {
-  auto extensions = device_.Extensions();
-  return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
-}
-
-// =================================================================================================
-
 // A test can either pass, be skipped, or fail
-template <typename T>
-void Tester<T>::ReportPass() {
+template <typename T, typename U>
+void Tester<T,U>::ReportPass() {
   num_passed_++;
 }
-template <typename T>
-void Tester<T>::ReportSkipped() {
+template <typename T, typename U>
+void Tester<T,U>::ReportSkipped() {
   num_skipped_++;
 }
-template <typename T>
-void Tester<T>::ReportError(const ErrorLogEntry &error_log_entry) {
+template <typename T, typename U>
+void Tester<T,U>::ReportError(const ErrorLogEntry &error_log_entry) {
   error_log_.push_back(error_log_entry);
   num_failed_++;
 }
@@ -334,8 +253,8 @@ void Tester<T>::ReportError(const ErrorLogEntry &error_log_entry) {
 
 // Prints the test-result symbol to screen. This function limits the maximum number of symbols per
 // line by printing newlines once every so many calls.
-template <typename T>
-void Tester<T>::PrintTestResult(const std::string &message) {
+template <typename T, typename U>
+void Tester<T,U>::PrintTestResult(const std::string &message) {
   if (print_count_ == kResultsPerLine) {
     print_count_ = 0;
     fprintf(stdout, "\n   ");
@@ -345,13 +264,98 @@ void Tester<T>::PrintTestResult(const std::string &message) {
   print_count_++;
 }
 
+// =================================================================================================
+// Below are the non-member functions (separated because of otherwise required partial class
+// template specialization)
+// =================================================================================================
+
+// Compares two floating point values and returns whether they are within an acceptable error
+// margin. This replaces GTest's EXPECT_NEAR().
+template <typename T>
+bool TestSimilarity(const T val1, const T val2) {
+  const auto difference = std::fabs(val1 - val2);
+
+  // Set the allowed error margin for floating-point comparisons
+  constexpr auto kErrorMarginRelative = 1.0e-2;
+  constexpr auto kErrorMarginAbsolute = 1.0e-10;
+
+  // Shortcut, handles infinities
+  if (val1 == val2) {
+    return true;
+  }
+  // The values are zero or very small: the relative error is less meaningful
+  else if (val1 == 0 || val2 == 0 || difference < static_cast<T>(kErrorMarginAbsolute)) {
+    return (difference < static_cast<T>(kErrorMarginAbsolute));
+  }
+  // Use relative error
+  else {
+    const auto absolute_sum = std::fabs(val1) + std::fabs(val2);
+    return (difference / absolute_sum) < static_cast<T>(kErrorMarginRelative);
+  }
+}
+
+// Compiles the default case for non-complex data-types
+template bool TestSimilarity<float>(const float, const float);
+template bool TestSimilarity<double>(const double, const double);
+
+// Specialisations for complex data-types
+template <>
+bool TestSimilarity(const float2 val1, const float2 val2) {
+  auto real = TestSimilarity(val1.real(), val2.real());
+  auto imag = TestSimilarity(val1.imag(), val2.imag());
+  return (real && imag);
+}
+template <>
+bool TestSimilarity(const double2 val1, const double2 val2) {
+  auto real = TestSimilarity(val1.real(), val2.real());
+  auto imag = TestSimilarity(val1.imag(), val2.imag());
+  return (real && imag);
+}
+
+// =================================================================================================
+
+// Retrieves a list of example scalar values, used for the alpha and beta arguments for the various
+// routines. This function is specialised for the different data-types.
+template <> const std::vector<float> GetExampleScalars(const bool full_test) {
+  if (full_test) { return {0.0f, 1.0f, 3.14f}; }
+  else { return {3.14f}; }
+}
+template <> const std::vector<double> GetExampleScalars(const bool full_test) {
+  if (full_test) { return {0.0, 1.0, 3.14}; }
+  else { return {3.14}; }
+}
+template <> const std::vector<float2> GetExampleScalars(const bool full_test) {
+  if (full_test) { return {{0.0f, 0.0f}, {1.0f, 1.3f}, {2.42f, 3.14f}}; }
+  else { return {{2.42f, 3.14f}}; }
+}
+template <> const std::vector<double2> GetExampleScalars(const bool full_test) {
+  if (full_test) { return {{0.0, 0.0}, {1.0, 1.3}, {2.42, 3.14}}; }
+  else { return {{2.42, 3.14}}; }
+}
+
+// =================================================================================================
+
+// Returns false is this precision is not supported by the device
+template <> bool PrecisionSupported<float>(const Device &) { return true; }
+template <> bool PrecisionSupported<float2>(const Device &) { return true; }
+template <> bool PrecisionSupported<double>(const Device &device) {
+  auto extensions = device.Extensions();
+  return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
+}
+template <> bool PrecisionSupported<double2>(const Device &device) {
+  auto extensions = device.Extensions();
+  return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
+}
+
 // =================================================================================================
 
 // Compiles the templated class
-template class Tester<float>;
-template class Tester<double>;
-template class Tester<float2>;
-template class Tester<double2>;
+template class Tester<float, float>;
+template class Tester<double, double>;
+template class Tester<float2, float2>;
+template class Tester<double2, double2>;
+template class Tester<float2, float>;
+template class Tester<double2, double>;
 
 // =================================================================================================
 } // namespace clblast
diff --git a/test/correctness/tester.h b/test/correctness/tester.h
index 3b6fa059..93515138 100644
--- a/test/correctness/tester.h
+++ b/test/correctness/tester.h
@@ -10,6 +10,8 @@
 // This file implements the Tester class, providing a test-framework. GTest was used before, but
 // was not able to handle certain cases (e.g. template type + parameters). This is its (basic)
 // custom replacement.
+// Typename T: the data-type of the routine's memory buffers (==precision)
+// Typename U: the data-type of the alpha and beta arguments
 //
 // =================================================================================================
 
@@ -30,7 +32,7 @@ namespace clblast {
 // =================================================================================================
 
 // See comment at top of file for a description of the class
-template <typename T>
+template <typename T, typename U>
 class Tester {
  public:
 
@@ -43,10 +45,6 @@ class Tester {
   // Error percentage is not applicable: error was caused by an incorrect status
   static constexpr auto kStatusError = -1.0f;
 
-  // Set the allowed error margin for floating-point comparisons
-  static constexpr auto kErrorMarginRelative = 1.0e-2;
-  static constexpr auto kErrorMarginAbsolute = 1.0e-10;
-
   // Constants holding start and end strings for terminal-output in colour
   const std::string kPrintError{"\x1b[31m"};
   const std::string kPrintSuccess{"\x1b[32m"};
@@ -62,16 +60,12 @@ class Tester {
   const std::string kSkippedCompilation{kPrintWarning + "\\" + kPrintEnd};
   const std::string kUnsupportedPrecision{kPrintWarning + "o" + kPrintEnd};
 
-  // The layouts and transpose-options to test with
-  static const std::vector<Layout> kLayouts;
-  static const std::vector<Transpose> kTransposes;
-
   // This structure combines the above log-entry with a status code an error percentage
   struct ErrorLogEntry {
     StatusCode status_expect;
     StatusCode status_found;
     float error_percentage;
-    Arguments<T> args;
+    Arguments<U> args;
   };
 
   // Creates an instance of the tester, running on a particular OpenCL platform and device. It
@@ -84,25 +78,13 @@ class Tester {
   void TestStart(const std::string &test_name, const std::string &test_configuration);
   void TestEnd();
 
-  // Compares two floating point values for similarity. Allows for a certain relative error margin.
-  static bool TestSimilarity(const T val1, const T val2);
-
   // Tests either an error count (should be zero) or two error codes (must match)
-  void TestErrorCount(const size_t errors, const size_t size, const Arguments<T> &args);
+  void TestErrorCount(const size_t errors, const size_t size, const Arguments<U> &args);
   void TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status,
-                      const Arguments<T> &args);
+                      const Arguments<U> &args);
 
  protected:
 
-  // Retrieves a list of example scalars of the right type
-  const std::vector<T> GetExampleScalars();
-
-  // Retrieves a list of offset values to test
-  const std::vector<size_t> GetOffsets();
-
-  // Returns false is this precision is not supported by the device
-  bool PrecisionSupported() const;
-
   // The help-message
   std::string help_;
 
@@ -112,6 +94,12 @@ class Tester {
   Context context_;
   CommandQueue queue_;
 
+  // Whether or not to run the full test-suite or just a smoke test
+  bool full_test_;
+
+  // Retrieves the offset values to test with
+  const std::vector<size_t> GetOffsets() const;
+
  private:
 
   // Internal methods to report a passed, skipped, or failed test
@@ -122,9 +110,6 @@ class Tester {
   // Prints the error or success symbol to screen
   void PrintTestResult(const std::string &message);
 
-  // Whether or not to run the full test-suite or just a smoke test
-  bool full_test_;
-
   // Logging and counting occurrences of errors
   std::vector<ErrorLogEntry> error_log_;
   size_t num_passed_;
@@ -143,6 +128,25 @@ class Tester {
   std::vector<std::string> options_;
 };
 
+// =================================================================================================
+// Below are the non-member functions (separated because of otherwise required partial class
+// template specialization)
+// =================================================================================================
+
+// Compares two floating point values and returns whether they are within an acceptable error
+// margin. This replaces GTest's EXPECT_NEAR().
+template <typename T>
+bool TestSimilarity(const T val1, const T val2);
+
+// Retrieves a list of example scalar values, used for the alpha and beta arguments for the various
+// routines. This function is specialised for the different data-types.
+template <typename T>
+const std::vector<T> GetExampleScalars(const bool full_test);
+
+// Returns false is this precision is not supported by the device
+template <typename T>
+bool PrecisionSupported(const Device &device);
+
 // =================================================================================================
 } // namespace clblast
 
diff --git a/test/correctness/testxy.cc b/test/correctness/testxy.cc
deleted file mode 100644
index 5c86182c..00000000
--- a/test/correctness/testxy.cc
+++ /dev/null
@@ -1,176 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the TestXY class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include <algorithm>
-
-#include "correctness/testxy.h"
-
-namespace clblast {
-// =================================================================================================
-
-// Constructor, initializes the base class tester and input data
-template <typename T>
-TestXY<T>::TestXY(int argc, char *argv[], const bool silent,
-                  const std::string &name, const std::vector<std::string> &options,
-                  const Routine clblast_lambda, const Routine clblas_lambda):
-    Tester<T>{argc, argv, silent, name, options},
-    clblast_lambda_(clblast_lambda),
-    clblas_lambda_(clblas_lambda) {
-
-  // Computes the maximum sizes. This allows for a single set of input/output buffers.
-  auto max_dim = *std::max_element(kVectorDims.begin(), kVectorDims.end());
-  auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end());
-  auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
-
-  // Creates test input data
-  x_source_.resize(max_dim*max_inc + max_offset);
-  y_source_.resize(max_dim*max_inc + max_offset);
-  PopulateVector(x_source_);
-  PopulateVector(y_source_);
-}
-
-// ===============================================================================================
-
-// Tests the routine for a wide variety of parameters
-template <typename T>
-void TestXY<T>::TestRegular(Arguments<T> &args, const std::string &name) {
-  if (!PrecisionSupported()) { return; }
-  TestStart("regular behaviour", name);
-
-  // Iterates over the vector dimension
-  for (auto &n: kVectorDims) {
-    args.n = n;
-
-    // Iterates over the increment-values and the offsets
-    for (auto &x_inc: kIncrements) {
-      args.x_inc = x_inc;
-      for (auto &x_offset: kOffsets) {
-        args.x_offset = x_offset;
-        for (auto &y_inc: kIncrements) {
-          args.y_inc = y_inc;
-          for (auto &y_offset: kOffsets) {
-            args.y_offset = y_offset;
-
-            // Computes the buffer sizes
-            auto x_size = n * x_inc + x_offset;
-            auto y_size = n * y_inc + y_offset;
-            if (x_size < 1 || y_size < 1) { continue; }
-
-            // Creates the OpenCL buffers
-            auto x_vec = Buffer(context_, CL_MEM_READ_WRITE, x_size*sizeof(T));
-            auto r_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
-            auto s_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
-
-            // Iterates over the values for alpha
-            for (auto &alpha: kAlphaValues) {
-              args.alpha = alpha;
-
-              // Runs the reference clBLAS code
-              x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
-              r_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
-              auto status1 = clblas_lambda_(args, x_vec, r_vec, queue_);
-
-              // Runs the CLBlast code
-              x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
-              s_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
-              auto status2 = clblast_lambda_(args, x_vec, s_vec, queue_);
-
-              // Tests for equality of the two status codes
-              if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
-                TestErrorCodes(status1, status2, args);
-                continue;
-              }
-
-              // Downloads the results
-              std::vector<T> r_result(y_size, static_cast<T>(0));
-              std::vector<T> s_result(y_size, static_cast<T>(0));
-              r_vec.ReadBuffer(queue_, y_size*sizeof(T), r_result);
-              s_vec.ReadBuffer(queue_, y_size*sizeof(T), s_result);
-
-              // Checks for differences in the output
-              auto errors = size_t{0};
-              for (auto idn=size_t{0}; idn<n; ++idn) {
-                auto index = idn*y_inc + y_offset;
-                if (!TestSimilarity(r_result[index], s_result[index])) {
-                  errors++;
-                }
-              }
-
-              // Tests the error count (should be zero)
-              TestErrorCount(errors, n, args);
-            }
-          }
-        }
-      }
-    }
-  }
-  TestEnd();
-}
-
-// =================================================================================================
-
-// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
-// does not test for results (if any).
-template <typename T>
-void TestXY<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
-  if (!PrecisionSupported()) { return; }
-  TestStart("invalid buffer sizes", name);
-
-  // Sets example test parameters
-  args.n = kBufferSize;
-  args.x_offset = 0;
-  args.y_offset = 0;
-
-  // Iterates over test buffer sizes
-  const std::vector<size_t> kBufferSizes = {0, kBufferSize - 1, kBufferSize};
-  for (auto &x_size: kBufferSizes) {
-    for (auto &y_size: kBufferSizes) {
-
-      // Iterates over test increments
-      for (auto &x_inc: kInvalidIncrements) {
-        args.x_inc = x_inc;
-        for (auto &y_inc: kInvalidIncrements) {
-          args.y_inc = y_inc;
-
-          // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
-          // want to be able to create invalid buffers (no error checking here).
-          auto x = clCreateBuffer(context_(), CL_MEM_READ_WRITE, x_size*sizeof(T), nullptr, nullptr);
-          auto x_vec = Buffer(x);
-          auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
-          auto r_vec = Buffer(r);
-          auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
-          auto s_vec = Buffer(s);
-
-          // Runs the two routines
-          auto status1 = clblas_lambda_(args, x_vec, r_vec, queue_);
-          auto status2 = clblast_lambda_(args, x_vec, s_vec, queue_);
-
-          // Tests for equality of the two status codes
-          TestErrorCodes(status1, status2, args);
-        }
-      }
-    }
-  }
-  TestEnd();
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class TestXY<float>;
-template class TestXY<double>;
-template class TestXY<float2>;
-template class TestXY<double2>;
-
-// =================================================================================================
-} // namespace clblast
diff --git a/test/correctness/testxy.h b/test/correctness/testxy.h
deleted file mode 100644
index ec2cbcc7..00000000
--- a/test/correctness/testxy.h
+++ /dev/null
@@ -1,84 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file tests any vector-vector (X,Y) routine. It contains two types of tests: one testing
-// all sorts of input combinations, and one deliberatly testing with invalid values.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_TEST_CORRECTNESS_TESTXY_H_
-#define CLBLAST_TEST_CORRECTNESS_TESTXY_H_
-
-#include <vector>
-#include <string>
-
-#include "correctness/tester.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class TestXY: public Tester<T> {
- public:
-
-  // Uses several variables from the Tester class
-  using Tester<T>::context_;
-  using Tester<T>::queue_;
-
-  // Uses several helper functions from the Tester class
-  using Tester<T>::TestStart;
-  using Tester<T>::TestEnd;
-  using Tester<T>::TestSimilarity;
-  using Tester<T>::TestErrorCount;
-  using Tester<T>::TestErrorCodes;
-  using Tester<T>::GetExampleScalars;
-  using Tester<T>::GetOffsets;
-  using Tester<T>::PrecisionSupported;
-
-  // Test settings for the regular test. Append to this list in case more tests are required.
-  const std::vector<size_t> kVectorDims = { 7, 93, 4096 };
-  const std::vector<size_t> kOffsets = GetOffsets();
-  const std::vector<size_t> kIncrements = { 1, 2, 7 };
-  const std::vector<T> kAlphaValues = GetExampleScalars();
-
-  // Test settings for the invalid test
-  const std::vector<size_t> kInvalidIncrements = { 0, 1 };
-  const size_t kBufferSize = 512;
-
-  // Shorthand for a BLAS routine
-  using Routine = std::function<StatusCode(const Arguments<T>&,
-                                           const Buffer&, const Buffer&,
-                                           CommandQueue&)>;
-
-  // Constructor, initializes the base class tester and input data
-  TestXY(int argc, char *argv[], const bool silent,
-         const std::string &name, const std::vector<std::string> &options,
-         const Routine clblast_lambda, const Routine clblas_lambda);
-
-  // The test functions, taking no inputs
-  void TestRegular(Arguments<T> &args, const std::string &name);
-  void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
-
- private:
-
-  // Source data to test with
-  std::vector<T> x_source_;
-  std::vector<T> y_source_;
-  
-  // The routines to test
-  Routine clblast_lambda_;
-  Routine clblas_lambda_;
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_TEST_CORRECTNESS_TESTXY_H_
-#endif
diff --git a/test/performance/client.cc b/test/performance/client.cc
index 3b07970c..676e88e4 100644
--- a/test/performance/client.cc
+++ b/test/performance/client.cc
@@ -21,249 +21,36 @@
 namespace clblast {
 // =================================================================================================
 
-// This is the vector-vector variant of the set-up/tear-down client routine.
-template <typename T>
-void ClientXY(int argc, char *argv[], Routine2<T> client_routine,
-              const std::vector<std::string> &options) {
-
-  // Function to determine how to find the default value of the leading dimension of matrix A.
-  // Note: this is not relevant for this client but given anyway.
-  auto default_ld_a = [](const Arguments<T> args) { return args.n; };
-
-  // Simple command line argument parser with defaults
-  auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
-  if (args.print_help) { return; }
-
-  // Prints the header of the output table
-  PrintTableHeader(args.silent, options);
-
-  // Initializes OpenCL and the libraries
-  auto platform = Platform(args.platform_id);
-  auto device = Device(platform, kDeviceType, args.device_id);
-  auto context = Context(device);
-  auto queue = CommandQueue(context, device);
-  if (args.compare_clblas) { clblasSetup(); }
-
-  // Iterates over all "num_step" values jumping by "step" each time
-  auto s = size_t{0};
-  while(true) {
-
-    // Computes the data sizes
-    auto x_size = args.n*args.x_inc + args.x_offset;
-    auto y_size = args.n*args.y_inc + args.y_offset;
-
-    // Populates input host vectors with random data
-    std::vector<T> x_source(x_size);
-    std::vector<T> y_source(y_size);
-    PopulateVector(x_source);
-    PopulateVector(y_source);
-
-    // Creates the vectors on the device
-    auto x_buffer = Buffer(context, CL_MEM_READ_WRITE, x_size*sizeof(T));
-    auto y_buffer = Buffer(context, CL_MEM_READ_WRITE, y_size*sizeof(T));
-    x_buffer.WriteBuffer(queue, x_size*sizeof(T), x_source);
-    y_buffer.WriteBuffer(queue, y_size*sizeof(T), y_source);
-
-    // Runs the routine-specific code
-    client_routine(args, x_buffer, y_buffer, queue);
-
-    // Makes the jump to the next step
-    ++s;
-    if (s >= args.num_steps) { break; }
-    args.n += args.step;
-  }
-
-  // Cleans-up and returns
-  if (args.compare_clblas) { clblasTeardown(); }
+// Constructor
+template <typename T, typename U>
+Client<T,U>::Client(const Routine run_routine, const Routine run_reference,
+                    const std::vector<std::string> &options,
+                    const GetMetric get_flops, const GetMetric get_bytes):
+  run_routine_(run_routine),
+  run_reference_(run_reference),
+  options_(options),
+  get_flops_(get_flops),
+  get_bytes_(get_bytes) {
 }
 
-// Compiles the above function
-template void ClientXY<float>(int, char **, Routine2<float>, const std::vector<std::string>&);
-template void ClientXY<double>(int, char **, Routine2<double>, const std::vector<std::string>&);
-template void ClientXY<float2>(int, char **, Routine2<float2>, const std::vector<std::string>&);
-template void ClientXY<double2>(int, char **, Routine2<double2>, const std::vector<std::string>&);
-
-// =================================================================================================
-
-// This is the matrix-vector-vector variant of the set-up/tear-down client routine.
-template <typename T>
-void ClientAXY(int argc, char *argv[], Routine3<T> client_routine,
-               const std::vector<std::string> &options) {
-
-  // Function to determine how to find the default value of the leading dimension of matrix A
-  auto default_ld_a = [](const Arguments<T> args) { return args.n; };
-
-  // Simple command line argument parser with defaults
-  auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
-  if (args.print_help) { return; }
-
-  // Prints the header of the output table
-  PrintTableHeader(args.silent, options);
-
-  // Initializes OpenCL and the libraries
-  auto platform = Platform(args.platform_id);
-  auto device = Device(platform, kDeviceType, args.device_id);
-  auto context = Context(device);
-  auto queue = CommandQueue(context, device);
-  if (args.compare_clblas) { clblasSetup(); }
-
-  // Iterates over all "num_step" values jumping by "step" each time
-  auto s = size_t{0};
-  while(true) {
-
-    // Computes the second dimension of the matrix taking the rotation into account
-    auto a_two = (args.layout == Layout::kRowMajor) ? args.m : args.n;
-
-    // Computes the vector sizes in case the matrix is transposed
-    auto a_transposed = (args.a_transpose == Transpose::kYes);
-    auto m_real = (a_transposed) ? args.n : args.m;
-    auto n_real = (a_transposed) ? args.m : args.n;
-
-    // Computes the data sizes
-    auto a_size = a_two * args.a_ld + args.a_offset;
-    auto x_size = n_real*args.x_inc + args.x_offset;
-    auto y_size = m_real*args.y_inc + args.y_offset;
-
-    // Populates input host vectors with random data
-    std::vector<T> a_source(a_size);
-    std::vector<T> x_source(x_size);
-    std::vector<T> y_source(y_size);
-    PopulateVector(a_source);
-    PopulateVector(x_source);
-    PopulateVector(y_source);
-
-    // Creates the vectors on the device
-    auto a_buffer = Buffer(context, CL_MEM_READ_WRITE, a_size*sizeof(T));
-    auto x_buffer = Buffer(context, CL_MEM_READ_WRITE, x_size*sizeof(T));
-    auto y_buffer = Buffer(context, CL_MEM_READ_WRITE, y_size*sizeof(T));
-    a_buffer.WriteBuffer(queue, a_size*sizeof(T), a_source);
-    x_buffer.WriteBuffer(queue, x_size*sizeof(T), x_source);
-    y_buffer.WriteBuffer(queue, y_size*sizeof(T), y_source);
-
-    // Runs the routine-specific code
-    client_routine(args, a_buffer, x_buffer, y_buffer, queue);
-
-    // Makes the jump to the next step
-    ++s;
-    if (s >= args.num_steps) { break; }
-    args.m += args.step;
-    args.n += args.step;
-    args.a_ld += args.step;
-  }
-
-  // Cleans-up and returns
-  if (args.compare_clblas) { clblasTeardown(); }
-}
-
-// Compiles the above function
-template void ClientAXY<float>(int, char **, Routine3<float>, const std::vector<std::string>&);
-template void ClientAXY<double>(int, char **, Routine3<double>, const std::vector<std::string>&);
-template void ClientAXY<float2>(int, char **, Routine3<float2>, const std::vector<std::string>&);
-template void ClientAXY<double2>(int, char **, Routine3<double2>, const std::vector<std::string>&);
-
-// =================================================================================================
-
-// This is the matrix-matrix-matrix variant of the set-up/tear-down client routine.
-template <typename T>
-void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
-                     const std::vector<std::string> &options) {
-
-  // Function to determine how to find the default value of the leading dimension of matrix A
-  auto default_ld_a = [](const Arguments<T> args) { return args.m; };
-
-  // Simple command line argument parser with defaults
-  auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
-  if (args.print_help) { return; }
-
-  // Prints the header of the output table
-  PrintTableHeader(args.silent, options);
-
-  // Initializes OpenCL and the libraries
-  auto platform = Platform(args.platform_id);
-  auto device = Device(platform, kDeviceType, args.device_id);
-  auto context = Context(device);
-  auto queue = CommandQueue(context, device);
-  if (args.compare_clblas) { clblasSetup(); }
-
-  // Computes whether or not the matrices are transposed. Note that we assume a default of
-  // column-major and no-transpose. If one of them is different (but not both), then rotated
-  // is considered true.
-  auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose == Transpose::kYes) ||
-                   (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
-  auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose == Transpose::kYes) ||
-                   (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
-  auto c_rotated = (args.layout == Layout::kRowMajor);
-
-  // Iterates over all "num_step" values jumping by "step" each time
-  auto s = size_t{0};
-  while(true) {
-
-    // Computes the data sizes
-    auto a_two = (a_rotated) ? args.m : args.k;
-    auto b_two = (b_rotated) ? args.k : args.n;
-    auto c_two = (c_rotated) ? args.m : args.n;
-    auto a_size = a_two * args.a_ld + args.a_offset;
-    auto b_size = b_two * args.b_ld + args.b_offset;
-    auto c_size = c_two * args.c_ld + args.c_offset;
-
-    // Populates input host matrices with random data
-    std::vector<T> a_source(a_size);
-    std::vector<T> b_source(b_size);
-    std::vector<T> c_source(c_size);
-    PopulateVector(a_source);
-    PopulateVector(b_source);
-    PopulateVector(c_source);
-
-    // Creates the matrices on the device
-    auto a_buffer = Buffer(context, CL_MEM_READ_WRITE, a_size*sizeof(T));
-    auto b_buffer = Buffer(context, CL_MEM_READ_WRITE, b_size*sizeof(T));
-    auto c_buffer = Buffer(context, CL_MEM_READ_WRITE, c_size*sizeof(T));
-    a_buffer.WriteBuffer(queue, a_size*sizeof(T), a_source);
-    b_buffer.WriteBuffer(queue, b_size*sizeof(T), b_source);
-    c_buffer.WriteBuffer(queue, c_size*sizeof(T), c_source);
-
-    // Runs the routine-specific code
-    client_routine(args, a_buffer, b_buffer, c_buffer, queue);
-
-    // Makes the jump to the next step
-    ++s;
-    if (s >= args.num_steps) { break; }
-    args.m += args.step;
-    args.n += args.step;
-    args.k += args.step;
-    args.a_ld += args.step;
-    args.b_ld += args.step;
-    args.c_ld += args.step;
-  }
-
-  // Cleans-up and returns
-  if (args.compare_clblas) { clblasTeardown(); }
-}
-
-// Compiles the above function
-template void ClientABC<float>(int, char **, Routine3<float>, const std::vector<std::string>&);
-template void ClientABC<double>(int, char **, Routine3<double>, const std::vector<std::string>&);
-template void ClientABC<float2>(int, char **, Routine3<float2>, const std::vector<std::string>&);
-template void ClientABC<double2>(int, char **, Routine3<double2>, const std::vector<std::string>&);
-
 // =================================================================================================
 
 // Parses all arguments available for the CLBlast client testers. Some arguments might not be
 // applicable, but are searched for anyway to be able to create one common argument parser. All
 // arguments have a default value in case they are not found.
-template <typename T>
-Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::string> &options,
-                            const std::function<size_t(const Arguments<T>)> default_ld_a) {
-  auto args = Arguments<T>{};
+template <typename T, typename U>
+Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric default_a_ld,
+                                         const GetMetric default_b_ld, const GetMetric default_c_ld) {
+  auto args = Arguments<U>{};
   auto help = std::string{"Options given/available:\n"};
 
   // These are the options which are not for every client: they are optional
-  for (auto &o: options) {
+  for (auto &o: options_) {
 
     // Data-sizes
-    if (o == kArgM) { args.m = args.k  = GetArgument(argc, argv, help, kArgM, 512UL); }
-    if (o == kArgN) { args.n           = GetArgument(argc, argv, help, kArgN, 512UL); }
-    if (o == kArgK) { args.k           = GetArgument(argc, argv, help, kArgK, 512UL); }
+    if (o == kArgM) { args.m  = GetArgument(argc, argv, help, kArgM, 512UL); }
+    if (o == kArgN) { args.n  = GetArgument(argc, argv, help, kArgN, 512UL); }
+    if (o == kArgK) { args.k  = GetArgument(argc, argv, help, kArgK, 512UL); }
 
     // Data-layouts
     if (o == kArgLayout)   { args.layout      = GetArgument(argc, argv, help, kArgLayout, Layout::kRowMajor); }
@@ -271,6 +58,7 @@ Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::strin
     if (o == kArgBTransp)  { args.b_transpose = GetArgument(argc, argv, help, kArgBTransp, Transpose::kNo); }
     if (o == kArgSide)     { args.side        = GetArgument(argc, argv, help, kArgSide, Side::kLeft); }
     if (o == kArgTriangle) { args.triangle    = GetArgument(argc, argv, help, kArgTriangle, Triangle::kUpper); }
+    if (o == kArgDiagonal) { args.diagonal    = GetArgument(argc, argv, help, kArgDiagonal, Diagonal::kUnit); }
 
     // Vector arguments
     if (o == kArgXInc)    { args.x_inc    = GetArgument(argc, argv, help, kArgXInc, size_t{1}); }
@@ -279,16 +67,16 @@ Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::strin
     if (o == kArgYOffset) { args.y_offset = GetArgument(argc, argv, help, kArgYOffset, size_t{0}); }
 
     // Matrix arguments
-    if (o == kArgALeadDim) { args.a_ld     = GetArgument(argc, argv, help, kArgALeadDim, default_ld_a(args)); }
-    if (o == kArgBLeadDim) { args.b_ld     = GetArgument(argc, argv, help, kArgBLeadDim, args.n); }
-    if (o == kArgCLeadDim) { args.c_ld     = GetArgument(argc, argv, help, kArgCLeadDim, args.n); }
+    if (o == kArgALeadDim) { args.a_ld     = GetArgument(argc, argv, help, kArgALeadDim, default_a_ld(args)); }
+    if (o == kArgBLeadDim) { args.b_ld     = GetArgument(argc, argv, help, kArgBLeadDim, default_b_ld(args)); }
+    if (o == kArgCLeadDim) { args.c_ld     = GetArgument(argc, argv, help, kArgCLeadDim, default_c_ld(args)); }
     if (o == kArgAOffset)  { args.a_offset = GetArgument(argc, argv, help, kArgAOffset, size_t{0}); }
     if (o == kArgBOffset)  { args.b_offset = GetArgument(argc, argv, help, kArgBOffset, size_t{0}); }
     if (o == kArgCOffset)  { args.c_offset = GetArgument(argc, argv, help, kArgCOffset, size_t{0}); }
 
     // Scalar values 
-    if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>()); }
-    if (o == kArgBeta)  { args.beta  = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); }
+    if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<U>()); }
+    if (o == kArgBeta)  { args.beta  = GetArgument(argc, argv, help, kArgBeta, GetScalar<U>()); }
   }
 
   // These are the options common to all routines
@@ -313,16 +101,92 @@ Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::strin
 
 // =================================================================================================
 
+// This is main performance tester
+template <typename T, typename U>
+void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) {
+
+  // Prints the header of the output table
+  PrintTableHeader(args.silent, options_);
+
+  // Initializes OpenCL and the libraries
+  auto platform = Platform(args.platform_id);
+  auto device = Device(platform, kDeviceType, args.device_id);
+  auto context = Context(device);
+  auto queue = CommandQueue(context, device);
+  if (args.compare_clblas) { clblasSetup(); }
+
+  // Iterates over all "num_step" values jumping by "step" each time
+  auto s = size_t{0};
+  while(true) {
+
+    // Sets the buffer sizes (routine-specific)
+    set_sizes(args);
+
+    // Populates input host matrices with random data
+    std::vector<T> x_source(args.x_size);
+    std::vector<T> y_source(args.y_size);
+    std::vector<T> a_source(args.a_size);
+    std::vector<T> b_source(args.b_size);
+    std::vector<T> c_source(args.c_size);
+    PopulateVector(x_source);
+    PopulateVector(y_source);
+    PopulateVector(a_source);
+    PopulateVector(b_source);
+    PopulateVector(c_source);
+
+    // Creates the matrices on the device
+    auto x_vec = Buffer(context, CL_MEM_READ_WRITE, args.x_size*sizeof(T));
+    auto y_vec = Buffer(context, CL_MEM_READ_WRITE, args.y_size*sizeof(T));
+    auto a_mat = Buffer(context, CL_MEM_READ_WRITE, args.a_size*sizeof(T));
+    auto b_mat = Buffer(context, CL_MEM_READ_WRITE, args.b_size*sizeof(T));
+    auto c_mat = Buffer(context, CL_MEM_READ_WRITE, args.c_size*sizeof(T));
+    x_vec.WriteBuffer(queue, args.x_size*sizeof(T), x_source);
+    y_vec.WriteBuffer(queue, args.y_size*sizeof(T), y_source);
+    a_mat.WriteBuffer(queue, args.a_size*sizeof(T), a_source);
+    b_mat.WriteBuffer(queue, args.b_size*sizeof(T), b_source);
+    c_mat.WriteBuffer(queue, args.c_size*sizeof(T), c_source);
+    auto buffers = Buffers{x_vec, y_vec, a_mat, b_mat, c_mat};
+
+    // Runs the routines and collects the timings
+    auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
+    auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference_, "clBLAS");
+
+    // Prints the performance of both libraries
+    PrintTableRow(args, ms_clblast, ms_clblas);
+
+    // Makes the jump to the next step
+    ++s;
+    if (s >= args.num_steps) { break; }
+    args.m += args.step;
+    args.n += args.step;
+    args.k += args.step;
+    args.a_ld += args.step;
+    args.b_ld += args.step;
+    args.c_ld += args.step;
+  }
+
+  // Cleans-up and returns
+  if (args.compare_clblas) { clblasTeardown(); }
+}
+
+// =================================================================================================
+
 // Creates a vector of timing results, filled with execution times of the 'main computation'. The
 // timing is performed using the milliseconds chrono functions. The function returns the minimum
 // value found in the vector of timing results. The return value is in milliseconds.
-double TimedExecution(const size_t num_runs, std::function<void()> main_computation) {
+template <typename T, typename U>
+double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args,
+                                   const Buffers &buffers, CommandQueue &queue,
+                                   Routine run_blas, const std::string &library_name) {
   auto timings = std::vector<double>(num_runs);
   for (auto &timing: timings) {
     auto start_time = std::chrono::steady_clock::now();
 
     // Executes the main computation
-    main_computation();
+    auto status = run_blas(args, buffers, queue);
+    if (status != StatusCode::kSuccess) {
+      throw std::runtime_error(library_name+" error: "+ToString(static_cast<int>(status)));
+    }
 
     // Records and stores the end-time
     auto elapsed_time = std::chrono::steady_clock::now() - start_time;
@@ -334,7 +198,8 @@ double TimedExecution(const size_t num_runs, std::function<void()> main_computat
 // =================================================================================================
 
 // Prints the header of the performance table
-void PrintTableHeader(const bool silent, const std::vector<std::string> &args) {
+template <typename T, typename U>
+void Client<T,U>::PrintTableHeader(const bool silent, const std::vector<std::string> &args) {
   if (!silent) {
     for (auto i=size_t{0}; i<args.size(); ++i) { fprintf(stdout, "%9s ", ""); }
     fprintf(stdout, " | <--       CLBlast       --> | <--      clBLAS      --> |\n");
@@ -345,29 +210,60 @@ void PrintTableHeader(const bool silent, const std::vector<std::string> &args) {
 }
 
 // Print a performance-result row
-void PrintTableRow(const std::vector<size_t> &args_int, const std::vector<std::string> &args_string,
-                   const bool no_abbrv, const double ms_clblast, const double ms_clblas,
-                   const unsigned long long flops, const unsigned long long bytes) {
+template <typename T, typename U>
+void Client<T,U>::PrintTableRow(const Arguments<U>& args, const double ms_clblast,
+                                const double ms_clblas) {
+
+  // Creates a vector of relevant variables
+  auto integers = std::vector<size_t>{};
+  for (auto &o: options_) {
+    if      (o == kArgM) {        integers.push_back(args.m); }
+    if      (o == kArgN) {        integers.push_back(args.n); }
+    else if (o == kArgK) {        integers.push_back(args.k); }
+    else if (o == kArgLayout) {   integers.push_back(static_cast<size_t>(args.layout)); }
+    else if (o == kArgSide) {     integers.push_back(static_cast<size_t>(args.side)); }
+    else if (o == kArgTriangle) { integers.push_back(static_cast<size_t>(args.triangle)); }
+    else if (o == kArgATransp) {  integers.push_back(static_cast<size_t>(args.a_transpose)); }
+    else if (o == kArgBTransp) {  integers.push_back(static_cast<size_t>(args.b_transpose)); }
+    else if (o == kArgDiagonal) { integers.push_back(static_cast<size_t>(args.diagonal)); }
+    else if (o == kArgXInc) {     integers.push_back(args.x_inc); }
+    else if (o == kArgYInc) {     integers.push_back(args.y_inc); }
+    else if (o == kArgXOffset) {  integers.push_back(args.x_offset); }
+    else if (o == kArgYOffset) {  integers.push_back(args.y_offset); }
+    else if (o == kArgALeadDim) { integers.push_back(args.a_ld); }
+    else if (o == kArgBLeadDim) { integers.push_back(args.b_ld); }
+    else if (o == kArgCLeadDim) { integers.push_back(args.c_ld); }
+    else if (o == kArgAOffset) {  integers.push_back(args.a_offset); }
+    else if (o == kArgBOffset) {  integers.push_back(args.b_offset); }
+    else if (o == kArgCOffset) {  integers.push_back(args.c_offset); }
+  }
+  auto strings = std::vector<std::string>{};
+  for (auto &o: options_) {
+    if      (o == kArgAlpha) {    strings.push_back(ToString(args.alpha)); }
+    else if (o == kArgBeta) {     strings.push_back(ToString(args.beta)); }
+  }
 
   // Computes the GFLOPS and GB/s metrics
+  auto flops = get_flops_(args);
+  auto bytes = get_bytes_(args);
   auto gflops_clblast = (ms_clblast != 0.0) ? (flops*1e-6)/ms_clblast : 0;
   auto gflops_clblas = (ms_clblas != 0.0) ? (flops*1e-6)/ms_clblas: 0;
   auto gbs_clblast = (ms_clblast != 0.0) ? (bytes*1e-6)/ms_clblast : 0;
   auto gbs_clblas = (ms_clblas != 0.0) ? (bytes*1e-6)/ms_clblas: 0;
 
   // Outputs the argument values
-  for (auto &argument: args_int) {
-    if (!no_abbrv && argument >= 1024*1024 && IsMultiple(argument, 1024*1024)) {
+  for (auto &argument: integers) {
+    if (!args.no_abbrv && argument >= 1024*1024 && IsMultiple(argument, 1024*1024)) {
       fprintf(stdout, "%8luM;", argument/(1024*1024));
     }
-    else if (!no_abbrv && argument >= 1024 && IsMultiple(argument, 1024)) {
+    else if (!args.no_abbrv && argument >= 1024 && IsMultiple(argument, 1024)) {
       fprintf(stdout, "%8luK;", argument/1024);
     }
     else {
       fprintf(stdout, "%9lu;", argument);
     }
   }
-  for (auto &argument: args_string) {
+  for (auto &argument: strings) {
     fprintf(stdout, "%9s;", argument.c_str());
   }
 
@@ -377,5 +273,15 @@ void PrintTableRow(const std::vector<size_t> &args_int, const std::vector<std::s
           ms_clblas, gflops_clblas, gbs_clblas);
 }
 
+// =================================================================================================
+
+// Compiles the templated class
+template class Client<float,float>;
+template class Client<double,double>;
+template class Client<float2,float2>;
+template class Client<double2,double2>;
+template class Client<float2,float>;
+template class Client<double2,double>;
+
 // =================================================================================================
 } // namespace clblast
diff --git a/test/performance/client.h b/test/performance/client.h
index 5125844a..c9095967 100644
--- a/test/performance/client.h
+++ b/test/performance/client.h
@@ -7,7 +7,14 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file provides common function declarations to be used with the test clients.
+// This class implements the performance-test client. It is generic for all CLBlast routines by
+// taking a number of routine-specific functions as arguments, such as how to compute buffer sizes
+// or how to get the FLOPS count.
+// Typename T: the data-type of the routine's memory buffers (==precision)
+// Typename U: the data-type of the alpha and beta arguments
+//
+// This file also provides the common interface to the performance client (see the 'RunClient'
+// function for details).
 //
 // =================================================================================================
 
@@ -26,61 +33,71 @@
 namespace clblast {
 // =================================================================================================
 
-// Types of devices to consider
-const cl_device_type kDeviceType = CL_DEVICE_TYPE_ALL;
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class Client {
+ public:
+
+  // Types of devices to consider
+  const cl_device_type kDeviceType = CL_DEVICE_TYPE_ALL;
+
+  // Shorthand for the routine-specific functions passed to the tester
+  using Routine = std::function<StatusCode(const Arguments<U>&, const Buffers&, CommandQueue&)>;
+  using SetMetric = std::function<void(Arguments<U>&)>;
+  using GetMetric = std::function<size_t(const Arguments<U>&)>;
+
+  // The constructor
+  Client(const Routine run_routine, const Routine run_reference,
+         const std::vector<std::string> &options,
+         const GetMetric get_flops, const GetMetric get_bytes);
+
+  // Parses all command-line arguments, filling in the arguments structure. If no command-line
+  // argument is given for a particular argument, it is filled in with a default value.
+  Arguments<U> ParseArguments(int argc, char *argv[], const GetMetric default_a_ld,
+                              const GetMetric default_b_ld, const GetMetric default_c_ld);
+
+  // The main client function, setting-up arguments, matrices, OpenCL buffers, etc. After set-up, it
+  // calls the client routines.
+  void PerformanceTest(Arguments<U> &args, const SetMetric set_sizes);
+
+ private:
+
+  // Runs a function a given number of times and returns the execution time of the shortest instance
+  double TimedExecution(const size_t num_runs, const Arguments<U> &args, const Buffers &buffers,
+                        CommandQueue &queue, Routine run_blas, const std::string &library_name);
+
+  // Prints the header of a performance-data table
+  void PrintTableHeader(const bool silent, const std::vector<std::string> &args);
+
+  // Prints a row of performance data, including results of two libraries
+  void PrintTableRow(const Arguments<U>& args, const double ms_clblast, const double ms_clblas);
+
+  // The routine-specific functions passed to the tester
+  const Routine run_routine_;
+  const Routine run_reference_;
+  const std::vector<std::string> options_;
+  const GetMetric get_flops_;
+  const GetMetric get_bytes_;
+};
 
 // =================================================================================================
 
-// Shorthand for a BLAS routine with 2 or 3 OpenCL buffers as argument
-template <typename T>
-using Routine2 = std::function<void(const Arguments<T>&,
-                                    const Buffer&, const Buffer&,
-                                    CommandQueue&)>;
-template <typename T>
-using Routine3 = std::function<void(const Arguments<T>&,
-                                    const Buffer&, const Buffer&, const Buffer&,
-                                    CommandQueue&)>;
+// The interface to the performance client. This is a separate function in the header such that it
+// is automatically compiled for each routine, templated by the parameter "C".
+template <typename C, typename T, typename U>
+void RunClient(int argc, char *argv[]) {
 
-// =================================================================================================
+  // Creates a new client
+  auto client = Client<T,U>(C::RunRoutine, C::RunReference, C::GetOptions(),
+                            C::GetFlops, C::GetBytes);
 
-// These are the main client functions, setting-up arguments, matrices, OpenCL buffers, etc. After
-// set-up, they call the client routine, passed as argument to this function.
-template <typename T>
-void ClientXY(int argc, char *argv[], Routine2<T> client_routine,
-              const std::vector<std::string> &options);
-template <typename T>
-void ClientAXY(int argc, char *argv[], Routine3<T> client_routine,
-               const std::vector<std::string> &options);
-template <typename T>
-void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
-               const std::vector<std::string> &options);
+  // Simple command line argument parser with defaults
+  auto args = client.ParseArguments(argc, argv, C::DefaultLDA, C::DefaultLDB, C::DefaultLDC);
+  if (args.print_help) { return; }
 
-// =================================================================================================
-
-// Parses all command-line arguments, filling in the arguments structure. If no command-line
-// argument is given for a particular argument, it is filled in with a default value.
-template <typename T>
-Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::string> &options,
-                            const std::function<size_t(const Arguments<T>)> default_ld_a);
-
-// Retrieves only the precision command-line argument, since the above function is templated based
-// on the precision
-Precision GetPrecision(int argc, char *argv[]);
-
-// =================================================================================================
-
-// Runs a function a given number of times and returns the execution time of the shortest instance
-double TimedExecution(const size_t num_runs, std::function<void()> main_computation);
-
-// =================================================================================================
-
-// Prints the header of a performance-data table
-void PrintTableHeader(const bool silent, const std::vector<std::string> &args);
-
-// Prints a row of performance data, including results of two libraries
-void PrintTableRow(const std::vector<size_t> &args_int, const std::vector<std::string> &args_string,
-                   const bool abbreviations, const double ms_clblast, const double ms_clblas,
-                   const unsigned long long flops, const unsigned long long bytes);
+  // Runs the client
+  client.PerformanceTest(args, C::SetSizes);
+}
 
 // =================================================================================================
 } // namespace clblast
diff --git a/test/performance/graphs/common.r b/test/performance/graphs/common.r
index e310b811..34a59c43 100644
--- a/test/performance/graphs/common.r
+++ b/test/performance/graphs/common.r
@@ -83,7 +83,16 @@ main <- function(routine_name, precision, test_names, test_values,
       params_string <- paste(parameters, params_values[[command_id]], collapse=" ")
       arguments <- paste(devices_string, params_string, options_string, sep=" ")
       print(paste("Running", executable, arguments, sep=" "))
-      result_string <- system2(command=executable, args=arguments, stdout=TRUE)
+      raw_result_string <- system2(command=executable, args=arguments, stdout=TRUE)
+
+      # Filter the string: only lines containing a ";" can be valid lines
+      result_string <- c()
+      for (line in raw_result_string) {
+        if (grepl(";",line)) {
+          result_string <-
+           c(result_string, line)
+        }
+      }
 
       # Reads the result into a dataframe
       command_db <- read.csv(text=result_string, sep=";")
diff --git a/test/performance/graphs/xgemm.r b/test/performance/graphs/xgemm.r
index 22f63b77..6533b44b 100755
--- a/test/performance/graphs/xgemm.r
+++ b/test/performance/graphs/xgemm.r
@@ -35,10 +35,10 @@ test_names <- list(
 
 # Defines the test-cases
 test_values <- list(
-  list(c(128, 128, 128, 0, 0, 0, 16, 128, num_runs, precision)),
-  list(c(129, 129, 129, 0, 0, 0, 16, 128, num_runs, precision)),
-  list(c(512, 512, 512, 0, 0, 0, 16, 1, num_runs, precision)),
-  list(c(2048, 2048, 2048, 0, 0, 0, 16, 1, num_runs, precision)),
+  list(c( 128,  128,  128, 1, 0, 0, 16, 128, num_runs, precision)),
+  list(c( 129,  129,  129, 1, 0, 0, 16, 128, num_runs, precision)),
+  list(c( 512,  512,  512, 1, 0, 0, 16, 1, num_runs, precision)),
+  list(c(2048, 2048, 2048, 1, 0, 0, 16, 1, num_runs, precision)),
   list(
     c(1024, 1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
     c(1024, 1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
@@ -50,17 +50,17 @@ test_values <- list(
     c(1024, 1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
   ),
   list(
-    c(8, 8, 8, 0, 0, 0, 1, 0, num_runs, precision),
-    c(16, 16, 16, 0, 0, 0, 1, 0, num_runs, precision),
-    c(32, 32, 32, 0, 0, 0, 1, 0, num_runs, precision),
-    c(64, 64, 64, 0, 0, 0, 1, 0, num_runs, precision),
-    c(128, 128, 128, 0, 0, 0, 1, 0, num_runs, precision),
-    c(256, 256, 256, 0, 0, 0, 1, 0, num_runs, precision),
-    c(512, 512, 512, 0, 0, 0, 1, 0, num_runs, precision),
-    c(1024, 1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
-    c(2048, 2048, 2048, 0, 0, 0, 1, 0, num_runs, precision),
-    c(4096, 4096, 4096, 0, 0, 0, 1, 0, num_runs, precision),
-    c(8192, 8192, 8192, 0, 0, 0, 1, 0, num_runs, precision)
+    c(   8,    8,    8, 1, 0, 0, 1, 0, num_runs, precision),
+    c(  16,   16,   16, 1, 0, 0, 1, 0, num_runs, precision),
+    c(  32,   32,   32, 1, 0, 0, 1, 0, num_runs, precision),
+    c(  64,   64,   64, 1, 0, 0, 1, 0, num_runs, precision),
+    c( 128,  128,  128, 1, 0, 0, 1, 0, num_runs, precision),
+    c( 256,  256,  256, 1, 0, 0, 1, 0, num_runs, precision),
+    c( 512,  512,  512, 1, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
+    c(2048, 2048, 2048, 1, 0, 0, 1, 0, num_runs, precision),
+    c(4096, 4096, 4096, 1, 0, 0, 1, 0, num_runs, precision),
+    c(8192, 8192, 8192, 1, 0, 0, 1, 0, num_runs, precision)
   )
 )
 
diff --git a/test/performance/graphs/xsymm.r b/test/performance/graphs/xsymm.r
index 6493f52a..c27de904 100644
--- a/test/performance/graphs/xsymm.r
+++ b/test/performance/graphs/xsymm.r
@@ -19,7 +19,7 @@ source(file.path(dirname(thisfile), "common.r"))
 
 # Settings
 routine_name <- "xsymm"
-parameters <- c("-m","-n","-layout","-triangle","-side",
+parameters <- c("-m","-n","-layout","-side","-triangle",
                 "-num_steps","-step","-runs","-precision")
 precision <- 32
 
@@ -29,16 +29,16 @@ test_names <- list(
   "multiples of 128 (+1)",
   "around m=n=512",
   "around m=n=2048",
-  "layouts and triangle/side (m=n=1024)",
+  "layouts and side/triangle (m=n=1024)",
   "powers of 2"
 )
 
 # Defines the test-cases
 test_values <- list(
-  list(c(128, 128, 0, 0, 0, 16, 128, num_runs, precision)),
-  list(c(129, 129, 0, 0, 0, 16, 128, num_runs, precision)),
-  list(c(512, 512, 0, 0, 0, 16, 1, num_runs, precision)),
-  list(c(2048, 2048, 0, 0, 0, 16, 1, num_runs, precision)),
+  list(c( 128,  128, 1, 0, 0, 16, 128, num_runs, precision)),
+  list(c( 129,  129, 1, 0, 0, 16, 128, num_runs, precision)),
+  list(c( 512,  512, 1, 0, 0, 16, 1, num_runs, precision)),
+  list(c(2048, 2048, 1, 0, 0, 16, 1, num_runs, precision)),
   list(
     c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
     c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
@@ -50,17 +50,17 @@ test_values <- list(
     c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
   ),
   list(
-    c(8, 8, 0, 0, 0, 1, 0, num_runs, precision),
-    c(16, 16, 0, 0, 0, 1, 0, num_runs, precision),
-    c(32, 32, 0, 0, 0, 1, 0, num_runs, precision),
-    c(64, 64, 0, 0, 0, 1, 0, num_runs, precision),
-    c(128, 128, 0, 0, 0, 1, 0, num_runs, precision),
-    c(256, 256, 0, 0, 0, 1, 0, num_runs, precision),
-    c(512, 512, 0, 0, 0, 1, 0, num_runs, precision),
-    c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
-    c(2048, 2048, 0, 0, 0, 1, 0, num_runs, precision),
-    c(4096, 4096, 0, 0, 0, 1, 0, num_runs, precision),
-    c(8192, 8192, 0, 0, 0, 1, 0, num_runs, precision)
+    c(   8,    8, 1, 0, 0, 1, 0, num_runs, precision),
+    c(  16,   16, 1, 0, 0, 1, 0, num_runs, precision),
+    c(  32,   32, 1, 0, 0, 1, 0, num_runs, precision),
+    c(  64,   64, 1, 0, 0, 1, 0, num_runs, precision),
+    c( 128,  128, 1, 0, 0, 1, 0, num_runs, precision),
+    c( 256,  256, 1, 0, 0, 1, 0, num_runs, precision),
+    c( 512,  512, 1, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
+    c(2048, 2048, 1, 0, 0, 1, 0, num_runs, precision),
+    c(4096, 4096, 1, 0, 0, 1, 0, num_runs, precision),
+    c(8192, 8192, 1, 0, 0, 1, 0, num_runs, precision)
   )
 )
 
@@ -70,7 +70,7 @@ test_xlabels <- list(
   "matrix sizes (m=n)",
   "matrix sizes (m=n)",
   "matrix sizes (m=n)",
-  "layout (row/col), triangle (up/lo), side (l/r)",
+  "layout (row/col), side (l/r), triangle (up/lo)",
   "matrix sizes (m=n)"
 )
 
@@ -80,8 +80,8 @@ test_xaxis <- list(
   c("m", ""),
   c("m", ""),
   c("m", ""),
-  list(1:8, c("row,up,l", "row,up,r", "row,lo,l", "row,lo,r",
-              "col,up,l", "col,up,r", "col,lo,l", "col,lo,r")),
+  list(1:8, c("row,l,up", "row,r,up", "row,l,lo", "row,r,lo",
+              "col,l,up", "col,r,up", "col,l,lo", "col,r,lo")),
   c("m", "x")
 )
 
diff --git a/test/performance/graphs/xsyr2k.r b/test/performance/graphs/xsyr2k.r
new file mode 100644
index 00000000..eb761e4c
--- /dev/null
+++ b/test/performance/graphs/xsyr2k.r
@@ -0,0 +1,94 @@
+
+# ==================================================================================================
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+# project uses a tab-size of two spaces and a max-width of 100 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+#
+# This file implements the performance script for the Xsyr2k routine
+#
+# ==================================================================================================
+
+# Includes the common functions
+args <- commandArgs(trailingOnly = FALSE)
+thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
+source(file.path(dirname(thisfile), "common.r"))
+
+# ==================================================================================================
+
+# Settings
+routine_name <- "xsyr2k"
+parameters <- c("-n","-k","-layout","-triangle","-transA",
+                "-num_steps","-step","-runs","-precision")
+precision <- 32
+
+# Sets the names of the test-cases
+test_names <- list(
+  "multiples of 128",
+  "multiples of 128 (+1)",
+  "around n=k=512",
+  "around n=k=1536",
+  "layouts and transposing (n=k=1024)",
+  "powers of 2"
+)
+
+# Defines the test-cases
+test_values <- list(
+  list(c( 128,  128, 1, 0, 0, 16, 128, num_runs, precision)),
+  list(c( 129,  129, 1, 0, 0, 16, 128, num_runs, precision)),
+  list(c( 512,  512, 1, 0, 0, 16, 1, num_runs, precision)),
+  list(c(1536, 1536, 1, 0, 0, 16, 1, num_runs, precision)),
+  list(
+    c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
+  ),
+  list(
+    c(   8,    8, 1, 0, 0, 1, 0, num_runs, precision),
+    c(  16,   16, 1, 0, 0, 1, 0, num_runs, precision),
+    c(  32,   32, 1, 0, 0, 1, 0, num_runs, precision),
+    c(  64,   64, 1, 0, 0, 1, 0, num_runs, precision),
+    c( 128,  128, 1, 0, 0, 1, 0, num_runs, precision),
+    c( 256,  256, 1, 0, 0, 1, 0, num_runs, precision),
+    c( 512,  512, 1, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
+    c(2048, 2048, 1, 0, 0, 1, 0, num_runs, precision),
+    c(4096, 4096, 1, 0, 0, 1, 0, num_runs, precision),
+    c(8192, 8192, 1, 0, 0, 1, 0, num_runs, precision)
+  )
+)
+
+# Defines the x-labels corresponding to the test-cases
+test_xlabels <- list(
+  "matrix sizes (n=k)",
+  "matrix sizes (n=k)",
+  "matrix sizes (n=k)",
+  "matrix sizes (n=k)",
+  "layout (row/col), triangle (u/l), transA (n/y)",
+  "matrix sizes (n=k)"
+)
+
+# Defines the x-axis of the test-cases
+test_xaxis <- list(
+  c("n", ""),
+  c("n", ""),
+  c("n", ""),
+  c("n", ""),
+  list(1:8, c("row,u,n", "row,u,y", "row,l,n", "row,l,y",
+              "col,u,n", "col,u,y", "col,l,n", "col,l,y")),
+  c("n", "x")
+)
+
+# ==================================================================================================
+
+# Start the script
+main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
+     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
+
+# ==================================================================================================
\ No newline at end of file
diff --git a/test/performance/graphs/xsyrk.r b/test/performance/graphs/xsyrk.r
new file mode 100644
index 00000000..04f7b515
--- /dev/null
+++ b/test/performance/graphs/xsyrk.r
@@ -0,0 +1,94 @@
+
+# ==================================================================================================
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+# project uses a tab-size of two spaces and a max-width of 100 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+#
+# This file implements the performance script for the Xsyrk routine
+#
+# ==================================================================================================
+
+# Includes the common functions
+args <- commandArgs(trailingOnly = FALSE)
+thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
+source(file.path(dirname(thisfile), "common.r"))
+
+# ==================================================================================================
+
+# Settings
+routine_name <- "xsyrk"
+parameters <- c("-n","-k","-layout","-triangle","-transA",
+                "-num_steps","-step","-runs","-precision")
+precision <- 32
+
+# Sets the names of the test-cases
+test_names <- list(
+  "multiples of 128",
+  "multiples of 128 (+1)",
+  "around n=k=512",
+  "around n=k=2048",
+  "layouts and transposing (n=k=1024)",
+  "powers of 2"
+)
+
+# Defines the test-cases
+test_values <- list(
+  list(c( 128,  128, 1, 0, 0, 16, 128, num_runs, precision)),
+  list(c( 129,  129, 1, 0, 0, 16, 128, num_runs, precision)),
+  list(c( 512,  512, 1, 0, 0, 16, 1, num_runs, precision)),
+  list(c(2048, 2048, 1, 0, 0, 16, 1, num_runs, precision)),
+  list(
+    c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
+  ),
+  list(
+    c(   8,    8, 1, 0, 0, 1, 0, num_runs, precision),
+    c(  16,   16, 1, 0, 0, 1, 0, num_runs, precision),
+    c(  32,   32, 1, 0, 0, 1, 0, num_runs, precision),
+    c(  64,   64, 1, 0, 0, 1, 0, num_runs, precision),
+    c( 128,  128, 1, 0, 0, 1, 0, num_runs, precision),
+    c( 256,  256, 1, 0, 0, 1, 0, num_runs, precision),
+    c( 512,  512, 1, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
+    c(2048, 2048, 1, 0, 0, 1, 0, num_runs, precision),
+    c(4096, 4096, 1, 0, 0, 1, 0, num_runs, precision),
+    c(8192, 8192, 1, 0, 0, 1, 0, num_runs, precision)
+  )
+)
+
+# Defines the x-labels corresponding to the test-cases
+test_xlabels <- list(
+  "matrix sizes (n=k)",
+  "matrix sizes (n=k)",
+  "matrix sizes (n=k)",
+  "matrix sizes (n=k)",
+  "layout (row/col), triangle (u/l), transA (n/y)",
+  "matrix sizes (n=k)"
+)
+
+# Defines the x-axis of the test-cases
+test_xaxis <- list(
+  c("n", ""),
+  c("n", ""),
+  c("n", ""),
+  c("n", ""),
+  list(1:8, c("row,u,n", "row,u,y", "row,l,n", "row,l,y",
+              "col,u,n", "col,u,y", "col,l,n", "col,l,y")),
+  c("n", "x")
+)
+
+# ==================================================================================================
+
+# Start the script
+main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
+     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
+
+# ==================================================================================================
\ No newline at end of file
diff --git a/test/performance/graphs/xtrmm.r b/test/performance/graphs/xtrmm.r
new file mode 100644
index 00000000..3b35f7c0
--- /dev/null
+++ b/test/performance/graphs/xtrmm.r
@@ -0,0 +1,127 @@
+
+# ==================================================================================================
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+# project uses a tab-size of two spaces and a max-width of 100 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+#
+# This file implements the performance script for the Xtrmm routine
+#
+# ==================================================================================================
+
+# Includes the common functions
+args <- commandArgs(trailingOnly = FALSE)
+thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
+source(file.path(dirname(thisfile), "common.r"))
+
+# ==================================================================================================
+
+# Settings
+routine_name <- "xtrmm"
+parameters <- c("-m","-n","-layout","-side","-triangle","-transA","-diagonal",
+                "-num_steps","-step","-runs","-precision")
+precision <- 32
+
+# Sets the names of the test-cases
+test_names <- list(
+  "multiples of 128",
+  "multiples of 128 (+1)",
+  "around m=n=512",
+  "around m=n=2048",
+  "layouts and side/triangle (m=n=1024)",
+  "powers of 2"
+)
+
+# Defines the test-cases
+test_values <- list(
+  list(c( 128,  128, 1, 0, 0, 0, 0, 16, 128, num_runs, precision)),
+  list(c( 129,  129, 1, 0, 0, 0, 0, 16, 128, num_runs, precision)),
+  list(c( 512,  512, 1, 0, 0, 0, 0, 16, 1, num_runs, precision)),
+  list(c(2048, 2048, 1, 0, 0, 0, 0, 16, 1, num_runs, precision)),
+  list(
+    c(1024, 1024, 0, 0, 0, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 0, 0, 0, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 0, 0, 1, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 0, 0, 1, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 0, 1, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 0, 1, 0, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 0, 1, 1, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 0, 1, 1, 1, 1, 0, num_runs, precision),
+
+    c(1024, 1024, 0, 1, 0, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 1, 0, 0, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 1, 0, 1, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 1, 0, 1, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 1, 1, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 1, 1, 0, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 1, 1, 1, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 1, 1, 1, 1, 1, 0, num_runs, precision),
+
+    c(1024, 1024, 1, 0, 0, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 0, 0, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 0, 1, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 0, 1, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 1, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 1, 0, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 1, 1, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 1, 1, 1, 1, 0, num_runs, precision),
+
+    c(1024, 1024, 1, 1, 0, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 1, 0, 0, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 1, 0, 1, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 1, 0, 1, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 1, 1, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 1, 1, 0, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 1, 1, 1, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 1, 1, 1, 1, 1, 0, num_runs, precision)
+  ),
+  list(
+    c(   8,    8, 1, 0, 0, 0, 0, 1, 0, num_runs, precision),
+    c(  16,   16, 1, 0, 0, 0, 0, 1, 0, num_runs, precision),
+    c(  32,   32, 1, 0, 0, 0, 0, 1, 0, num_runs, precision),
+    c(  64,   64, 1, 0, 0, 0, 0, 1, 0, num_runs, precision),
+    c( 128,  128, 1, 0, 0, 0, 0, 1, 0, num_runs, precision),
+    c( 256,  256, 1, 0, 0, 0, 0, 1, 0, num_runs, precision),
+    c( 512,  512, 1, 0, 0, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 0, 0, 0, 1, 0, num_runs, precision),
+    c(2048, 2048, 1, 0, 0, 0, 0, 1, 0, num_runs, precision),
+    c(4096, 4096, 1, 0, 0, 0, 0, 1, 0, num_runs, precision),
+    c(8192, 8192, 1, 0, 0, 0, 0, 1, 0, num_runs, precision)
+  )
+)
+
+# Defines the x-labels corresponding to the test-cases
+test_xlabels <- list(
+  "matrix sizes (m=n)",
+  "matrix sizes (m=n)",
+  "matrix sizes (m=n)",
+  "matrix sizes (m=n)",
+  "layout (row/col), side (l/r), triangle (up/lo), transA (n/y), diag (u/nu)",
+  "matrix sizes (m=n)"
+)
+
+# Defines the x-axis of the test-cases
+test_xaxis <- list(
+  c("m", ""),
+  c("m", ""),
+  c("m", ""),
+  c("m", ""),
+  list(1:32, c("row,l,up,n,u", "row,l,up,n,nu", "row,l,up,y,u", "row,l,up,y,nu",
+               "row,r,up,n,u", "row,r,up,n,nu", "row,r,up,y,u", "row,r,up,y,nu",
+               "row,l,lo,n,u", "row,l,lo,n,nu", "row,l,lo,y,u", "row,l,lo,y,nu",
+               "row,r,lo,n,u", "row,r,lo,n,nu", "row,r,lo,y,u", "row,r,lo,y,nu",
+               "col,l,up,n,u", "col,l,up,n,nu", "col,l,up,y,u", "col,l,up,y,nu",
+               "col,r,up,n,u", "col,r,up,n,nu", "col,r,up,y,u", "col,r,up,y,nu",
+               "col,l,lo,n,u", "col,l,lo,n,nu", "col,l,lo,y,u", "col,l,lo,y,nu",
+               "col,r,lo,n,u", "col,r,lo,n,nu", "col,r,lo,y,u", "col,r,lo,y,nu")),
+  c("m", "x")
+)
+
+# ==================================================================================================
+
+# Start the script
+main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
+     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
+
+# ==================================================================================================
\ No newline at end of file
diff --git a/test/performance/routines/level1/xaxpy.cc b/test/performance/routines/level1/xaxpy.cc
new file mode 100644
index 00000000..fe90c697
--- /dev/null
+++ b/test/performance/routines/level1/xaxpy.cc
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xaxpy command-line interface performance tester.
+//
+// =================================================================================================
+
+#include "performance/client.h"
+#include "routines/level1/xaxpy.h"
+
+// =================================================================================================
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXaxpy<float>, float, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXaxpy<double>, double, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXaxpy<float2>, float2, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXaxpy<double2>, double2, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/level2/xgemv.cc b/test/performance/routines/level2/xgemv.cc
new file mode 100644
index 00000000..376c6c33
--- /dev/null
+++ b/test/performance/routines/level2/xgemv.cc
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgemv command-line interface performance tester.
+//
+// =================================================================================================
+
+#include "performance/client.h"
+#include "routines/level2/xgemv.h"
+
+// =================================================================================================
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXgemv<float>, float, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXgemv<double>, double, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXgemv<float2>, float2, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXgemv<double2>, double2, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/level3/xgemm.cc b/test/performance/routines/level3/xgemm.cc
new file mode 100644
index 00000000..c45c238f
--- /dev/null
+++ b/test/performance/routines/level3/xgemm.cc
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgemm command-line interface performance tester.
+//
+// =================================================================================================
+
+#include "performance/client.h"
+#include "routines/level3/xgemm.h"
+
+// =================================================================================================
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXgemm<float>, float, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXgemm<double>, double, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXgemm<float2>, float2, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXgemm<double2>, double2, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/level3/xhemm.cc b/test/performance/routines/level3/xhemm.cc
new file mode 100644
index 00000000..d215653b
--- /dev/null
+++ b/test/performance/routines/level3/xhemm.cc
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhemm command-line interface performance tester.
+//
+// =================================================================================================
+
+#include "performance/client.h"
+#include "routines/level3/xhemm.h"
+
+// =================================================================================================
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kDouble:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXhemm<float2>, float2, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXhemm<double2>, double2, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/level3/xher2k.cc b/test/performance/routines/level3/xher2k.cc
new file mode 100644
index 00000000..2e1f248a
--- /dev/null
+++ b/test/performance/routines/level3/xher2k.cc
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xher2k command-line interface performance tester.
+//
+// =================================================================================================
+
+#include "performance/client.h"
+#include "routines/level3/xher2k.h"
+
+// =================================================================================================
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kDouble:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXher2k<float2,float>, float2, float>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXher2k<double2,double>, double2, double>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/level3/xherk.cc b/test/performance/routines/level3/xherk.cc
new file mode 100644
index 00000000..4386f78c
--- /dev/null
+++ b/test/performance/routines/level3/xherk.cc
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xherk command-line interface performance tester.
+//
+// =================================================================================================
+
+#include "performance/client.h"
+#include "routines/level3/xherk.h"
+
+// =================================================================================================
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kDouble:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXherk<float2,float>, float2, float>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXherk<double2,double>, double2, double>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/level3/xsymm.cc b/test/performance/routines/level3/xsymm.cc
new file mode 100644
index 00000000..bd014cee
--- /dev/null
+++ b/test/performance/routines/level3/xsymm.cc
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsymm command-line interface performance tester.
+//
+// =================================================================================================
+
+#include "performance/client.h"
+#include "routines/level3/xsymm.h"
+
+// =================================================================================================
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXsymm<float>, float, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXsymm<double>, double, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXsymm<float2>, float2, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXsymm<double2>, double2, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/level3/xsyr2k.cc b/test/performance/routines/level3/xsyr2k.cc
new file mode 100644
index 00000000..1261be88
--- /dev/null
+++ b/test/performance/routines/level3/xsyr2k.cc
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr2k command-line interface performance tester.
+//
+// =================================================================================================
+
+#include "performance/client.h"
+#include "routines/level3/xsyr2k.h"
+
+// =================================================================================================
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXsyr2k<float>, float, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXsyr2k<double>, double, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXsyr2k<float2>, float2, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXsyr2k<double2>, double2, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/level3/xsyrk.cc b/test/performance/routines/level3/xsyrk.cc
new file mode 100644
index 00000000..5799130f
--- /dev/null
+++ b/test/performance/routines/level3/xsyrk.cc
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyrk command-line interface performance tester.
+//
+// =================================================================================================
+
+#include "performance/client.h"
+#include "routines/level3/xsyrk.h"
+
+// =================================================================================================
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXsyrk<float>, float, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXsyrk<double>, double, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXsyrk<float2>, float2, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXsyrk<double2>, double2, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/level3/xtrmm.cc b/test/performance/routines/level3/xtrmm.cc
new file mode 100644
index 00000000..c30866e9
--- /dev/null
+++ b/test/performance/routines/level3/xtrmm.cc
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmm command-line interface performance tester.
+//
+// =================================================================================================
+
+#include "performance/client.h"
+#include "routines/level3/xtrmm.h"
+
+// =================================================================================================
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXtrmm<float>, float, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXtrmm<double>, double, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXtrmm<float2>, float2, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXtrmm<double2>, double2, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/xaxpy.cc b/test/performance/routines/xaxpy.cc
deleted file mode 100644
index 23d76099..00000000
--- a/test/performance/routines/xaxpy.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xaxpy command-line interface tester.
-//
-// =================================================================================================
-
-#include <string>
-#include <vector>
-#include <exception>
-
-#include "wrapper_clblas.h"
-#include "performance/client.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The client, used for performance testing. It contains the function calls to CLBlast and to other
-// libraries to compare against.
-template <typename T>
-void PerformanceXaxpy(const Arguments<T> &args,
-                      const Buffer &x_vec, const Buffer &y_vec,
-                      CommandQueue &queue) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [&args, &x_vec, &y_vec, &queue]() {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = Axpy(args.n, args.alpha,
-                       x_vec(), args.x_offset, args.x_inc,
-                       y_vec(), args.y_offset, args.y_inc,
-                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
-    if (status != StatusCode::kSuccess) {
-      throw std::runtime_error("CLBlast error: "+ToString(static_cast<int>(status)));
-    }
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [&args, &x_vec, &y_vec, &queue]() {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXaxpy(args.n, args.alpha,
-                              x_vec(), args.x_offset, args.x_inc,
-                              y_vec(), args.y_offset, args.y_inc,
-                              1, &queue_plain, 0, nullptr, &event);
-    clWaitForEvents(1, &event);
-    if (status != CL_SUCCESS) {
-      throw std::runtime_error("clBLAS error: "+ToString(static_cast<int>(status)));
-    }
-  };
-
-  // Runs the routines and collect the timings
-  auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda);
-  auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda);
-
-  // Prints the performance of both libraries
-  const auto flops = 2 * args.n;
-  const auto bytes = (3 * args.n) * sizeof(T);
-  const auto output_ints = std::vector<size_t>{args.n, args.x_inc, args.y_inc,
-                                               args.x_offset, args.y_offset};
-  const auto output_strings = std::vector<std::string>{ToString(args.alpha)};
-  PrintTableRow(output_ints, output_strings, args.no_abbrv,
-                ms_clblast, ms_clblas, flops, bytes);
-}
-
-// =================================================================================================
-
-// Main function which calls the common client code with the routine-specific function as argument.
-void ClientXaxpy(int argc, char *argv[]) {
-  const auto o = std::vector<std::string>{kArgN, kArgXInc, kArgYInc,
-                                          kArgXOffset, kArgYOffset, kArgAlpha};
-  switch(GetPrecision(argc, argv)) {
-    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
-    case Precision::kSingle: ClientXY<float>(argc, argv, PerformanceXaxpy<float>, o); break;
-    case Precision::kDouble: ClientXY<double>(argc, argv, PerformanceXaxpy<double>, o); break;
-    case Precision::kComplexSingle: ClientXY<float2>(argc, argv, PerformanceXaxpy<float2>, o); break;
-    case Precision::kComplexDouble: ClientXY<double2>(argc, argv, PerformanceXaxpy<double2>, o); break;
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::ClientXaxpy(argc, argv);
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/performance/routines/xgemm.cc b/test/performance/routines/xgemm.cc
deleted file mode 100644
index 234e9fdb..00000000
--- a/test/performance/routines/xgemm.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xgemm command-line interface tester.
-//
-// =================================================================================================
-
-#include <string>
-#include <vector>
-#include <exception>
-
-#include "wrapper_clblas.h"
-#include "performance/client.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The client, used for performance testing. It contains the function calls to CLBlast and to other
-// libraries to compare against.
-template <typename T>
-void PerformanceXgemm(const Arguments<T> &args,
-                      const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
-                      CommandQueue &queue) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = Gemm(args.layout, args.a_transpose, args.b_transpose,
-                       args.m, args.n, args.k,
-                       args.alpha,
-                       a_mat(), args.a_offset, args.a_ld,
-                       b_mat(), args.b_offset, args.b_ld,
-                       args.beta,
-                       c_mat(), args.c_offset, args.c_ld,
-                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
-    if (status != StatusCode::kSuccess) {
-      throw std::runtime_error("CLBlast error: "+ToString(static_cast<int>(status)));
-    }
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXgemm(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasTranspose>(args.a_transpose),
-                              static_cast<clblasTranspose>(args.b_transpose),
-                              args.m, args.n, args.k,
-                              args.alpha,
-                              a_mat(), args.a_offset, args.a_ld,
-                              b_mat(), args.b_offset, args.b_ld,
-                              args.beta,
-                              c_mat(), args.c_offset, args.c_ld,
-                              1, &queue_plain, 0, nullptr, &event);
-    clWaitForEvents(1, &event);
-    if (status != CL_SUCCESS) {
-      throw std::runtime_error("clBLAS error: "+ToString(static_cast<int>(status)));
-    }
-  };
-
-  // Runs the routines and collect the timings
-  auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda);
-  auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda);
-
-  // Prints the performance of both libraries
-  const auto flops = 2 * args.m * args.n * args.k;
-  const auto bytes = (args.m*args.k + args.k*args.n + args.m*args.n) * sizeof(T);
-  const auto output_ints = std::vector<size_t>{args.m, args.n, args.k,
-                                               static_cast<size_t>(args.layout),
-                                               static_cast<size_t>(args.a_transpose),
-                                               static_cast<size_t>(args.b_transpose),
-                                               args.a_ld, args.b_ld, args.c_ld,
-                                               args.a_offset, args.b_offset, args.c_offset};
-  const auto output_strings = std::vector<std::string>{ToString(args.alpha),
-                                                       ToString(args.beta)};
-  PrintTableRow(output_ints, output_strings, args.no_abbrv,
-                ms_clblast, ms_clblas, flops, bytes);
-}
-
-// =================================================================================================
-
-// Main function which calls the common client code with the routine-specific function as argument.
-void ClientXgemm(int argc, char *argv[]) {
-  const auto o = std::vector<std::string>{kArgM, kArgN, kArgK, kArgLayout,
-                                          kArgATransp, kArgBTransp,
-                                          kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
-                                          kArgAOffset, kArgBOffset, kArgCOffset,
-                                          kArgAlpha, kArgBeta};
-  switch(GetPrecision(argc, argv)) {
-    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
-    case Precision::kSingle: ClientABC<float>(argc, argv, PerformanceXgemm<float>, o); break;
-    case Precision::kDouble: ClientABC<double>(argc, argv, PerformanceXgemm<double>, o); break;
-    case Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode");
-    case Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode");
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::ClientXgemm(argc, argv);
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/performance/routines/xgemv.cc b/test/performance/routines/xgemv.cc
deleted file mode 100644
index 43222396..00000000
--- a/test/performance/routines/xgemv.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xgemv command-line interface tester.
-//
-// =================================================================================================
-
-#include <string>
-#include <vector>
-#include <exception>
-
-#include "wrapper_clblas.h"
-#include "performance/client.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The client, used for performance testing. It contains the function calls to CLBlast and to other
-// libraries to compare against.
-template <typename T>
-void PerformanceXgemv(const Arguments<T> &args,
-                      const Buffer &a_mat, const Buffer &x_vec, const Buffer &y_vec,
-                      CommandQueue &queue) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [&args, &a_mat, &x_vec, &y_vec, &queue]() {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = Gemv(args.layout, args.a_transpose, args.m, args.n, args.alpha,
-                       a_mat(), args.a_offset, args.a_ld,
-                       x_vec(), args.x_offset, args.x_inc, args.beta,
-                       y_vec(), args.y_offset, args.y_inc,
-                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
-    if (status != StatusCode::kSuccess) {
-      throw std::runtime_error("CLBlast error: "+ToString(static_cast<int>(status)));
-    }
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [&args, &a_mat, &x_vec, &y_vec, &queue]() {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXgemv(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasTranspose>(args.a_transpose),
-                              args.m, args.n, args.alpha,
-                              a_mat(), args.a_offset, args.a_ld,
-                              x_vec(), args.x_offset, args.x_inc, args.beta,
-                              y_vec(), args.y_offset, args.y_inc,
-                              1, &queue_plain, 0, nullptr, &event);
-    clWaitForEvents(1, &event);
-    if (status != CL_SUCCESS) {
-      throw std::runtime_error("clBLAS error: "+ToString(static_cast<int>(status)));
-    }
-  };
-
-  // Runs the routines and collect the timings
-  auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda);
-  auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda);
-
-  // Prints the performance of both libraries
-  const auto flops = 2 * args.m * args.n;
-  const auto bytes = (args.m*args.n + 2*args.m + args.n) * sizeof(T);
-  const auto output_ints = std::vector<size_t>{args.m, args.n,
-                                               static_cast<size_t>(args.layout),
-                                               static_cast<size_t>(args.a_transpose),
-                                               args.a_ld, args.x_inc, args.y_inc,
-                                               args.a_offset, args.x_offset, args.y_offset};
-  const auto output_strings = std::vector<std::string>{ToString(args.alpha),
-                                                       ToString(args.beta)};
-  PrintTableRow(output_ints, output_strings, args.no_abbrv,
-                ms_clblast, ms_clblas, flops, bytes);
-}
-
-// =================================================================================================
-
-// Main function which calls the common client code with the routine-specific function as argument.
-void ClientXgemv(int argc, char *argv[]) {
-  const auto o = std::vector<std::string>{kArgM, kArgN, kArgLayout, kArgATransp,
-                                          kArgALeadDim, kArgXInc, kArgYInc,
-                                          kArgAOffset, kArgXOffset, kArgYOffset,
-                                          kArgAlpha, kArgBeta};
-  switch(GetPrecision(argc, argv)) {
-    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
-    case Precision::kSingle: ClientAXY<float>(argc, argv, PerformanceXgemv<float>, o); break;
-    case Precision::kDouble: ClientAXY<double>(argc, argv, PerformanceXgemv<double>, o); break;
-    case Precision::kComplexSingle: ClientAXY<float2>(argc, argv, PerformanceXgemv<float2>, o); break;
-    case Precision::kComplexDouble: ClientAXY<double2>(argc, argv, PerformanceXgemv<double2>, o); break;
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::ClientXgemv(argc, argv);
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/performance/routines/xsymm.cc b/test/performance/routines/xsymm.cc
deleted file mode 100644
index 13ad434a..00000000
--- a/test/performance/routines/xsymm.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xsymm command-line interface tester.
-//
-// =================================================================================================
-
-#include <string>
-#include <vector>
-#include <exception>
-
-#include "wrapper_clblas.h"
-#include "performance/client.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The client, used for performance testing. It contains the function calls to CLBlast and to other
-// libraries to compare against.
-template <typename T>
-void PerformanceXsymm(const Arguments<T> &args,
-                      const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
-                      CommandQueue &queue) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = Symm(args.layout, args.side, args.triangle,
-                       args.m, args.n,
-                       args.alpha,
-                       a_mat(), args.a_offset, args.a_ld,
-                       b_mat(), args.b_offset, args.b_ld,
-                       args.beta,
-                       c_mat(), args.c_offset, args.c_ld,
-                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
-    if (status != StatusCode::kSuccess) {
-      throw std::runtime_error("CLBlast error: "+ToString(static_cast<int>(status)));
-    }
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXsymm(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasSide>(args.side),
-                              static_cast<clblasUplo>(args.triangle),
-                              args.m, args.n,
-                              args.alpha,
-                              a_mat(), args.a_offset, args.a_ld,
-                              b_mat(), args.b_offset, args.b_ld,
-                              args.beta,
-                              c_mat(), args.c_offset, args.c_ld,
-                              1, &queue_plain, 0, nullptr, &event);
-    clWaitForEvents(1, &event);
-    if (status != CL_SUCCESS) {
-      throw std::runtime_error("clBLAS error: "+ToString(static_cast<int>(status)));
-    }
-  };
-
-  // Runs the routines and collect the timings
-  auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda);
-  auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda);
-
-  // Prints the performance of both libraries
-  const auto flops = 2 * args.m * args.n * args.m;
-  const auto bytes = (args.m*args.m + args.m*args.n + args.m*args.n) * sizeof(T);
-  const auto output_ints = std::vector<size_t>{args.m, args.n,
-                                               static_cast<size_t>(args.layout),
-                                               static_cast<size_t>(args.triangle),
-                                               static_cast<size_t>(args.side),
-                                               args.a_ld, args.b_ld, args.c_ld,
-                                               args.a_offset, args.b_offset, args.c_offset};
-  const auto output_strings = std::vector<std::string>{ToString(args.alpha),
-                                                       ToString(args.beta)};
-  PrintTableRow(output_ints, output_strings, args.no_abbrv,
-                ms_clblast, ms_clblas, flops, bytes);
-}
-
-// =================================================================================================
-
-// Main function which calls the common client code with the routine-specific function as argument.
-void ClientXsymm(int argc, char *argv[]) {
-  const auto o = std::vector<std::string>{kArgM, kArgN, kArgLayout,
-                                          kArgTriangle, kArgSide,
-                                          kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
-                                          kArgAOffset, kArgBOffset, kArgCOffset,
-                                          kArgAlpha, kArgBeta};
-  switch(GetPrecision(argc, argv)) {
-    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
-    case Precision::kSingle: ClientABC<float>(argc, argv, PerformanceXsymm<float>, o); break;
-    case Precision::kDouble: ClientABC<double>(argc, argv, PerformanceXsymm<double>, o); break;
-    case Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode");
-    case Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode");
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::ClientXsymm(argc, argv);
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/routines/level1/xaxpy.h b/test/routines/level1/xaxpy.h
new file mode 100644
index 00000000..6ce5d7e2
--- /dev/null
+++ b/test/routines/level1/xaxpy.h
@@ -0,0 +1,113 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xaxpy routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XAXPY_H_
+#define CLBLAST_TEST_ROUTINES_XAXPY_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXaxpy {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN,
+            kArgXInc, kArgYInc,
+            kArgXOffset, kArgYOffset,
+            kArgAlpha};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &args) {
+    return args.n * args.x_inc + args.x_offset;
+  }
+  static size_t GetSizeY(const Arguments<T> &args) {
+    return args.n * args.y_inc + args.y_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.x_size = GetSizeX(args);
+    args.y_size = GetSizeY(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Axpy(args.n, args.alpha,
+                       buffers.x_vec(), args.x_offset, args.x_inc,
+                       buffers.y_vec(), args.y_offset, args.y_inc,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXaxpy(args.n, args.alpha,
+                              buffers.x_vec(), args.x_offset, args.x_inc,
+                              buffers.y_vec(), args.y_offset, args.y_inc,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.y_size, static_cast<T>(0));
+    buffers.y_vec.ReadBuffer(queue, args.y_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.n; }
+  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+    return id1*args.y_inc + args.y_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.n;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (3 * args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XAXPY_H_
+#endif
diff --git a/test/routines/level2/xgemv.h b/test/routines/level2/xgemv.h
new file mode 100644
index 00000000..73f7d76e
--- /dev/null
+++ b/test/routines/level2/xgemv.h
@@ -0,0 +1,132 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xgemv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XGEMV_H_
+#define CLBLAST_TEST_ROUTINES_XGEMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXgemv {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgM, kArgN,
+            kArgLayout, kArgATransp, 
+            kArgALeadDim, kArgXInc, kArgYInc,
+            kArgAOffset, kArgXOffset, kArgYOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    auto a_rotated = (args.layout == Layout::kRowMajor);
+    auto a_two = (a_rotated) ? args.m : args.n;
+    return a_two * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeX(const Arguments<T> &args) {
+    auto a_transposed = (args.a_transpose != Transpose::kNo);
+    auto n_real = (a_transposed) ? args.m : args.n;
+    return n_real * args.x_inc + args.x_offset;
+  }
+  static size_t GetSizeY(const Arguments<T> &args) {
+    auto a_transposed = (args.a_transpose != Transpose::kNo);
+    auto m_real = (a_transposed) ? args.n : args.m;
+    return m_real * args.y_inc + args.y_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.x_size = GetSizeX(args);
+    args.y_size = GetSizeY(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Gemv(args.layout, args.a_transpose,
+                       args.m, args.n, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld,
+                       buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+                       buffers.y_vec(), args.y_offset, args.y_inc,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXgemv(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasTranspose>(args.a_transpose),
+                              args.m, args.n, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld,
+                              buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+                              buffers.y_vec(), args.y_offset, args.y_inc,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.y_size, static_cast<T>(0));
+    buffers.y_vec.ReadBuffer(queue, args.y_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) {
+    auto a_transposed = (args.a_transpose != Transpose::kNo);
+    return (a_transposed) ? args.n : args.m;
+  }
+  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+    return id1*args.y_inc + args.y_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.m * args.n;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (args.m*args.n + 2*args.m + args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XGEMV_H_
+#endif
diff --git a/test/routines/level3/xgemm.h b/test/routines/level3/xgemm.h
new file mode 100644
index 00000000..86a304d1
--- /dev/null
+++ b/test/routines/level3/xgemm.h
@@ -0,0 +1,134 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xgemm routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XGEMM_H_
+#define CLBLAST_TEST_ROUTINES_XGEMM_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXgemm {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgM, kArgN, kArgK,
+            kArgLayout, kArgATransp, kArgBTransp,
+            kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
+            kArgAOffset, kArgBOffset, kArgCOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+    auto a_two = (a_rotated) ? args.m : args.k;
+    return a_two * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeB(const Arguments<T> &args) {
+    auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
+    auto b_two = (b_rotated) ? args.k : args.n;
+    return b_two * args.b_ld + args.b_offset;
+  }
+  static size_t GetSizeC(const Arguments<T> &args) {
+    auto c_rotated = (args.layout == Layout::kRowMajor);
+    auto c_two = (c_rotated) ? args.m : args.n;
+    return c_two * args.c_ld + args.c_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.b_size = GetSizeB(args);
+    args.c_size = GetSizeC(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.k; }
+  static size_t DefaultLDB(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDC(const Arguments<T> &args) { return args.n; }
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Gemm(args.layout, args.a_transpose, args.b_transpose,
+                       args.m, args.n, args.k, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld,
+                       buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                       buffers.c_mat(), args.c_offset, args.c_ld,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXgemm(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasTranspose>(args.a_transpose),
+                              static_cast<clblasTranspose>(args.b_transpose),
+                              args.m, args.n, args.k, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld,
+                              buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                              buffers.c_mat(), args.c_offset, args.c_ld,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.c_size, static_cast<T>(0));
+    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.m; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    return (args.layout == Layout::kRowMajor) ?
+           id1*args.c_ld + id2 + args.c_offset:
+           id2*args.c_ld + id1 + args.c_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.m * args.n * args.m;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XGEMM_H_
+#endif
diff --git a/test/routines/level3/xhemm.h b/test/routines/level3/xhemm.h
new file mode 100644
index 00000000..75878b06
--- /dev/null
+++ b/test/routines/level3/xhemm.h
@@ -0,0 +1,134 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xhemm routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XHEMM_H_
+#define CLBLAST_TEST_ROUTINES_XHEMM_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXhemm {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgM, kArgN,
+            kArgLayout, kArgSide, kArgTriangle,
+            kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
+            kArgAOffset, kArgBOffset, kArgCOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    size_t k_value = (args.side == Side::kLeft) ? args.m : args.n;
+    auto a_rotated = (args.layout == Layout::kRowMajor);
+    auto a_two = (a_rotated) ? args.m : k_value;
+    return a_two * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeB(const Arguments<T> &args) {
+    size_t k_value = (args.side == Side::kLeft) ? args.m : args.n;
+    auto b_rotated = (args.layout == Layout::kRowMajor);
+    auto b_two = (b_rotated) ? k_value : args.n;
+    return b_two * args.b_ld + args.b_offset;
+  }
+  static size_t GetSizeC(const Arguments<T> &args) {
+    auto c_rotated = (args.layout == Layout::kRowMajor);
+    auto c_two = (c_rotated) ? args.m : args.n;
+    return c_two * args.c_ld + args.c_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.b_size = GetSizeB(args);
+    args.c_size = GetSizeC(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.m; }
+  static size_t DefaultLDB(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDC(const Arguments<T> &args) { return args.n; }
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Hemm(args.layout, args.side, args.triangle,
+                       args.m, args.n, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld,
+                       buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                       buffers.c_mat(), args.c_offset, args.c_ld,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXhemm(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasSide>(args.side),
+                              static_cast<clblasUplo>(args.triangle),
+                              args.m, args.n, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld,
+                              buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                              buffers.c_mat(), args.c_offset, args.c_ld,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.c_size, static_cast<T>(0));
+    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.m; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    return (args.layout == Layout::kRowMajor) ?
+           id1*args.c_ld + id2 + args.c_offset:
+           id2*args.c_ld + id1 + args.c_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.m * args.n * args.m;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XHEMM_H_
+#endif
diff --git a/test/routines/level3/xher2k.h b/test/routines/level3/xher2k.h
new file mode 100644
index 00000000..f13e8a62
--- /dev/null
+++ b/test/routines/level3/xher2k.h
@@ -0,0 +1,132 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xher2k routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XHER2K_H_
+#define CLBLAST_TEST_ROUTINES_XHER2K_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class TestXher2k {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN, kArgK,
+            kArgLayout, kArgTriangle, kArgATransp,
+            kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
+            kArgAOffset, kArgBOffset, kArgCOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<U> &args) {
+    auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+    auto a_two = (a_rotated) ? args.n : args.k;
+    return a_two * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeB(const Arguments<U> &args) {
+    auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
+    auto b_two = (b_rotated) ? args.n : args.k;
+    return b_two * args.b_ld + args.b_offset;
+  }
+  static size_t GetSizeC(const Arguments<U> &args) {
+    return args.n * args.c_ld + args.c_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<U> &args) {
+    args.a_size = GetSizeA(args);
+    args.b_size = GetSizeB(args);
+    args.c_size = GetSizeC(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<U> &args) { return args.k; }
+  static size_t DefaultLDB(const Arguments<U> &args) { return args.k; }
+  static size_t DefaultLDC(const Arguments<U> &args) { return args.n; }
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<U> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto alpha2 = T{args.alpha, args.alpha};
+    auto status = Her2k(args.layout, args.triangle, args.a_transpose,
+                        args.n, args.k, alpha2,
+                        buffers.a_mat(), args.a_offset, args.a_ld,
+                        buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                        buffers.c_mat(), args.c_offset, args.c_ld,
+                        &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<U> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto alpha2 = T{args.alpha, args.alpha};
+    auto status = clblasXher2k(static_cast<clblasOrder>(args.layout),
+                               static_cast<clblasUplo>(args.triangle),
+                               static_cast<clblasTranspose>(args.a_transpose),
+                               args.n, args.k, alpha2,
+                               buffers.a_mat(), args.a_offset, args.a_ld,
+                               buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                               buffers.c_mat(), args.c_offset, args.c_ld,
+                               1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.c_size, static_cast<T>(0));
+    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<U> &args) { return args.n; }
+  static size_t ResultID2(const Arguments<U> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<U> &args, const size_t id1, const size_t id2) {
+    return id1*args.c_ld + id2 + args.c_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<U> &args) {
+    return 2 * args.n * args.n * args.k;
+  }
+  static size_t GetBytes(const Arguments<U> &args) {
+    return (args.n*args.k + args.n*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XHER2K_H_
+#endif
diff --git a/test/routines/level3/xherk.h b/test/routines/level3/xherk.h
new file mode 100644
index 00000000..780b9b52
--- /dev/null
+++ b/test/routines/level3/xherk.h
@@ -0,0 +1,121 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xherk routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XHERK_H_
+#define CLBLAST_TEST_ROUTINES_XHERK_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class TestXherk {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN, kArgK,
+            kArgLayout, kArgTriangle, kArgATransp,
+            kArgALeadDim, kArgCLeadDim,
+            kArgAOffset, kArgCOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<U> &args) {
+    auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+    auto a_two = (a_rotated) ? args.n : args.k;
+    return a_two * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeC(const Arguments<U> &args) {
+    return args.n * args.c_ld + args.c_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<U> &args) {
+    args.a_size = GetSizeA(args);
+    args.c_size = GetSizeC(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<U> &args) { return args.k; }
+  static size_t DefaultLDB(const Arguments<U> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<U> &args) { return args.n; }
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<U> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Herk(args.layout, args.triangle, args.a_transpose,
+                       args.n, args.k, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
+                       buffers.c_mat(), args.c_offset, args.c_ld,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<U> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXherk(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasUplo>(args.triangle),
+                              static_cast<clblasTranspose>(args.a_transpose),
+                              args.n, args.k, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
+                              buffers.c_mat(), args.c_offset, args.c_ld,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.c_size, static_cast<T>(0));
+    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<U> &args) { return args.n; }
+  static size_t ResultID2(const Arguments<U> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<U> &args, const size_t id1, const size_t id2) {
+    return id1*args.c_ld + id2 + args.c_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<U> &args) {
+    return args.n * args.n * args.k;
+  }
+  static size_t GetBytes(const Arguments<U> &args) {
+    return (args.n*args.k + args.n*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XHERK_H_
+#endif
diff --git a/test/routines/level3/xsymm.h b/test/routines/level3/xsymm.h
new file mode 100644
index 00000000..10476349
--- /dev/null
+++ b/test/routines/level3/xsymm.h
@@ -0,0 +1,134 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xsymm routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XSYMM_H_
+#define CLBLAST_TEST_ROUTINES_XSYMM_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXsymm {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgM, kArgN,
+            kArgLayout, kArgSide, kArgTriangle,
+            kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
+            kArgAOffset, kArgBOffset, kArgCOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    size_t k_value = (args.side == Side::kLeft) ? args.m : args.n;
+    auto a_rotated = (args.layout == Layout::kRowMajor);
+    auto a_two = (a_rotated) ? args.m : k_value;
+    return a_two * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeB(const Arguments<T> &args) {
+    size_t k_value = (args.side == Side::kLeft) ? args.m : args.n;
+    auto b_rotated = (args.layout == Layout::kRowMajor);
+    auto b_two = (b_rotated) ? k_value : args.n;
+    return b_two * args.b_ld + args.b_offset;
+  }
+  static size_t GetSizeC(const Arguments<T> &args) {
+    auto c_rotated = (args.layout == Layout::kRowMajor);
+    auto c_two = (c_rotated) ? args.m : args.n;
+    return c_two * args.c_ld + args.c_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.b_size = GetSizeB(args);
+    args.c_size = GetSizeC(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.m; }
+  static size_t DefaultLDB(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDC(const Arguments<T> &args) { return args.n; }
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Symm(args.layout, args.side, args.triangle,
+                       args.m, args.n, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld,
+                       buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                       buffers.c_mat(), args.c_offset, args.c_ld,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXsymm(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasSide>(args.side),
+                              static_cast<clblasUplo>(args.triangle),
+                              args.m, args.n, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld,
+                              buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                              buffers.c_mat(), args.c_offset, args.c_ld,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.c_size, static_cast<T>(0));
+    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.m; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    return (args.layout == Layout::kRowMajor) ?
+           id1*args.c_ld + id2 + args.c_offset:
+           id2*args.c_ld + id1 + args.c_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.m * args.n * args.m;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XSYMM_H_
+#endif
diff --git a/test/routines/level3/xsyr2k.h b/test/routines/level3/xsyr2k.h
new file mode 100644
index 00000000..f3b1b542
--- /dev/null
+++ b/test/routines/level3/xsyr2k.h
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xsyr2k routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XSYR2K_H_
+#define CLBLAST_TEST_ROUTINES_XSYR2K_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXsyr2k {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN, kArgK,
+            kArgLayout, kArgTriangle, kArgATransp,
+            kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
+            kArgAOffset, kArgBOffset, kArgCOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+    auto a_two = (a_rotated) ? args.n : args.k;
+    return a_two * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeB(const Arguments<T> &args) {
+    auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
+    auto b_two = (b_rotated) ? args.n : args.k;
+    return b_two * args.b_ld + args.b_offset;
+  }
+  static size_t GetSizeC(const Arguments<T> &args) {
+    return args.n * args.c_ld + args.c_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.b_size = GetSizeB(args);
+    args.c_size = GetSizeC(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.k; }
+  static size_t DefaultLDB(const Arguments<T> &args) { return args.k; }
+  static size_t DefaultLDC(const Arguments<T> &args) { return args.n; }
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Syr2k(args.layout, args.triangle, args.a_transpose,
+                        args.n, args.k, args.alpha,
+                        buffers.a_mat(), args.a_offset, args.a_ld,
+                        buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                        buffers.c_mat(), args.c_offset, args.c_ld,
+                        &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXsyr2k(static_cast<clblasOrder>(args.layout),
+                               static_cast<clblasUplo>(args.triangle),
+                               static_cast<clblasTranspose>(args.a_transpose),
+                               args.n, args.k, args.alpha,
+                               buffers.a_mat(), args.a_offset, args.a_ld,
+                               buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                               buffers.c_mat(), args.c_offset, args.c_ld,
+                               1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.c_size, static_cast<T>(0));
+    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.n; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    return id1*args.c_ld + id2 + args.c_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.n * args.n * args.k;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (args.n*args.k + args.n*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XSYR2K_H_
+#endif
diff --git a/test/routines/level3/xsyrk.h b/test/routines/level3/xsyrk.h
new file mode 100644
index 00000000..2ec9fb65
--- /dev/null
+++ b/test/routines/level3/xsyrk.h
@@ -0,0 +1,121 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xsyrk routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XSYRK_H_
+#define CLBLAST_TEST_ROUTINES_XSYRK_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXsyrk {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN, kArgK,
+            kArgLayout, kArgTriangle, kArgATransp,
+            kArgALeadDim, kArgCLeadDim,
+            kArgAOffset, kArgCOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+    auto a_two = (a_rotated) ? args.n : args.k;
+    return a_two * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeC(const Arguments<T> &args) {
+    return args.n * args.c_ld + args.c_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.c_size = GetSizeC(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.k; }
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &args) { return args.n; }
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Syrk(args.layout, args.triangle, args.a_transpose,
+                       args.n, args.k, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
+                       buffers.c_mat(), args.c_offset, args.c_ld,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasUplo>(args.triangle),
+                              static_cast<clblasTranspose>(args.a_transpose),
+                              args.n, args.k, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
+                              buffers.c_mat(), args.c_offset, args.c_ld,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.c_size, static_cast<T>(0));
+    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.n; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    return id1*args.c_ld + id2 + args.c_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return args.n * args.n * args.k;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (args.n*args.k + args.n*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XSYRK_H_
+#endif
diff --git a/test/routines/level3/xtrmm.h b/test/routines/level3/xtrmm.h
new file mode 100644
index 00000000..7b7e7af1
--- /dev/null
+++ b/test/routines/level3/xtrmm.h
@@ -0,0 +1,127 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xtrmm routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XTRMM_H_
+#define CLBLAST_TEST_ROUTINES_XTRMM_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXtrmm {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgM, kArgN,
+            kArgLayout, kArgSide, kArgTriangle, kArgATransp, kArgDiagonal,
+            kArgALeadDim, kArgBLeadDim,
+            kArgAOffset, kArgBOffset,
+            kArgAlpha};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    auto k = (args.side == Side::kLeft) ? args.m : args.n;
+    return k * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeB(const Arguments<T> &args) {
+    auto b_rotated = (args.layout == Layout::kRowMajor);
+    auto b_two = (b_rotated) ? args.m : args.n;
+    return b_two * args.b_ld + args.b_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.b_size = GetSizeB(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.m; }
+  static size_t DefaultLDB(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
+                       args.m, args.n, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld,
+                       buffers.b_mat(), args.b_offset, args.b_ld,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXtrmm(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasSide>(args.side),
+                              static_cast<clblasUplo>(args.triangle),
+                              static_cast<clblasTranspose>(args.a_transpose),
+                              static_cast<clblasDiag>(args.diagonal),
+                              args.m, args.n, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld,
+                              buffers.b_mat(), args.b_offset, args.b_ld,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.b_size, static_cast<T>(0));
+    buffers.b_mat.ReadBuffer(queue, args.b_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.m; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    return (args.layout == Layout::kRowMajor) ?
+           id1*args.b_ld + id2 + args.b_offset:
+           id2*args.b_ld + id1 + args.b_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    auto k = (args.side == Side::kLeft) ? args.m : args.n;
+    return args.m * args.n * k;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    auto k = (args.side == Side::kLeft) ? args.m : args.n;
+    return (k*k + 2*args.m*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XTRMM_H_
+#endif
diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h
index 093a8742..4aaf3705 100644
--- a/test/wrapper_clblas.h
+++ b/test/wrapper_clblas.h
@@ -1,6 +1,6 @@
 
 // =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
@@ -76,33 +76,33 @@ clblasStatus clblasXaxpy(
 
 // Calls {clblasSgemv, clblasDgemv, clblasCgemv, clblasZgemv} with the arguments forwarded.
 clblasStatus clblasXgemv(
-  clblasOrder layout, clblasTranspose tran_a, size_t m, size_t n, float alpha,
+  clblasOrder layout, clblasTranspose a_transpose, size_t m, size_t n, float alpha,
   const cl_mem a_mat, size_t a_offset, size_t a_ld,
   const cl_mem x_vec, size_t x_offset, size_t x_inc, float beta,
   const cl_mem y_vec, size_t y_offset, size_t y_inc,
   cl_uint num_queues, cl_command_queue *queues,
   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
-    return clblasSgemv(layout, tran_a, m, n, alpha,
+    return clblasSgemv(layout, a_transpose, m, n, alpha,
                        a_mat, a_offset, a_ld,
                        x_vec, x_offset, static_cast<int>(x_inc), beta,
                        y_vec, y_offset, static_cast<int>(y_inc),
                        num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXgemv(
-  clblasOrder layout, clblasTranspose tran_a, size_t m, size_t n, double alpha,
+  clblasOrder layout, clblasTranspose a_transpose, size_t m, size_t n, double alpha,
   const cl_mem a_mat, size_t a_offset, size_t a_ld,
   const cl_mem x_vec, size_t x_offset, size_t x_inc, double beta,
   const cl_mem y_vec, size_t y_offset, size_t y_inc,
   cl_uint num_queues, cl_command_queue *queues,
   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
-    return clblasDgemv(layout, tran_a, m, n, alpha,
+    return clblasDgemv(layout, a_transpose, m, n, alpha,
                        a_mat, a_offset, a_ld,
                        x_vec, x_offset, static_cast<int>(x_inc), beta,
                        y_vec, y_offset, static_cast<int>(y_inc),
                        num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXgemv(
-  clblasOrder layout, clblasTranspose tran_a, size_t m, size_t n, float2 alpha,
+  clblasOrder layout, clblasTranspose a_transpose, size_t m, size_t n, float2 alpha,
   const cl_mem a_mat, size_t a_offset, size_t a_ld,
   const cl_mem x_vec, size_t x_offset, size_t x_inc, float2 beta,
   const cl_mem y_vec, size_t y_offset, size_t y_inc,
@@ -110,14 +110,14 @@ clblasStatus clblasXgemv(
   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
     auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}};
     auto cl_beta = cl_float2{{beta.real(), beta.imag()}};
-    return clblasCgemv(layout, tran_a, m, n, cl_alpha,
+    return clblasCgemv(layout, a_transpose, m, n, cl_alpha,
                        a_mat, a_offset, a_ld,
                        x_vec, x_offset, static_cast<int>(x_inc), cl_beta,
                        y_vec, y_offset, static_cast<int>(y_inc),
                        num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXgemv(
-  clblasOrder layout, clblasTranspose tran_a, size_t m, size_t n, double2 alpha,
+  clblasOrder layout, clblasTranspose a_transpose, size_t m, size_t n, double2 alpha,
   const cl_mem a_mat, size_t a_offset, size_t a_ld,
   const cl_mem x_vec, size_t x_offset, size_t x_inc, double2 beta,
   const cl_mem y_vec, size_t y_offset, size_t y_inc,
@@ -125,7 +125,7 @@ clblasStatus clblasXgemv(
   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
     auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}};
     auto cl_beta = cl_double2{{beta.real(), beta.imag()}};
-    return clblasZgemv(layout, tran_a, m, n, cl_alpha,
+    return clblasZgemv(layout, a_transpose, m, n, cl_alpha,
                        a_mat, a_offset, a_ld,
                        x_vec, x_offset, static_cast<int>(x_inc), cl_beta,
                        y_vec, y_offset, static_cast<int>(y_inc),
@@ -137,14 +137,14 @@ clblasStatus clblasXgemv(
 
 // This calls {clblasSgemm, clblasDgemm, clblasCgemm, clblasZgemm} with the arguments forwarded.
 clblasStatus clblasXgemm(
-  clblasOrder layout, clblasTranspose tran_a, clblasTranspose tran_b,
+  clblasOrder layout, clblasTranspose a_transpose, clblasTranspose b_transpose,
   size_t m, size_t n, size_t k, float alpha,
   const cl_mem a_mat, size_t a_offset, size_t a_ld,
   const cl_mem b_mat, size_t b_offset, size_t b_ld, float beta,
-  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  const cl_mem c_mat, size_t c_offset, size_t c_ld,
   cl_uint num_queues, cl_command_queue *queues,
   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
-    return clblasSgemm(layout, tran_a, tran_b,
+    return clblasSgemm(layout, a_transpose, b_transpose,
                        m, n, k, alpha,
                        a_mat, a_offset, a_ld,
                        b_mat, b_offset, b_ld, beta,
@@ -152,14 +152,14 @@ clblasStatus clblasXgemm(
                        num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXgemm(
-  clblasOrder layout, clblasTranspose tran_a, clblasTranspose tran_b,
+  clblasOrder layout, clblasTranspose a_transpose, clblasTranspose b_transpose,
   size_t m, size_t n, size_t k, double alpha,
   const cl_mem a_mat, size_t a_offset, size_t a_ld,
   const cl_mem b_mat, size_t b_offset, size_t b_ld, double beta,
-  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  const cl_mem c_mat, size_t c_offset, size_t c_ld,
   cl_uint num_queues, cl_command_queue *queues,
   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
-    return clblasDgemm(layout, tran_a, tran_b,
+    return clblasDgemm(layout, a_transpose, b_transpose,
                        m, n, k, alpha,
                        a_mat, a_offset, a_ld,
                        b_mat, b_offset, b_ld, beta,
@@ -167,16 +167,16 @@ clblasStatus clblasXgemm(
                        num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXgemm(
-  clblasOrder layout, clblasTranspose tran_a, clblasTranspose tran_b,
+  clblasOrder layout, clblasTranspose a_transpose, clblasTranspose b_transpose,
   size_t m, size_t n, size_t k, float2 alpha,
   const cl_mem a_mat, size_t a_offset, size_t a_ld,
   const cl_mem b_mat, size_t b_offset, size_t b_ld, float2 beta,
-  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  const cl_mem c_mat, size_t c_offset, size_t c_ld,
   cl_uint num_queues, cl_command_queue *queues,
   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
     auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}};
     auto cl_beta = cl_float2{{beta.real(), beta.imag()}};
-    return clblasCgemm(layout, tran_a, tran_b,
+    return clblasCgemm(layout, a_transpose, b_transpose,
                        m, n, k, cl_alpha,
                        a_mat, a_offset, a_ld,
                        b_mat, b_offset, b_ld, cl_beta,
@@ -184,16 +184,16 @@ clblasStatus clblasXgemm(
                        num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXgemm(
-  clblasOrder layout, clblasTranspose tran_a, clblasTranspose tran_b,
+  clblasOrder layout, clblasTranspose a_transpose, clblasTranspose b_transpose,
   size_t m, size_t n, size_t k, double2 alpha,
   const cl_mem a_mat, size_t a_offset, size_t a_ld,
   const cl_mem b_mat, size_t b_offset, size_t b_ld, double2 beta,
-  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  const cl_mem c_mat, size_t c_offset, size_t c_ld,
   cl_uint num_queues, cl_command_queue *queues,
   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
     auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}};
     auto cl_beta = cl_double2{{beta.real(), beta.imag()}};
-    return clblasZgemm(layout, tran_a, tran_b,
+    return clblasZgemm(layout, a_transpose, b_transpose,
                        m, n, k, cl_alpha,
                        a_mat, a_offset, a_ld,
                        b_mat, b_offset, b_ld, cl_beta,
@@ -201,13 +201,13 @@ clblasStatus clblasXgemm(
                        num_queues, queues, num_wait_events, wait_events, events);
 }
 
-// This calls {clblasSsymm, clblasDsymm} with the arguments forwarded.
+// This calls {clblasSsymm, clblasDsymm, clblasCsymm, clblasZsymm} with the arguments forwarded.
 clblasStatus clblasXsymm(
   clblasOrder layout, clblasSide side, clblasUplo triangle,
   size_t m, size_t n, float alpha,
   const cl_mem a_mat, size_t a_offset, size_t a_ld,
   const cl_mem b_mat, size_t b_offset, size_t b_ld, float beta,
-  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  const cl_mem c_mat, size_t c_offset, size_t c_ld,
   cl_uint num_queues, cl_command_queue *queues,
   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
     return clblasSsymm(layout, side, triangle,
@@ -222,7 +222,7 @@ clblasStatus clblasXsymm(
   size_t m, size_t n, double alpha,
   const cl_mem a_mat, size_t a_offset, size_t a_ld,
   const cl_mem b_mat, size_t b_offset, size_t b_ld, double beta,
-  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  const cl_mem c_mat, size_t c_offset, size_t c_ld,
   cl_uint num_queues, cl_command_queue *queues,
   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
     return clblasDsymm(layout, side, triangle,
@@ -237,7 +237,7 @@ clblasStatus clblasXsymm(
   size_t m, size_t n, float2 alpha,
   const cl_mem a_mat, size_t a_offset, size_t a_ld,
   const cl_mem b_mat, size_t b_offset, size_t b_ld, float2 beta,
-  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  const cl_mem c_mat, size_t c_offset, size_t c_ld,
   cl_uint num_queues, cl_command_queue *queues,
   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
     auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}};
@@ -254,7 +254,7 @@ clblasStatus clblasXsymm(
   size_t m, size_t n, double2 alpha,
   const cl_mem a_mat, size_t a_offset, size_t a_ld,
   const cl_mem b_mat, size_t b_offset, size_t b_ld, double2 beta,
-  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  const cl_mem c_mat, size_t c_offset, size_t c_ld,
   cl_uint num_queues, cl_command_queue *queues,
   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
     auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}};
@@ -267,6 +267,348 @@ clblasStatus clblasXsymm(
                        num_queues, queues, num_wait_events, wait_events, events);
 }
 
+// This calls {clblasChemm, clblasZhemm} with the arguments forwarded.
+clblasStatus clblasXhemm(
+  clblasOrder layout, clblasSide side, clblasUplo triangle,
+  size_t m, size_t n, float2 alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld, float2 beta,
+  const cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}};
+    auto cl_beta = cl_float2{{beta.real(), beta.imag()}};
+    return clblasChemm(layout, side, triangle,
+                       m, n, cl_alpha,
+                       a_mat, a_offset, a_ld,
+                       b_mat, b_offset, b_ld, cl_beta,
+                       c_mat, c_offset, c_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXhemm(
+  clblasOrder layout, clblasSide side, clblasUplo triangle,
+  size_t m, size_t n, double2 alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld, double2 beta,
+  const cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}};
+    auto cl_beta = cl_double2{{beta.real(), beta.imag()}};
+    return clblasZhemm(layout, side, triangle,
+                       m, n, cl_alpha,
+                       a_mat, a_offset, a_ld,
+                       b_mat, b_offset, b_ld, cl_beta,
+                       c_mat, c_offset, c_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+
+// This calls {clblasSsyrk, clblasDsyrk, clblasCsyrk, clblasZsyrk} with the arguments forwarded.
+clblasStatus clblasXsyrk(
+  clblasOrder layout, clblasUplo triangle, clblasTranspose a_transpose,
+  size_t n, size_t k, float alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld, float beta,
+  const cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    return clblasSsyrk(layout, triangle, a_transpose,
+                       n, k, alpha,
+                       a_mat, a_offset, a_ld, beta,
+                       c_mat, c_offset, c_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXsyrk(
+  clblasOrder layout, clblasUplo triangle, clblasTranspose a_transpose,
+  size_t n, size_t k, double alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld, double beta,
+  const cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    return clblasDsyrk(layout, triangle, a_transpose,
+                       n, k, alpha,
+                       a_mat, a_offset, a_ld, beta,
+                       c_mat, c_offset, c_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXsyrk(
+  clblasOrder layout, clblasUplo triangle, clblasTranspose a_transpose,
+  size_t n, size_t k, float2 alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld, float2 beta,
+  const cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}};
+    auto cl_beta = cl_float2{{beta.real(), beta.imag()}};
+    return clblasCsyrk(layout, triangle, a_transpose,
+                       n, k, cl_alpha,
+                       a_mat, a_offset, a_ld, cl_beta,
+                       c_mat, c_offset, c_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXsyrk(
+  clblasOrder layout, clblasUplo triangle, clblasTranspose a_transpose,
+  size_t n, size_t k, double2 alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld, double2 beta,
+  const cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}};
+    auto cl_beta = cl_double2{{beta.real(), beta.imag()}};
+    return clblasZsyrk(layout, triangle, a_transpose,
+                       n, k, cl_alpha,
+                       a_mat, a_offset, a_ld, cl_beta,
+                       c_mat, c_offset, c_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+
+// This calls {clblasCherk, clblasZherk} with the arguments forwarded.
+clblasStatus clblasXherk(
+  clblasOrder layout, clblasUplo triangle, clblasTranspose a_transpose,
+  size_t n, size_t k, float alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld, float beta,
+  const cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    return clblasCherk(layout, triangle, a_transpose,
+                       n, k, alpha,
+                       a_mat, a_offset, a_ld, beta,
+                       c_mat, c_offset, c_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXherk(
+  clblasOrder layout, clblasUplo triangle, clblasTranspose a_transpose,
+  size_t n, size_t k, double alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld, double beta,
+  const cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    return clblasZherk(layout, triangle, a_transpose,
+                       n, k, alpha,
+                       a_mat, a_offset, a_ld, beta,
+                       c_mat, c_offset, c_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+
+// This calls {clblasSsyr2k, clblasDsyr2k, clblasCsyr2k, clblasZsyr2k} with the arguments forwarded.
+clblasStatus clblasXsyr2k(
+  clblasOrder layout, clblasUplo triangle, clblasTranspose ab_transpose,
+  size_t n, size_t k, float alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld, float beta,
+  const cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    return clblasSsyr2k(layout, triangle, ab_transpose,
+                        n, k, alpha,
+                        a_mat, a_offset, a_ld,
+                        b_mat, b_offset, b_ld, beta,
+                        c_mat, c_offset, c_ld,
+                        num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXsyr2k(
+  clblasOrder layout, clblasUplo triangle, clblasTranspose ab_transpose,
+  size_t n, size_t k, double alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld, double beta,
+  const cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    return clblasDsyr2k(layout, triangle, ab_transpose,
+                        n, k, alpha,
+                        a_mat, a_offset, a_ld,
+                        b_mat, b_offset, b_ld, beta,
+                        c_mat, c_offset, c_ld,
+                        num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXsyr2k(
+  clblasOrder layout, clblasUplo triangle, clblasTranspose ab_transpose,
+  size_t n, size_t k, float2 alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld, float2 beta,
+  const cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}};
+    auto cl_beta = cl_float2{{beta.real(), beta.imag()}};
+    return clblasCsyr2k(layout, triangle, ab_transpose,
+                        n, k, cl_alpha,
+                        a_mat, a_offset, a_ld,
+                        b_mat, b_offset, b_ld, cl_beta,
+                        c_mat, c_offset, c_ld,
+                        num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXsyr2k(
+  clblasOrder layout, clblasUplo triangle, clblasTranspose ab_transpose,
+  size_t n, size_t k, double2 alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld, double2 beta,
+  const cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}};
+    auto cl_beta = cl_double2{{beta.real(), beta.imag()}};
+    return clblasZsyr2k(layout, triangle, ab_transpose,
+                        n, k, cl_alpha,
+                        a_mat, a_offset, a_ld,
+                        b_mat, b_offset, b_ld, cl_beta,
+                        c_mat, c_offset, c_ld,
+                        num_queues, queues, num_wait_events, wait_events, events);
+}
+
+// This calls {clblasCher2k, clblasZher2k} with the arguments forwarded.
+clblasStatus clblasXher2k(
+  clblasOrder layout, clblasUplo triangle, clblasTranspose ab_transpose,
+  size_t n, size_t k, float2 alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld, float beta,
+  const cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}};
+    return clblasCher2k(layout, triangle, ab_transpose,
+                        n, k, cl_alpha,
+                        a_mat, a_offset, a_ld,
+                        b_mat, b_offset, b_ld, beta,
+                        c_mat, c_offset, c_ld,
+                        num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXher2k(
+  clblasOrder layout, clblasUplo triangle, clblasTranspose ab_transpose,
+  size_t n, size_t k, double2 alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld, double beta,
+  const cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}};
+    return clblasZher2k(layout, triangle, ab_transpose,
+                        n, k, cl_alpha,
+                        a_mat, a_offset, a_ld,
+                        b_mat, b_offset, b_ld, beta,
+                        c_mat, c_offset, c_ld,
+                        num_queues, queues, num_wait_events, wait_events, events);
+}
+
+// This calls {clblasStrmm, clblasDtrmm, clblasCtrmm, clblasZtrmm} with the arguments forwarded.
+clblasStatus clblasXtrmm(
+  clblasOrder layout, clblasSide side, clblasUplo triangle,
+  clblasTranspose a_transpose, clblasDiag diagonal,
+  size_t m, size_t n, float alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    return clblasStrmm(layout, side, triangle, a_transpose, diagonal,
+                       m, n, alpha,
+                       a_mat, a_offset, a_ld,
+                       b_mat, b_offset, b_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXtrmm(
+  clblasOrder layout, clblasSide side, clblasUplo triangle,
+  clblasTranspose a_transpose, clblasDiag diagonal,
+  size_t m, size_t n, double alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    return clblasDtrmm(layout, side, triangle, a_transpose, diagonal,
+                       m, n, alpha,
+                       a_mat, a_offset, a_ld,
+                       b_mat, b_offset, b_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXtrmm(
+  clblasOrder layout, clblasSide side, clblasUplo triangle,
+  clblasTranspose a_transpose, clblasDiag diagonal,
+  size_t m, size_t n, float2 alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}};
+    return clblasCtrmm(layout, side, triangle, a_transpose, diagonal,
+                       m, n, cl_alpha,
+                       a_mat, a_offset, a_ld,
+                       b_mat, b_offset, b_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXtrmm(
+  clblasOrder layout, clblasSide side, clblasUplo triangle,
+  clblasTranspose a_transpose, clblasDiag diagonal,
+  size_t m, size_t n, double2 alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}};
+    return clblasZtrmm(layout, side, triangle, a_transpose, diagonal,
+                       m, n, cl_alpha,
+                       a_mat, a_offset, a_ld,
+                       b_mat, b_offset, b_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+
+// This calls {clblasStrsm, clblasDtrsm, clblasCtrsm, clblasZtrsm} with the arguments forwarded.
+clblasStatus clblasXtrsm(
+  clblasOrder layout, clblasSide side, clblasUplo triangle,
+  clblasTranspose a_transpose, clblasDiag diagonal,
+  size_t m, size_t n, float alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    return clblasStrsm(layout, side, triangle, a_transpose, diagonal,
+                       m, n, alpha,
+                       a_mat, a_offset, a_ld,
+                       b_mat, b_offset, b_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXtrsm(
+  clblasOrder layout, clblasSide side, clblasUplo triangle,
+  clblasTranspose a_transpose, clblasDiag diagonal,
+  size_t m, size_t n, double alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    return clblasDtrsm(layout, side, triangle, a_transpose, diagonal,
+                       m, n, alpha,
+                       a_mat, a_offset, a_ld,
+                       b_mat, b_offset, b_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXtrsm(
+  clblasOrder layout, clblasSide side, clblasUplo triangle,
+  clblasTranspose a_transpose, clblasDiag diagonal,
+  size_t m, size_t n, float2 alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}};
+    return clblasCtrsm(layout, side, triangle, a_transpose, diagonal,
+                       m, n, cl_alpha,
+                       a_mat, a_offset, a_ld,
+                       b_mat, b_offset, b_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXtrsm(
+  clblasOrder layout, clblasSide side, clblasUplo triangle,
+  clblasTranspose a_transpose, clblasDiag diagonal,
+  size_t m, size_t n, double2 alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}};
+    return clblasZtrsm(layout, side, triangle, a_transpose, diagonal,
+                       m, n, cl_alpha,
+                       a_mat, a_offset, a_ld,
+                       b_mat, b_offset, b_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+
 // =================================================================================================
 } // namespace clblast