Merge pull request #9 from CNugteren/level3_routines

Added SYRK and SYR2K level-3 routines
2024-08-21 04:22:27 +02:00 · 2015-06-26 20:56:21 +02:00 · 2015-06-26 20:56:21 +02:00 · 77e2157485
parent 985eeac503 e27e339ebf
commit 77e2157485
37 changed files with 2263 additions and 193 deletions
--- a/5
+++ b/5
@ -1,4 +1,9 @@

+Development version (next release)
+- Added level-3 routines:
+  * SSYRK/DSYRK/CSYRK/ZSYRK
+  * SSYR2K/DSYR2K/CSYR2K/ZSYR2K
+
 Version 0.2.0
 - Added support for complex conjugate transpose
 - Several host-code performance improvements
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -97,8 +97,10 @@ set(KERNELS copy pad transpose padtranspose xaxpy xgemv xgemm)
 set(SAMPLE_PROGRAMS sgemm)
 set(ROUTINES_XY xaxpy)
 set(ROUTINES_AXY xgemv)
-set(ROUTINES_ABC xgemm xsymm)
-set(ROUTINES ${ROUTINES_XY} ${ROUTINES_AXY} ${ROUTINES_ABC})
+set(ROUTINES_ABC xgemm xsymm xsyr2k)
+set(ROUTINES_AB )
+set(ROUTINES_AC xsyrk)
+set(ROUTINES ${ROUTINES_XY} ${ROUTINES_AXY} ${ROUTINES_ABC} ${ROUTINES_AB} ${ROUTINES_AC})

 # ==================================================================================================

@ -172,6 +174,8 @@ if(TESTS)
  add_library(test_correctness_xy OBJECT test/correctness/testxy.cc)
  add_library(test_correctness_axy OBJECT test/correctness/testaxy.cc)
  add_library(test_correctness_abc OBJECT test/correctness/testabc.cc)
+  add_library(test_correctness_ab OBJECT test/correctness/testab.cc)
+  add_library(test_correctness_ac OBJECT test/correctness/testac.cc)

  # Compiles the correctness-tests
  foreach(ROUTINE ${ROUTINES_XY})
@ -179,22 +183,32 @@ if(TESTS)
                   $<TARGET_OBJECTS:test_correctness_common>
                   $<TARGET_OBJECTS:test_correctness_xy>
                   test/correctness/routines/${ROUTINE}.cc)
-    target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
-    install(TARGETS test_${ROUTINE} DESTINATION bin)
  endforeach()
  foreach(ROUTINE ${ROUTINES_AXY})
    add_executable(test_${ROUTINE}
                   $<TARGET_OBJECTS:test_correctness_common>
                   $<TARGET_OBJECTS:test_correctness_axy>
                   test/correctness/routines/${ROUTINE}.cc)
-    target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
-    install(TARGETS test_${ROUTINE} DESTINATION bin)
  endforeach()
  foreach(ROUTINE ${ROUTINES_ABC})
    add_executable(test_${ROUTINE}
                   $<TARGET_OBJECTS:test_correctness_common>
                   $<TARGET_OBJECTS:test_correctness_abc>
                   test/correctness/routines/${ROUTINE}.cc)
+  endforeach()
+  foreach(ROUTINE ${ROUTINES_AB})
+    add_executable(test_${ROUTINE}
+                   $<TARGET_OBJECTS:test_correctness_common>
+                   $<TARGET_OBJECTS:test_correctness_ab>
+                   test/correctness/routines/${ROUTINE}.cc)
+  endforeach()
+  foreach(ROUTINE ${ROUTINES_AC})
+    add_executable(test_${ROUTINE}
+                   $<TARGET_OBJECTS:test_correctness_common>
+                   $<TARGET_OBJECTS:test_correctness_ac>
+                   test/correctness/routines/${ROUTINE}.cc)
+  endforeach()
+  foreach(ROUTINE ${ROUTINES})
    target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
    install(TARGETS test_${ROUTINE} DESTINATION bin)
  endforeach()
--- a/README.md
+++ b/README.md
@ -124,7 +124,7 @@ These graphs can be generated automatically on your own device. First, compile C
 Supported routines
 -------------

-CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with `x` in the following tables:
+CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with '✔' in the following tables:

 | Level-1  | S | D | C | Z | Notes   |
 | ---------|---|---|---|---|---------|
@ -135,7 +135,7 @@ CLBlast is in active development and currently does not support the full set of
 | xSWAP    |   |   |   |   |         |
 | xSCAL    |   |   |   |   | +CS +ZD |
 | xCOPY    |   |   |   |   |         |
-| xAXPY    |`x`|`x`|`x`|`x`|         |
+| xAXPY    | ✔ | ✔ | ✔ | ✔ |         |
 | xDOT     |   |   | - | - | +DS     |
 | xDOTU    | - | - |   |   |         |
 | xDOTC    | - | - |   |   |         |
@ -147,7 +147,7 @@ CLBlast is in active development and currently does not support the full set of

 | Level-2  | S | D | C | Z | Notes   |
 | ---------|---|---|---|---|---------|
-| xGEMV    |`x`|`x`|`x`|`x`|         |
+| xGEMV    | ✔ | ✔ | ✔ | ✔ |         |
 | xGBMV    |   |   |   |   |         |
 | xHEMV    | - | - |   |   |         |
 | xHBMV    | - | - |   |   |         |
@ -175,12 +175,12 @@ CLBlast is in active development and currently does not support the full set of

 | Level-3  | S | D | C | Z | Notes   |
 | ---------|---|---|---|---|---------|
-| xGEMM    |`x`|`x`|`x`|`x`|         |
-| xSYMM    |`x`|`x`|`x`|`x`|         |
+| xGEMM    | ✔ | ✔ | ✔ | ✔ |         |
+| xSYMM    | ✔ | ✔ | ✔ | ✔ |         |
 | xHEMM    | - | - |   |   |         |
-| xSYRK    |   |   |   |   |         |
+| xSYRK    | ✔ | ✔ | ✔ | ✔ |         |
 | xHERK    | - | - |   |   |         |
-| xSYR2K   |   |   |   |   |         |
+| xSYR2K   | ✔ | ✔ | ✔ | ✔ |         |
 | xHER2K   | - | - |   |   |         |
 | xTRMM    |   |   |   |   |         |
 | xTRSM    |   |   |   |   |         |
--- a/doc/performance/Iris/SSYR2K.pdf
+++ b/doc/performance/Iris/SSYR2K.pdf
--- a/doc/performance/Iris/SSYRK.pdf
+++ b/doc/performance/Iris/SSYRK.pdf
--- a/doc/performance/Tesla_K40m/SSYRK.pdf
+++ b/doc/performance/Tesla_K40m/SSYRK.pdf
--- a/include/clblast.h
+++ b/include/clblast.h
@ -107,7 +107,7 @@ StatusCode Gemv(const Layout layout, const Transpose transpose_a,
 // =================================================================================================
 // BLAS level-3 (matrix-matrix) routines

-// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM
+// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM
 template <typename T>
 StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b,
                const size_t m, const size_t n, const size_t k,
@ -118,7 +118,7 @@ StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpos
                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                cl_command_queue* queue, cl_event* event);

-// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM
+// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM
 template <typename T>
 StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
                const size_t m, const size_t n,
@ -129,6 +129,27 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                cl_command_queue* queue, cl_event* event);

+// Templated-precision rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK
+template <typename T>
+StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose transpose_a,
+                const size_t n, const size_t k,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event);
+
+// Templated-precision rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K
+template <typename T>
+StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose transpose_ab,
+                 const size_t n, const size_t k,
+                 const T alpha,
+                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                 const T beta,
+                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                 cl_command_queue* queue, cl_event* event);
+
 // =================================================================================================
 } // namespace clblast

--- a/include/internal/routine.h
+++ b/include/internal/routine.h
@ -92,7 +92,8 @@ class Routine {
                                    const size_t dest_ld, const size_t dest_offset,
                                    const Buffer &dest,
                                    const bool do_transpose, const bool do_conjugate,
-                                    const bool pad, const Program &program);
+                                    const bool pad, const bool upper, const bool lower,
+                                    const Program &program);
  
  // Queries the cache and retrieve either a matching program or a boolean whether a match exists.
  // The first assumes that the program is available in the cache and will throw an exception
--- a/include/internal/routines/xsyr2k.h
+++ b/include/internal/routines/xsyr2k.h
@ -0,0 +1,48 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr2k routine. The precision is implemented using a template argument.
+// The implementation is very similar to Xsyrk (see header for details), except for the fact that
+// the main XgemmUpper/XgemmLower kernel is called twice: C = AB^T + C and C = BA^T + C.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYR2K_H_
+#define CLBLAST_ROUTINES_XSYR2K_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsyr2k: public Routine {
+ public:
+  Xsyr2k(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                     const size_t n, const size_t k,
+                     const T alpha,
+                     const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                     const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                     const T beta,
+                     const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYR2K_H_
+#endif
--- a/include/internal/routines/xsyrk.h
+++ b/include/internal/routines/xsyrk.h
@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyrk routine. The precision is implemented using a template argument.
+// The implementation is based on the regular Xgemm routine and kernel, but with two main changes:
+// 1) The final unpad(transpose) kernel updates only the upper/lower triangular part.
+// 2) The main Xgemm kernel masks workgroups not contributing to usefull data. This is only for
+//    performance reasons, as the actual masking is done later (see the first point).
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYRK_H_
+#define CLBLAST_ROUTINES_XSYRK_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsyrk: public Routine {
+ public:
+  Xsyrk(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                    const size_t n, const size_t k,
+                    const T alpha,
+                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const T beta,
+                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYRK_H_
+#endif
--- a/src/clblast.cc
+++ b/src/clblast.cc
@ -26,6 +26,8 @@
 // BLAS level-3 includes
 #include "internal/routines/xgemm.h"
 #include "internal/routines/xsymm.h"
+#include "internal/routines/xsyrk.h"
+#include "internal/routines/xsyr2k.h"

 namespace clblast {
 // =================================================================================================
@ -209,7 +211,7 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
  std::string kernel_source =
  #include "kernels/xgemm.opencl"
  auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
-                            kernel_source);
+                              kernel_source);
  if (status != StatusCode::kSuccess) { return status; }

  // Runs the routine
@ -243,5 +245,119 @@ template StatusCode Symm<double2>(const Layout, const Side, const Triangle,
                                  cl_mem, const size_t, const size_t,
                                  cl_command_queue*, cl_event*);

+// =================================================================================================
+
+// SYRK
+template <typename T>
+StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose transpose_a,
+                const size_t n, const size_t k, const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = CommandQueue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xsyrk<T>(queue_cpp, event_cpp);
+
+  // Loads the kernel source-code as an include (C++11 raw string literal)
+  std::string common_source1 =
+  #include "kernels/copy.opencl"
+  std::string common_source2 =
+  #include "kernels/pad.opencl"
+  std::string common_source3 =
+  #include "kernels/transpose.opencl"
+  std::string common_source4 =
+  #include "kernels/padtranspose.opencl"
+  std::string kernel_source =
+  #include "kernels/xgemm.opencl"
+  auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
+                              kernel_source);
+  if (status != StatusCode::kSuccess) { return status; }
+
+  // Runs the routine
+  return routine.DoSyrk(layout, triangle, transpose_a, n, k, alpha,
+                        Buffer(a_buffer), a_offset, a_ld, beta,
+                        Buffer(c_buffer), c_offset, c_ld);
+}
+template StatusCode Syrk<float>(const Layout, const Triangle, const Transpose,
+                                const size_t, const size_t, const float,
+                                const cl_mem, const size_t, const size_t, const float,
+                                cl_mem, const size_t, const size_t,
+                                cl_command_queue*, cl_event*);
+template StatusCode Syrk<double>(const Layout, const Triangle, const Transpose,
+                                 const size_t, const size_t, const double,
+                                 const cl_mem, const size_t, const size_t, const double,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Syrk<float2>(const Layout, const Triangle, const Transpose,
+                                 const size_t, const size_t, const float2,
+                                 const cl_mem, const size_t, const size_t, const float2,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Syrk<double2>(const Layout, const Triangle, const Transpose,
+                                  const size_t, const size_t, const double2,
+                                  const cl_mem, const size_t, const size_t, const double2,
+                                  cl_mem, const size_t, const size_t,
+                                  cl_command_queue*, cl_event*);
+
+// =================================================================================================
+
+// SYR2K
+template <typename T>
+StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose transpose_ab,
+                const size_t n, const size_t k, const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = CommandQueue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xsyr2k<T>(queue_cpp, event_cpp);
+
+  // Loads the kernel source-code as an include (C++11 raw string literal)
+  std::string common_source1 =
+  #include "kernels/copy.opencl"
+  std::string common_source2 =
+  #include "kernels/pad.opencl"
+  std::string common_source3 =
+  #include "kernels/transpose.opencl"
+  std::string common_source4 =
+  #include "kernels/padtranspose.opencl"
+  std::string kernel_source =
+  #include "kernels/xgemm.opencl"
+  auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
+                              kernel_source);
+  if (status != StatusCode::kSuccess) { return status; }
+
+  // Runs the routine
+  return routine.DoSyr2k(layout, triangle, transpose_ab, n, k, alpha,
+                         Buffer(a_buffer), a_offset, a_ld,
+                         Buffer(b_buffer), b_offset, b_ld, beta,
+                         Buffer(c_buffer), c_offset, c_ld);
+}
+template StatusCode Syr2k<float>(const Layout, const Triangle, const Transpose,
+                                 const size_t, const size_t, const float,
+                                 const cl_mem, const size_t, const size_t,
+                                 const cl_mem, const size_t, const size_t, const float,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Syr2k<double>(const Layout, const Triangle, const Transpose,
+                                  const size_t, const size_t, const double,
+                                  const cl_mem, const size_t, const size_t,
+                                  const cl_mem, const size_t, const size_t, const double,
+                                  cl_mem, const size_t, const size_t,
+                                  cl_command_queue*, cl_event*);
+template StatusCode Syr2k<float2>(const Layout, const Triangle, const Transpose,
+                                  const size_t, const size_t, const float2,
+                                  const cl_mem, const size_t, const size_t,
+                                  const cl_mem, const size_t, const size_t, const float2,
+                                  cl_mem, const size_t, const size_t,
+                                  cl_command_queue*, cl_event*);
+template StatusCode Syr2k<double2>(const Layout, const Triangle, const Transpose,
+                                   const size_t, const size_t, const double2,
+                                   const cl_mem, const size_t, const size_t,
+                                   const cl_mem, const size_t, const size_t, const double2,
+                                   cl_mem, const size_t, const size_t,
+                                   cl_command_queue*, cl_event*);
+
 // =================================================================================================
 } // namespace clblast
--- a/src/kernels/pad.opencl
+++ b/src/kernels/pad.opencl
@ -86,7 +86,8 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
                          __global const real* restrict src,
                          const int dest_one, const int dest_two,
                          const int dest_ld, const int dest_offset,
-                          __global real* dest) {
+                          __global real* dest,
+                          const int upper, const int lower) {

  // Loops over the work per thread in both dimensions
  #pragma unroll
@ -95,11 +96,18 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
    #pragma unroll
    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_two && id_one < dest_one) {
+
+      // Masking in case of triangular matrices: updates only the upper or lower part
+      bool condition = true;
+      if (upper == 1) { condition = (id_two >= id_one); }
+      else if (lower == 1) { condition = (id_two <= id_one); }
+      if (condition) {

        // Copies the value into the destination matrix. This is always within bounds of the source
        // matrix, as we know that the destination matrix is smaller than the source.
-        dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset];
+        if (id_two < dest_two && id_one < dest_one) {
+          dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset];
+        }
      }
    }
  }
--- a/src/kernels/padtranspose.opencl
+++ b/src/kernels/padtranspose.opencl
@ -100,7 +100,8 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
                                   __global const real* restrict src,
                                   const int dest_one, const int dest_two,
                                   const int dest_ld, const int dest_offset,
-                                   __global real* dest) {
+                                   __global real* dest,
+                                   const int upper, const int lower) {

  // Local memory to store a tile of the matrix (for coalescing)
  __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
@ -137,10 +138,17 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
      const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
      const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);

-      // Stores the transposed value in the destination matrix
-      if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
-        real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
-        dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+      // Masking in case of triangular matrices: updates only the upper or lower part
+      bool condition = true;
+      if (upper == 1) { condition = (id_dest_one >= id_dest_two); }
+      else if (lower == 1) { condition = (id_dest_one <= id_dest_two); }
+      if (condition) {
+
+        // Stores the transposed value in the destination matrix
+        if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
+          real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
+          dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+        }
      }
    }
  }
--- a/src/kernels/xgemm.opencl
+++ b/src/kernels/xgemm.opencl
@ -127,6 +127,55 @@ R"(

 // =================================================================================================

+// Initializes the accumulation registers to zero
+inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
+  #pragma unroll
+  for (int mi=0; mi<MWI/VWM; ++mi) {
+    #pragma unroll
+    for (int ni=0; ni<NWI; ++ni) {
+      #if VWM == 1
+        SetToZero(cpm[ni][mi]);
+      #elif VWM == 2
+        SetToZero(cpm[ni][mi].x);
+        SetToZero(cpm[ni][mi].y);
+      #elif VWM == 4
+        SetToZero(cpm[ni][mi].x);
+        SetToZero(cpm[ni][mi].y);
+        SetToZero(cpm[ni][mi].z);
+        SetToZero(cpm[ni][mi].w);
+      #elif VWM == 8
+        SetToZero(cpm[ni][mi].s0);
+        SetToZero(cpm[ni][mi].s1);
+        SetToZero(cpm[ni][mi].s2);
+        SetToZero(cpm[ni][mi].s3);
+        SetToZero(cpm[ni][mi].s4);
+        SetToZero(cpm[ni][mi].s5);
+        SetToZero(cpm[ni][mi].s6);
+        SetToZero(cpm[ni][mi].s7);
+      #elif VWM == 16
+        SetToZero(cpm[ni][mi].s0);
+        SetToZero(cpm[ni][mi].s1);
+        SetToZero(cpm[ni][mi].s2);
+        SetToZero(cpm[ni][mi].s3);
+        SetToZero(cpm[ni][mi].s4);
+        SetToZero(cpm[ni][mi].s5);
+        SetToZero(cpm[ni][mi].s6);
+        SetToZero(cpm[ni][mi].s7);
+        SetToZero(cpm[ni][mi].s8);
+        SetToZero(cpm[ni][mi].s9);
+        SetToZero(cpm[ni][mi].sA);
+        SetToZero(cpm[ni][mi].sB);
+        SetToZero(cpm[ni][mi].sC);
+        SetToZero(cpm[ni][mi].sD);
+        SetToZero(cpm[ni][mi].sE);
+        SetToZero(cpm[ni][mi].sF);
+      #endif
+    }
+  }
+}
+
+// =================================================================================================
+
 // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
 // caching the A input matrix.
 #if SA == 1
@ -272,71 +321,6 @@ inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg

 // =================================================================================================

-// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
-// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
-inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
-                         const real alpha, const real beta) {
-  #pragma unroll
-  for (int ni=0; ni<NWI; ++ni) {
-    #pragma unroll
-    for (int mi=0; mi<MWI/VWM; ++mi) {
-      #if STRM == 0
-        int mg = mi + get_local_id(0)*(MWI/VWM);
-      #elif STRM == 1
-        int mg = get_local_id(0) + mi*MDIMC;
-      #endif
-      #if STRN == 0
-        int ng = ni + get_local_id(1)*NWI;
-      #elif STRN == 1
-        int ng = ni%VWN + get_local_id(1)*VWN + (ni/VWN)*VWN*NDIMC;
-      #endif
-      int idm = mg + get_group_id(0)*(MWG/VWM);
-      int idn = ng + get_group_id(1)*NWG;
-      int index = idn*(kSizeM/VWM) + idm;
-      realM cval = cgm[index];
-      #if VWM == 1
-        AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cval);
-      #elif VWM == 2
-        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
-        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
-      #elif VWM == 4
-        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
-        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
-        AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cval.z);
-        AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cval.w);
-      #elif VWM == 8
-        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
-        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
-        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
-        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
-        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
-        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
-        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
-        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
-      #elif VWM == 16
-        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
-        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
-        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
-        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
-        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
-        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
-        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
-        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
-        AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cval.s8);
-        AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cval.s9);
-        AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cval.sA);
-        AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cval.sB);
-        AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cval.sC);
-        AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cval.sD);
-        AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cval.sE);
-        AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cval.sF);
-      #endif
-    }
-  }
-}
-
-// =================================================================================================
-
 // The vectorised multiply-add function
 inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
  #if USE_VECTOR_MAD == 1
@ -432,77 +416,97 @@ inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], real

 // =================================================================================================

-// Main entry of the kernel. This function contains the basic skeleton, the functionality is
-// provided by the inlined functions above
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
-                    const real alpha, const real beta,
-                    const __global realM* restrict agm,
-                    const __global realN* restrict bgm,
-                    __global realM* cgm) {
+// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
+// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
+inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
+                         const real alpha, const real beta) {
+  #pragma unroll
+  for (int ni=0; ni<NWI; ++ni) {
+    #pragma unroll
+    for (int mi=0; mi<MWI/VWM; ++mi) {
+      #if STRM == 0
+        int mg = mi + get_local_id(0)*(MWI/VWM);
+      #elif STRM == 1
+        int mg = get_local_id(0) + mi*MDIMC;
+      #endif
+      #if STRN == 0
+        int ng = ni + get_local_id(1)*NWI;
+      #elif STRN == 1
+        int ng = ni%VWN + get_local_id(1)*VWN + (ni/VWN)*VWN*NDIMC;
+      #endif
+      int idm = mg + get_group_id(0)*(MWG/VWM);
+      int idn = ng + get_group_id(1)*NWG;

-  // Combined thread identifier
+      // The final multiplication with alpha and the addition with beta*C
+      int index = idn*(kSizeM/VWM) + idm;
+      realM cval = cgm[index];
+      #if VWM == 1
+        AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cval);
+      #elif VWM == 2
+        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
+        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
+      #elif VWM == 4
+        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
+        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
+        AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cval.z);
+        AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cval.w);
+      #elif VWM == 8
+        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
+        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
+        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
+        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
+        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
+        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
+        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
+        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
+      #elif VWM == 16
+        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
+        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
+        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
+        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
+        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
+        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
+        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
+        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
+        AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cval.s8);
+        AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cval.s9);
+        AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cval.sA);
+        AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cval.sB);
+        AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cval.sC);
+        AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cval.sD);
+        AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cval.sE);
+        AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cval.sF);
+      #endif
+    }
+  }
+}
+
+// =================================================================================================
+
+// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
+inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
+                      const __global realM* restrict agm, const __global realN* restrict bgm,
+                      __global realM* cgm, realM cpm[NWI][MWI/VWM],
+                      #if SA == 1 && SB == 1
+                        __local realM* alm, __local realN* blm
+                      #elif SA == 1
+                        __local realM* alm
+                      #elif SB == 1
+                        __local realN* blm
+                      #endif
+                      ) {
+
+  // Allocates workitem-private memory (registers)
+  realM apm[MWI/VWM];
+  realN bpm[NWI/VWN];
+
+  // Combined thread identifier (volatile to disable caching)
  #if SA == 1 || SB == 1
    volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
  #endif

-  // Allocates workgroup-private memory (local memory)
-  #if SA == 1
-    __local realM alm[KWG * MWG/VWM];
-  #endif
-  #if SB == 1
-    __local realN blm[KWG * NWG/VWN];
-  #endif
-  
-  // Allocates workitem-private memory (registers)
-  realM apm[MWI/VWM];
-  realN bpm[NWI/VWN];
-  realM cpm[NWI][MWI/VWM];
-
  // Initializes the accumulation registers
-  #pragma unroll
-  for (int mi=0; mi<MWI/VWM; ++mi) {
-    #pragma unroll
-    for (int ni=0; ni<NWI; ++ni) {
-      #if VWM == 1
-        SetToZero(cpm[ni][mi]);
-      #elif VWM == 2
-        SetToZero(cpm[ni][mi].x);
-        SetToZero(cpm[ni][mi].y);
-      #elif VWM == 4
-        SetToZero(cpm[ni][mi].x);
-        SetToZero(cpm[ni][mi].y);
-        SetToZero(cpm[ni][mi].z);
-        SetToZero(cpm[ni][mi].w);
-      #elif VWM == 8
-        SetToZero(cpm[ni][mi].s0);
-        SetToZero(cpm[ni][mi].s1);
-        SetToZero(cpm[ni][mi].s2);
-        SetToZero(cpm[ni][mi].s3);
-        SetToZero(cpm[ni][mi].s4);
-        SetToZero(cpm[ni][mi].s5);
-        SetToZero(cpm[ni][mi].s6);
-        SetToZero(cpm[ni][mi].s7);
-      #elif VWM == 16
-        SetToZero(cpm[ni][mi].s0);
-        SetToZero(cpm[ni][mi].s1);
-        SetToZero(cpm[ni][mi].s2);
-        SetToZero(cpm[ni][mi].s3);
-        SetToZero(cpm[ni][mi].s4);
-        SetToZero(cpm[ni][mi].s5);
-        SetToZero(cpm[ni][mi].s6);
-        SetToZero(cpm[ni][mi].s7);
-        SetToZero(cpm[ni][mi].s8);
-        SetToZero(cpm[ni][mi].s9);
-        SetToZero(cpm[ni][mi].sA);
-        SetToZero(cpm[ni][mi].sB);
-        SetToZero(cpm[ni][mi].sC);
-        SetToZero(cpm[ni][mi].sD);
-        SetToZero(cpm[ni][mi].sE);
-        SetToZero(cpm[ni][mi].sF);
-      #endif
-    }
-  }
+  InitAccRegisters(cpm);

  // Loops over all workgroup tiles
  for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
@ -515,8 +519,6 @@ __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
    #if SB == 1
      GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
    #endif
-
-    // Synchronizes all threads in a workgroup
    #if SA == 1 || SB == 1
      barrier(CLK_LOCAL_MEM_FENCE);
    #endif
@ -552,19 +554,126 @@ __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
        MultiplyAccumulate(cpm, apm, bpm);
      }
    }
-
-    // Synchronizes all threads in a workgroup
    #if SA == 1 || SB == 1
      barrier(CLK_LOCAL_MEM_FENCE);
    #endif
  }
+}

-  // Stores an MWG * NWG tile of results and perform the multiplication with alpha and beta
+// =================================================================================================
+
+// Main entry point of the kernel. This is the regular full version.
+__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
+                    const real alpha, const real beta,
+                    const __global realM* restrict agm,
+                    const __global realN* restrict bgm,
+                    __global realM* cgm) {
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+
+  // Computes the matrix-multiplication and stores the result in register memory
+  realM cpm[NWI][MWI/VWM];
+  #if SA == 1 && SB == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+  #elif SA == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+  #elif SB == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+  #else
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+  #endif
+
+  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
  StoreResults(cgm, cpm, kSizeM, alpha, beta);
 }

 // =================================================================================================

+// Main entry point of the kernel. This is the upper-triangular version.
+__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
+                         const real alpha, const real beta,
+                         const __global realM* restrict agm,
+                         const __global realN* restrict bgm,
+                         __global realM* cgm) {
+
+  // Skip these threads if they do not contain threads contributing to the upper-triangle
+  if (get_group_id(1)*NWG < get_group_id(0)*MWG) {
+    return;
+  }
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+
+  // Computes the matrix-multiplication and stores the result in register memory
+  realM cpm[NWI][MWI/VWM];
+  #if SA == 1 && SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+  #elif SA == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+  #elif SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+  #else
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+  #endif
+
+  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+  StoreResults(cgm, cpm, kSizeN, alpha, beta);
+}
+
+// =================================================================================================
+
+// Main entry point of the kernel. This is the lower-triangular version.
+__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+__kernel void XgemmLower(const int kSizeN, const int kSizeK,
+                         const real alpha, const real beta,
+                         const __global realM* restrict agm,
+                         const __global realN* restrict bgm,
+                         __global realM* cgm) {
+
+  // Skip these threads if they do not contain threads contributing to the lower-triangle
+  if (get_group_id(1)*NWG > get_group_id(0)*MWG) {
+    return;
+  }
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+
+  // Computes the matrix-multiplication and stores the result in register memory
+  realM cpm[NWI][MWI/VWM];
+  #if SA == 1 && SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+  #elif SA == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+  #elif SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+  #else
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+  #endif
+
+  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+  StoreResults(cgm, cpm, kSizeN, alpha, beta);
+}
+
+// =================================================================================================
+
 // End of the C++11 raw string literal
 )";

--- a/src/routine.cc
+++ b/src/routine.cc
@ -210,11 +210,13 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
                                           const size_t dest_ld, const size_t dest_offset,
                                           const Buffer &dest,
                                           const bool do_transpose, const bool do_conjugate,
-                                           const bool pad, const Program &program) {
+                                           const bool pad, const bool upper, const bool lower,
+                                           const Program &program) {

  // Determines whether or not the fast-version could potentially be used
  auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
-                         (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld);
+                         (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
+                         (upper == false) && (lower == false);

  // Determines the right kernel
  auto kernel_name = std::string{};
@ -267,6 +269,10 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
      if (pad) {
        kernel.SetArgument(10, static_cast<int>(do_conjugate));
      }
+      else {
+        kernel.SetArgument(10, static_cast<int>(upper));
+        kernel.SetArgument(11, static_cast<int>(lower));
+      }
    }

    // Launches the kernel and returns the error code. Uses global and local thread sizes based on
--- a/src/routines/xgemm.cc
+++ b/src/routines/xgemm.cc
@ -108,18 +108,18 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
    // them up until they reach a certain multiple of size (kernel parameter dependent).
    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
                                    m_ceiled, k_ceiled, m_ceiled, 0, temp_a,
-                                    a_do_transpose, a_conjugate, true, program);
+                                    a_do_transpose, a_conjugate, true, false, false, program);
    if (ErrorIn(status)) { return status; }
    status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
-                                    b_do_transpose, b_conjugate, true, program);
+                                    b_do_transpose, b_conjugate, true, false, false, program);
    if (ErrorIn(status)) { return status; }

    // Only necessary for matrix C if it used both as input and output
    if (beta != static_cast<T>(0)) {
      status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer,
                                      m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
-                                      c_do_transpose, false, true, program);
+                                      c_do_transpose, false, true, false, false, program);
      if (ErrorIn(status)) { return status; }
    }

@ -151,7 +151,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
      // Runs the post-processing kernel
      status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
                                      c_one, c_two, c_ld, c_offset, c_buffer,
-                                      c_do_transpose, false, false, program);
+                                      c_do_transpose, false, false, false, false, program);
      if (ErrorIn(status)) { return status; }

      // Successfully finished the computation
--- a/src/routines/xsyr2k.cc
+++ b/src/routines/xsyr2k.cc
@ -0,0 +1,169 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr2k class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/xsyr2k.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xsyr2k<float>::precision_ = Precision::kSingle;
+template <> const Precision Xsyr2k<double>::precision_ = Precision::kDouble;
+template <> const Precision Xsyr2k<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xsyr2k<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xsyr2k<T>::Xsyr2k(CommandQueue &queue, Event &event):
+    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                              const size_t n, const size_t k,
+                              const T alpha,
+                              const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                              const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                              const T beta,
+                              const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Computes whether or not the matrices are transposed in memory. This is based on their layout
+  // (row or column-major) and whether or not they are requested to be pre-transposed.
+  auto ab_rotated = (layout == Layout::kColMajor && ab_transpose != Transpose::kNo) ||
+                    (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo);
+  auto c_rotated = (layout == Layout::kRowMajor);
+
+  // In case of complex data-types, the transpose can also become a conjugate transpose
+  auto ab_conjugate = (ab_transpose == Transpose::kConjugate);
+
+  // Computes the first and second dimensions of the A and B matrices taking the layout into account
+  auto ab_one = (ab_rotated) ? k : n;
+  auto ab_two = (ab_rotated) ? n : k;
+
+  // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
+  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
+  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
+  // space. Also tests that the leading dimensions of:
+  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix C cannot be less than N
+  auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Calculates the ceiled versions of n and k
+  auto n_ceiled = Ceil(n, db_["NWG"]);
+  auto k_ceiled = Ceil(k, db_["KWG"]);
+
+  // Decides which kernel to run: the upper-triangular or lower-triangular version
+  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
+
+  // Allocates space on the device for padded and/or transposed input and output matrices.
+  try {
+    auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
+
+    // Loads the program from the database
+    auto& program = GetProgramFromCache();
+
+    // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
+    // fill them up until they reach a certain multiple of size (kernel parameter dependent).
+    status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_a,
+                                    ab_rotated, ab_conjugate, true, false, false, program);
+    if (ErrorIn(status)) { return status; }
+    status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
+                                    ab_rotated, ab_conjugate, true, false, false, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+    // modify the other triangle.
+    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
+                                    n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
+                                    c_rotated, false, true, false, false, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+    try {
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the kernel arguments
+      kernel.SetArgument(0, static_cast<int>(n_ceiled));
+      kernel.SetArgument(1, static_cast<int>(k_ceiled));
+      kernel.SetArgument(2, alpha);
+      kernel.SetArgument(3, beta);
+      kernel.SetArgument(4, temp_a());
+      kernel.SetArgument(5, temp_b());
+      kernel.SetArgument(6, temp_c());
+
+      // Computes the global and local thread sizes
+      auto global = std::vector<size_t>{
+        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+      };
+      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+      // Launches the kernel
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Swaps the arguments for matrices A and B, and sets 'beta' to 1
+      auto one = static_cast<T>(1);
+      kernel.SetArgument(3, one);
+      kernel.SetArgument(4, temp_b());
+      kernel.SetArgument(5, temp_a());
+
+      // Runs the kernel again
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the post-processing kernel
+      auto upper = (triangle == Triangle::kUpper);
+      auto lower = (triangle == Triangle::kLower);
+      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
+                                      n, n, c_ld, c_offset, c_buffer,
+                                      c_rotated, false, false, upper, lower, program);
+      if (ErrorIn(status)) { return status; }
+
+      // Successfully finished the computation
+      return StatusCode::kSuccess;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xsyr2k<float>;
+template class Xsyr2k<double>;
+template class Xsyr2k<float2>;
+template class Xsyr2k<double2>;
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routines/xsyrk.cc
+++ b/src/routines/xsyrk.cc
@ -0,0 +1,150 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyrk class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/xsyrk.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xsyrk<float>::precision_ = Precision::kSingle;
+template <> const Precision Xsyrk<double>::precision_ = Precision::kDouble;
+template <> const Precision Xsyrk<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xsyrk<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xsyrk<T>::Xsyrk(CommandQueue &queue, Event &event):
+    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                            const size_t n, const size_t k,
+                            const T alpha,
+                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const T beta,
+                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Computes whether or not the matrices are transposed in memory. This is based on their layout
+  // (row or column-major) and whether or not they are requested to be pre-transposed.
+  auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) ||
+                   (layout == Layout::kRowMajor && a_transpose == Transpose::kNo);
+  auto c_rotated = (layout == Layout::kRowMajor);
+
+  // In case of complex data-types, the transpose can also become a conjugate transpose
+  auto a_conjugate = (a_transpose == Transpose::kConjugate);
+
+  // Computes the first and second dimensions of the A matrix taking the layout into account
+  auto a_one = (a_rotated) ? k : n;
+  auto a_two = (a_rotated) ? n : k;
+
+  // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and
+  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
+  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
+  // space. Also tests that the leading dimensions of:
+  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix C cannot be less than N
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Calculates the ceiled versions of n and k
+  auto n_ceiled = Ceil(n, db_["NWG"]);
+  auto k_ceiled = Ceil(k, db_["KWG"]);
+
+  // Decides which kernel to run: the upper-triangular or lower-triangular version
+  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
+
+  // Allocates space on the device for padded and/or transposed input and output matrices.
+  try {
+    auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
+
+    // Loads the program from the database
+    auto& program = GetProgramFromCache();
+
+    // Runs the pre-processing kernel. This transposes the matrix A, but also pads zeros to
+    // fill them up until they reach a certain multiple of size (kernel parameter dependent).
+    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_a,
+                                    a_rotated, a_conjugate, true, false, false, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+    // modify the other triangle.
+    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
+                                    n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
+                                    c_rotated, false, true, false, false, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+    try {
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the kernel arguments
+      kernel.SetArgument(0, static_cast<int>(n_ceiled));
+      kernel.SetArgument(1, static_cast<int>(k_ceiled));
+      kernel.SetArgument(2, alpha);
+      kernel.SetArgument(3, beta);
+      kernel.SetArgument(4, temp_a());
+      kernel.SetArgument(5, temp_a());
+      kernel.SetArgument(6, temp_c());
+
+      // Computes the global and local thread sizes
+      auto global = std::vector<size_t>{
+        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+      };
+      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+      // Launches the kernel
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the post-processing kernel
+      auto upper = (triangle == Triangle::kUpper);
+      auto lower = (triangle == Triangle::kLower);
+      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
+                                      n, n, c_ld, c_offset, c_buffer,
+                                      c_rotated, false, false, upper, lower, program);
+      if (ErrorIn(status)) { return status; }
+
+      // Successfully finished the computation
+      return StatusCode::kSuccess;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xsyrk<float>;
+template class Xsyrk<double>;
+template class Xsyrk<float2>;
+template class Xsyrk<double2>;
+
+// =================================================================================================
+} // namespace clblast
--- a/test/correctness/routines/xgemm.cc
+++ b/test/correctness/routines/xgemm.cc
@ -76,7 +76,7 @@ void XgemmTest(int argc, char *argv[], const bool silent, const std::string &nam
        const auto case_name = ToString(layout)+" "+ToString(a_transpose)+" "+ToString(b_transpose);

        // Runs the tests
-        tester.TestRegular(args, case_name);
+        tester.TestRegular(args, case_name, false);
        tester.TestInvalidBufferSizes(args, case_name);
      }
    }
--- a/test/correctness/routines/xsymm.cc
+++ b/test/correctness/routines/xsymm.cc
@ -76,7 +76,7 @@ void XsymmTest(int argc, char *argv[], const bool silent, const std::string &nam
        const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);

        // Runs the tests
-        tester.TestRegular(args, case_name);
+        tester.TestRegular(args, case_name, true);
        tester.TestInvalidBufferSizes(args, case_name);
      }
    }
--- a/test/correctness/routines/xsyr2k.cc
+++ b/test/correctness/routines/xsyr2k.cc
@ -0,0 +1,99 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under the MIT license. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xsyr2k routine. It is based on the TestABC class.
+//
+// =================================================================================================
+
+#include "wrapper_clblas.h"
+#include "correctness/testabc.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
+template <typename T>
+void Xsyr2kTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates the CLBlast lambda
+  auto clblast_lambda = [](const Arguments<T> &args,
+                           const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
+                           CommandQueue &queue) -> StatusCode {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    return Syr2k(args.layout, args.triangle, args.a_transpose,
+                 args.n, args.k,
+                 args.alpha,
+                 a_mat(), args.a_offset, args.a_ld,
+                 b_mat(), args.b_offset, args.b_ld,
+                 args.beta,
+                 c_mat(), args.c_offset, args.c_ld,
+                 &queue_plain, &event);
+  };
+
+  // Creates the clBLAS lambda (for comparison)
+  auto clblas_lambda = [](const Arguments<T> &args,
+                          const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
+                          CommandQueue &queue) -> StatusCode {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXsyr2k(static_cast<clblasOrder>(args.layout),
+                               static_cast<clblasUplo>(args.triangle),
+                               static_cast<clblasTranspose>(args.a_transpose),
+                               args.n, args.k,
+                               args.alpha,
+                               a_mat(), args.a_offset, args.a_ld,
+                               b_mat(), args.b_offset, args.b_ld,
+                               args.beta,
+                               c_mat(), args.c_offset, args.c_ld,
+                               1, &queue_plain, 0, nullptr, &event);
+    return static_cast<StatusCode>(status);
+  };
+
+  // Initializes the arguments relevant for this routine
+  auto args = Arguments<T>{};
+  const auto options = std::vector<std::string>{kArgN, kArgK, kArgLayout,
+                                                kArgTriangle, kArgATransp,
+                                                kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
+                                                kArgAOffset, kArgBOffset, kArgCOffset};
+
+  // Creates a tester
+  TestABC<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) {
+    args.layout = layout;
+    for (auto &triangle: {Triangle::kUpper, Triangle::kLower}) {
+      args.triangle = triangle;
+      for (auto &ab_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it is
+        args.a_transpose = ab_transpose;                            // not supported by clBLAS
+        args.b_transpose = ab_transpose;
+        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose);
+
+        // Runs the tests
+        tester.TestRegular(args, case_name, true);
+        tester.TestInvalidBufferSizes(args, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::Xsyr2kTest<float>(argc, argv, false, "SSYR2K");
+  clblast::Xsyr2kTest<double>(argc, argv, true, "DSYR2K");
+  clblast::Xsyr2kTest<clblast::float2>(argc, argv, true, "CSYR2K");
+  clblast::Xsyr2kTest<clblast::double2>(argc, argv, true, "ZSYR2K");
+  return 0;
+}
+
+// =================================================================================================
--- a/test/correctness/routines/xsyrk.cc
+++ b/test/correctness/routines/xsyrk.cc
@ -0,0 +1,96 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under the MIT license. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xsyrk routine. It is based on the TestAC class.
+//
+// =================================================================================================
+
+#include "wrapper_clblas.h"
+#include "correctness/testac.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
+template <typename T>
+void XsyrkTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates the CLBlast lambda
+  auto clblast_lambda = [](const Arguments<T> &args,
+                           const Buffer &a_mat, const Buffer &c_mat,
+                           CommandQueue &queue) -> StatusCode {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    return Syrk(args.layout, args.triangle, args.a_transpose,
+                args.n, args.k,
+                args.alpha,
+                a_mat(), args.a_offset, args.a_ld,
+                args.beta,
+                c_mat(), args.c_offset, args.c_ld,
+                &queue_plain, &event);
+  };
+
+  // Creates the clBLAS lambda (for comparison)
+  auto clblas_lambda = [](const Arguments<T> &args,
+                          const Buffer &a_mat, const Buffer &c_mat,
+                          CommandQueue &queue) -> StatusCode {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasUplo>(args.triangle),
+                              static_cast<clblasTranspose>(args.a_transpose),
+                              args.n, args.k,
+                              args.alpha,
+                              a_mat(), args.a_offset, args.a_ld,
+                              args.beta,
+                              c_mat(), args.c_offset, args.c_ld,
+                              1, &queue_plain, 0, nullptr, &event);
+    return static_cast<StatusCode>(status);
+  };
+
+  // Initializes the arguments relevant for this routine
+  auto args = Arguments<T>{};
+  const auto options = std::vector<std::string>{kArgN, kArgK, kArgLayout,
+                                                kArgTriangle, kArgATransp,
+                                                kArgALeadDim, kArgCLeadDim,
+                                                kArgAOffset, kArgCOffset};
+
+  // Creates a tester
+  TestAC<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) {
+    args.layout = layout;
+    for (auto &triangle: {Triangle::kUpper, Triangle::kLower}) {
+      args.triangle = triangle;
+      for (auto &a_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it is
+        args.a_transpose = a_transpose;                            // not supported by clBLAS
+        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose);
+
+        // Runs the tests
+        tester.TestRegular(args, case_name);
+        tester.TestInvalidBufferSizes(args, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::XsyrkTest<float>(argc, argv, false, "SSYRK");
+  clblast::XsyrkTest<double>(argc, argv, true, "DSYRK");
+  clblast::XsyrkTest<clblast::float2>(argc, argv, true, "CSYRK");
+  clblast::XsyrkTest<clblast::double2>(argc, argv, true, "ZSYRK");
+  return 0;
+}
+
+// =================================================================================================
--- a/test/correctness/testab.cc
+++ b/test/correctness/testab.cc
@ -0,0 +1,192 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under the MIT license. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the TestAB class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include <algorithm>
+
+#include "correctness/testab.h"
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor, initializes the base class tester and input data
+template <typename T>
+TestAB<T>::TestAB(int argc, char *argv[], const bool silent,
+                  const std::string &name, const std::vector<std::string> &options,
+                  const Routine clblast_lambda, const Routine clblas_lambda):
+    Tester<T>{argc, argv, silent, name, options},
+    clblast_lambda_(clblast_lambda),
+    clblas_lambda_(clblas_lambda) {
+
+  // Computes the maximum sizes. This allows for a single set of input/output buffers.
+  auto max_dim = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
+  auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
+  auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
+
+  // Creates test input data
+  a_source_.resize(max_dim*max_ld + max_offset);
+  b_source_.resize(max_dim*max_ld + max_offset);
+  PopulateVector(a_source_);
+  PopulateVector(b_source_);
+}
+
+// ===============================================================================================
+
+// Tests the routine for a wide variety of parameters
+template <typename T>
+void TestAB<T>::TestRegular(Arguments<T> &args, const std::string &name) {
+  if (!PrecisionSupported()) { return; }
+  TestStart("regular behaviour", name);
+
+  // Computes whether or not the matrices are transposed. Note that we assume a default of
+  // column-major and no-transpose. If one of them is different (but not both), then rotated
+  // is considered true.
+  auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                   (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+  auto b_rotated = (args.layout == Layout::kRowMajor);
+
+  // Iterates over the matrix dimensions
+  for (auto &m: kMatrixDims) {
+    args.m = m;
+    for (auto &n: kMatrixDims) {
+      args.n = n;
+
+      // Computes the second dimensions of the matrices taking the rotation into account
+      auto a_two = (a_rotated) ? n : n;
+      auto b_two = (b_rotated) ? m : n;
+
+      // Iterates over the leading-dimension values and the offsets
+      for (auto &a_ld: kMatrixDims) {
+        args.a_ld = a_ld;
+        for (auto &a_offset: kOffsets) {
+          args.a_offset = a_offset;
+          for (auto &b_ld: kMatrixDims) {
+            args.b_ld = b_ld;
+            for (auto &b_offset: kOffsets) {
+              args.b_offset = b_offset;
+
+              // Computes the buffer sizes
+              auto a_size = a_two * a_ld + a_offset;
+              auto b_size = b_two * b_ld + b_offset;
+              if (a_size < 1 || b_size < 1) { continue; }
+
+              // Creates the OpenCL buffers
+              auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T));
+              auto r_mat = Buffer(context_, CL_MEM_READ_WRITE, b_size*sizeof(T));
+              auto s_mat = Buffer(context_, CL_MEM_READ_WRITE, b_size*sizeof(T));
+
+              // Iterates over the values for alpha and beta
+              for (auto &alpha: kAlphaValues) {
+                args.alpha = alpha;
+                for (auto &beta: kBetaValues) {
+                  args.beta = beta;
+
+                  // Runs the reference clBLAS code
+                  a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
+                  r_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_);
+                  auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_);
+
+                  // Runs the CLBlast code
+                  a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
+                  s_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_);
+                  auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_);
+
+                  // Tests for equality of the two status codes
+                  if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
+                    TestErrorCodes(status1, status2, args);
+                    continue;
+                  }
+
+                  // Downloads the results
+                  std::vector<T> r_result(b_size, static_cast<T>(0));
+                  std::vector<T> s_result(b_size, static_cast<T>(0));
+                  r_mat.ReadBuffer(queue_, b_size*sizeof(T), r_result);
+                  s_mat.ReadBuffer(queue_, b_size*sizeof(T), s_result);
+
+                  // Checks for differences in the output
+                  auto errors = size_t{0};
+                  for (auto idm=size_t{0}; idm<m; ++idm) {
+                    for (auto idn=size_t{0}; idn<n; ++idn) {
+                      auto index = (args.layout == Layout::kRowMajor) ?
+                                    idm*args.b_ld + idn + args.b_offset:
+                                    idn*args.b_ld + idm + args.b_offset;
+                      if (!TestSimilarity(r_result[index], s_result[index])) {
+                        errors++;
+                      }
+                    }
+                  }
+
+                  // Tests the error count (should be zero)
+                  TestErrorCount(errors, m*n, args);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  TestEnd();
+}
+
+// =================================================================================================
+
+// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
+// does not test for results (if any).
+template <typename T>
+void TestAB<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
+  if (!PrecisionSupported()) { return; }
+  TestStart("invalid buffer sizes", name);
+
+  // Sets example test parameters
+  args.m = kBufferSize;
+  args.n = kBufferSize;
+  args.a_ld = kBufferSize;
+  args.b_ld = kBufferSize;
+  args.a_offset = 0;
+  args.b_offset = 0;
+
+  // Iterates over test buffer sizes
+  const std::vector<size_t> kBufferSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
+  for (auto &a_size: kBufferSizes) {
+    for (auto &b_size: kBufferSizes) {
+
+      // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
+      // want to be able to create invalid buffers (no error checking here).
+      auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr);
+      auto a_mat = Buffer(a);
+      auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, b_size*sizeof(T), nullptr, nullptr);
+      auto r_mat = Buffer(r);
+      auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, b_size*sizeof(T), nullptr, nullptr);
+      auto s_mat = Buffer(s);
+
+      // Runs the two routines
+      auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_);
+      auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_);
+
+      // Tests for equality of the two status codes
+      TestErrorCodes(status1, status2, args);
+    }
+  }
+  TestEnd();
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class TestAB<float>;
+template class TestAB<double>;
+template class TestAB<float2>;
+template class TestAB<double2>;
+
+// =================================================================================================
+} // namespace clblast
--- a/test/correctness/testab.h
+++ b/test/correctness/testab.h
@ -0,0 +1,85 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under the MIT license. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file tests any mat-mat (A,B) routine. It contains two types of tests: one testing
+// all sorts of input combinations, and one deliberatly testing with invalid values.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_CORRECTNESS_TESTAB_H_
+#define CLBLAST_TEST_CORRECTNESS_TESTAB_H_
+
+#include <vector>
+#include <string>
+
+#include "correctness/tester.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestAB: public Tester<T> {
+ public:
+
+  // Uses several variables from the Tester class
+  using Tester<T>::context_;
+  using Tester<T>::queue_;
+  using Tester<T>::kLayouts;
+  using Tester<T>::kTransposes;
+
+  // Uses several helper functions from the Tester class
+  using Tester<T>::TestStart;
+  using Tester<T>::TestEnd;
+  using Tester<T>::TestSimilarity;
+  using Tester<T>::TestErrorCount;
+  using Tester<T>::TestErrorCodes;
+  using Tester<T>::GetExampleScalars;
+  using Tester<T>::GetOffsets;
+  using Tester<T>::PrecisionSupported;
+
+  // Test settings for the regular test. Append to this list in case more tests are required.
+  const std::vector<size_t> kMatrixDims = { 7, 64 };
+  const std::vector<size_t> kOffsets = GetOffsets();
+  const std::vector<T> kAlphaValues = GetExampleScalars();
+  const std::vector<T> kBetaValues = GetExampleScalars();
+
+  // Test settings for the invalid test
+  const size_t kBufferSize = 64;
+
+  // Shorthand for a BLAS routine
+  using Routine = std::function<StatusCode(const Arguments<T>&,
+                                           const Buffer&, const Buffer&,
+                                           CommandQueue&)>;
+
+  // Constructor, initializes the base class tester and input data
+  TestAB(int argc, char *argv[], const bool silent,
+         const std::string &name, const std::vector<std::string> &options,
+         const Routine clblast_lambda, const Routine clblas_lambda);
+
+  // The test functions, taking no inputs
+  void TestRegular(Arguments<T> &args, const std::string &name);
+  void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
+
+ private:
+
+  // Source data to test with
+  std::vector<T> a_source_;
+  std::vector<T> b_source_;
+  
+  // The routines to test
+  Routine clblast_lambda_;
+  Routine clblas_lambda_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_CORRECTNESS_TESTAB_H_
+#endif
--- a/test/correctness/testabc.cc
+++ b/test/correctness/testabc.cc
@ -45,7 +45,7 @@ TestABC<T>::TestABC(int argc, char *argv[], const bool silent,

 // Tests the routine for a wide variety of parameters
 template <typename T>
-void TestABC<T>::TestRegular(Arguments<T> &args, const std::string &name) {
+void TestABC<T>::TestRegular(Arguments<T> &args, const std::string &name, const bool symmetric) {
  if (!PrecisionSupported()) { return; }
  TestStart("regular behaviour", name);

@ -63,6 +63,7 @@ void TestABC<T>::TestRegular(Arguments<T> &args, const std::string &name) {
    args.m = m;
    for (auto &n: kMatrixDims) {
      args.n = n;
+      if (symmetric && m != n) { continue; }
      for (auto &k: kMatrixDims) {
        args.k = k;

--- a/test/correctness/testabc.h
+++ b/test/correctness/testabc.h
@ -64,7 +64,7 @@ class TestABC: public Tester<T> {
          const Routine clblast_lambda, const Routine clblas_lambda);

  // The test functions, taking no inputs
-  void TestRegular(Arguments<T> &args, const std::string &name);
+  void TestRegular(Arguments<T> &args, const std::string &name, const bool symmetric);
  void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);

 private:
--- a/test/correctness/testac.cc
+++ b/test/correctness/testac.cc
@ -0,0 +1,191 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under the MIT license. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the TestAC class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include <algorithm>
+
+#include "correctness/testac.h"
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor, initializes the base class tester and input data
+template <typename T>
+TestAC<T>::TestAC(int argc, char *argv[], const bool silent,
+                  const std::string &name, const std::vector<std::string> &options,
+                  const Routine clblast_lambda, const Routine clblas_lambda):
+    Tester<T>{argc, argv, silent, name, options},
+    clblast_lambda_(clblast_lambda),
+    clblas_lambda_(clblas_lambda) {
+
+  // Computes the maximum sizes. This allows for a single set of input/output buffers.
+  auto max_dim = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
+  auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
+  auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
+
+  // Creates test input data
+  a_source_.resize(max_dim*max_ld + max_offset);
+  c_source_.resize(max_dim*max_ld + max_offset);
+  PopulateVector(a_source_);
+  PopulateVector(c_source_);
+}
+
+// ===============================================================================================
+
+// Tests the routine for a wide variety of parameters
+template <typename T>
+void TestAC<T>::TestRegular(Arguments<T> &args, const std::string &name) {
+  if (!PrecisionSupported()) { return; }
+  TestStart("regular behaviour", name);
+
+  // Computes whether or not the matrices are transposed. Note that we assume a default of
+  // column-major and no-transpose. If one of them is different (but not both), then rotated
+  // is considered true.
+  auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                   (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+  auto c_rotated = (args.layout == Layout::kRowMajor);
+
+  // Iterates over the matrix dimensions
+  for (auto &n: kMatrixDims) {
+    args.n = n;
+    for (auto &k: kMatrixDims) {
+      args.k = k;
+
+      // Computes the second dimensions of the matrices taking the rotation into account
+      auto a_two = (a_rotated) ? n : k;
+      auto c_two = (c_rotated) ? n : n;
+
+      // Iterates over the leading-dimension values and the offsets
+      for (auto &a_ld: kMatrixDims) {
+        args.a_ld = a_ld;
+        for (auto &a_offset: kOffsets) {
+          args.a_offset = a_offset;
+          for (auto &c_ld: kMatrixDims) {
+            args.c_ld = c_ld;
+            for (auto &c_offset: kOffsets) {
+              args.c_offset = c_offset;
+
+              // Computes the buffer sizes
+              auto a_size = a_two * a_ld + a_offset;
+              auto c_size = c_two * c_ld + c_offset;
+              if (a_size < 1 || c_size < 1) { continue; }
+
+              // Creates the OpenCL buffers
+              auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T));
+              auto r_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T));
+              auto s_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T));
+
+              // Iterates over the values for alpha and beta
+              for (auto &alpha: kAlphaValues) {
+                args.alpha = alpha;
+                for (auto &beta: kBetaValues) {
+                  args.beta = beta;
+
+                  // Runs the reference clBLAS code
+                  a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
+                  r_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_);
+                  auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_);
+
+                  // Runs the CLBlast code
+                  a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
+                  s_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_);
+                  auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_);
+
+                  // Tests for equality of the two status codes
+                  if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
+                    TestErrorCodes(status1, status2, args);
+                    continue;
+                  }
+
+                  // Downloads the results
+                  std::vector<T> r_result(c_size, static_cast<T>(0));
+                  std::vector<T> s_result(c_size, static_cast<T>(0));
+                  r_mat.ReadBuffer(queue_, c_size*sizeof(T), r_result);
+                  s_mat.ReadBuffer(queue_, c_size*sizeof(T), s_result);
+
+                  // Checks for differences in the output
+                  auto errors = size_t{0};
+                  for (auto idn0=size_t{0}; idn0<n; ++idn0) {
+                    for (auto idn1=size_t{0}; idn1<n; ++idn1) {
+                      auto index = idn0*args.c_ld + idn1 + args.c_offset;
+                      if (!TestSimilarity(r_result[index], s_result[index])) {
+                        errors++;
+                      }
+                    }
+                  }
+
+                  // Tests the error count (should be zero)
+                  TestErrorCount(errors, n*n, args);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  TestEnd();
+}
+
+// =================================================================================================
+
+// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
+// does not test for results (if any).
+template <typename T>
+void TestAC<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
+  if (!PrecisionSupported()) { return; }
+  TestStart("invalid buffer sizes", name);
+
+  // Sets example test parameters
+  args.m = kBufferSize;
+  args.n = kBufferSize;
+  args.k = kBufferSize;
+  args.a_ld = kBufferSize;
+  args.c_ld = kBufferSize;
+  args.a_offset = 0;
+  args.c_offset = 0;
+
+  // Iterates over test buffer sizes
+  const std::vector<size_t> kBufferSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
+  for (auto &a_size: kBufferSizes) {
+    for (auto &c_size: kBufferSizes) {
+
+      // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
+      // want to be able to create invalid buffers (no error checking here).
+      auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr);
+      auto a_mat = Buffer(a);
+      auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr);
+      auto r_mat = Buffer(r);
+      auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr);
+      auto s_mat = Buffer(s);
+
+      // Runs the two routines
+      auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_);
+      auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_);
+
+      // Tests for equality of the two status codes
+      TestErrorCodes(status1, status2, args);
+    }
+  }
+  TestEnd();
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class TestAC<float>;
+template class TestAC<double>;
+template class TestAC<float2>;
+template class TestAC<double2>;
+
+// =================================================================================================
+} // namespace clblast
--- a/test/correctness/testac.h
+++ b/test/correctness/testac.h
@ -0,0 +1,85 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under the MIT license. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file tests any mat-mat (A,C) routine. It contains two types of tests: one testing
+// all sorts of input combinations, and one deliberatly testing with invalid values.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_CORRECTNESS_TESTAC_H_
+#define CLBLAST_TEST_CORRECTNESS_TESTAC_H_
+
+#include <vector>
+#include <string>
+
+#include "correctness/tester.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestAC: public Tester<T> {
+ public:
+
+  // Uses several variables from the Tester class
+  using Tester<T>::context_;
+  using Tester<T>::queue_;
+  using Tester<T>::kLayouts;
+  using Tester<T>::kTransposes;
+
+  // Uses several helper functions from the Tester class
+  using Tester<T>::TestStart;
+  using Tester<T>::TestEnd;
+  using Tester<T>::TestSimilarity;
+  using Tester<T>::TestErrorCount;
+  using Tester<T>::TestErrorCodes;
+  using Tester<T>::GetExampleScalars;
+  using Tester<T>::GetOffsets;
+  using Tester<T>::PrecisionSupported;
+
+  // Test settings for the regular test. Append to this list in case more tests are required.
+  const std::vector<size_t> kMatrixDims = { 7, 64 };
+  const std::vector<size_t> kOffsets = GetOffsets();
+  const std::vector<T> kAlphaValues = GetExampleScalars();
+  const std::vector<T> kBetaValues = GetExampleScalars();
+
+  // Test settings for the invalid test
+  const size_t kBufferSize = 64;
+
+  // Shorthand for a BLAS routine
+  using Routine = std::function<StatusCode(const Arguments<T>&,
+                                           const Buffer&, const Buffer&,
+                                           CommandQueue&)>;
+
+  // Constructor, initializes the base class tester and input data
+  TestAC(int argc, char *argv[], const bool silent,
+         const std::string &name, const std::vector<std::string> &options,
+         const Routine clblast_lambda, const Routine clblas_lambda);
+
+  // The test functions, taking no inputs
+  void TestRegular(Arguments<T> &args, const std::string &name);
+  void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
+
+ private:
+
+  // Source data to test with
+  std::vector<T> a_source_;
+  std::vector<T> c_source_;
+  
+  // The routines to test
+  Routine clblast_lambda_;
+  Routine clblas_lambda_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_CORRECTNESS_TESTAC_H_
+#endif
--- a/test/performance/client.cc
+++ b/test/performance/client.cc
@ -115,7 +115,7 @@ void ClientAXY(int argc, char *argv[], Routine3<T> client_routine,
    auto a_two = (args.layout == Layout::kRowMajor) ? args.m : args.n;

    // Computes the vector sizes in case the matrix is transposed
-    auto a_transposed = (args.a_transpose == Transpose::kYes);
+    auto a_transposed = (args.a_transpose != Transpose::kNo);
    auto m_real = (a_transposed) ? args.n : args.m;
    auto n_real = (a_transposed) ? args.m : args.n;

@ -163,13 +163,13 @@ template void ClientAXY<double2>(int, char **, Routine3<double2>, const std::vec

 // =================================================================================================

-// This is the matrix-matrix-matrix variant of the set-up/tear-down client routine.
+// This is the matrix-matrix variant of the set-up/tear-down client routine.
 template <typename T>
-void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
-                     const std::vector<std::string> &options) {
+void ClientAC(int argc, char *argv[], Routine2<T> client_routine,
+              const std::vector<std::string> &options) {

  // Function to determine how to find the default value of the leading dimension of matrix A
-  auto default_ld_a = [](const Arguments<T> args) { return args.m; };
+  auto default_ld_a = [](const Arguments<T> args) { return args.k; };

  // Simple command line argument parser with defaults
  auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
@ -188,9 +188,83 @@ void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
  // Computes whether or not the matrices are transposed. Note that we assume a default of
  // column-major and no-transpose. If one of them is different (but not both), then rotated
  // is considered true.
-  auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose == Transpose::kYes) ||
+  auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
                   (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
-  auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose == Transpose::kYes) ||
+
+  // Iterates over all "num_step" values jumping by "step" each time
+  auto s = size_t{0};
+  while(true) {
+
+    // Computes the data sizes
+    auto a_two = (a_rotated) ? args.n : args.k;
+    auto a_size = a_two * args.a_ld + args.a_offset;
+    auto c_size = args.n * args.c_ld + args.c_offset;
+
+    // Populates input host matrices with random data
+    std::vector<T> a_source(a_size);
+    std::vector<T> c_source(c_size);
+    PopulateVector(a_source);
+    PopulateVector(c_source);
+
+    // Creates the matrices on the device
+    auto a_buffer = Buffer(context, CL_MEM_READ_WRITE, a_size*sizeof(T));
+    auto c_buffer = Buffer(context, CL_MEM_READ_WRITE, c_size*sizeof(T));
+    a_buffer.WriteBuffer(queue, a_size*sizeof(T), a_source);
+    c_buffer.WriteBuffer(queue, c_size*sizeof(T), c_source);
+
+    // Runs the routine-specific code
+    client_routine(args, a_buffer, c_buffer, queue);
+
+    // Makes the jump to the next step
+    ++s;
+    if (s >= args.num_steps) { break; }
+    args.n += args.step;
+    args.k += args.step;
+    args.a_ld += args.step;
+    args.c_ld += args.step;
+  }
+
+  // Cleans-up and returns
+  if (args.compare_clblas) { clblasTeardown(); }
+}
+
+// Compiles the above function
+template void ClientAC<float>(int, char **, Routine2<float>, const std::vector<std::string>&);
+template void ClientAC<double>(int, char **, Routine2<double>, const std::vector<std::string>&);
+template void ClientAC<float2>(int, char **, Routine2<float2>, const std::vector<std::string>&);
+template void ClientAC<double2>(int, char **, Routine2<double2>, const std::vector<std::string>&);
+
+// =================================================================================================
+
+// This is the matrix-matrix-matrix variant of the set-up/tear-down client routine.
+template <typename T>
+void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
+               const std::vector<std::string> &options, const bool symmetric) {
+
+  // Function to determine how to find the default value of the leading dimension of matrix A
+  auto default_ld_a = [&symmetric](const Arguments<T> args) { return (symmetric) ? args.n : args.m; };
+
+  // Simple command line argument parser with defaults
+  auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
+  if (args.print_help) { return; }
+  if (symmetric) { args.m = args.n; }
+
+  // Prints the header of the output table
+  PrintTableHeader(args.silent, options);
+
+  // Initializes OpenCL and the libraries
+  auto platform = Platform(args.platform_id);
+  auto device = Device(platform, kDeviceType, args.device_id);
+  auto context = Context(device);
+  auto queue = CommandQueue(context, device);
+  if (args.compare_clblas) { clblasSetup(); }
+
+  // Computes whether or not the matrices are transposed. Note that we assume a default of
+  // column-major and no-transpose. If one of them is different (but not both), then rotated
+  // is considered true.
+  auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                   (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+  auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
                   (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
  auto c_rotated = (args.layout == Layout::kRowMajor);

@ -241,10 +315,10 @@ void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
 }

 // Compiles the above function
-template void ClientABC<float>(int, char **, Routine3<float>, const std::vector<std::string>&);
-template void ClientABC<double>(int, char **, Routine3<double>, const std::vector<std::string>&);
-template void ClientABC<float2>(int, char **, Routine3<float2>, const std::vector<std::string>&);
-template void ClientABC<double2>(int, char **, Routine3<double2>, const std::vector<std::string>&);
+template void ClientABC<float>(int, char **, Routine3<float>, const std::vector<std::string>&, const bool);
+template void ClientABC<double>(int, char **, Routine3<double>, const std::vector<std::string>&, const bool);
+template void ClientABC<float2>(int, char **, Routine3<float2>, const std::vector<std::string>&, const bool);
+template void ClientABC<double2>(int, char **, Routine3<double2>, const std::vector<std::string>&, const bool);

 // =================================================================================================

--- a/test/performance/client.h
+++ b/test/performance/client.h
@ -52,8 +52,11 @@ template <typename T>
 void ClientAXY(int argc, char *argv[], Routine3<T> client_routine,
               const std::vector<std::string> &options);
 template <typename T>
+void ClientAC(int argc, char *argv[], Routine2<T> client_routine,
+              const std::vector<std::string> &options);
+template <typename T>
 void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
-               const std::vector<std::string> &options);
+               const std::vector<std::string> &options, const bool symmetric);

 // =================================================================================================

--- a/test/performance/graphs/xsyr2k.r
+++ b/test/performance/graphs/xsyr2k.r
@ -0,0 +1,94 @@
+
+# ==================================================================================================
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+# project uses a tab-size of two spaces and a max-width of 100 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+#
+# This file implements the performance script for the Xsyr2k routine
+#
+# ==================================================================================================
+
+# Includes the common functions
+args <- commandArgs(trailingOnly = FALSE)
+thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
+source(file.path(dirname(thisfile), "common.r"))
+
+# ==================================================================================================
+
+# Settings
+routine_name <- "xsyr2k"
+parameters <- c("-n","-k","-layout","-triangle","-transA",
+                "-num_steps","-step","-runs","-precision")
+precision <- 32
+
+# Sets the names of the test-cases
+test_names <- list(
+  "multiples of 128",
+  "multiples of 128 (+1)",
+  "around n=k=512",
+  "around n=k=1536",
+  "layouts and transposing (n=k=1024)",
+  "powers of 2"
+)
+
+# Defines the test-cases
+test_values <- list(
+  list(c(128, 128, 0, 0, 0, 16, 128, num_runs, precision)),
+  list(c(129, 129, 0, 0, 0, 16, 128, num_runs, precision)),
+  list(c(512, 512, 0, 0, 0, 16, 1, num_runs, precision)),
+  list(c(1536, 1536, 0, 0, 0, 16, 1, num_runs, precision)),
+  list(
+    c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
+  ),
+  list(
+    c(8, 8, 0, 0, 0, 1, 0, num_runs, precision),
+    c(16, 16, 0, 0, 0, 1, 0, num_runs, precision),
+    c(32, 32, 0, 0, 0, 1, 0, num_runs, precision),
+    c(64, 64, 0, 0, 0, 1, 0, num_runs, precision),
+    c(128, 128, 0, 0, 0, 1, 0, num_runs, precision),
+    c(256, 256, 0, 0, 0, 1, 0, num_runs, precision),
+    c(512, 512, 0, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
+    c(2048, 2048, 0, 0, 0, 1, 0, num_runs, precision),
+    c(4096, 4096, 0, 0, 0, 1, 0, num_runs, precision),
+    c(8192, 8192, 0, 0, 0, 1, 0, num_runs, precision)
+  )
+)
+
+# Defines the x-labels corresponding to the test-cases
+test_xlabels <- list(
+  "matrix sizes (n=k)",
+  "matrix sizes (n=k)",
+  "matrix sizes (n=k)",
+  "matrix sizes (n=k)",
+  "layout (row/col), triangle (u/l), transA (n/y)",
+  "matrix sizes (n=k)"
+)
+
+# Defines the x-axis of the test-cases
+test_xaxis <- list(
+  c("n", ""),
+  c("n", ""),
+  c("n", ""),
+  c("n", ""),
+  list(1:8, c("row,u,n", "row,u,y", "row,l,n", "row,l,y",
+              "col,u,n", "col,u,y", "col,l,n", "col,l,y")),
+  c("n", "x")
+)
+
+# ==================================================================================================
+
+# Start the script
+main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
+     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
+
+# ==================================================================================================
--- a/test/performance/graphs/xsyrk.r
+++ b/test/performance/graphs/xsyrk.r
@ -0,0 +1,94 @@
+
+# ==================================================================================================
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+# project uses a tab-size of two spaces and a max-width of 100 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+#
+# This file implements the performance script for the Xsyrk routine
+#
+# ==================================================================================================
+
+# Includes the common functions
+args <- commandArgs(trailingOnly = FALSE)
+thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
+source(file.path(dirname(thisfile), "common.r"))
+
+# ==================================================================================================
+
+# Settings
+routine_name <- "xsyrk"
+parameters <- c("-n","-k","-layout","-triangle","-transA",
+                "-num_steps","-step","-runs","-precision")
+precision <- 32
+
+# Sets the names of the test-cases
+test_names <- list(
+  "multiples of 128",
+  "multiples of 128 (+1)",
+  "around n=k=512",
+  "around n=k=2048",
+  "layouts and transposing (n=k=1024)",
+  "powers of 2"
+)
+
+# Defines the test-cases
+test_values <- list(
+  list(c(128, 128, 0, 0, 0, 16, 128, num_runs, precision)),
+  list(c(129, 129, 0, 0, 0, 16, 128, num_runs, precision)),
+  list(c(512, 512, 0, 0, 0, 16, 1, num_runs, precision)),
+  list(c(2048, 2048, 0, 0, 0, 16, 1, num_runs, precision)),
+  list(
+    c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
+  ),
+  list(
+    c(8, 8, 0, 0, 0, 1, 0, num_runs, precision),
+    c(16, 16, 0, 0, 0, 1, 0, num_runs, precision),
+    c(32, 32, 0, 0, 0, 1, 0, num_runs, precision),
+    c(64, 64, 0, 0, 0, 1, 0, num_runs, precision),
+    c(128, 128, 0, 0, 0, 1, 0, num_runs, precision),
+    c(256, 256, 0, 0, 0, 1, 0, num_runs, precision),
+    c(512, 512, 0, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
+    c(2048, 2048, 0, 0, 0, 1, 0, num_runs, precision),
+    c(4096, 4096, 0, 0, 0, 1, 0, num_runs, precision),
+    c(8192, 8192, 0, 0, 0, 1, 0, num_runs, precision)
+  )
+)
+
+# Defines the x-labels corresponding to the test-cases
+test_xlabels <- list(
+  "matrix sizes (n=k)",
+  "matrix sizes (n=k)",
+  "matrix sizes (n=k)",
+  "matrix sizes (n=k)",
+  "layout (row/col), triangle (u/l), transA (n/y)",
+  "matrix sizes (n=k)"
+)
+
+# Defines the x-axis of the test-cases
+test_xaxis <- list(
+  c("n", ""),
+  c("n", ""),
+  c("n", ""),
+  c("n", ""),
+  list(1:8, c("row,u,n", "row,u,y", "row,l,n", "row,l,y",
+              "col,u,n", "col,u,y", "col,l,n", "col,l,y")),
+  c("n", "x")
+)
+
+# ==================================================================================================
+
+# Start the script
+main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
+     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
+
+# ==================================================================================================
--- a/test/performance/routines/xgemm.cc
+++ b/test/performance/routines/xgemm.cc
@ -72,7 +72,7 @@ void PerformanceXgemm(const Arguments<T> &args,

  // Prints the performance of both libraries
  const auto flops = 2 * args.m * args.n * args.k;
-  const auto bytes = (args.m*args.k + args.k*args.n + args.m*args.n) * sizeof(T);
+  const auto bytes = (args.m*args.k + args.k*args.n + 2*args.m*args.n) * sizeof(T);
  const auto output_ints = std::vector<size_t>{args.m, args.n, args.k,
                                               static_cast<size_t>(args.layout),
                                               static_cast<size_t>(args.a_transpose),
@ -96,10 +96,10 @@ void ClientXgemm(int argc, char *argv[]) {
                                          kArgAlpha, kArgBeta};
  switch(GetPrecision(argc, argv)) {
    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
-    case Precision::kSingle: ClientABC<float>(argc, argv, PerformanceXgemm<float>, o); break;
-    case Precision::kDouble: ClientABC<double>(argc, argv, PerformanceXgemm<double>, o); break;
-    case Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode");
-    case Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode");
+    case Precision::kSingle: ClientABC<float>(argc, argv, PerformanceXgemm<float>, o, false); break;
+    case Precision::kDouble: ClientABC<double>(argc, argv, PerformanceXgemm<double>, o, false); break;
+    case Precision::kComplexSingle: ClientABC<float2>(argc, argv, PerformanceXgemm<float2>, o, false); break;
+    case Precision::kComplexDouble: ClientABC<double2>(argc, argv, PerformanceXgemm<double2>, o, false); break;
  }
 }

--- a/test/performance/routines/xsymm.cc
+++ b/test/performance/routines/xsymm.cc
@ -72,7 +72,7 @@ void PerformanceXsymm(const Arguments<T> &args,

  // Prints the performance of both libraries
  const auto flops = 2 * args.m * args.n * args.m;
-  const auto bytes = (args.m*args.m + args.m*args.n + args.m*args.n) * sizeof(T);
+  const auto bytes = (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T);
  const auto output_ints = std::vector<size_t>{args.m, args.n,
                                               static_cast<size_t>(args.layout),
                                               static_cast<size_t>(args.triangle),
@ -96,10 +96,10 @@ void ClientXsymm(int argc, char *argv[]) {
                                          kArgAlpha, kArgBeta};
  switch(GetPrecision(argc, argv)) {
    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
-    case Precision::kSingle: ClientABC<float>(argc, argv, PerformanceXsymm<float>, o); break;
-    case Precision::kDouble: ClientABC<double>(argc, argv, PerformanceXsymm<double>, o); break;
-    case Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode");
-    case Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode");
+    case Precision::kSingle: ClientABC<float>(argc, argv, PerformanceXsymm<float>, o, false); break;
+    case Precision::kDouble: ClientABC<double>(argc, argv, PerformanceXsymm<double>, o, false); break;
+    case Precision::kComplexSingle: ClientABC<float2>(argc, argv, PerformanceXsymm<float2>, o, false); break;
+    case Precision::kComplexDouble: ClientABC<double2>(argc, argv, PerformanceXsymm<double2>, o, false); break;
  }
 }

--- a/test/performance/routines/xsyr2k.cc
+++ b/test/performance/routines/xsyr2k.cc
@ -0,0 +1,115 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr2k command-line interface tester.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+#include <exception>
+
+#include "wrapper_clblas.h"
+#include "performance/client.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The client, used for performance testing. It contains the function calls to CLBlast and to other
+// libraries to compare against.
+template <typename T>
+void PerformanceXsyr2k(const Arguments<T> &args,
+                       const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
+                       CommandQueue &queue) {
+
+  // Creates the CLBlast lambda
+  auto clblast_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Syr2k(args.layout, args.triangle, args.a_transpose,
+                        args.n, args.k,
+                        args.alpha,
+                        a_mat(), args.a_offset, args.a_ld,
+                        b_mat(), args.b_offset, args.b_ld,
+                        args.beta,
+                        c_mat(), args.c_offset, args.c_ld,
+                        &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    if (status != StatusCode::kSuccess) {
+      throw std::runtime_error("CLBlast error: "+ToString(static_cast<int>(status)));
+    }
+  };
+
+  // Creates the clBLAS lambda (for comparison)
+  auto clblas_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXsyr2k(static_cast<clblasOrder>(args.layout),
+                               static_cast<clblasUplo>(args.triangle),
+                               static_cast<clblasTranspose>(args.a_transpose),
+                               args.n, args.k,
+                               args.alpha,
+                               a_mat(), args.a_offset, args.a_ld,
+                               b_mat(), args.b_offset, args.b_ld,
+                               args.beta,
+                               c_mat(), args.c_offset, args.c_ld,
+                               1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+      throw std::runtime_error("clBLAS error: "+ToString(static_cast<int>(status)));
+    }
+  };
+
+  // Runs the routines and collect the timings
+  auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda);
+  auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda);
+
+  // Prints the performance of both libraries
+  const auto flops = 2 * args.n * args.n * args.k;
+  const auto bytes = (args.n*args.k + args.n*args.n) * sizeof(T);
+  const auto output_ints = std::vector<size_t>{args.n, args.k,
+                                               static_cast<size_t>(args.layout),
+                                               static_cast<size_t>(args.triangle),
+                                               static_cast<size_t>(args.a_transpose),
+                                               args.a_ld, args.b_ld, args.c_ld,
+                                               args.a_offset, args.b_offset, args.c_offset};
+  const auto output_strings = std::vector<std::string>{ToString(args.alpha),
+                                                       ToString(args.beta)};
+  PrintTableRow(output_ints, output_strings, args.no_abbrv,
+                ms_clblast, ms_clblas, flops, bytes);
+}
+
+// =================================================================================================
+
+// Main function which calls the common client code with the routine-specific function as argument.
+void ClientXsyr2k(int argc, char *argv[]) {
+  const auto o = std::vector<std::string>{kArgN, kArgK,
+                                          kArgLayout, kArgTriangle, kArgATransp,
+                                          kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
+                                          kArgAOffset, kArgBOffset, kArgCOffset,
+                                          kArgAlpha, kArgBeta};
+  switch(GetPrecision(argc, argv)) {
+    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case Precision::kSingle: ClientABC<float>(argc, argv, PerformanceXsyr2k<float>, o, true); break;
+    case Precision::kDouble: ClientABC<double>(argc, argv, PerformanceXsyr2k<double>, o, true); break;
+    case Precision::kComplexSingle: ClientABC<float2>(argc, argv, PerformanceXsyr2k<float2>, o, true); break;
+    case Precision::kComplexDouble: ClientABC<double2>(argc, argv, PerformanceXsyr2k<double2>, o, true); break;
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::ClientXsyr2k(argc, argv);
+  return 0;
+}
+
+// =================================================================================================
--- a/test/performance/routines/xsyrk.cc
+++ b/test/performance/routines/xsyrk.cc
@ -0,0 +1,113 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyrk command-line interface tester.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+#include <exception>
+
+#include "wrapper_clblas.h"
+#include "performance/client.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The client, used for performance testing. It contains the function calls to CLBlast and to other
+// libraries to compare against.
+template <typename T>
+void PerformanceXsyrk(const Arguments<T> &args,
+                      const Buffer &a_mat, const Buffer &c_mat,
+                      CommandQueue &queue) {
+
+  // Creates the CLBlast lambda
+  auto clblast_lambda = [&args, &a_mat, &c_mat, &queue]() {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Syrk(args.layout, args.triangle, args.a_transpose,
+                       args.n, args.k,
+                       args.alpha,
+                       a_mat(), args.a_offset, args.a_ld,
+                       args.beta,
+                       c_mat(), args.c_offset, args.c_ld,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    if (status != StatusCode::kSuccess) {
+      throw std::runtime_error("CLBlast error: "+ToString(static_cast<int>(status)));
+    }
+  };
+
+  // Creates the clBLAS lambda (for comparison)
+  auto clblas_lambda = [&args, &a_mat, &c_mat, &queue]() {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasUplo>(args.triangle),
+                              static_cast<clblasTranspose>(args.a_transpose),
+                              args.n, args.k,
+                              args.alpha,
+                              a_mat(), args.a_offset, args.a_ld,
+                              args.beta,
+                              c_mat(), args.c_offset, args.c_ld,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+      throw std::runtime_error("clBLAS error: "+ToString(static_cast<int>(status)));
+    }
+  };
+
+  // Runs the routines and collect the timings
+  auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda);
+  auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda);
+
+  // Prints the performance of both libraries
+  const auto flops = args.n * args.n * args.k;
+  const auto bytes = (args.n*args.k + args.n*args.n) * sizeof(T);
+  const auto output_ints = std::vector<size_t>{args.n, args.k,
+                                               static_cast<size_t>(args.layout),
+                                               static_cast<size_t>(args.triangle),
+                                               static_cast<size_t>(args.a_transpose),
+                                               args.a_ld, args.c_ld,
+                                               args.a_offset, args.c_offset};
+  const auto output_strings = std::vector<std::string>{ToString(args.alpha),
+                                                       ToString(args.beta)};
+  PrintTableRow(output_ints, output_strings, args.no_abbrv,
+                ms_clblast, ms_clblas, flops, bytes);
+}
+
+// =================================================================================================
+
+// Main function which calls the common client code with the routine-specific function as argument.
+void ClientXsyrk(int argc, char *argv[]) {
+  const auto o = std::vector<std::string>{kArgN, kArgK,
+                                          kArgLayout, kArgTriangle, kArgATransp,
+                                          kArgALeadDim, kArgCLeadDim,
+                                          kArgAOffset, kArgCOffset,
+                                          kArgAlpha, kArgBeta};
+  switch(GetPrecision(argc, argv)) {
+    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case Precision::kSingle: ClientAC<float>(argc, argv, PerformanceXsyrk<float>, o); break;
+    case Precision::kDouble: ClientAC<double>(argc, argv, PerformanceXsyrk<double>, o); break;
+    case Precision::kComplexSingle: ClientAC<float2>(argc, argv, PerformanceXsyrk<float2>, o); break;
+    case Precision::kComplexDouble: ClientAC<double2>(argc, argv, PerformanceXsyrk<double2>, o); break;
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::ClientXsyrk(argc, argv);
+  return 0;
+}
+
+// =================================================================================================
--- a/test/wrapper_clblas.h
+++ b/test/wrapper_clblas.h
@ -201,7 +201,7 @@ clblasStatus clblasXgemm(
                       num_queues, queues, num_wait_events, wait_events, events);
 }

-// This calls {clblasSsymm, clblasDsymm} with the arguments forwarded.
+// This calls {clblasSsymm, clblasDsymm, clblasCsymm, clblasZsymm} with the arguments forwarded.
 clblasStatus clblasXsymm(
  clblasOrder layout, clblasSide side, clblasUplo triangle,
  size_t m, size_t n, float alpha,
@ -267,6 +267,130 @@ clblasStatus clblasXsymm(
                       num_queues, queues, num_wait_events, wait_events, events);
 }

+// This calls {clblasSsyrk, clblasDsyrk, clblasCsyrk, clblasZsyrk} with the arguments forwarded.
+clblasStatus clblasXsyrk(
+  clblasOrder layout, clblasUplo triangle, clblasTranspose tran_a,
+  size_t n, size_t k, float alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld, float beta,
+  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    return clblasSsyrk(layout, triangle, tran_a,
+                       n, k, alpha,
+                       a_mat, a_offset, a_ld, beta,
+                       c_mat, c_offset, c_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXsyrk(
+  clblasOrder layout, clblasUplo triangle, clblasTranspose tran_a,
+  size_t n, size_t k, double alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld, double beta,
+  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    return clblasDsyrk(layout, triangle, tran_a,
+                       n, k, alpha,
+                       a_mat, a_offset, a_ld, beta,
+                       c_mat, c_offset, c_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXsyrk(
+  clblasOrder layout, clblasUplo triangle, clblasTranspose tran_a,
+  size_t n, size_t k, float2 alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld, float2 beta,
+  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}};
+    auto cl_beta = cl_float2{{beta.real(), beta.imag()}};
+    return clblasCsyrk(layout, triangle, tran_a,
+                       n, k, cl_alpha,
+                       a_mat, a_offset, a_ld, cl_beta,
+                       c_mat, c_offset, c_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXsyrk(
+  clblasOrder layout, clblasUplo triangle, clblasTranspose tran_a,
+  size_t n, size_t k, double2 alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld, double2 beta,
+  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}};
+    auto cl_beta = cl_double2{{beta.real(), beta.imag()}};
+    return clblasZsyrk(layout, triangle, tran_a,
+                       n, k, cl_alpha,
+                       a_mat, a_offset, a_ld, cl_beta,
+                       c_mat, c_offset, c_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+
+// This calls {clblasSsyr2k, clblasDsyr2k, clblasCsyr2k, clblasZsyr2k} with the arguments forwarded.
+clblasStatus clblasXsyr2k(
+  clblasOrder layout, clblasUplo triangle, clblasTranspose tran_ab,
+  size_t n, size_t k, float alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld, float beta,
+  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    return clblasSsyr2k(layout, triangle, tran_ab,
+                        n, k, alpha,
+                        a_mat, a_offset, a_ld,
+                        b_mat, b_offset, b_ld, beta,
+                        c_mat, c_offset, c_ld,
+                        num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXsyr2k(
+  clblasOrder layout, clblasUplo triangle, clblasTranspose tran_ab,
+  size_t n, size_t k, double alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld, double beta,
+  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    return clblasDsyr2k(layout, triangle, tran_ab,
+                        n, k, alpha,
+                        a_mat, a_offset, a_ld,
+                        b_mat, b_offset, b_ld, beta,
+                        c_mat, c_offset, c_ld,
+                        num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXsyr2k(
+  clblasOrder layout, clblasUplo triangle, clblasTranspose tran_ab,
+  size_t n, size_t k, float2 alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld, float2 beta,
+  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}};
+    auto cl_beta = cl_float2{{beta.real(), beta.imag()}};
+    return clblasCsyr2k(layout, triangle, tran_ab,
+                        n, k, cl_alpha,
+                        a_mat, a_offset, a_ld,
+                        b_mat, b_offset, b_ld, cl_beta,
+                        c_mat, c_offset, c_ld,
+                        num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXsyr2k(
+  clblasOrder layout, clblasUplo triangle, clblasTranspose tran_ab,
+  size_t n, size_t k, double2 alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld, double2 beta,
+  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}};
+    auto cl_beta = cl_double2{{beta.real(), beta.imag()}};
+    return clblasZsyr2k(layout, triangle, tran_ab,
+                        n, k, cl_alpha,
+                        a_mat, a_offset, a_ld,
+                        b_mat, b_offset, b_ld, cl_beta,
+                        c_mat, c_offset, c_ld,
+                        num_queues, queues, num_wait_events, wait_events, events);
+}
+
 // =================================================================================================
 } // namespace clblast