Merge pull request #10 from CNugteren/test_infrastructure

Re-organized test infrastructure
2024-07-15 19:05:44 +02:00 · 2015-06-29 20:45:10 +02:00 · 2015-06-29 20:45:10 +02:00 · cbf2eef179
parent 77e2157485 3726f6a618
commit cbf2eef179
38 changed files with 1736 additions and 2716 deletions
--- a/1
+++ b/1
@ -1,5 +1,6 @@

 Development version (next release)
+- Re-organized test/client infrastructure to avoid code duplication
 - Added level-3 routines:
  * SSYRK/DSYRK/CSYRK/ZSYRK
  * SSYR2K/DSYR2K/CSYR2K/ZSYR2K
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -95,12 +95,10 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
 # Sets the supported routines and the used kernels. New routines and kernels should be added here.
 set(KERNELS copy pad transpose padtranspose xaxpy xgemv xgemm)
 set(SAMPLE_PROGRAMS sgemm)
-set(ROUTINES_XY xaxpy)
-set(ROUTINES_AXY xgemv)
-set(ROUTINES_ABC xgemm xsymm xsyr2k)
-set(ROUTINES_AB )
-set(ROUTINES_AC xsyrk)
-set(ROUTINES ${ROUTINES_XY} ${ROUTINES_AXY} ${ROUTINES_ABC} ${ROUTINES_AB} ${ROUTINES_AC})
+set(ROUTINES
+  xaxpy
+  xgemv
+  xgemm xsymm xsyrk xsyr2k)

 # ==================================================================================================

@ -170,45 +168,14 @@ if(TESTS)
  include_directories(${clblast_SOURCE_DIR}/test ${clBLAS_SOURCE_DIR})

  # Creates the common correctness-tests objects (requires CMake 2.8.8)
-  add_library(test_correctness_common OBJECT test/correctness/tester.cc)
-  add_library(test_correctness_xy OBJECT test/correctness/testxy.cc)
-  add_library(test_correctness_axy OBJECT test/correctness/testaxy.cc)
-  add_library(test_correctness_abc OBJECT test/correctness/testabc.cc)
-  add_library(test_correctness_ab OBJECT test/correctness/testab.cc)
-  add_library(test_correctness_ac OBJECT test/correctness/testac.cc)
+  add_library(test_correctness_common OBJECT
+              test/correctness/tester.cc test/correctness/testblas.cc)

  # Compiles the correctness-tests
-  foreach(ROUTINE ${ROUTINES_XY})
-    add_executable(test_${ROUTINE}
-                   $<TARGET_OBJECTS:test_correctness_common>
-                   $<TARGET_OBJECTS:test_correctness_xy>
-                   test/correctness/routines/${ROUTINE}.cc)
-  endforeach()
-  foreach(ROUTINE ${ROUTINES_AXY})
-    add_executable(test_${ROUTINE}
-                   $<TARGET_OBJECTS:test_correctness_common>
-                   $<TARGET_OBJECTS:test_correctness_axy>
-                   test/correctness/routines/${ROUTINE}.cc)
-  endforeach()
-  foreach(ROUTINE ${ROUTINES_ABC})
-    add_executable(test_${ROUTINE}
-                   $<TARGET_OBJECTS:test_correctness_common>
-                   $<TARGET_OBJECTS:test_correctness_abc>
-                   test/correctness/routines/${ROUTINE}.cc)
-  endforeach()
-  foreach(ROUTINE ${ROUTINES_AB})
-    add_executable(test_${ROUTINE}
-                   $<TARGET_OBJECTS:test_correctness_common>
-                   $<TARGET_OBJECTS:test_correctness_ab>
-                   test/correctness/routines/${ROUTINE}.cc)
-  endforeach()
-  foreach(ROUTINE ${ROUTINES_AC})
-    add_executable(test_${ROUTINE}
-                   $<TARGET_OBJECTS:test_correctness_common>
-                   $<TARGET_OBJECTS:test_correctness_ac>
-                   test/correctness/routines/${ROUTINE}.cc)
-  endforeach()
  foreach(ROUTINE ${ROUTINES})
+    add_executable(test_${ROUTINE}
+                   $<TARGET_OBJECTS:test_correctness_common>
+                   test/correctness/routines/${ROUTINE}.cc)
    target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
    install(TARGETS test_${ROUTINE} DESTINATION bin)
  endforeach()
@ -217,7 +184,6 @@ if(TESTS)
  add_library(test_performance_common OBJECT test/performance/client.cc)

  # Compiles the performance-tests
-  set(TEST_PERF_COMM )
  foreach(ROUTINE ${ROUTINES})
    add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
                   test/performance/routines/${ROUTINE}.cc)
--- a/include/internal/utilities.h
+++ b/include/internal/utilities.h
@ -105,6 +105,11 @@ struct Arguments {
  size_t c_offset = 0;
  T alpha = T{1.0};
  T beta = T{1.0};
+  size_t x_size = 1;
+  size_t y_size = 1;
+  size_t a_size = 1;
+  size_t b_size = 1;
+  size_t c_size = 1;
  // Tuner-specific arguments
  double fraction = 1.0;
  // Client-specific arguments
@ -123,6 +128,15 @@ struct Arguments {
  bool no_abbrv = false;
 };

+// Structure containing all possible buffers for test clients
+struct Buffers {
+  Buffer x_vec;
+  Buffer y_vec;
+  Buffer a_mat;
+  Buffer b_mat;
+  Buffer c_mat;
+};
+
 // =================================================================================================

 // Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast
--- a/test/correctness/routines/xaxpy.cc
+++ b/test/correctness/routines/xaxpy.cc
@ -1,63 +1,69 @@

 // =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file implements the tests for the Xaxpy routine. It is based on the TestXY class.
+// This file implements the tests for the Xaxpy routine.
 //
 // =================================================================================================

-#include "wrapper_clblas.h"
-#include "correctness/testxy.h"
+#include "correctness/testblas.h"
+#include "routines/xaxpy.h"

 namespace clblast {
 // =================================================================================================

-// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
+// The correctness tester
 template <typename T>
-void XaxpyTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [](const Arguments<T> &args,
-                           const Buffer &x_vec, const Buffer &y_vec,
-                           CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    return Axpy(args.n, args.alpha,
-                x_vec(), args.x_offset, args.x_inc,
-                y_vec(), args.y_offset, args.y_inc,
-                &queue_plain, &event);
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [](const Arguments<T> &args,
-                          const Buffer &x_vec, const Buffer &y_vec,
-                          CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXaxpy(args.n, args.alpha,
-                              x_vec(), args.x_offset, args.x_inc,
-                              y_vec(), args.y_offset, args.y_inc,
-                              1, &queue_plain, 0, nullptr, &event);
-    return static_cast<StatusCode>(status);
-  };
-
-  // Initializes the arguments relevant for this routine
-  auto args = Arguments<T>{};
-  const auto options = std::vector<std::string>{kArgN, kArgXInc, kArgYInc,
-                                                kArgXOffset, kArgYOffset, kArgAlpha};
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {

  // Creates a tester
-  TestXY<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
+  TestBlas<T> tester{argc, argv, silent, name, TestXaxpy<T>::GetOptions(),
+                     TestXaxpy<T>::RunRoutine, TestXaxpy<T>::RunReference,
+                     TestXaxpy<T>::DownloadResult, TestXaxpy<T>::GetResultIndex,
+                     TestXaxpy<T>::ResultID1, TestXaxpy<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Creates the arguments vector for the regular tests
+  auto regular_test_vector = std::vector<Arguments<T>>{};
+  for (auto &n: tester.kVectorDims) { args.n = n;
+    for (auto &x_inc: tester.kIncrements) { args.x_inc = x_inc;
+      for (auto &x_offset: tester.kOffsets) { args.x_offset = x_offset;
+        for (auto &y_inc: tester.kIncrements) { args.y_inc = y_inc;
+          for (auto &y_offset: tester.kOffsets) { args.y_offset = y_offset;
+            for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+              args.x_size = TestXaxpy<T>::GetSizeX(args);
+              args.y_size = TestXaxpy<T>::GetSizeY(args);
+              if (args.x_size<1 || args.y_size<1) { continue; }
+              regular_test_vector.push_back(args);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Creates the arguments vector for the invalid-buffer tests
+  auto invalid_test_vector = std::vector<Arguments<T>>{};
+  args.n = tester.kBufferSize;
+  args.x_inc = args.y_inc = 1;
+  args.x_offset = args.y_offset = 0;
+  for (auto &x_size: tester.kVecSizes) { args.x_size = x_size;
+    for (auto &y_size: tester.kVecSizes) { args.y_size = y_size;
+      invalid_test_vector.push_back(args);
+    }
+  }

  // Runs the tests
  const auto case_name = "default";
-  tester.TestRegular(args, case_name);
-  tester.TestInvalidBufferSizes(args, case_name);
+  tester.TestRegular(regular_test_vector, case_name);
+  tester.TestInvalid(invalid_test_vector, case_name);
 }

 // =================================================================================================
@ -65,10 +71,10 @@ void XaxpyTest(int argc, char *argv[], const bool silent, const std::string &nam

 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  clblast::XaxpyTest<float>(argc, argv, false, "SAXPY");
-  clblast::XaxpyTest<double>(argc, argv, true, "DAXPY");
-  clblast::XaxpyTest<clblast::float2>(argc, argv, true, "CAXPY");
-  clblast::XaxpyTest<clblast::double2>(argc, argv, true, "ZAXPY");
+  clblast::RunTest<float>(argc, argv, false, "SAXPY");
+  clblast::RunTest<double>(argc, argv, true, "DAXPY");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CAXPY");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZAXPY");
  return 0;
 }

--- a/test/correctness/routines/xgemm.cc
+++ b/test/correctness/routines/xgemm.cc
@ -1,83 +1,87 @@

 // =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file implements the tests for the Xgemm routine. It is based on the TestABC class.
+// This file implements the tests for the Xgemm routine.
 //
 // =================================================================================================

-#include "wrapper_clblas.h"
-#include "correctness/testabc.h"
+#include "correctness/testblas.h"
+#include "routines/xgemm.h"

 namespace clblast {
 // =================================================================================================

-// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
+// The correctness tester
 template <typename T>
-void XgemmTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [](const Arguments<T> &args,
-                           const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
-                           CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    return Gemm(args.layout, args.a_transpose, args.b_transpose,
-                args.m, args.n, args.k,
-                args.alpha,
-                a_mat(), args.a_offset, args.a_ld,
-                b_mat(), args.b_offset, args.b_ld,
-                args.beta,
-                c_mat(), args.c_offset, args.c_ld,
-                &queue_plain, &event);
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [](const Arguments<T> &args,
-                          const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
-                          CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXgemm(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasTranspose>(args.a_transpose),
-                              static_cast<clblasTranspose>(args.b_transpose),
-                              args.m, args.n, args.k,
-                              args.alpha,
-                              a_mat(), args.a_offset, args.a_ld,
-                              b_mat(), args.b_offset, args.b_ld,
-                              args.beta,
-                              c_mat(), args.c_offset, args.c_ld,
-                              1, &queue_plain, 0, nullptr, &event);
-    return static_cast<StatusCode>(status);
-  };
-
-  // Initializes the arguments relevant for this routine
-  auto args = Arguments<T>{};
-  const auto options = std::vector<std::string>{kArgM, kArgN, kArgK, kArgLayout,
-                                                kArgATransp, kArgBTransp,
-                                                kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
-                                                kArgAOffset, kArgBOffset, kArgCOffset};
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {

  // Creates a tester
-  TestABC<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
+  TestBlas<T> tester{argc, argv, silent, name, TestXgemm<T>::GetOptions(),
+                     TestXgemm<T>::RunRoutine, TestXgemm<T>::RunReference,
+                     TestXgemm<T>::DownloadResult, TestXgemm<T>::GetResultIndex,
+                     TestXgemm<T>::ResultID1, TestXgemm<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};

  // Loops over the test-cases from a data-layout point of view
-  for (auto &layout: tester.kLayouts) {
-    args.layout = layout;
-    for (auto &a_transpose: tester.kTransposes) {
-      args.a_transpose = a_transpose;
-      for (auto &b_transpose: tester.kTransposes) {
-        args.b_transpose = b_transpose;
-        const auto case_name = ToString(layout)+" "+ToString(a_transpose)+" "+ToString(b_transpose);
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose;
+      for (auto &b_transpose: tester.kTransposes) { args.b_transpose = b_transpose;
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<T>>{};
+        for (auto &m: tester.kMatrixDims) { args.m = m;
+          for (auto &n: tester.kMatrixDims) { args.n = n;
+            for (auto &k: tester.kMatrixDims) { args.k = k;
+              for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+                for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                  for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
+                    for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
+                      for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                        for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                          for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                            for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                              args.a_size = TestXgemm<T>::GetSizeA(args);
+                              args.b_size = TestXgemm<T>::GetSizeB(args);
+                              args.c_size = TestXgemm<T>::GetSizeC(args);
+                              if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
+                              regular_test_vector.push_back(args);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<T>>{};
+        args.m = args.n = args.k = tester.kBufferSize;
+        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.b_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
+            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+              invalid_test_vector.push_back(args);
+            }
+          }
+        }

        // Runs the tests
-        tester.TestRegular(args, case_name, false);
-        tester.TestInvalidBufferSizes(args, case_name);
+        const auto case_name = ToString(layout)+" "+ToString(a_transpose)+" "+ToString(b_transpose);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
      }
    }
  }
@ -88,10 +92,10 @@ void XgemmTest(int argc, char *argv[], const bool silent, const std::string &nam

 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  clblast::XgemmTest<float>(argc, argv, false, "SGEMM");
-  clblast::XgemmTest<double>(argc, argv, true, "DGEMM");
-  clblast::XgemmTest<clblast::float2>(argc, argv, true, "CGEMM");
-  clblast::XgemmTest<clblast::double2>(argc, argv, true, "ZGEMM");
+  clblast::RunTest<float>(argc, argv, false, "SGEMM");
+  clblast::RunTest<double>(argc, argv, true, "DGEMM");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CGEMM");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZGEMM");
  return 0;
 }

--- a/test/correctness/routines/xgemv.cc
+++ b/test/correctness/routines/xgemv.cc
@ -1,74 +1,85 @@

 // =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file implements the tests for the Xgemv routine. It is based on the TestAXY class.
+// This file implements the tests for the Xgemv routine.
 //
 // =================================================================================================

-#include "wrapper_clblas.h"
-#include "correctness/testaxy.h"
+#include "correctness/testblas.h"
+#include "routines/xgemv.h"

 namespace clblast {
 // =================================================================================================

-// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
+// The correctness tester
 template <typename T>
-void XgemvTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [](const Arguments<T> &args,
-                           const Buffer &a_mat, const Buffer &x_vec, const Buffer &y_vec,
-                           CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    return Gemv(args.layout, args.a_transpose, args.m, args.n, args.alpha,
-                a_mat(), args.a_offset, args.a_ld,
-                x_vec(), args.x_offset, args.x_inc, args.beta,
-                y_vec(), args.y_offset, args.y_inc,
-                &queue_plain, &event);
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [](const Arguments<T> &args,
-                          const Buffer &a_mat, const Buffer &x_vec, const Buffer &y_vec,
-                          CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXgemv(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasTranspose>(args.a_transpose),
-                              args.m, args.n, args.alpha,
-                              a_mat(), args.a_offset, args.a_ld,
-                              x_vec(), args.x_offset, args.x_inc, args.beta,
-                              y_vec(), args.y_offset, args.y_inc,
-                              1, &queue_plain, 0, nullptr, &event);
-    return static_cast<StatusCode>(status);
-  };
-
-  // Initializes the arguments relevant for this routine
-  auto args = Arguments<T>{};
-  const auto options = std::vector<std::string>{kArgM, kArgN, kArgLayout, kArgATransp,
-                                                kArgALeadDim, kArgXInc, kArgYInc,
-                                                kArgAOffset, kArgXOffset, kArgYOffset};
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {

  // Creates a tester
-  TestAXY<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
+  TestBlas<T> tester{argc, argv, silent, name, TestXgemv<T>::GetOptions(),
+                     TestXgemv<T>::RunRoutine, TestXgemv<T>::RunReference,
+                     TestXgemv<T>::DownloadResult, TestXgemv<T>::GetResultIndex,
+                     TestXgemv<T>::ResultID1, TestXgemv<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};

  // Loops over the test-cases from a data-layout point of view
-  for (auto &layout: tester.kLayouts) {
-    args.layout = layout;
-    for (auto &a_transpose: tester.kTransposes) {
-      args.a_transpose = a_transpose;
-      const auto case_name = ToString(layout)+" "+ToString(a_transpose);
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose;
+
+      // Creates the arguments vector for the regular tests
+      auto regular_test_vector = std::vector<Arguments<T>>{};
+      for (auto &m: tester.kMatrixVectorDims) { args.m = m;
+        for (auto &n: tester.kMatrixVectorDims) { args.n = n;
+          for (auto &a_ld: tester.kMatrixVectorDims) { args.a_ld = a_ld;
+            for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+              for (auto &x_inc: tester.kIncrements) { args.x_inc = x_inc;
+                for (auto &x_offset: tester.kOffsets) { args.x_offset = x_offset;
+                  for (auto &y_inc: tester.kIncrements) { args.y_inc = y_inc;
+                    for (auto &y_offset: tester.kOffsets) { args.y_offset = y_offset;
+                      for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                        for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                          args.a_size = TestXgemv<T>::GetSizeA(args);
+                          args.x_size = TestXgemv<T>::GetSizeX(args);
+                          args.y_size = TestXgemv<T>::GetSizeY(args);
+                          if (args.a_size<1 || args.x_size<1 || args.y_size<1) { continue; }
+                          regular_test_vector.push_back(args);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+
+      // Creates the arguments vector for the invalid-buffer tests
+      auto invalid_test_vector = std::vector<Arguments<T>>{};
+      args.m = args.n = tester.kBufferSize;
+      args.a_ld = tester.kBufferSize;
+      args.x_inc = args.y_inc = 1;
+      args.a_offset = args.x_offset = args.y_offset = 0;
+      for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+        for (auto &x_size: tester.kVecSizes) { args.x_size = x_size;
+          for (auto &y_size: tester.kVecSizes) { args.y_size = y_size;
+            invalid_test_vector.push_back(args);
+          }
+        }
+      }

      // Runs the tests
-      tester.TestRegular(args, case_name);
-      tester.TestInvalidBufferSizes(args, case_name);
+      const auto case_name = ToString(layout)+" "+ToString(a_transpose);
+      tester.TestRegular(regular_test_vector, case_name);
+      tester.TestInvalid(invalid_test_vector, case_name);
    }
  }
 }
@ -78,10 +89,10 @@ void XgemvTest(int argc, char *argv[], const bool silent, const std::string &nam

 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  clblast::XgemvTest<float>(argc, argv, false, "SGEMV");
-  clblast::XgemvTest<double>(argc, argv, true, "DGEMV");
-  clblast::XgemvTest<clblast::float2>(argc, argv, true, "CGEMV");
-  clblast::XgemvTest<clblast::double2>(argc, argv, true, "ZGEMV");
+  clblast::RunTest<float>(argc, argv, false, "SGEMV");
+  clblast::RunTest<double>(argc, argv, true, "DGEMV");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CGEMV");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZGEMV");
  return 0;
 }

--- a/test/correctness/routines/xsymm.cc
+++ b/test/correctness/routines/xsymm.cc
@ -1,83 +1,85 @@

 // =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file implements the tests for the Xsymm routine. It is based on the TestABC class.
+// This file implements the tests for the Xsymm routine.
 //
 // =================================================================================================

-#include "wrapper_clblas.h"
-#include "correctness/testabc.h"
+#include "correctness/testblas.h"
+#include "routines/xsymm.h"

 namespace clblast {
 // =================================================================================================

-// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
+// The correctness tester
 template <typename T>
-void XsymmTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [](const Arguments<T> &args,
-                           const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
-                           CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    return Symm(args.layout, args.side, args.triangle,
-                args.m, args.n,
-                args.alpha,
-                a_mat(), args.a_offset, args.a_ld,
-                b_mat(), args.b_offset, args.b_ld,
-                args.beta,
-                c_mat(), args.c_offset, args.c_ld,
-                &queue_plain, &event);
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [](const Arguments<T> &args,
-                          const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
-                          CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXsymm(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasSide>(args.side),
-                              static_cast<clblasUplo>(args.triangle),
-                              args.m, args.n,
-                              args.alpha,
-                              a_mat(), args.a_offset, args.a_ld,
-                              b_mat(), args.b_offset, args.b_ld,
-                              args.beta,
-                              c_mat(), args.c_offset, args.c_ld,
-                              1, &queue_plain, 0, nullptr, &event);
-    return static_cast<StatusCode>(status);
-  };
-
-  // Initializes the arguments relevant for this routine
-  auto args = Arguments<T>{};
-  const auto options = std::vector<std::string>{kArgM, kArgN, kArgLayout,
-                                                kArgSide, kArgTriangle,
-                                                kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
-                                                kArgAOffset, kArgBOffset, kArgCOffset};
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {

  // Creates a tester
-  TestABC<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
+  TestBlas<T> tester{argc, argv, silent, name, TestXsymm<T>::GetOptions(),
+                     TestXsymm<T>::RunRoutine, TestXsymm<T>::RunReference,
+                     TestXsymm<T>::DownloadResult, TestXsymm<T>::GetResultIndex,
+                     TestXsymm<T>::ResultID1, TestXsymm<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};

  // Loops over the test-cases from a data-layout point of view
-  for (auto &layout: tester.kLayouts) {
-    args.layout = layout;
-    for (auto &side: {Side::kLeft, Side::kRight}) {
-      args.side = side;
-      for (auto &triangle: {Triangle::kUpper, Triangle::kLower}) {
-        args.triangle = triangle;
-        const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &side: tester.kSides) { args.side = side;
+      for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<T>>{};
+        for (auto &m: tester.kMatrixDims) { args.m = m;
+          for (auto &n: tester.kMatrixDims) { args.n = n;
+            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
+                  for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
+                    for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                      for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                          for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                            args.a_size = TestXsymm<T>::GetSizeA(args);
+                            args.b_size = TestXsymm<T>::GetSizeB(args);
+                            args.c_size = TestXsymm<T>::GetSizeC(args);
+                            if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
+                            regular_test_vector.push_back(args);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<T>>{};
+        args.m = args.n = tester.kBufferSize;
+        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.b_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
+            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+              invalid_test_vector.push_back(args);
+            }
+          }
+        }

        // Runs the tests
-        tester.TestRegular(args, case_name, true);
-        tester.TestInvalidBufferSizes(args, case_name);
+        const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
      }
    }
  }
@ -88,10 +90,10 @@ void XsymmTest(int argc, char *argv[], const bool silent, const std::string &nam

 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  clblast::XsymmTest<float>(argc, argv, false, "SSYMM");
-  clblast::XsymmTest<double>(argc, argv, true, "DSYMM");
-  clblast::XsymmTest<clblast::float2>(argc, argv, true, "CSYMM");
-  clblast::XsymmTest<clblast::double2>(argc, argv, true, "ZSYMM");
+  clblast::RunTest<float>(argc, argv, false, "SSYMM");
+  clblast::RunTest<double>(argc, argv, true, "DSYMM");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CSYMM");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYMM");
  return 0;
 }

--- a/test/correctness/routines/xsyr2k.cc
+++ b/test/correctness/routines/xsyr2k.cc
@ -1,84 +1,87 @@

 // =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file implements the tests for the Xsyr2k routine. It is based on the TestABC class.
+// This file implements the tests for the Xsyr2k routine.
 //
 // =================================================================================================

-#include "wrapper_clblas.h"
-#include "correctness/testabc.h"
+#include "correctness/testblas.h"
+#include "routines/xsyr2k.h"

 namespace clblast {
 // =================================================================================================

-// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
+// The correctness tester
 template <typename T>
-void Xsyr2kTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [](const Arguments<T> &args,
-                           const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
-                           CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    return Syr2k(args.layout, args.triangle, args.a_transpose,
-                 args.n, args.k,
-                 args.alpha,
-                 a_mat(), args.a_offset, args.a_ld,
-                 b_mat(), args.b_offset, args.b_ld,
-                 args.beta,
-                 c_mat(), args.c_offset, args.c_ld,
-                 &queue_plain, &event);
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [](const Arguments<T> &args,
-                          const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
-                          CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXsyr2k(static_cast<clblasOrder>(args.layout),
-                               static_cast<clblasUplo>(args.triangle),
-                               static_cast<clblasTranspose>(args.a_transpose),
-                               args.n, args.k,
-                               args.alpha,
-                               a_mat(), args.a_offset, args.a_ld,
-                               b_mat(), args.b_offset, args.b_ld,
-                               args.beta,
-                               c_mat(), args.c_offset, args.c_ld,
-                               1, &queue_plain, 0, nullptr, &event);
-    return static_cast<StatusCode>(status);
-  };
-
-  // Initializes the arguments relevant for this routine
-  auto args = Arguments<T>{};
-  const auto options = std::vector<std::string>{kArgN, kArgK, kArgLayout,
-                                                kArgTriangle, kArgATransp,
-                                                kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
-                                                kArgAOffset, kArgBOffset, kArgCOffset};
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {

  // Creates a tester
-  TestABC<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
+  TestBlas<T> tester{argc, argv, silent, name, TestXsyr2k<T>::GetOptions(),
+                     TestXsyr2k<T>::RunRoutine, TestXsyr2k<T>::RunReference,
+                     TestXsyr2k<T>::DownloadResult, TestXsyr2k<T>::GetResultIndex,
+                     TestXsyr2k<T>::ResultID1, TestXsyr2k<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};

  // Loops over the test-cases from a data-layout point of view
-  for (auto &layout: tester.kLayouts) {
-    args.layout = layout;
-    for (auto &triangle: {Triangle::kUpper, Triangle::kLower}) {
-      args.triangle = triangle;
-      for (auto &ab_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it is
-        args.a_transpose = ab_transpose;                            // not supported by clBLAS
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+      for (auto &ab_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it
+        args.a_transpose = ab_transpose;                            // is not supported by clBLAS
        args.b_transpose = ab_transpose;
-        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose);
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<T>>{};
+        for (auto &n: tester.kMatrixDims) { args.n = n;
+          for (auto &k: tester.kMatrixDims) { args.k = k;
+            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
+                  for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
+                    for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                      for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                          for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                            args.a_size = TestXsyr2k<T>::GetSizeA(args);
+                            args.b_size = TestXsyr2k<T>::GetSizeB(args);
+                            args.c_size = TestXsyr2k<T>::GetSizeC(args);
+                            if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
+                            regular_test_vector.push_back(args);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<T>>{};
+        args.n = args.k = tester.kBufferSize;
+        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.b_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
+            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+              invalid_test_vector.push_back(args);
+            }
+          }
+        }

        // Runs the tests
-        tester.TestRegular(args, case_name, true);
-        tester.TestInvalidBufferSizes(args, case_name);
+        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
      }
    }
  }
@ -89,10 +92,10 @@ void Xsyr2kTest(int argc, char *argv[], const bool silent, const std::string &na

 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  clblast::Xsyr2kTest<float>(argc, argv, false, "SSYR2K");
-  clblast::Xsyr2kTest<double>(argc, argv, true, "DSYR2K");
-  clblast::Xsyr2kTest<clblast::float2>(argc, argv, true, "CSYR2K");
-  clblast::Xsyr2kTest<clblast::double2>(argc, argv, true, "ZSYR2K");
+  clblast::RunTest<float>(argc, argv, false, "SSYR2K");
+  clblast::RunTest<double>(argc, argv, true, "DSYR2K");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CSYR2K");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYR2K");
  return 0;
 }

--- a/test/correctness/routines/xsyrk.cc
+++ b/test/correctness/routines/xsyrk.cc
@ -1,81 +1,79 @@

 // =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file implements the tests for the Xsyrk routine. It is based on the TestAC class.
+// This file implements the tests for the Xsyrk routine.
 //
 // =================================================================================================

-#include "wrapper_clblas.h"
-#include "correctness/testac.h"
+#include "correctness/testblas.h"
+#include "routines/xsyrk.h"

 namespace clblast {
 // =================================================================================================

-// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
+// The correctness tester
 template <typename T>
-void XsyrkTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [](const Arguments<T> &args,
-                           const Buffer &a_mat, const Buffer &c_mat,
-                           CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    return Syrk(args.layout, args.triangle, args.a_transpose,
-                args.n, args.k,
-                args.alpha,
-                a_mat(), args.a_offset, args.a_ld,
-                args.beta,
-                c_mat(), args.c_offset, args.c_ld,
-                &queue_plain, &event);
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [](const Arguments<T> &args,
-                          const Buffer &a_mat, const Buffer &c_mat,
-                          CommandQueue &queue) -> StatusCode {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasUplo>(args.triangle),
-                              static_cast<clblasTranspose>(args.a_transpose),
-                              args.n, args.k,
-                              args.alpha,
-                              a_mat(), args.a_offset, args.a_ld,
-                              args.beta,
-                              c_mat(), args.c_offset, args.c_ld,
-                              1, &queue_plain, 0, nullptr, &event);
-    return static_cast<StatusCode>(status);
-  };
-
-  // Initializes the arguments relevant for this routine
-  auto args = Arguments<T>{};
-  const auto options = std::vector<std::string>{kArgN, kArgK, kArgLayout,
-                                                kArgTriangle, kArgATransp,
-                                                kArgALeadDim, kArgCLeadDim,
-                                                kArgAOffset, kArgCOffset};
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {

  // Creates a tester
-  TestAC<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
+  TestBlas<T> tester{argc, argv, silent, name, TestXsyrk<T>::GetOptions(),
+                     TestXsyrk<T>::RunRoutine, TestXsyrk<T>::RunReference,
+                     TestXsyrk<T>::DownloadResult, TestXsyrk<T>::GetResultIndex,
+                     TestXsyrk<T>::ResultID1, TestXsyrk<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};

  // Loops over the test-cases from a data-layout point of view
-  for (auto &layout: tester.kLayouts) {
-    args.layout = layout;
-    for (auto &triangle: {Triangle::kUpper, Triangle::kLower}) {
-      args.triangle = triangle;
-      for (auto &a_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it is
-        args.a_transpose = a_transpose;                            // not supported by clBLAS
-        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose);
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+      for (auto &a_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it
+        args.a_transpose = a_transpose;                            // is not supported by clBLAS
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<T>>{};
+        for (auto &n: tester.kMatrixDims) { args.n = n;
+          for (auto &k: tester.kMatrixDims) { args.k = k;
+            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                  for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                    for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                      for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                        args.a_size = TestXsyrk<T>::GetSizeA(args);
+                        args.c_size = TestXsyrk<T>::GetSizeC(args);
+                        if (args.a_size<1 || args.c_size<1) { continue; }
+                        regular_test_vector.push_back(args);
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<T>>{};
+        args.n = args.k = tester.kBufferSize;
+        args.a_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+            invalid_test_vector.push_back(args);
+          }
+        }

        // Runs the tests
-        tester.TestRegular(args, case_name);
-        tester.TestInvalidBufferSizes(args, case_name);
+        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
      }
    }
  }
@ -86,10 +84,10 @@ void XsyrkTest(int argc, char *argv[], const bool silent, const std::string &nam

 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  clblast::XsyrkTest<float>(argc, argv, false, "SSYRK");
-  clblast::XsyrkTest<double>(argc, argv, true, "DSYRK");
-  clblast::XsyrkTest<clblast::float2>(argc, argv, true, "CSYRK");
-  clblast::XsyrkTest<clblast::double2>(argc, argv, true, "ZSYRK");
+  clblast::RunTest<float>(argc, argv, false, "SSYRK");
+  clblast::RunTest<double>(argc, argv, true, "DSYRK");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CSYRK");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYRK");
  return 0;
 }

--- a/test/correctness/testab.cc
+++ b/test/correctness/testab.cc
@ -1,192 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the TestAB class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include <algorithm>
-
-#include "correctness/testab.h"
-
-namespace clblast {
-// =================================================================================================
-
-// Constructor, initializes the base class tester and input data
-template <typename T>
-TestAB<T>::TestAB(int argc, char *argv[], const bool silent,
-                  const std::string &name, const std::vector<std::string> &options,
-                  const Routine clblast_lambda, const Routine clblas_lambda):
-    Tester<T>{argc, argv, silent, name, options},
-    clblast_lambda_(clblast_lambda),
-    clblas_lambda_(clblas_lambda) {
-
-  // Computes the maximum sizes. This allows for a single set of input/output buffers.
-  auto max_dim = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
-  auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
-  auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
-
-  // Creates test input data
-  a_source_.resize(max_dim*max_ld + max_offset);
-  b_source_.resize(max_dim*max_ld + max_offset);
-  PopulateVector(a_source_);
-  PopulateVector(b_source_);
-}
-
-// ===============================================================================================
-
-// Tests the routine for a wide variety of parameters
-template <typename T>
-void TestAB<T>::TestRegular(Arguments<T> &args, const std::string &name) {
-  if (!PrecisionSupported()) { return; }
-  TestStart("regular behaviour", name);
-
-  // Computes whether or not the matrices are transposed. Note that we assume a default of
-  // column-major and no-transpose. If one of them is different (but not both), then rotated
-  // is considered true.
-  auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
-                   (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
-  auto b_rotated = (args.layout == Layout::kRowMajor);
-
-  // Iterates over the matrix dimensions
-  for (auto &m: kMatrixDims) {
-    args.m = m;
-    for (auto &n: kMatrixDims) {
-      args.n = n;
-
-      // Computes the second dimensions of the matrices taking the rotation into account
-      auto a_two = (a_rotated) ? n : n;
-      auto b_two = (b_rotated) ? m : n;
-
-      // Iterates over the leading-dimension values and the offsets
-      for (auto &a_ld: kMatrixDims) {
-        args.a_ld = a_ld;
-        for (auto &a_offset: kOffsets) {
-          args.a_offset = a_offset;
-          for (auto &b_ld: kMatrixDims) {
-            args.b_ld = b_ld;
-            for (auto &b_offset: kOffsets) {
-              args.b_offset = b_offset;
-
-              // Computes the buffer sizes
-              auto a_size = a_two * a_ld + a_offset;
-              auto b_size = b_two * b_ld + b_offset;
-              if (a_size < 1 || b_size < 1) { continue; }
-
-              // Creates the OpenCL buffers
-              auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T));
-              auto r_mat = Buffer(context_, CL_MEM_READ_WRITE, b_size*sizeof(T));
-              auto s_mat = Buffer(context_, CL_MEM_READ_WRITE, b_size*sizeof(T));
-
-              // Iterates over the values for alpha and beta
-              for (auto &alpha: kAlphaValues) {
-                args.alpha = alpha;
-                for (auto &beta: kBetaValues) {
-                  args.beta = beta;
-
-                  // Runs the reference clBLAS code
-                  a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
-                  r_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_);
-                  auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_);
-
-                  // Runs the CLBlast code
-                  a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
-                  s_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_);
-                  auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_);
-
-                  // Tests for equality of the two status codes
-                  if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
-                    TestErrorCodes(status1, status2, args);
-                    continue;
-                  }
-
-                  // Downloads the results
-                  std::vector<T> r_result(b_size, static_cast<T>(0));
-                  std::vector<T> s_result(b_size, static_cast<T>(0));
-                  r_mat.ReadBuffer(queue_, b_size*sizeof(T), r_result);
-                  s_mat.ReadBuffer(queue_, b_size*sizeof(T), s_result);
-
-                  // Checks for differences in the output
-                  auto errors = size_t{0};
-                  for (auto idm=size_t{0}; idm<m; ++idm) {
-                    for (auto idn=size_t{0}; idn<n; ++idn) {
-                      auto index = (args.layout == Layout::kRowMajor) ?
-                                    idm*args.b_ld + idn + args.b_offset:
-                                    idn*args.b_ld + idm + args.b_offset;
-                      if (!TestSimilarity(r_result[index], s_result[index])) {
-                        errors++;
-                      }
-                    }
-                  }
-
-                  // Tests the error count (should be zero)
-                  TestErrorCount(errors, m*n, args);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  TestEnd();
-}
-
-// =================================================================================================
-
-// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
-// does not test for results (if any).
-template <typename T>
-void TestAB<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
-  if (!PrecisionSupported()) { return; }
-  TestStart("invalid buffer sizes", name);
-
-  // Sets example test parameters
-  args.m = kBufferSize;
-  args.n = kBufferSize;
-  args.a_ld = kBufferSize;
-  args.b_ld = kBufferSize;
-  args.a_offset = 0;
-  args.b_offset = 0;
-
-  // Iterates over test buffer sizes
-  const std::vector<size_t> kBufferSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
-  for (auto &a_size: kBufferSizes) {
-    for (auto &b_size: kBufferSizes) {
-
-      // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
-      // want to be able to create invalid buffers (no error checking here).
-      auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr);
-      auto a_mat = Buffer(a);
-      auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, b_size*sizeof(T), nullptr, nullptr);
-      auto r_mat = Buffer(r);
-      auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, b_size*sizeof(T), nullptr, nullptr);
-      auto s_mat = Buffer(s);
-
-      // Runs the two routines
-      auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_);
-      auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_);
-
-      // Tests for equality of the two status codes
-      TestErrorCodes(status1, status2, args);
-    }
-  }
-  TestEnd();
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class TestAB<float>;
-template class TestAB<double>;
-template class TestAB<float2>;
-template class TestAB<double2>;
-
-// =================================================================================================
-} // namespace clblast
--- a/test/correctness/testab.h
+++ b/test/correctness/testab.h
@ -1,85 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file tests any mat-mat (A,B) routine. It contains two types of tests: one testing
-// all sorts of input combinations, and one deliberatly testing with invalid values.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_TEST_CORRECTNESS_TESTAB_H_
-#define CLBLAST_TEST_CORRECTNESS_TESTAB_H_
-
-#include <vector>
-#include <string>
-
-#include "correctness/tester.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class TestAB: public Tester<T> {
- public:
-
-  // Uses several variables from the Tester class
-  using Tester<T>::context_;
-  using Tester<T>::queue_;
-  using Tester<T>::kLayouts;
-  using Tester<T>::kTransposes;
-
-  // Uses several helper functions from the Tester class
-  using Tester<T>::TestStart;
-  using Tester<T>::TestEnd;
-  using Tester<T>::TestSimilarity;
-  using Tester<T>::TestErrorCount;
-  using Tester<T>::TestErrorCodes;
-  using Tester<T>::GetExampleScalars;
-  using Tester<T>::GetOffsets;
-  using Tester<T>::PrecisionSupported;
-
-  // Test settings for the regular test. Append to this list in case more tests are required.
-  const std::vector<size_t> kMatrixDims = { 7, 64 };
-  const std::vector<size_t> kOffsets = GetOffsets();
-  const std::vector<T> kAlphaValues = GetExampleScalars();
-  const std::vector<T> kBetaValues = GetExampleScalars();
-
-  // Test settings for the invalid test
-  const size_t kBufferSize = 64;
-
-  // Shorthand for a BLAS routine
-  using Routine = std::function<StatusCode(const Arguments<T>&,
-                                           const Buffer&, const Buffer&,
-                                           CommandQueue&)>;
-
-  // Constructor, initializes the base class tester and input data
-  TestAB(int argc, char *argv[], const bool silent,
-         const std::string &name, const std::vector<std::string> &options,
-         const Routine clblast_lambda, const Routine clblas_lambda);
-
-  // The test functions, taking no inputs
-  void TestRegular(Arguments<T> &args, const std::string &name);
-  void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
-
- private:
-
-  // Source data to test with
-  std::vector<T> a_source_;
-  std::vector<T> b_source_;
-  
-  // The routines to test
-  Routine clblast_lambda_;
-  Routine clblas_lambda_;
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_TEST_CORRECTNESS_TESTAB_H_
-#endif
--- a/test/correctness/testabc.cc
+++ b/test/correctness/testabc.cc
@ -1,218 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the TestABC class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include <algorithm>
-
-#include "correctness/testabc.h"
-
-namespace clblast {
-// =================================================================================================
-
-// Constructor, initializes the base class tester and input data
-template <typename T>
-TestABC<T>::TestABC(int argc, char *argv[], const bool silent,
-                    const std::string &name, const std::vector<std::string> &options,
-                    const Routine clblast_lambda, const Routine clblas_lambda):
-    Tester<T>{argc, argv, silent, name, options},
-    clblast_lambda_(clblast_lambda),
-    clblas_lambda_(clblas_lambda) {
-
-  // Computes the maximum sizes. This allows for a single set of input/output buffers.
-  auto max_dim = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
-  auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
-  auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
-
-  // Creates test input data
-  a_source_.resize(max_dim*max_ld + max_offset);
-  b_source_.resize(max_dim*max_ld + max_offset);
-  c_source_.resize(max_dim*max_ld + max_offset);
-  PopulateVector(a_source_);
-  PopulateVector(b_source_);
-  PopulateVector(c_source_);
-}
-
-// ===============================================================================================
-
-// Tests the routine for a wide variety of parameters
-template <typename T>
-void TestABC<T>::TestRegular(Arguments<T> &args, const std::string &name, const bool symmetric) {
-  if (!PrecisionSupported()) { return; }
-  TestStart("regular behaviour", name);
-
-  // Computes whether or not the matrices are transposed. Note that we assume a default of
-  // column-major and no-transpose. If one of them is different (but not both), then rotated
-  // is considered true.
-  auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
-                   (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
-  auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
-                   (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
-  auto c_rotated = (args.layout == Layout::kRowMajor);
-
-  // Iterates over the matrix dimensions
-  for (auto &m: kMatrixDims) {
-    args.m = m;
-    for (auto &n: kMatrixDims) {
-      args.n = n;
-      if (symmetric && m != n) { continue; }
-      for (auto &k: kMatrixDims) {
-        args.k = k;
-
-        // Computes the second dimensions of the matrices taking the rotation into account
-        auto a_two = (a_rotated) ? m : k;
-        auto b_two = (b_rotated) ? k : n;
-        auto c_two = (c_rotated) ? m : n;
-
-        // Iterates over the leading-dimension values and the offsets
-        for (auto &a_ld: kMatrixDims) {
-          args.a_ld = a_ld;
-          for (auto &a_offset: kOffsets) {
-            args.a_offset = a_offset;
-            for (auto &b_ld: kMatrixDims) {
-              args.b_ld = b_ld;
-              for (auto &b_offset: kOffsets) {
-                args.b_offset = b_offset;
-                for (auto &c_ld: kMatrixDims) {
-                  args.c_ld = c_ld;
-                  for (auto &c_offset: kOffsets) {
-                    args.c_offset = c_offset;
-
-                    // Computes the buffer sizes
-                    auto a_size = a_two * a_ld + a_offset;
-                    auto b_size = b_two * b_ld + b_offset;
-                    auto c_size = c_two * c_ld + c_offset;
-                    if (a_size < 1 || b_size < 1 || c_size < 1) { continue; }
-
-                    // Creates the OpenCL buffers
-                    auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T));
-                    auto b_mat = Buffer(context_, CL_MEM_READ_WRITE, b_size*sizeof(T));
-                    auto r_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T));
-                    auto s_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T));
-
-                    // Iterates over the values for alpha and beta
-                    for (auto &alpha: kAlphaValues) {
-                      args.alpha = alpha;
-                      for (auto &beta: kBetaValues) {
-                        args.beta = beta;
-
-                        // Runs the reference clBLAS code
-                        a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
-                        b_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_);
-                        r_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_);
-                        auto status1 = clblas_lambda_(args, a_mat, b_mat, r_mat, queue_);
-
-                        // Runs the CLBlast code
-                        a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
-                        b_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_);
-                        s_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_);
-                        auto status2 = clblast_lambda_(args, a_mat, b_mat, s_mat, queue_);
-
-                        // Tests for equality of the two status codes
-                        if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
-                          TestErrorCodes(status1, status2, args);
-                          continue;
-                        }
-
-                        // Downloads the results
-                        std::vector<T> r_result(c_size, static_cast<T>(0));
-                        std::vector<T> s_result(c_size, static_cast<T>(0));
-                        r_mat.ReadBuffer(queue_, c_size*sizeof(T), r_result);
-                        s_mat.ReadBuffer(queue_, c_size*sizeof(T), s_result);
-
-                        // Checks for differences in the output
-                        auto errors = size_t{0};
-                        for (auto idm=size_t{0}; idm<m; ++idm) {
-                          for (auto idn=size_t{0}; idn<n; ++idn) {
-                            auto index = (args.layout == Layout::kRowMajor) ?
-                                          idm*args.c_ld + idn + args.c_offset:
-                                          idn*args.c_ld + idm + args.c_offset;
-                            if (!TestSimilarity(r_result[index], s_result[index])) {
-                              errors++;
-                            }
-                          }
-                        }
-
-                        // Tests the error count (should be zero)
-                        TestErrorCount(errors, m*n, args);
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  TestEnd();
-}
-
-// =================================================================================================
-
-// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
-// does not test for results (if any).
-template <typename T>
-void TestABC<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
-  if (!PrecisionSupported()) { return; }
-  TestStart("invalid buffer sizes", name);
-
-  // Sets example test parameters
-  args.m = kBufferSize;
-  args.n = kBufferSize;
-  args.k = kBufferSize;
-  args.a_ld = kBufferSize;
-  args.b_ld = kBufferSize;
-  args.c_ld = kBufferSize;
-  args.a_offset = 0;
-  args.b_offset = 0;
-  args.c_offset = 0;
-
-  // Iterates over test buffer sizes
-  const std::vector<size_t> kBufferSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
-  for (auto &a_size: kBufferSizes) {
-    for (auto &b_size: kBufferSizes) {
-      for (auto &c_size: kBufferSizes) {
-
-        // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
-        // want to be able to create invalid buffers (no error checking here).
-        auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr);
-        auto a_mat = Buffer(a);
-        auto b = clCreateBuffer(context_(), CL_MEM_READ_WRITE, b_size*sizeof(T), nullptr, nullptr);
-        auto b_mat = Buffer(b);
-        auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr);
-        auto r_mat = Buffer(r);
-        auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr);
-        auto s_mat = Buffer(s);
-
-        // Runs the two routines
-        auto status1 = clblas_lambda_(args, a_mat, b_mat, r_mat, queue_);
-        auto status2 = clblast_lambda_(args, a_mat, b_mat, s_mat, queue_);
-
-        // Tests for equality of the two status codes
-        TestErrorCodes(status1, status2, args);
-      }
-    }
-  }
-  TestEnd();
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class TestABC<float>;
-template class TestABC<double>;
-template class TestABC<float2>;
-template class TestABC<double2>;
-
-// =================================================================================================
-} // namespace clblast
--- a/test/correctness/testabc.h
+++ b/test/correctness/testabc.h
@ -1,86 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file tests any mat-mat-mat (A,B,C) routine. It contains two types of tests: one testing
-// all sorts of input combinations, and one deliberatly testing with invalid values.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_TEST_CORRECTNESS_TESTABC_H_
-#define CLBLAST_TEST_CORRECTNESS_TESTABC_H_
-
-#include <vector>
-#include <string>
-
-#include "correctness/tester.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class TestABC: public Tester<T> {
- public:
-
-  // Uses several variables from the Tester class
-  using Tester<T>::context_;
-  using Tester<T>::queue_;
-  using Tester<T>::kLayouts;
-  using Tester<T>::kTransposes;
-
-  // Uses several helper functions from the Tester class
-  using Tester<T>::TestStart;
-  using Tester<T>::TestEnd;
-  using Tester<T>::TestSimilarity;
-  using Tester<T>::TestErrorCount;
-  using Tester<T>::TestErrorCodes;
-  using Tester<T>::GetExampleScalars;
-  using Tester<T>::GetOffsets;
-  using Tester<T>::PrecisionSupported;
-
-  // Test settings for the regular test. Append to this list in case more tests are required.
-  const std::vector<size_t> kMatrixDims = { 7, 64 };
-  const std::vector<size_t> kOffsets = GetOffsets();
-  const std::vector<T> kAlphaValues = GetExampleScalars();
-  const std::vector<T> kBetaValues = GetExampleScalars();
-
-  // Test settings for the invalid test
-  const size_t kBufferSize = 64;
-
-  // Shorthand for a BLAS routine
-  using Routine = std::function<StatusCode(const Arguments<T>&,
-                                           const Buffer&, const Buffer&, const Buffer&,
-                                           CommandQueue&)>;
-
-  // Constructor, initializes the base class tester and input data
-  TestABC(int argc, char *argv[], const bool silent,
-          const std::string &name, const std::vector<std::string> &options,
-          const Routine clblast_lambda, const Routine clblas_lambda);
-
-  // The test functions, taking no inputs
-  void TestRegular(Arguments<T> &args, const std::string &name, const bool symmetric);
-  void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
-
- private:
-
-  // Source data to test with
-  std::vector<T> a_source_;
-  std::vector<T> b_source_;
-  std::vector<T> c_source_;
-  
-  // The routines to test
-  Routine clblast_lambda_;
-  Routine clblas_lambda_;
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_TEST_CORRECTNESS_TESTABC_H_
-#endif
--- a/test/correctness/testac.cc
+++ b/test/correctness/testac.cc
@ -1,191 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the TestAC class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include <algorithm>
-
-#include "correctness/testac.h"
-
-namespace clblast {
-// =================================================================================================
-
-// Constructor, initializes the base class tester and input data
-template <typename T>
-TestAC<T>::TestAC(int argc, char *argv[], const bool silent,
-                  const std::string &name, const std::vector<std::string> &options,
-                  const Routine clblast_lambda, const Routine clblas_lambda):
-    Tester<T>{argc, argv, silent, name, options},
-    clblast_lambda_(clblast_lambda),
-    clblas_lambda_(clblas_lambda) {
-
-  // Computes the maximum sizes. This allows for a single set of input/output buffers.
-  auto max_dim = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
-  auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
-  auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
-
-  // Creates test input data
-  a_source_.resize(max_dim*max_ld + max_offset);
-  c_source_.resize(max_dim*max_ld + max_offset);
-  PopulateVector(a_source_);
-  PopulateVector(c_source_);
-}
-
-// ===============================================================================================
-
-// Tests the routine for a wide variety of parameters
-template <typename T>
-void TestAC<T>::TestRegular(Arguments<T> &args, const std::string &name) {
-  if (!PrecisionSupported()) { return; }
-  TestStart("regular behaviour", name);
-
-  // Computes whether or not the matrices are transposed. Note that we assume a default of
-  // column-major and no-transpose. If one of them is different (but not both), then rotated
-  // is considered true.
-  auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
-                   (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
-  auto c_rotated = (args.layout == Layout::kRowMajor);
-
-  // Iterates over the matrix dimensions
-  for (auto &n: kMatrixDims) {
-    args.n = n;
-    for (auto &k: kMatrixDims) {
-      args.k = k;
-
-      // Computes the second dimensions of the matrices taking the rotation into account
-      auto a_two = (a_rotated) ? n : k;
-      auto c_two = (c_rotated) ? n : n;
-
-      // Iterates over the leading-dimension values and the offsets
-      for (auto &a_ld: kMatrixDims) {
-        args.a_ld = a_ld;
-        for (auto &a_offset: kOffsets) {
-          args.a_offset = a_offset;
-          for (auto &c_ld: kMatrixDims) {
-            args.c_ld = c_ld;
-            for (auto &c_offset: kOffsets) {
-              args.c_offset = c_offset;
-
-              // Computes the buffer sizes
-              auto a_size = a_two * a_ld + a_offset;
-              auto c_size = c_two * c_ld + c_offset;
-              if (a_size < 1 || c_size < 1) { continue; }
-
-              // Creates the OpenCL buffers
-              auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T));
-              auto r_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T));
-              auto s_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T));
-
-              // Iterates over the values for alpha and beta
-              for (auto &alpha: kAlphaValues) {
-                args.alpha = alpha;
-                for (auto &beta: kBetaValues) {
-                  args.beta = beta;
-
-                  // Runs the reference clBLAS code
-                  a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
-                  r_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_);
-                  auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_);
-
-                  // Runs the CLBlast code
-                  a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
-                  s_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_);
-                  auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_);
-
-                  // Tests for equality of the two status codes
-                  if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
-                    TestErrorCodes(status1, status2, args);
-                    continue;
-                  }
-
-                  // Downloads the results
-                  std::vector<T> r_result(c_size, static_cast<T>(0));
-                  std::vector<T> s_result(c_size, static_cast<T>(0));
-                  r_mat.ReadBuffer(queue_, c_size*sizeof(T), r_result);
-                  s_mat.ReadBuffer(queue_, c_size*sizeof(T), s_result);
-
-                  // Checks for differences in the output
-                  auto errors = size_t{0};
-                  for (auto idn0=size_t{0}; idn0<n; ++idn0) {
-                    for (auto idn1=size_t{0}; idn1<n; ++idn1) {
-                      auto index = idn0*args.c_ld + idn1 + args.c_offset;
-                      if (!TestSimilarity(r_result[index], s_result[index])) {
-                        errors++;
-                      }
-                    }
-                  }
-
-                  // Tests the error count (should be zero)
-                  TestErrorCount(errors, n*n, args);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  TestEnd();
-}
-
-// =================================================================================================
-
-// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
-// does not test for results (if any).
-template <typename T>
-void TestAC<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
-  if (!PrecisionSupported()) { return; }
-  TestStart("invalid buffer sizes", name);
-
-  // Sets example test parameters
-  args.m = kBufferSize;
-  args.n = kBufferSize;
-  args.k = kBufferSize;
-  args.a_ld = kBufferSize;
-  args.c_ld = kBufferSize;
-  args.a_offset = 0;
-  args.c_offset = 0;
-
-  // Iterates over test buffer sizes
-  const std::vector<size_t> kBufferSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
-  for (auto &a_size: kBufferSizes) {
-    for (auto &c_size: kBufferSizes) {
-
-      // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
-      // want to be able to create invalid buffers (no error checking here).
-      auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr);
-      auto a_mat = Buffer(a);
-      auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr);
-      auto r_mat = Buffer(r);
-      auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr);
-      auto s_mat = Buffer(s);
-
-      // Runs the two routines
-      auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_);
-      auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_);
-
-      // Tests for equality of the two status codes
-      TestErrorCodes(status1, status2, args);
-    }
-  }
-  TestEnd();
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class TestAC<float>;
-template class TestAC<double>;
-template class TestAC<float2>;
-template class TestAC<double2>;
-
-// =================================================================================================
-} // namespace clblast
--- a/test/correctness/testac.h
+++ b/test/correctness/testac.h
@ -1,85 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file tests any mat-mat (A,C) routine. It contains two types of tests: one testing
-// all sorts of input combinations, and one deliberatly testing with invalid values.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_TEST_CORRECTNESS_TESTAC_H_
-#define CLBLAST_TEST_CORRECTNESS_TESTAC_H_
-
-#include <vector>
-#include <string>
-
-#include "correctness/tester.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class TestAC: public Tester<T> {
- public:
-
-  // Uses several variables from the Tester class
-  using Tester<T>::context_;
-  using Tester<T>::queue_;
-  using Tester<T>::kLayouts;
-  using Tester<T>::kTransposes;
-
-  // Uses several helper functions from the Tester class
-  using Tester<T>::TestStart;
-  using Tester<T>::TestEnd;
-  using Tester<T>::TestSimilarity;
-  using Tester<T>::TestErrorCount;
-  using Tester<T>::TestErrorCodes;
-  using Tester<T>::GetExampleScalars;
-  using Tester<T>::GetOffsets;
-  using Tester<T>::PrecisionSupported;
-
-  // Test settings for the regular test. Append to this list in case more tests are required.
-  const std::vector<size_t> kMatrixDims = { 7, 64 };
-  const std::vector<size_t> kOffsets = GetOffsets();
-  const std::vector<T> kAlphaValues = GetExampleScalars();
-  const std::vector<T> kBetaValues = GetExampleScalars();
-
-  // Test settings for the invalid test
-  const size_t kBufferSize = 64;
-
-  // Shorthand for a BLAS routine
-  using Routine = std::function<StatusCode(const Arguments<T>&,
-                                           const Buffer&, const Buffer&,
-                                           CommandQueue&)>;
-
-  // Constructor, initializes the base class tester and input data
-  TestAC(int argc, char *argv[], const bool silent,
-         const std::string &name, const std::vector<std::string> &options,
-         const Routine clblast_lambda, const Routine clblas_lambda);
-
-  // The test functions, taking no inputs
-  void TestRegular(Arguments<T> &args, const std::string &name);
-  void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
-
- private:
-
-  // Source data to test with
-  std::vector<T> a_source_;
-  std::vector<T> c_source_;
-  
-  // The routines to test
-  Routine clblast_lambda_;
-  Routine clblas_lambda_;
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_TEST_CORRECTNESS_TESTAC_H_
-#endif
--- a/test/correctness/testaxy.cc
+++ b/test/correctness/testaxy.cc
@ -1,213 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the TestAXY class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include <algorithm>
-
-#include "correctness/testaxy.h"
-
-namespace clblast {
-// =================================================================================================
-
-// Constructor, initializes the base class tester and input data
-template <typename T>
-TestAXY<T>::TestAXY(int argc, char *argv[], const bool silent,
-                    const std::string &name, const std::vector<std::string> &options,
-                    const Routine clblast_lambda, const Routine clblas_lambda):
-    Tester<T>{argc, argv, silent, name, options},
-    clblast_lambda_(clblast_lambda),
-    clblas_lambda_(clblas_lambda) {
-
-  // Computes the maximum sizes. This allows for a single set of input/output buffers.
-  auto max_dim = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
-  auto max_ld = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
-  auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end());
-  auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
-
-  // Creates test input data
-  a_source_.resize(max_dim*max_ld + max_offset);
-  x_source_.resize(max_dim*max_inc + max_offset);
-  y_source_.resize(max_dim*max_inc + max_offset);
-  PopulateVector(a_source_);
-  PopulateVector(x_source_);
-  PopulateVector(y_source_);
-}
-
-// ===============================================================================================
-
-// Tests the routine for a wide variety of parameters
-template <typename T>
-void TestAXY<T>::TestRegular(Arguments<T> &args, const std::string &name) {
-  if (!PrecisionSupported()) { return; }
-  TestStart("regular behaviour", name);
-
-  // Iterates over the dimension for the matrix and vectors
-  for (auto &m: kMatrixVectorDims) {
-    args.m = m;
-    for (auto &n: kMatrixVectorDims) {
-      args.n = n;
-
-      // Computes the second dimension of the matrix taking the rotation into account
-      auto a_two = (args.layout == Layout::kRowMajor) ? args.m : args.n;
-
-      // Computes the vector sizes in case the matrix is transposed
-      auto a_transposed = (args.a_transpose == Transpose::kYes);
-      auto m_real = (a_transposed) ? n : m;
-      auto n_real = (a_transposed) ? m : n;
-
-      // Iterates over the leading-dimension values and the offsets of the matrix
-      for (auto &a_ld: kMatrixVectorDims) {
-        args.a_ld = a_ld;
-        for (auto &a_offset: kOffsets) {
-          args.a_offset = a_offset;
-
-          // Iterates over the increment-values and the offsets of the vectors
-          for (auto &x_inc: kIncrements) {
-            args.x_inc = x_inc;
-            for (auto &x_offset: kOffsets) {
-              args.x_offset = x_offset;
-              for (auto &y_inc: kIncrements) {
-                args.y_inc = y_inc;
-                for (auto &y_offset: kOffsets) {
-                  args.y_offset = y_offset;
-
-                  // Computes the buffer sizes
-                  auto a_size = a_two * a_ld + a_offset;
-                  auto x_size = n_real * x_inc + x_offset;
-                  auto y_size = m_real * y_inc + y_offset;
-                  if (a_size < 1 || x_size < 1 || y_size < 1) { continue; }
-
-                  // Creates the OpenCL buffers
-                  auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T));
-                  auto x_vec = Buffer(context_, CL_MEM_READ_WRITE, x_size*sizeof(T));
-                  auto r_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
-                  auto s_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
-
-                  // Iterates over the values for alpha and beta
-                  for (auto &alpha: kAlphaValues) {
-                    args.alpha = alpha;
-                    for (auto &beta: kBetaValues) {
-                      args.beta = beta;
-
-                      // Runs the reference clBLAS code
-                      a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
-                      x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
-                      r_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
-                      auto status1 = clblas_lambda_(args, a_mat, x_vec, r_vec, queue_);
-
-                      // Runs the CLBlast code
-                      a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
-                      x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
-                      s_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
-                      auto status2 = clblast_lambda_(args, a_mat, x_vec, s_vec, queue_);
-
-                      // Tests for equality of the two status codes
-                      if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
-                        TestErrorCodes(status1, status2, args);
-                        continue;
-                      }
-
-                      // Downloads the results
-                      std::vector<T> r_result(y_size, static_cast<T>(0));
-                      std::vector<T> s_result(y_size, static_cast<T>(0));
-                      r_vec.ReadBuffer(queue_, y_size*sizeof(T), r_result);
-                      s_vec.ReadBuffer(queue_, y_size*sizeof(T), s_result);
-
-                      // Checks for differences in the output
-                      auto errors = size_t{0};
-                      for (auto idm=size_t{0}; idm<m_real; ++idm) {
-                        auto index = idm*y_inc + y_offset;
-                        if (!TestSimilarity(r_result[index], s_result[index])) {
-                          errors++;
-                        }
-                      }
-
-                      // Tests the error count (should be zero)
-                      TestErrorCount(errors, m_real, args);
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  TestEnd();
-}
-
-// =================================================================================================
-
-// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
-// does not test for results (if any).
-template <typename T>
-void TestAXY<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
-  if (!PrecisionSupported()) { return; }
-  TestStart("invalid buffer sizes", name);
-
-  // Sets example test parameters
-  args.m = kBufferSize;
-  args.n = kBufferSize;
-  args.a_ld = kBufferSize;
-  args.a_offset = 0;
-  args.x_offset = 0;
-  args.y_offset = 0;
-
-  // Iterates over test buffer sizes
-  const std::vector<size_t> kMatrixSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
-  const std::vector<size_t> kVectorSizes = {0, kBufferSize - 1, kBufferSize};
-  for (auto &a_size: kMatrixSizes) {
-    for (auto &x_size: kVectorSizes) {
-      for (auto &y_size: kVectorSizes) {
-
-        // Iterates over test increments
-        for (auto &x_inc: kInvalidIncrements) {
-          args.x_inc = x_inc;
-          for (auto &y_inc: kInvalidIncrements) {
-            args.y_inc = y_inc;
-
-            // Creates the OpenCL buffers. Note: we are not using the C++ version since we
-            // explicitly want to be able to create invalid buffers (no error checking here).
-            auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr);
-            auto a_mat = Buffer(a);
-            auto x = clCreateBuffer(context_(), CL_MEM_READ_WRITE, x_size*sizeof(T), nullptr, nullptr);
-            auto x_vec = Buffer(x);
-            auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
-            auto r_vec = Buffer(r);
-            auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
-            auto s_vec = Buffer(s);
-
-            // Runs the two routines
-            auto status1 = clblas_lambda_(args, a_mat, x_vec, r_vec, queue_);
-            auto status2 = clblast_lambda_(args, a_mat, x_vec, s_vec, queue_);
-
-            // Tests for equality of the two status codes
-            TestErrorCodes(status1, status2, args);
-          }
-        }
-      }
-    }
-  }
-  TestEnd();
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class TestAXY<float>;
-template class TestAXY<double>;
-template class TestAXY<float2>;
-template class TestAXY<double2>;
-
-// =================================================================================================
-} // namespace clblast
--- a/test/correctness/testaxy.h
+++ b/test/correctness/testaxy.h
@ -1,88 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file tests any mat-vec-vec (A,X,Y) routine. It contains two types of tests: one testing
-// all sorts of input combinations, and one deliberatly testing with invalid values.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_TEST_CORRECTNESS_TESTAXY_H_
-#define CLBLAST_TEST_CORRECTNESS_TESTAXY_H_
-
-#include <vector>
-#include <string>
-
-#include "correctness/tester.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class TestAXY: public Tester<T> {
- public:
-
-  // Uses several variables from the Tester class
-  using Tester<T>::context_;
-  using Tester<T>::queue_;
-  using Tester<T>::kLayouts;
-  using Tester<T>::kTransposes;
-
-  // Uses several helper functions from the Tester class
-  using Tester<T>::TestStart;
-  using Tester<T>::TestEnd;
-  using Tester<T>::TestSimilarity;
-  using Tester<T>::TestErrorCount;
-  using Tester<T>::TestErrorCodes;
-  using Tester<T>::GetExampleScalars;
-  using Tester<T>::GetOffsets;
-  using Tester<T>::PrecisionSupported;
-
-  // Test settings for the regular test. Append to this list in case more tests are required.
-  const std::vector<size_t> kMatrixVectorDims = { 61, 512 };
-  const std::vector<size_t> kOffsets = GetOffsets();
-  const std::vector<size_t> kIncrements = { 1, 2 };
-  const std::vector<T> kAlphaValues = GetExampleScalars();
-  const std::vector<T> kBetaValues = GetExampleScalars();
-
-  // Test settings for the invalid test
-  const std::vector<size_t> kInvalidIncrements = { 0, 1 };
-  const size_t kBufferSize = 64;
-
-  // Shorthand for a BLAS routine
-  using Routine = std::function<StatusCode(const Arguments<T>&,
-                                           const Buffer&, const Buffer&, const Buffer&,
-                                           CommandQueue&)>;
-
-  // Constructor, initializes the base class tester and input data
-  TestAXY(int argc, char *argv[], const bool silent,
-          const std::string &name, const std::vector<std::string> &options,
-          const Routine clblast_lambda, const Routine clblas_lambda);
-
-  // The test functions, taking no inputs
-  void TestRegular(Arguments<T> &args, const std::string &name);
-  void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
-
- private:
-
-  // Source data to test with
-  std::vector<T> a_source_;
-  std::vector<T> x_source_;
-  std::vector<T> y_source_;
-  
-  // The routines to test
-  Routine clblast_lambda_;
-  Routine clblas_lambda_;
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_TEST_CORRECTNESS_TESTAXY_H_
-#endif
--- a/test/correctness/testblas.cc
+++ b/test/correctness/testblas.cc
@ -0,0 +1,185 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the TestBlas class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include <algorithm>
+
+#include "correctness/testblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The transpose-options to test with (data-type dependent)
+template <> const std::vector<Transpose> TestBlas<float>::kTransposes = {Transpose::kNo, Transpose::kYes};
+template <> const std::vector<Transpose> TestBlas<double>::kTransposes = {Transpose::kNo, Transpose::kYes};
+template <> const std::vector<Transpose> TestBlas<float2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
+template <> const std::vector<Transpose> TestBlas<double2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
+
+// =================================================================================================
+
+// Constructor, initializes the base class tester and input data
+template <typename T>
+TestBlas<T>::TestBlas(int argc, char *argv[], const bool silent,
+                      const std::string &name, const std::vector<std::string> &options,
+                      const Routine run_routine, const Routine run_reference,
+                      const ResultGet get_result, const ResultIndex get_index,
+                      const ResultIterator get_id1, const ResultIterator get_id2):
+    Tester<T>{argc, argv, silent, name, options},
+    run_routine_(run_routine),
+    run_reference_(run_reference),
+    get_result_(get_result),
+    get_index_(get_index),
+    get_id1_(get_id1),
+    get_id2_(get_id2) {
+
+  // Computes the maximum sizes. This allows for a single set of input/output buffers.
+  auto max_vec = *std::max_element(kVectorDims.begin(), kVectorDims.end());
+  auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end());
+  auto max_mat = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
+  auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
+  auto max_matvec = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
+  auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
+
+  // Creates test input data
+  x_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset);
+  y_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset);
+  a_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
+  b_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
+  c_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
+  PopulateVector(x_source_);
+  PopulateVector(y_source_);
+  PopulateVector(a_source_);
+  PopulateVector(b_source_);
+  PopulateVector(c_source_);
+}
+
+// ===============================================================================================
+
+// Tests the routine for a wide variety of parameters
+template <typename T>
+void TestBlas<T>::TestRegular(std::vector<Arguments<T>> &test_vector, const std::string &name) {
+  if (!PrecisionSupported()) { return; }
+  TestStart("regular behaviour", name);
+
+  // Iterates over all the to-be-tested combinations of arguments
+  for (auto &args: test_vector) {
+
+    // Runs the reference clBLAS code
+    auto x_vec1 = Buffer(context_, CL_MEM_READ_WRITE, args.x_size*sizeof(T));
+    auto y_vec1 = Buffer(context_, CL_MEM_READ_WRITE, args.y_size*sizeof(T));
+    auto a_mat1 = Buffer(context_, CL_MEM_READ_WRITE, args.a_size*sizeof(T));
+    auto b_mat1 = Buffer(context_, CL_MEM_READ_WRITE, args.b_size*sizeof(T));
+    auto c_mat1 = Buffer(context_, CL_MEM_READ_WRITE, args.c_size*sizeof(T));
+    x_vec1.WriteBuffer(queue_, args.x_size*sizeof(T), x_source_);
+    y_vec1.WriteBuffer(queue_, args.y_size*sizeof(T), y_source_);
+    a_mat1.WriteBuffer(queue_, args.a_size*sizeof(T), a_source_);
+    b_mat1.WriteBuffer(queue_, args.b_size*sizeof(T), b_source_);
+    c_mat1.WriteBuffer(queue_, args.c_size*sizeof(T), c_source_);
+    auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1};
+    auto status1 = run_reference_(args, buffers1, queue_);
+
+    // Runs the CLBlast code
+    auto x_vec2 = Buffer(context_, CL_MEM_READ_WRITE, args.x_size*sizeof(T));
+    auto y_vec2 = Buffer(context_, CL_MEM_READ_WRITE, args.y_size*sizeof(T));
+    auto a_mat2 = Buffer(context_, CL_MEM_READ_WRITE, args.a_size*sizeof(T));
+    auto b_mat2 = Buffer(context_, CL_MEM_READ_WRITE, args.b_size*sizeof(T));
+    auto c_mat2 = Buffer(context_, CL_MEM_READ_WRITE, args.c_size*sizeof(T));
+    x_vec2.WriteBuffer(queue_, args.x_size*sizeof(T), x_source_);
+    y_vec2.WriteBuffer(queue_, args.y_size*sizeof(T), y_source_);
+    a_mat2.WriteBuffer(queue_, args.a_size*sizeof(T), a_source_);
+    b_mat2.WriteBuffer(queue_, args.b_size*sizeof(T), b_source_);
+    c_mat2.WriteBuffer(queue_, args.c_size*sizeof(T), c_source_);
+    auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2};
+    auto status2 = run_routine_(args, buffers2, queue_);
+
+    // Tests for equality of the two status codes
+    if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
+      TestErrorCodes(status1, status2, args);
+      continue;
+    }
+
+    // Downloads the results
+    auto result1 = get_result_(args, buffers1, queue_);
+    auto result2 = get_result_(args, buffers2, queue_);
+
+    // Checks for differences in the output
+    auto errors = size_t{0};
+    for (auto id1=size_t{0}; id1<get_id1_(args); ++id1) {
+      for (auto id2=size_t{0}; id2<get_id2_(args); ++id2) {
+        auto index = get_index_(args, id1, id2);
+        if (!TestSimilarity(result1[index], result2[index])) {
+          errors++;
+        }
+      }
+    }
+
+    // Tests the error count (should be zero)
+    TestErrorCount(errors, get_id1_(args)*get_id2_(args), args);
+  }
+  TestEnd();
+}
+
+// =================================================================================================
+
+// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
+// does not test for results (if any).
+template <typename T>
+void TestBlas<T>::TestInvalid(std::vector<Arguments<T>> &test_vector, const std::string &name) {
+  if (!PrecisionSupported()) { return; }
+  TestStart("invalid buffer sizes", name);
+
+  // Iterates over all the to-be-tested combinations of arguments
+  for (auto &args: test_vector) {
+
+    // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
+    // want to be able to create invalid buffers (no error checking here).
+    auto x1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
+    auto y1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
+    auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
+    auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
+    auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
+    auto x_vec1 = Buffer(x1);
+    auto y_vec1 = Buffer(y1);
+    auto a_mat1 = Buffer(a1);
+    auto b_mat1 = Buffer(b1);
+    auto c_mat1 = Buffer(c1);
+    auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
+    auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
+    auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
+    auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
+    auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
+    auto x_vec2 = Buffer(x2);
+    auto y_vec2 = Buffer(y2);
+    auto a_mat2 = Buffer(a2);
+    auto b_mat2 = Buffer(b2);
+    auto c_mat2 = Buffer(c2);
+
+    // Runs the two routines
+    auto status1 = run_reference_(args, Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1}, queue_);
+    auto status2 = run_routine_(args, Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2}, queue_);
+
+    // Tests for equality of the two status codes
+    TestErrorCodes(status1, status2, args);
+  }
+  TestEnd();
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class TestBlas<float>;
+template class TestBlas<double>;
+template class TestBlas<float2>;
+template class TestBlas<double2>;
+
+// =================================================================================================
+} // namespace clblast
--- a/test/correctness/testblas.h
+++ b/test/correctness/testblas.h
@ -0,0 +1,104 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file tests any CLBlast routine. It contains two types of tests: one testing all sorts of
+// input combinations, and one deliberatly testing with invalid values.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_CORRECTNESS_TESTBLAS_H_
+#define CLBLAST_TEST_CORRECTNESS_TESTBLAS_H_
+
+#include <vector>
+#include <string>
+
+#include "correctness/tester.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestBlas: public Tester<T> {
+ public:
+
+  // Uses several variables from the Tester class
+  using Tester<T>::context_;
+  using Tester<T>::queue_;
+
+  // Uses several helper functions from the Tester class
+  using Tester<T>::TestStart;
+  using Tester<T>::TestEnd;
+  using Tester<T>::TestSimilarity;
+  using Tester<T>::TestErrorCount;
+  using Tester<T>::TestErrorCodes;
+  using Tester<T>::GetExampleScalars;
+  using Tester<T>::GetOffsets;
+  using Tester<T>::PrecisionSupported;
+
+  // Test settings for the regular test. Append to these lists in case more tests are required.
+  const std::vector<size_t> kVectorDims = { 7, 93, 4096 };
+  const std::vector<size_t> kIncrements = { 1, 2, 7 };
+  const std::vector<size_t> kMatrixDims = { 7, 64 };
+  const std::vector<size_t> kMatrixVectorDims = { 61, 512 };
+  const std::vector<size_t> kOffsets = GetOffsets();
+  const std::vector<T> kAlphaValues = GetExampleScalars();
+  const std::vector<T> kBetaValues = GetExampleScalars();
+
+  // Test settings for the invalid tests
+  const std::vector<size_t> kInvalidIncrements = { 0, 1 };
+  const size_t kBufferSize = 64;
+  const std::vector<size_t> kMatSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
+  const std::vector<size_t> kVecSizes = {0, kBufferSize - 1, kBufferSize};
+
+  // The layout/transpose/triangle options to test with
+  const std::vector<Layout> kLayouts = {Layout::kRowMajor, Layout::kColMajor};
+  const std::vector<Triangle> kTriangles = {Triangle::kUpper, Triangle::kLower};
+  const std::vector<Side> kSides = {Side::kLeft, Side::kRight};
+  static const std::vector<Transpose> kTransposes; // Data-type dependent, see .cc-file
+
+  // Shorthand for the routine-specific functions passed to the tester
+  using Routine = std::function<StatusCode(const Arguments<T>&, const Buffers&, CommandQueue&)>;
+  using ResultGet = std::function<std::vector<T>(const Arguments<T>&, Buffers&, CommandQueue&)>;
+  using ResultIndex = std::function<size_t(const Arguments<T>&, const size_t, const size_t)>;
+  using ResultIterator = std::function<size_t(const Arguments<T>&)>;
+
+  // Constructor, initializes the base class tester and input data
+  TestBlas(int argc, char *argv[], const bool silent,
+           const std::string &name, const std::vector<std::string> &options,
+           const Routine run_routine, const Routine run_reference, const ResultGet get_result,
+           const ResultIndex get_index, const ResultIterator get_id1, const ResultIterator get_id2);
+
+  // The test functions, taking no inputs
+  void TestRegular(std::vector<Arguments<T>> &test_vector, const std::string &name);
+  void TestInvalid(std::vector<Arguments<T>> &test_vector, const std::string &name);
+
+ private:
+
+  // Source data to test with
+  std::vector<T> x_source_;
+  std::vector<T> y_source_;
+  std::vector<T> a_source_;
+  std::vector<T> b_source_;
+  std::vector<T> c_source_;
+  
+  // The routine-specific functions passed to the tester
+  Routine run_routine_;
+  Routine run_reference_;
+  ResultGet get_result_;
+  ResultIndex get_index_;
+  ResultIterator get_id1_;
+  ResultIterator get_id2_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_CORRECTNESS_TESTBLAS_H_
+#endif
--- a/test/correctness/tester.cc
+++ b/test/correctness/tester.cc
@ -21,16 +21,6 @@
 namespace clblast {
 // =================================================================================================

-// The layouts and transpose-options to test with (data-type dependent)
-template <typename T>
-const std::vector<Layout> Tester<T>::kLayouts = {Layout::kRowMajor, Layout::kColMajor};
-template <> const std::vector<Transpose> Tester<float>::kTransposes = {Transpose::kNo, Transpose::kYes};
-template <> const std::vector<Transpose> Tester<double>::kTransposes = {Transpose::kNo, Transpose::kYes};
-template <> const std::vector<Transpose> Tester<float2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
-template <> const std::vector<Transpose> Tester<double2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
-
-// =================================================================================================
-
 // General constructor for all CLBlast testers. It prints out the test header to stdout and sets-up
 // the clBLAS library for reference.
 template <typename T>
--- a/test/correctness/tester.h
+++ b/test/correctness/tester.h
@ -62,10 +62,6 @@ class Tester {
  const std::string kSkippedCompilation{kPrintWarning + "\\" + kPrintEnd};
  const std::string kUnsupportedPrecision{kPrintWarning + "o" + kPrintEnd};

-  // The layouts and transpose-options to test with
-  static const std::vector<Layout> kLayouts;
-  static const std::vector<Transpose> kTransposes;
-
  // This structure combines the above log-entry with a status code an error percentage
  struct ErrorLogEntry {
    StatusCode status_expect;
--- a/test/correctness/testxy.cc
+++ b/test/correctness/testxy.cc
@ -1,176 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the TestXY class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include <algorithm>
-
-#include "correctness/testxy.h"
-
-namespace clblast {
-// =================================================================================================
-
-// Constructor, initializes the base class tester and input data
-template <typename T>
-TestXY<T>::TestXY(int argc, char *argv[], const bool silent,
-                  const std::string &name, const std::vector<std::string> &options,
-                  const Routine clblast_lambda, const Routine clblas_lambda):
-    Tester<T>{argc, argv, silent, name, options},
-    clblast_lambda_(clblast_lambda),
-    clblas_lambda_(clblas_lambda) {
-
-  // Computes the maximum sizes. This allows for a single set of input/output buffers.
-  auto max_dim = *std::max_element(kVectorDims.begin(), kVectorDims.end());
-  auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end());
-  auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
-
-  // Creates test input data
-  x_source_.resize(max_dim*max_inc + max_offset);
-  y_source_.resize(max_dim*max_inc + max_offset);
-  PopulateVector(x_source_);
-  PopulateVector(y_source_);
-}
-
-// ===============================================================================================
-
-// Tests the routine for a wide variety of parameters
-template <typename T>
-void TestXY<T>::TestRegular(Arguments<T> &args, const std::string &name) {
-  if (!PrecisionSupported()) { return; }
-  TestStart("regular behaviour", name);
-
-  // Iterates over the vector dimension
-  for (auto &n: kVectorDims) {
-    args.n = n;
-
-    // Iterates over the increment-values and the offsets
-    for (auto &x_inc: kIncrements) {
-      args.x_inc = x_inc;
-      for (auto &x_offset: kOffsets) {
-        args.x_offset = x_offset;
-        for (auto &y_inc: kIncrements) {
-          args.y_inc = y_inc;
-          for (auto &y_offset: kOffsets) {
-            args.y_offset = y_offset;
-
-            // Computes the buffer sizes
-            auto x_size = n * x_inc + x_offset;
-            auto y_size = n * y_inc + y_offset;
-            if (x_size < 1 || y_size < 1) { continue; }
-
-            // Creates the OpenCL buffers
-            auto x_vec = Buffer(context_, CL_MEM_READ_WRITE, x_size*sizeof(T));
-            auto r_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
-            auto s_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
-
-            // Iterates over the values for alpha
-            for (auto &alpha: kAlphaValues) {
-              args.alpha = alpha;
-
-              // Runs the reference clBLAS code
-              x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
-              r_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
-              auto status1 = clblas_lambda_(args, x_vec, r_vec, queue_);
-
-              // Runs the CLBlast code
-              x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
-              s_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
-              auto status2 = clblast_lambda_(args, x_vec, s_vec, queue_);
-
-              // Tests for equality of the two status codes
-              if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
-                TestErrorCodes(status1, status2, args);
-                continue;
-              }
-
-              // Downloads the results
-              std::vector<T> r_result(y_size, static_cast<T>(0));
-              std::vector<T> s_result(y_size, static_cast<T>(0));
-              r_vec.ReadBuffer(queue_, y_size*sizeof(T), r_result);
-              s_vec.ReadBuffer(queue_, y_size*sizeof(T), s_result);
-
-              // Checks for differences in the output
-              auto errors = size_t{0};
-              for (auto idn=size_t{0}; idn<n; ++idn) {
-                auto index = idn*y_inc + y_offset;
-                if (!TestSimilarity(r_result[index], s_result[index])) {
-                  errors++;
-                }
-              }
-
-              // Tests the error count (should be zero)
-              TestErrorCount(errors, n, args);
-            }
-          }
-        }
-      }
-    }
-  }
-  TestEnd();
-}
-
-// =================================================================================================
-
-// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
-// does not test for results (if any).
-template <typename T>
-void TestXY<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
-  if (!PrecisionSupported()) { return; }
-  TestStart("invalid buffer sizes", name);
-
-  // Sets example test parameters
-  args.n = kBufferSize;
-  args.x_offset = 0;
-  args.y_offset = 0;
-
-  // Iterates over test buffer sizes
-  const std::vector<size_t> kBufferSizes = {0, kBufferSize - 1, kBufferSize};
-  for (auto &x_size: kBufferSizes) {
-    for (auto &y_size: kBufferSizes) {
-
-      // Iterates over test increments
-      for (auto &x_inc: kInvalidIncrements) {
-        args.x_inc = x_inc;
-        for (auto &y_inc: kInvalidIncrements) {
-          args.y_inc = y_inc;
-
-          // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
-          // want to be able to create invalid buffers (no error checking here).
-          auto x = clCreateBuffer(context_(), CL_MEM_READ_WRITE, x_size*sizeof(T), nullptr, nullptr);
-          auto x_vec = Buffer(x);
-          auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
-          auto r_vec = Buffer(r);
-          auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
-          auto s_vec = Buffer(s);
-
-          // Runs the two routines
-          auto status1 = clblas_lambda_(args, x_vec, r_vec, queue_);
-          auto status2 = clblast_lambda_(args, x_vec, s_vec, queue_);
-
-          // Tests for equality of the two status codes
-          TestErrorCodes(status1, status2, args);
-        }
-      }
-    }
-  }
-  TestEnd();
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class TestXY<float>;
-template class TestXY<double>;
-template class TestXY<float2>;
-template class TestXY<double2>;
-
-// =================================================================================================
-} // namespace clblast
--- a/test/correctness/testxy.h
+++ b/test/correctness/testxy.h
@ -1,84 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file tests any vector-vector (X,Y) routine. It contains two types of tests: one testing
-// all sorts of input combinations, and one deliberatly testing with invalid values.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_TEST_CORRECTNESS_TESTXY_H_
-#define CLBLAST_TEST_CORRECTNESS_TESTXY_H_
-
-#include <vector>
-#include <string>
-
-#include "correctness/tester.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class TestXY: public Tester<T> {
- public:
-
-  // Uses several variables from the Tester class
-  using Tester<T>::context_;
-  using Tester<T>::queue_;
-
-  // Uses several helper functions from the Tester class
-  using Tester<T>::TestStart;
-  using Tester<T>::TestEnd;
-  using Tester<T>::TestSimilarity;
-  using Tester<T>::TestErrorCount;
-  using Tester<T>::TestErrorCodes;
-  using Tester<T>::GetExampleScalars;
-  using Tester<T>::GetOffsets;
-  using Tester<T>::PrecisionSupported;
-
-  // Test settings for the regular test. Append to this list in case more tests are required.
-  const std::vector<size_t> kVectorDims = { 7, 93, 4096 };
-  const std::vector<size_t> kOffsets = GetOffsets();
-  const std::vector<size_t> kIncrements = { 1, 2, 7 };
-  const std::vector<T> kAlphaValues = GetExampleScalars();
-
-  // Test settings for the invalid test
-  const std::vector<size_t> kInvalidIncrements = { 0, 1 };
-  const size_t kBufferSize = 512;
-
-  // Shorthand for a BLAS routine
-  using Routine = std::function<StatusCode(const Arguments<T>&,
-                                           const Buffer&, const Buffer&,
-                                           CommandQueue&)>;
-
-  // Constructor, initializes the base class tester and input data
-  TestXY(int argc, char *argv[], const bool silent,
-         const std::string &name, const std::vector<std::string> &options,
-         const Routine clblast_lambda, const Routine clblas_lambda);
-
-  // The test functions, taking no inputs
-  void TestRegular(Arguments<T> &args, const std::string &name);
-  void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
-
- private:
-
-  // Source data to test with
-  std::vector<T> x_source_;
-  std::vector<T> y_source_;
-  
-  // The routines to test
-  Routine clblast_lambda_;
-  Routine clblas_lambda_;
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_TEST_CORRECTNESS_TESTXY_H_
-#endif
--- a/test/performance/client.cc
+++ b/test/performance/client.cc
@ -21,323 +21,36 @@
 namespace clblast {
 // =================================================================================================

-// This is the vector-vector variant of the set-up/tear-down client routine.
+// Constructor
 template <typename T>
-void ClientXY(int argc, char *argv[], Routine2<T> client_routine,
-              const std::vector<std::string> &options) {
-
-  // Function to determine how to find the default value of the leading dimension of matrix A.
-  // Note: this is not relevant for this client but given anyway.
-  auto default_ld_a = [](const Arguments<T> args) { return args.n; };
-
-  // Simple command line argument parser with defaults
-  auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
-  if (args.print_help) { return; }
-
-  // Prints the header of the output table
-  PrintTableHeader(args.silent, options);
-
-  // Initializes OpenCL and the libraries
-  auto platform = Platform(args.platform_id);
-  auto device = Device(platform, kDeviceType, args.device_id);
-  auto context = Context(device);
-  auto queue = CommandQueue(context, device);
-  if (args.compare_clblas) { clblasSetup(); }
-
-  // Iterates over all "num_step" values jumping by "step" each time
-  auto s = size_t{0};
-  while(true) {
-
-    // Computes the data sizes
-    auto x_size = args.n*args.x_inc + args.x_offset;
-    auto y_size = args.n*args.y_inc + args.y_offset;
-
-    // Populates input host vectors with random data
-    std::vector<T> x_source(x_size);
-    std::vector<T> y_source(y_size);
-    PopulateVector(x_source);
-    PopulateVector(y_source);
-
-    // Creates the vectors on the device
-    auto x_buffer = Buffer(context, CL_MEM_READ_WRITE, x_size*sizeof(T));
-    auto y_buffer = Buffer(context, CL_MEM_READ_WRITE, y_size*sizeof(T));
-    x_buffer.WriteBuffer(queue, x_size*sizeof(T), x_source);
-    y_buffer.WriteBuffer(queue, y_size*sizeof(T), y_source);
-
-    // Runs the routine-specific code
-    client_routine(args, x_buffer, y_buffer, queue);
-
-    // Makes the jump to the next step
-    ++s;
-    if (s >= args.num_steps) { break; }
-    args.n += args.step;
-  }
-
-  // Cleans-up and returns
-  if (args.compare_clblas) { clblasTeardown(); }
+Client<T>::Client(const Routine run_routine, const Routine run_reference,
+                  const std::vector<std::string> &options,
+                  const GetMetric get_flops, const GetMetric get_bytes):
+  run_routine_(run_routine),
+  run_reference_(run_reference),
+  options_(options),
+  get_flops_(get_flops),
+  get_bytes_(get_bytes) {
 }

-// Compiles the above function
-template void ClientXY<float>(int, char **, Routine2<float>, const std::vector<std::string>&);
-template void ClientXY<double>(int, char **, Routine2<double>, const std::vector<std::string>&);
-template void ClientXY<float2>(int, char **, Routine2<float2>, const std::vector<std::string>&);
-template void ClientXY<double2>(int, char **, Routine2<double2>, const std::vector<std::string>&);
-
-// =================================================================================================
-
-// This is the matrix-vector-vector variant of the set-up/tear-down client routine.
-template <typename T>
-void ClientAXY(int argc, char *argv[], Routine3<T> client_routine,
-               const std::vector<std::string> &options) {
-
-  // Function to determine how to find the default value of the leading dimension of matrix A
-  auto default_ld_a = [](const Arguments<T> args) { return args.n; };
-
-  // Simple command line argument parser with defaults
-  auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
-  if (args.print_help) { return; }
-
-  // Prints the header of the output table
-  PrintTableHeader(args.silent, options);
-
-  // Initializes OpenCL and the libraries
-  auto platform = Platform(args.platform_id);
-  auto device = Device(platform, kDeviceType, args.device_id);
-  auto context = Context(device);
-  auto queue = CommandQueue(context, device);
-  if (args.compare_clblas) { clblasSetup(); }
-
-  // Iterates over all "num_step" values jumping by "step" each time
-  auto s = size_t{0};
-  while(true) {
-
-    // Computes the second dimension of the matrix taking the rotation into account
-    auto a_two = (args.layout == Layout::kRowMajor) ? args.m : args.n;
-
-    // Computes the vector sizes in case the matrix is transposed
-    auto a_transposed = (args.a_transpose != Transpose::kNo);
-    auto m_real = (a_transposed) ? args.n : args.m;
-    auto n_real = (a_transposed) ? args.m : args.n;
-
-    // Computes the data sizes
-    auto a_size = a_two * args.a_ld + args.a_offset;
-    auto x_size = n_real*args.x_inc + args.x_offset;
-    auto y_size = m_real*args.y_inc + args.y_offset;
-
-    // Populates input host vectors with random data
-    std::vector<T> a_source(a_size);
-    std::vector<T> x_source(x_size);
-    std::vector<T> y_source(y_size);
-    PopulateVector(a_source);
-    PopulateVector(x_source);
-    PopulateVector(y_source);
-
-    // Creates the vectors on the device
-    auto a_buffer = Buffer(context, CL_MEM_READ_WRITE, a_size*sizeof(T));
-    auto x_buffer = Buffer(context, CL_MEM_READ_WRITE, x_size*sizeof(T));
-    auto y_buffer = Buffer(context, CL_MEM_READ_WRITE, y_size*sizeof(T));
-    a_buffer.WriteBuffer(queue, a_size*sizeof(T), a_source);
-    x_buffer.WriteBuffer(queue, x_size*sizeof(T), x_source);
-    y_buffer.WriteBuffer(queue, y_size*sizeof(T), y_source);
-
-    // Runs the routine-specific code
-    client_routine(args, a_buffer, x_buffer, y_buffer, queue);
-
-    // Makes the jump to the next step
-    ++s;
-    if (s >= args.num_steps) { break; }
-    args.m += args.step;
-    args.n += args.step;
-    args.a_ld += args.step;
-  }
-
-  // Cleans-up and returns
-  if (args.compare_clblas) { clblasTeardown(); }
-}
-
-// Compiles the above function
-template void ClientAXY<float>(int, char **, Routine3<float>, const std::vector<std::string>&);
-template void ClientAXY<double>(int, char **, Routine3<double>, const std::vector<std::string>&);
-template void ClientAXY<float2>(int, char **, Routine3<float2>, const std::vector<std::string>&);
-template void ClientAXY<double2>(int, char **, Routine3<double2>, const std::vector<std::string>&);
-
-// =================================================================================================
-
-// This is the matrix-matrix variant of the set-up/tear-down client routine.
-template <typename T>
-void ClientAC(int argc, char *argv[], Routine2<T> client_routine,
-              const std::vector<std::string> &options) {
-
-  // Function to determine how to find the default value of the leading dimension of matrix A
-  auto default_ld_a = [](const Arguments<T> args) { return args.k; };
-
-  // Simple command line argument parser with defaults
-  auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
-  if (args.print_help) { return; }
-
-  // Prints the header of the output table
-  PrintTableHeader(args.silent, options);
-
-  // Initializes OpenCL and the libraries
-  auto platform = Platform(args.platform_id);
-  auto device = Device(platform, kDeviceType, args.device_id);
-  auto context = Context(device);
-  auto queue = CommandQueue(context, device);
-  if (args.compare_clblas) { clblasSetup(); }
-
-  // Computes whether or not the matrices are transposed. Note that we assume a default of
-  // column-major and no-transpose. If one of them is different (but not both), then rotated
-  // is considered true.
-  auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
-                   (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
-
-  // Iterates over all "num_step" values jumping by "step" each time
-  auto s = size_t{0};
-  while(true) {
-
-    // Computes the data sizes
-    auto a_two = (a_rotated) ? args.n : args.k;
-    auto a_size = a_two * args.a_ld + args.a_offset;
-    auto c_size = args.n * args.c_ld + args.c_offset;
-
-    // Populates input host matrices with random data
-    std::vector<T> a_source(a_size);
-    std::vector<T> c_source(c_size);
-    PopulateVector(a_source);
-    PopulateVector(c_source);
-
-    // Creates the matrices on the device
-    auto a_buffer = Buffer(context, CL_MEM_READ_WRITE, a_size*sizeof(T));
-    auto c_buffer = Buffer(context, CL_MEM_READ_WRITE, c_size*sizeof(T));
-    a_buffer.WriteBuffer(queue, a_size*sizeof(T), a_source);
-    c_buffer.WriteBuffer(queue, c_size*sizeof(T), c_source);
-
-    // Runs the routine-specific code
-    client_routine(args, a_buffer, c_buffer, queue);
-
-    // Makes the jump to the next step
-    ++s;
-    if (s >= args.num_steps) { break; }
-    args.n += args.step;
-    args.k += args.step;
-    args.a_ld += args.step;
-    args.c_ld += args.step;
-  }
-
-  // Cleans-up and returns
-  if (args.compare_clblas) { clblasTeardown(); }
-}
-
-// Compiles the above function
-template void ClientAC<float>(int, char **, Routine2<float>, const std::vector<std::string>&);
-template void ClientAC<double>(int, char **, Routine2<double>, const std::vector<std::string>&);
-template void ClientAC<float2>(int, char **, Routine2<float2>, const std::vector<std::string>&);
-template void ClientAC<double2>(int, char **, Routine2<double2>, const std::vector<std::string>&);
-
-// =================================================================================================
-
-// This is the matrix-matrix-matrix variant of the set-up/tear-down client routine.
-template <typename T>
-void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
-               const std::vector<std::string> &options, const bool symmetric) {
-
-  // Function to determine how to find the default value of the leading dimension of matrix A
-  auto default_ld_a = [&symmetric](const Arguments<T> args) { return (symmetric) ? args.n : args.m; };
-
-  // Simple command line argument parser with defaults
-  auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
-  if (args.print_help) { return; }
-  if (symmetric) { args.m = args.n; }
-
-  // Prints the header of the output table
-  PrintTableHeader(args.silent, options);
-
-  // Initializes OpenCL and the libraries
-  auto platform = Platform(args.platform_id);
-  auto device = Device(platform, kDeviceType, args.device_id);
-  auto context = Context(device);
-  auto queue = CommandQueue(context, device);
-  if (args.compare_clblas) { clblasSetup(); }
-
-  // Computes whether or not the matrices are transposed. Note that we assume a default of
-  // column-major and no-transpose. If one of them is different (but not both), then rotated
-  // is considered true.
-  auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
-                   (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
-  auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
-                   (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
-  auto c_rotated = (args.layout == Layout::kRowMajor);
-
-  // Iterates over all "num_step" values jumping by "step" each time
-  auto s = size_t{0};
-  while(true) {
-
-    // Computes the data sizes
-    auto a_two = (a_rotated) ? args.m : args.k;
-    auto b_two = (b_rotated) ? args.k : args.n;
-    auto c_two = (c_rotated) ? args.m : args.n;
-    auto a_size = a_two * args.a_ld + args.a_offset;
-    auto b_size = b_two * args.b_ld + args.b_offset;
-    auto c_size = c_two * args.c_ld + args.c_offset;
-
-    // Populates input host matrices with random data
-    std::vector<T> a_source(a_size);
-    std::vector<T> b_source(b_size);
-    std::vector<T> c_source(c_size);
-    PopulateVector(a_source);
-    PopulateVector(b_source);
-    PopulateVector(c_source);
-
-    // Creates the matrices on the device
-    auto a_buffer = Buffer(context, CL_MEM_READ_WRITE, a_size*sizeof(T));
-    auto b_buffer = Buffer(context, CL_MEM_READ_WRITE, b_size*sizeof(T));
-    auto c_buffer = Buffer(context, CL_MEM_READ_WRITE, c_size*sizeof(T));
-    a_buffer.WriteBuffer(queue, a_size*sizeof(T), a_source);
-    b_buffer.WriteBuffer(queue, b_size*sizeof(T), b_source);
-    c_buffer.WriteBuffer(queue, c_size*sizeof(T), c_source);
-
-    // Runs the routine-specific code
-    client_routine(args, a_buffer, b_buffer, c_buffer, queue);
-
-    // Makes the jump to the next step
-    ++s;
-    if (s >= args.num_steps) { break; }
-    args.m += args.step;
-    args.n += args.step;
-    args.k += args.step;
-    args.a_ld += args.step;
-    args.b_ld += args.step;
-    args.c_ld += args.step;
-  }
-
-  // Cleans-up and returns
-  if (args.compare_clblas) { clblasTeardown(); }
-}
-
-// Compiles the above function
-template void ClientABC<float>(int, char **, Routine3<float>, const std::vector<std::string>&, const bool);
-template void ClientABC<double>(int, char **, Routine3<double>, const std::vector<std::string>&, const bool);
-template void ClientABC<float2>(int, char **, Routine3<float2>, const std::vector<std::string>&, const bool);
-template void ClientABC<double2>(int, char **, Routine3<double2>, const std::vector<std::string>&, const bool);
-
 // =================================================================================================

 // Parses all arguments available for the CLBlast client testers. Some arguments might not be
 // applicable, but are searched for anyway to be able to create one common argument parser. All
 // arguments have a default value in case they are not found.
 template <typename T>
-Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::string> &options,
-                            const std::function<size_t(const Arguments<T>)> default_ld_a) {
+Arguments<T> Client<T>::ParseArguments(int argc, char *argv[], const GetMetric default_a_ld,
+                                       const GetMetric default_b_ld, const GetMetric default_c_ld) {
  auto args = Arguments<T>{};
  auto help = std::string{"Options given/available:\n"};

  // These are the options which are not for every client: they are optional
-  for (auto &o: options) {
+  for (auto &o: options_) {

    // Data-sizes
-    if (o == kArgM) { args.m = args.k  = GetArgument(argc, argv, help, kArgM, 512UL); }
-    if (o == kArgN) { args.n           = GetArgument(argc, argv, help, kArgN, 512UL); }
-    if (o == kArgK) { args.k           = GetArgument(argc, argv, help, kArgK, 512UL); }
+    if (o == kArgM) { args.m  = GetArgument(argc, argv, help, kArgM, 512UL); }
+    if (o == kArgN) { args.n  = GetArgument(argc, argv, help, kArgN, 512UL); }
+    if (o == kArgK) { args.k  = GetArgument(argc, argv, help, kArgK, 512UL); }

    // Data-layouts
    if (o == kArgLayout)   { args.layout      = GetArgument(argc, argv, help, kArgLayout, Layout::kRowMajor); }
@ -353,9 +66,9 @@ Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::strin
    if (o == kArgYOffset) { args.y_offset = GetArgument(argc, argv, help, kArgYOffset, size_t{0}); }

    // Matrix arguments
-    if (o == kArgALeadDim) { args.a_ld     = GetArgument(argc, argv, help, kArgALeadDim, default_ld_a(args)); }
-    if (o == kArgBLeadDim) { args.b_ld     = GetArgument(argc, argv, help, kArgBLeadDim, args.n); }
-    if (o == kArgCLeadDim) { args.c_ld     = GetArgument(argc, argv, help, kArgCLeadDim, args.n); }
+    if (o == kArgALeadDim) { args.a_ld     = GetArgument(argc, argv, help, kArgALeadDim, default_a_ld(args)); }
+    if (o == kArgBLeadDim) { args.b_ld     = GetArgument(argc, argv, help, kArgBLeadDim, default_b_ld(args)); }
+    if (o == kArgCLeadDim) { args.c_ld     = GetArgument(argc, argv, help, kArgCLeadDim, default_c_ld(args)); }
    if (o == kArgAOffset)  { args.a_offset = GetArgument(argc, argv, help, kArgAOffset, size_t{0}); }
    if (o == kArgBOffset)  { args.b_offset = GetArgument(argc, argv, help, kArgBOffset, size_t{0}); }
    if (o == kArgCOffset)  { args.c_offset = GetArgument(argc, argv, help, kArgCOffset, size_t{0}); }
@ -387,16 +100,92 @@ Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::strin

 // =================================================================================================

+// This is main performance tester
+template <typename T>
+void Client<T>::PerformanceTest(Arguments<T> &args, const SetMetric set_sizes) {
+
+  // Prints the header of the output table
+  PrintTableHeader(args.silent, options_);
+
+  // Initializes OpenCL and the libraries
+  auto platform = Platform(args.platform_id);
+  auto device = Device(platform, kDeviceType, args.device_id);
+  auto context = Context(device);
+  auto queue = CommandQueue(context, device);
+  if (args.compare_clblas) { clblasSetup(); }
+
+  // Iterates over all "num_step" values jumping by "step" each time
+  auto s = size_t{0};
+  while(true) {
+
+    // Sets the buffer sizes (routine-specific)
+    set_sizes(args);
+
+    // Populates input host matrices with random data
+    std::vector<T> x_source(args.x_size);
+    std::vector<T> y_source(args.y_size);
+    std::vector<T> a_source(args.a_size);
+    std::vector<T> b_source(args.b_size);
+    std::vector<T> c_source(args.c_size);
+    PopulateVector(x_source);
+    PopulateVector(y_source);
+    PopulateVector(a_source);
+    PopulateVector(b_source);
+    PopulateVector(c_source);
+
+    // Creates the matrices on the device
+    auto x_vec = Buffer(context, CL_MEM_READ_WRITE, args.x_size*sizeof(T));
+    auto y_vec = Buffer(context, CL_MEM_READ_WRITE, args.y_size*sizeof(T));
+    auto a_mat = Buffer(context, CL_MEM_READ_WRITE, args.a_size*sizeof(T));
+    auto b_mat = Buffer(context, CL_MEM_READ_WRITE, args.b_size*sizeof(T));
+    auto c_mat = Buffer(context, CL_MEM_READ_WRITE, args.c_size*sizeof(T));
+    x_vec.WriteBuffer(queue, args.x_size*sizeof(T), x_source);
+    y_vec.WriteBuffer(queue, args.y_size*sizeof(T), y_source);
+    a_mat.WriteBuffer(queue, args.a_size*sizeof(T), a_source);
+    b_mat.WriteBuffer(queue, args.b_size*sizeof(T), b_source);
+    c_mat.WriteBuffer(queue, args.c_size*sizeof(T), c_source);
+    auto buffers = Buffers{x_vec, y_vec, a_mat, b_mat, c_mat};
+
+    // Runs the routines and collects the timings
+    auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
+    auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference_, "clBLAS");
+
+    // Prints the performance of both libraries
+    PrintTableRow(args, ms_clblast, ms_clblas);
+
+    // Makes the jump to the next step
+    ++s;
+    if (s >= args.num_steps) { break; }
+    args.m += args.step;
+    args.n += args.step;
+    args.k += args.step;
+    args.a_ld += args.step;
+    args.b_ld += args.step;
+    args.c_ld += args.step;
+  }
+
+  // Cleans-up and returns
+  if (args.compare_clblas) { clblasTeardown(); }
+}
+
+// =================================================================================================
+
 // Creates a vector of timing results, filled with execution times of the 'main computation'. The
 // timing is performed using the milliseconds chrono functions. The function returns the minimum
 // value found in the vector of timing results. The return value is in milliseconds.
-double TimedExecution(const size_t num_runs, std::function<void()> main_computation) {
+template <typename T>
+double Client<T>::TimedExecution(const size_t num_runs, const Arguments<T> &args,
+                                 const Buffers &buffers, CommandQueue &queue,
+                                 Routine run_blas, const std::string &library_name) {
  auto timings = std::vector<double>(num_runs);
  for (auto &timing: timings) {
    auto start_time = std::chrono::steady_clock::now();

    // Executes the main computation
-    main_computation();
+    auto status = run_blas(args, buffers, queue);
+    if (status != StatusCode::kSuccess) {
+      throw std::runtime_error(library_name+" error: "+ToString(static_cast<int>(status)));
+    }

    // Records and stores the end-time
    auto elapsed_time = std::chrono::steady_clock::now() - start_time;
@ -408,7 +197,8 @@ double TimedExecution(const size_t num_runs, std::function<void()> main_computat
 // =================================================================================================

 // Prints the header of the performance table
-void PrintTableHeader(const bool silent, const std::vector<std::string> &args) {
+template <typename T>
+void Client<T>::PrintTableHeader(const bool silent, const std::vector<std::string> &args) {
  if (!silent) {
    for (auto i=size_t{0}; i<args.size(); ++i) { fprintf(stdout, "%9s ", ""); }
    fprintf(stdout, " | <--       CLBlast       --> | <--      clBLAS      --> |\n");
@ -419,29 +209,59 @@ void PrintTableHeader(const bool silent, const std::vector<std::string> &args) {
 }

 // Print a performance-result row
-void PrintTableRow(const std::vector<size_t> &args_int, const std::vector<std::string> &args_string,
-                   const bool no_abbrv, const double ms_clblast, const double ms_clblas,
-                   const unsigned long long flops, const unsigned long long bytes) {
+template <typename T>
+void Client<T>::PrintTableRow(const Arguments<T>& args, const double ms_clblast,
+                              const double ms_clblas) {
+
+  // Creates a vector of relevant variables
+  auto integers = std::vector<size_t>{};
+  for (auto &o: options_) {
+    if      (o == kArgM) {        integers.push_back(args.m); }
+    if      (o == kArgN) {        integers.push_back(args.n); }
+    else if (o == kArgK) {        integers.push_back(args.k); }
+    else if (o == kArgLayout) {   integers.push_back(static_cast<size_t>(args.layout)); }
+    else if (o == kArgSide) {     integers.push_back(static_cast<size_t>(args.side)); }
+    else if (o == kArgTriangle) { integers.push_back(static_cast<size_t>(args.triangle)); }
+    else if (o == kArgATransp) {  integers.push_back(static_cast<size_t>(args.a_transpose)); }
+    else if (o == kArgBTransp) {  integers.push_back(static_cast<size_t>(args.b_transpose)); }
+    else if (o == kArgXInc) {     integers.push_back(args.x_inc); }
+    else if (o == kArgYInc) {     integers.push_back(args.y_inc); }
+    else if (o == kArgXOffset) {  integers.push_back(args.x_offset); }
+    else if (o == kArgYOffset) {  integers.push_back(args.y_offset); }
+    else if (o == kArgALeadDim) { integers.push_back(args.a_ld); }
+    else if (o == kArgBLeadDim) { integers.push_back(args.b_ld); }
+    else if (o == kArgCLeadDim) { integers.push_back(args.c_ld); }
+    else if (o == kArgAOffset) {  integers.push_back(args.a_offset); }
+    else if (o == kArgBOffset) {  integers.push_back(args.b_offset); }
+    else if (o == kArgCOffset) {  integers.push_back(args.c_offset); }
+  }
+  auto strings = std::vector<std::string>{};
+  for (auto &o: options_) {
+    if      (o == kArgAlpha) {    strings.push_back(ToString(args.alpha)); }
+    else if (o == kArgBeta) {     strings.push_back(ToString(args.beta)); }
+  }

  // Computes the GFLOPS and GB/s metrics
+  auto flops = get_flops_(args);
+  auto bytes = get_bytes_(args);
  auto gflops_clblast = (ms_clblast != 0.0) ? (flops*1e-6)/ms_clblast : 0;
  auto gflops_clblas = (ms_clblas != 0.0) ? (flops*1e-6)/ms_clblas: 0;
  auto gbs_clblast = (ms_clblast != 0.0) ? (bytes*1e-6)/ms_clblast : 0;
  auto gbs_clblas = (ms_clblas != 0.0) ? (bytes*1e-6)/ms_clblas: 0;

  // Outputs the argument values
-  for (auto &argument: args_int) {
-    if (!no_abbrv && argument >= 1024*1024 && IsMultiple(argument, 1024*1024)) {
+  for (auto &argument: integers) {
+    if (!args.no_abbrv && argument >= 1024*1024 && IsMultiple(argument, 1024*1024)) {
      fprintf(stdout, "%8luM;", argument/(1024*1024));
    }
-    else if (!no_abbrv && argument >= 1024 && IsMultiple(argument, 1024)) {
+    else if (!args.no_abbrv && argument >= 1024 && IsMultiple(argument, 1024)) {
      fprintf(stdout, "%8luK;", argument/1024);
    }
    else {
      fprintf(stdout, "%9lu;", argument);
    }
  }
-  for (auto &argument: args_string) {
+  for (auto &argument: strings) {
    fprintf(stdout, "%9s;", argument.c_str());
  }

@ -451,5 +271,13 @@ void PrintTableRow(const std::vector<size_t> &args_int, const std::vector<std::s
          ms_clblas, gflops_clblas, gbs_clblas);
 }

+// =================================================================================================
+
+// Compiles the templated class
+template class Client<float>;
+template class Client<double>;
+template class Client<float2>;
+template class Client<double2>;
+
 // =================================================================================================
 } // namespace clblast
--- a/test/performance/client.h
+++ b/test/performance/client.h
@ -7,7 +7,12 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file provides common function declarations to be used with the test clients.
+// This class implements the performance-test client. It is generic for all CLBlast routines by
+// taking a number of routine-specific functions as arguments, such as how to compute buffer sizes
+// or how to get the FLOPS count.
+//
+// This file also provides the common interface to the performance client (see the 'RunClient'
+// function for details).
 //
 // =================================================================================================

@ -26,64 +31,71 @@
 namespace clblast {
 // =================================================================================================

-// Types of devices to consider
-const cl_device_type kDeviceType = CL_DEVICE_TYPE_ALL;
+// See comment at top of file for a description of the class
+template <typename T>
+class Client {
+ public:
+
+  // Types of devices to consider
+  const cl_device_type kDeviceType = CL_DEVICE_TYPE_ALL;
+
+  // Shorthand for the routine-specific functions passed to the tester
+  using Routine = std::function<StatusCode(const Arguments<T>&, const Buffers&, CommandQueue&)>;
+  using SetMetric = std::function<void(Arguments<T>&)>;
+  using GetMetric = std::function<size_t(const Arguments<T>&)>;
+
+  // The constructor
+  Client(const Routine run_routine, const Routine run_reference,
+         const std::vector<std::string> &options,
+         const GetMetric get_flops, const GetMetric get_bytes);
+
+  // Parses all command-line arguments, filling in the arguments structure. If no command-line
+  // argument is given for a particular argument, it is filled in with a default value.
+  Arguments<T> ParseArguments(int argc, char *argv[], const GetMetric default_a_ld,
+                              const GetMetric default_b_ld, const GetMetric default_c_ld);
+
+  // The main client function, setting-up arguments, matrices, OpenCL buffers, etc. After set-up, it
+  // calls the client routines.
+  void PerformanceTest(Arguments<T> &args, const SetMetric set_sizes);
+
+ private:
+
+  // Runs a function a given number of times and returns the execution time of the shortest instance
+  double TimedExecution(const size_t num_runs, const Arguments<T> &args, const Buffers &buffers,
+                        CommandQueue &queue, Routine run_blas, const std::string &library_name);
+
+  // Prints the header of a performance-data table
+  void PrintTableHeader(const bool silent, const std::vector<std::string> &args);
+
+  // Prints a row of performance data, including results of two libraries
+  void PrintTableRow(const Arguments<T>& args, const double ms_clblast, const double ms_clblas);
+
+  // The routine-specific functions passed to the tester
+  const Routine run_routine_;
+  const Routine run_reference_;
+  const std::vector<std::string> options_;
+  const GetMetric get_flops_;
+  const GetMetric get_bytes_;
+};

 // =================================================================================================

-// Shorthand for a BLAS routine with 2 or 3 OpenCL buffers as argument
-template <typename T>
-using Routine2 = std::function<void(const Arguments<T>&,
-                                    const Buffer&, const Buffer&,
-                                    CommandQueue&)>;
-template <typename T>
-using Routine3 = std::function<void(const Arguments<T>&,
-                                    const Buffer&, const Buffer&, const Buffer&,
-                                    CommandQueue&)>;
+// The interface to the performance client. This is a separate function in the header such that it
+// is automatically compiled for each routine, templated by the parameter "C".
+template <typename C, typename T>
+void RunClient(int argc, char *argv[]) {

-// =================================================================================================
+  // Creates a new client
+  auto client = Client<T>(C::RunRoutine, C::RunReference, C::GetOptions(),
+                          C::GetFlops, C::GetBytes);

-// These are the main client functions, setting-up arguments, matrices, OpenCL buffers, etc. After
-// set-up, they call the client routine, passed as argument to this function.
-template <typename T>
-void ClientXY(int argc, char *argv[], Routine2<T> client_routine,
-              const std::vector<std::string> &options);
-template <typename T>
-void ClientAXY(int argc, char *argv[], Routine3<T> client_routine,
-               const std::vector<std::string> &options);
-template <typename T>
-void ClientAC(int argc, char *argv[], Routine2<T> client_routine,
-              const std::vector<std::string> &options);
-template <typename T>
-void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
-               const std::vector<std::string> &options, const bool symmetric);
+  // Simple command line argument parser with defaults
+  auto args = client.ParseArguments(argc, argv, C::DefaultLDA, C::DefaultLDB, C::DefaultLDC);
+  if (args.print_help) { return; }

-// =================================================================================================
-
-// Parses all command-line arguments, filling in the arguments structure. If no command-line
-// argument is given for a particular argument, it is filled in with a default value.
-template <typename T>
-Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::string> &options,
-                            const std::function<size_t(const Arguments<T>)> default_ld_a);
-
-// Retrieves only the precision command-line argument, since the above function is templated based
-// on the precision
-Precision GetPrecision(int argc, char *argv[]);
-
-// =================================================================================================
-
-// Runs a function a given number of times and returns the execution time of the shortest instance
-double TimedExecution(const size_t num_runs, std::function<void()> main_computation);
-
-// =================================================================================================
-
-// Prints the header of a performance-data table
-void PrintTableHeader(const bool silent, const std::vector<std::string> &args);
-
-// Prints a row of performance data, including results of two libraries
-void PrintTableRow(const std::vector<size_t> &args_int, const std::vector<std::string> &args_string,
-                   const bool abbreviations, const double ms_clblast, const double ms_clblas,
-                   const unsigned long long flops, const unsigned long long bytes);
+  // Runs the client
+  client.PerformanceTest(args, C::SetSizes);
+}

 // =================================================================================================
 } // namespace clblast
--- a/test/performance/routines/xaxpy.cc
+++ b/test/performance/routines/xaxpy.cc
@ -7,90 +7,29 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file implements the Xaxpy command-line interface tester.
+// This file implements the Xaxpy command-line interface performance tester.
 //
 // =================================================================================================

-#include <string>
-#include <vector>
-#include <exception>
-
-#include "wrapper_clblas.h"
 #include "performance/client.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The client, used for performance testing. It contains the function calls to CLBlast and to other
-// libraries to compare against.
-template <typename T>
-void PerformanceXaxpy(const Arguments<T> &args,
-                      const Buffer &x_vec, const Buffer &y_vec,
-                      CommandQueue &queue) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [&args, &x_vec, &y_vec, &queue]() {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = Axpy(args.n, args.alpha,
-                       x_vec(), args.x_offset, args.x_inc,
-                       y_vec(), args.y_offset, args.y_inc,
-                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
-    if (status != StatusCode::kSuccess) {
-      throw std::runtime_error("CLBlast error: "+ToString(static_cast<int>(status)));
-    }
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [&args, &x_vec, &y_vec, &queue]() {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXaxpy(args.n, args.alpha,
-                              x_vec(), args.x_offset, args.x_inc,
-                              y_vec(), args.y_offset, args.y_inc,
-                              1, &queue_plain, 0, nullptr, &event);
-    clWaitForEvents(1, &event);
-    if (status != CL_SUCCESS) {
-      throw std::runtime_error("clBLAS error: "+ToString(static_cast<int>(status)));
-    }
-  };
-
-  // Runs the routines and collect the timings
-  auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda);
-  auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda);
-
-  // Prints the performance of both libraries
-  const auto flops = 2 * args.n;
-  const auto bytes = (3 * args.n) * sizeof(T);
-  const auto output_ints = std::vector<size_t>{args.n, args.x_inc, args.y_inc,
-                                               args.x_offset, args.y_offset};
-  const auto output_strings = std::vector<std::string>{ToString(args.alpha)};
-  PrintTableRow(output_ints, output_strings, args.no_abbrv,
-                ms_clblast, ms_clblas, flops, bytes);
-}
+#include "routines/xaxpy.h"

 // =================================================================================================

-// Main function which calls the common client code with the routine-specific function as argument.
-void ClientXaxpy(int argc, char *argv[]) {
-  const auto o = std::vector<std::string>{kArgN, kArgXInc, kArgYInc,
-                                          kArgXOffset, kArgYOffset, kArgAlpha};
-  switch(GetPrecision(argc, argv)) {
-    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
-    case Precision::kSingle: ClientXY<float>(argc, argv, PerformanceXaxpy<float>, o); break;
-    case Precision::kDouble: ClientXY<double>(argc, argv, PerformanceXaxpy<double>, o); break;
-    case Precision::kComplexSingle: ClientXY<float2>(argc, argv, PerformanceXaxpy<float2>, o); break;
-    case Precision::kComplexDouble: ClientXY<double2>(argc, argv, PerformanceXaxpy<double2>, o); break;
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  clblast::ClientXaxpy(argc, argv);
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXaxpy<float>, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXaxpy<double>, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXaxpy<clblast::float2>, clblast::float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXaxpy<clblast::double2>, clblast::double2>(argc, argv); break;
+  }
  return 0;
 }

--- a/test/performance/routines/xgemm.cc
+++ b/test/performance/routines/xgemm.cc
@ -7,108 +7,29 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file implements the Xgemm command-line interface tester.
+// This file implements the Xgemm command-line interface performance tester.
 //
 // =================================================================================================

-#include <string>
-#include <vector>
-#include <exception>
-
-#include "wrapper_clblas.h"
 #include "performance/client.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The client, used for performance testing. It contains the function calls to CLBlast and to other
-// libraries to compare against.
-template <typename T>
-void PerformanceXgemm(const Arguments<T> &args,
-                      const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
-                      CommandQueue &queue) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = Gemm(args.layout, args.a_transpose, args.b_transpose,
-                       args.m, args.n, args.k,
-                       args.alpha,
-                       a_mat(), args.a_offset, args.a_ld,
-                       b_mat(), args.b_offset, args.b_ld,
-                       args.beta,
-                       c_mat(), args.c_offset, args.c_ld,
-                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
-    if (status != StatusCode::kSuccess) {
-      throw std::runtime_error("CLBlast error: "+ToString(static_cast<int>(status)));
-    }
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXgemm(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasTranspose>(args.a_transpose),
-                              static_cast<clblasTranspose>(args.b_transpose),
-                              args.m, args.n, args.k,
-                              args.alpha,
-                              a_mat(), args.a_offset, args.a_ld,
-                              b_mat(), args.b_offset, args.b_ld,
-                              args.beta,
-                              c_mat(), args.c_offset, args.c_ld,
-                              1, &queue_plain, 0, nullptr, &event);
-    clWaitForEvents(1, &event);
-    if (status != CL_SUCCESS) {
-      throw std::runtime_error("clBLAS error: "+ToString(static_cast<int>(status)));
-    }
-  };
-
-  // Runs the routines and collect the timings
-  auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda);
-  auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda);
-
-  // Prints the performance of both libraries
-  const auto flops = 2 * args.m * args.n * args.k;
-  const auto bytes = (args.m*args.k + args.k*args.n + 2*args.m*args.n) * sizeof(T);
-  const auto output_ints = std::vector<size_t>{args.m, args.n, args.k,
-                                               static_cast<size_t>(args.layout),
-                                               static_cast<size_t>(args.a_transpose),
-                                               static_cast<size_t>(args.b_transpose),
-                                               args.a_ld, args.b_ld, args.c_ld,
-                                               args.a_offset, args.b_offset, args.c_offset};
-  const auto output_strings = std::vector<std::string>{ToString(args.alpha),
-                                                       ToString(args.beta)};
-  PrintTableRow(output_ints, output_strings, args.no_abbrv,
-                ms_clblast, ms_clblas, flops, bytes);
-}
+#include "routines/xgemm.h"

 // =================================================================================================

-// Main function which calls the common client code with the routine-specific function as argument.
-void ClientXgemm(int argc, char *argv[]) {
-  const auto o = std::vector<std::string>{kArgM, kArgN, kArgK, kArgLayout,
-                                          kArgATransp, kArgBTransp,
-                                          kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
-                                          kArgAOffset, kArgBOffset, kArgCOffset,
-                                          kArgAlpha, kArgBeta};
-  switch(GetPrecision(argc, argv)) {
-    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
-    case Precision::kSingle: ClientABC<float>(argc, argv, PerformanceXgemm<float>, o, false); break;
-    case Precision::kDouble: ClientABC<double>(argc, argv, PerformanceXgemm<double>, o, false); break;
-    case Precision::kComplexSingle: ClientABC<float2>(argc, argv, PerformanceXgemm<float2>, o, false); break;
-    case Precision::kComplexDouble: ClientABC<double2>(argc, argv, PerformanceXgemm<double2>, o, false); break;
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  clblast::ClientXgemm(argc, argv);
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXgemm<float>, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXgemm<double>, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXgemm<clblast::float2>, clblast::float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXgemm<clblast::double2>, clblast::double2>(argc, argv); break;
+  }
  return 0;
 }

--- a/test/performance/routines/xgemv.cc
+++ b/test/performance/routines/xgemv.cc
@ -7,100 +7,29 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file implements the Xgemv command-line interface tester.
+// This file implements the Xgemv command-line interface performance tester.
 //
 // =================================================================================================

-#include <string>
-#include <vector>
-#include <exception>
-
-#include "wrapper_clblas.h"
 #include "performance/client.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The client, used for performance testing. It contains the function calls to CLBlast and to other
-// libraries to compare against.
-template <typename T>
-void PerformanceXgemv(const Arguments<T> &args,
-                      const Buffer &a_mat, const Buffer &x_vec, const Buffer &y_vec,
-                      CommandQueue &queue) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [&args, &a_mat, &x_vec, &y_vec, &queue]() {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = Gemv(args.layout, args.a_transpose, args.m, args.n, args.alpha,
-                       a_mat(), args.a_offset, args.a_ld,
-                       x_vec(), args.x_offset, args.x_inc, args.beta,
-                       y_vec(), args.y_offset, args.y_inc,
-                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
-    if (status != StatusCode::kSuccess) {
-      throw std::runtime_error("CLBlast error: "+ToString(static_cast<int>(status)));
-    }
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [&args, &a_mat, &x_vec, &y_vec, &queue]() {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXgemv(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasTranspose>(args.a_transpose),
-                              args.m, args.n, args.alpha,
-                              a_mat(), args.a_offset, args.a_ld,
-                              x_vec(), args.x_offset, args.x_inc, args.beta,
-                              y_vec(), args.y_offset, args.y_inc,
-                              1, &queue_plain, 0, nullptr, &event);
-    clWaitForEvents(1, &event);
-    if (status != CL_SUCCESS) {
-      throw std::runtime_error("clBLAS error: "+ToString(static_cast<int>(status)));
-    }
-  };
-
-  // Runs the routines and collect the timings
-  auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda);
-  auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda);
-
-  // Prints the performance of both libraries
-  const auto flops = 2 * args.m * args.n;
-  const auto bytes = (args.m*args.n + 2*args.m + args.n) * sizeof(T);
-  const auto output_ints = std::vector<size_t>{args.m, args.n,
-                                               static_cast<size_t>(args.layout),
-                                               static_cast<size_t>(args.a_transpose),
-                                               args.a_ld, args.x_inc, args.y_inc,
-                                               args.a_offset, args.x_offset, args.y_offset};
-  const auto output_strings = std::vector<std::string>{ToString(args.alpha),
-                                                       ToString(args.beta)};
-  PrintTableRow(output_ints, output_strings, args.no_abbrv,
-                ms_clblast, ms_clblas, flops, bytes);
-}
+#include "routines/xgemv.h"

 // =================================================================================================

-// Main function which calls the common client code with the routine-specific function as argument.
-void ClientXgemv(int argc, char *argv[]) {
-  const auto o = std::vector<std::string>{kArgM, kArgN, kArgLayout, kArgATransp,
-                                          kArgALeadDim, kArgXInc, kArgYInc,
-                                          kArgAOffset, kArgXOffset, kArgYOffset,
-                                          kArgAlpha, kArgBeta};
-  switch(GetPrecision(argc, argv)) {
-    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
-    case Precision::kSingle: ClientAXY<float>(argc, argv, PerformanceXgemv<float>, o); break;
-    case Precision::kDouble: ClientAXY<double>(argc, argv, PerformanceXgemv<double>, o); break;
-    case Precision::kComplexSingle: ClientAXY<float2>(argc, argv, PerformanceXgemv<float2>, o); break;
-    case Precision::kComplexDouble: ClientAXY<double2>(argc, argv, PerformanceXgemv<double2>, o); break;
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  clblast::ClientXgemv(argc, argv);
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXgemv<float>, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXgemv<double>, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXgemv<clblast::float2>, clblast::float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXgemv<clblast::double2>, clblast::double2>(argc, argv); break;
+  }
  return 0;
 }

--- a/test/performance/routines/xsymm.cc
+++ b/test/performance/routines/xsymm.cc
@ -7,108 +7,29 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file implements the Xsymm command-line interface tester.
+// This file implements the Xsymm command-line interface performance tester.
 //
 // =================================================================================================

-#include <string>
-#include <vector>
-#include <exception>
-
-#include "wrapper_clblas.h"
 #include "performance/client.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The client, used for performance testing. It contains the function calls to CLBlast and to other
-// libraries to compare against.
-template <typename T>
-void PerformanceXsymm(const Arguments<T> &args,
-                      const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
-                      CommandQueue &queue) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = Symm(args.layout, args.side, args.triangle,
-                       args.m, args.n,
-                       args.alpha,
-                       a_mat(), args.a_offset, args.a_ld,
-                       b_mat(), args.b_offset, args.b_ld,
-                       args.beta,
-                       c_mat(), args.c_offset, args.c_ld,
-                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
-    if (status != StatusCode::kSuccess) {
-      throw std::runtime_error("CLBlast error: "+ToString(static_cast<int>(status)));
-    }
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXsymm(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasSide>(args.side),
-                              static_cast<clblasUplo>(args.triangle),
-                              args.m, args.n,
-                              args.alpha,
-                              a_mat(), args.a_offset, args.a_ld,
-                              b_mat(), args.b_offset, args.b_ld,
-                              args.beta,
-                              c_mat(), args.c_offset, args.c_ld,
-                              1, &queue_plain, 0, nullptr, &event);
-    clWaitForEvents(1, &event);
-    if (status != CL_SUCCESS) {
-      throw std::runtime_error("clBLAS error: "+ToString(static_cast<int>(status)));
-    }
-  };
-
-  // Runs the routines and collect the timings
-  auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda);
-  auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda);
-
-  // Prints the performance of both libraries
-  const auto flops = 2 * args.m * args.n * args.m;
-  const auto bytes = (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T);
-  const auto output_ints = std::vector<size_t>{args.m, args.n,
-                                               static_cast<size_t>(args.layout),
-                                               static_cast<size_t>(args.triangle),
-                                               static_cast<size_t>(args.side),
-                                               args.a_ld, args.b_ld, args.c_ld,
-                                               args.a_offset, args.b_offset, args.c_offset};
-  const auto output_strings = std::vector<std::string>{ToString(args.alpha),
-                                                       ToString(args.beta)};
-  PrintTableRow(output_ints, output_strings, args.no_abbrv,
-                ms_clblast, ms_clblas, flops, bytes);
-}
+#include "routines/xsymm.h"

 // =================================================================================================

-// Main function which calls the common client code with the routine-specific function as argument.
-void ClientXsymm(int argc, char *argv[]) {
-  const auto o = std::vector<std::string>{kArgM, kArgN, kArgLayout,
-                                          kArgTriangle, kArgSide,
-                                          kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
-                                          kArgAOffset, kArgBOffset, kArgCOffset,
-                                          kArgAlpha, kArgBeta};
-  switch(GetPrecision(argc, argv)) {
-    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
-    case Precision::kSingle: ClientABC<float>(argc, argv, PerformanceXsymm<float>, o, false); break;
-    case Precision::kDouble: ClientABC<double>(argc, argv, PerformanceXsymm<double>, o, false); break;
-    case Precision::kComplexSingle: ClientABC<float2>(argc, argv, PerformanceXsymm<float2>, o, false); break;
-    case Precision::kComplexDouble: ClientABC<double2>(argc, argv, PerformanceXsymm<double2>, o, false); break;
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  clblast::ClientXsymm(argc, argv);
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXsymm<float>, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXsymm<double>, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXsymm<clblast::float2>, clblast::float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXsymm<clblast::double2>, clblast::double2>(argc, argv); break;
+  }
  return 0;
 }

--- a/test/performance/routines/xsyr2k.cc
+++ b/test/performance/routines/xsyr2k.cc
@ -7,108 +7,29 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file implements the Xsyr2k command-line interface tester.
+// This file implements the Xsyr2k command-line interface performance tester.
 //
 // =================================================================================================

-#include <string>
-#include <vector>
-#include <exception>
-
-#include "wrapper_clblas.h"
 #include "performance/client.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The client, used for performance testing. It contains the function calls to CLBlast and to other
-// libraries to compare against.
-template <typename T>
-void PerformanceXsyr2k(const Arguments<T> &args,
-                       const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
-                       CommandQueue &queue) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = Syr2k(args.layout, args.triangle, args.a_transpose,
-                        args.n, args.k,
-                        args.alpha,
-                        a_mat(), args.a_offset, args.a_ld,
-                        b_mat(), args.b_offset, args.b_ld,
-                        args.beta,
-                        c_mat(), args.c_offset, args.c_ld,
-                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
-    if (status != StatusCode::kSuccess) {
-      throw std::runtime_error("CLBlast error: "+ToString(static_cast<int>(status)));
-    }
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXsyr2k(static_cast<clblasOrder>(args.layout),
-                               static_cast<clblasUplo>(args.triangle),
-                               static_cast<clblasTranspose>(args.a_transpose),
-                               args.n, args.k,
-                               args.alpha,
-                               a_mat(), args.a_offset, args.a_ld,
-                               b_mat(), args.b_offset, args.b_ld,
-                               args.beta,
-                               c_mat(), args.c_offset, args.c_ld,
-                               1, &queue_plain, 0, nullptr, &event);
-    clWaitForEvents(1, &event);
-    if (status != CL_SUCCESS) {
-      throw std::runtime_error("clBLAS error: "+ToString(static_cast<int>(status)));
-    }
-  };
-
-  // Runs the routines and collect the timings
-  auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda);
-  auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda);
-
-  // Prints the performance of both libraries
-  const auto flops = 2 * args.n * args.n * args.k;
-  const auto bytes = (args.n*args.k + args.n*args.n) * sizeof(T);
-  const auto output_ints = std::vector<size_t>{args.n, args.k,
-                                               static_cast<size_t>(args.layout),
-                                               static_cast<size_t>(args.triangle),
-                                               static_cast<size_t>(args.a_transpose),
-                                               args.a_ld, args.b_ld, args.c_ld,
-                                               args.a_offset, args.b_offset, args.c_offset};
-  const auto output_strings = std::vector<std::string>{ToString(args.alpha),
-                                                       ToString(args.beta)};
-  PrintTableRow(output_ints, output_strings, args.no_abbrv,
-                ms_clblast, ms_clblas, flops, bytes);
-}
+#include "routines/xsyr2k.h"

 // =================================================================================================

-// Main function which calls the common client code with the routine-specific function as argument.
-void ClientXsyr2k(int argc, char *argv[]) {
-  const auto o = std::vector<std::string>{kArgN, kArgK,
-                                          kArgLayout, kArgTriangle, kArgATransp,
-                                          kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
-                                          kArgAOffset, kArgBOffset, kArgCOffset,
-                                          kArgAlpha, kArgBeta};
-  switch(GetPrecision(argc, argv)) {
-    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
-    case Precision::kSingle: ClientABC<float>(argc, argv, PerformanceXsyr2k<float>, o, true); break;
-    case Precision::kDouble: ClientABC<double>(argc, argv, PerformanceXsyr2k<double>, o, true); break;
-    case Precision::kComplexSingle: ClientABC<float2>(argc, argv, PerformanceXsyr2k<float2>, o, true); break;
-    case Precision::kComplexDouble: ClientABC<double2>(argc, argv, PerformanceXsyr2k<double2>, o, true); break;
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  clblast::ClientXsyr2k(argc, argv);
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXsyr2k<float>, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXsyr2k<double>, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXsyr2k<clblast::float2>, clblast::float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXsyr2k<clblast::double2>, clblast::double2>(argc, argv); break;
+  }
  return 0;
 }

--- a/test/performance/routines/xsyrk.cc
+++ b/test/performance/routines/xsyrk.cc
@ -7,106 +7,29 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file implements the Xsyrk command-line interface tester.
+// This file implements the Xsyrk command-line interface performance tester.
 //
 // =================================================================================================

-#include <string>
-#include <vector>
-#include <exception>
-
-#include "wrapper_clblas.h"
 #include "performance/client.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The client, used for performance testing. It contains the function calls to CLBlast and to other
-// libraries to compare against.
-template <typename T>
-void PerformanceXsyrk(const Arguments<T> &args,
-                      const Buffer &a_mat, const Buffer &c_mat,
-                      CommandQueue &queue) {
-
-  // Creates the CLBlast lambda
-  auto clblast_lambda = [&args, &a_mat, &c_mat, &queue]() {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = Syrk(args.layout, args.triangle, args.a_transpose,
-                       args.n, args.k,
-                       args.alpha,
-                       a_mat(), args.a_offset, args.a_ld,
-                       args.beta,
-                       c_mat(), args.c_offset, args.c_ld,
-                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
-    if (status != StatusCode::kSuccess) {
-      throw std::runtime_error("CLBlast error: "+ToString(static_cast<int>(status)));
-    }
-  };
-
-  // Creates the clBLAS lambda (for comparison)
-  auto clblas_lambda = [&args, &a_mat, &c_mat, &queue]() {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasUplo>(args.triangle),
-                              static_cast<clblasTranspose>(args.a_transpose),
-                              args.n, args.k,
-                              args.alpha,
-                              a_mat(), args.a_offset, args.a_ld,
-                              args.beta,
-                              c_mat(), args.c_offset, args.c_ld,
-                              1, &queue_plain, 0, nullptr, &event);
-    clWaitForEvents(1, &event);
-    if (status != CL_SUCCESS) {
-      throw std::runtime_error("clBLAS error: "+ToString(static_cast<int>(status)));
-    }
-  };
-
-  // Runs the routines and collect the timings
-  auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda);
-  auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda);
-
-  // Prints the performance of both libraries
-  const auto flops = args.n * args.n * args.k;
-  const auto bytes = (args.n*args.k + args.n*args.n) * sizeof(T);
-  const auto output_ints = std::vector<size_t>{args.n, args.k,
-                                               static_cast<size_t>(args.layout),
-                                               static_cast<size_t>(args.triangle),
-                                               static_cast<size_t>(args.a_transpose),
-                                               args.a_ld, args.c_ld,
-                                               args.a_offset, args.c_offset};
-  const auto output_strings = std::vector<std::string>{ToString(args.alpha),
-                                                       ToString(args.beta)};
-  PrintTableRow(output_ints, output_strings, args.no_abbrv,
-                ms_clblast, ms_clblas, flops, bytes);
-}
+#include "routines/xsyrk.h"

 // =================================================================================================

-// Main function which calls the common client code with the routine-specific function as argument.
-void ClientXsyrk(int argc, char *argv[]) {
-  const auto o = std::vector<std::string>{kArgN, kArgK,
-                                          kArgLayout, kArgTriangle, kArgATransp,
-                                          kArgALeadDim, kArgCLeadDim,
-                                          kArgAOffset, kArgCOffset,
-                                          kArgAlpha, kArgBeta};
-  switch(GetPrecision(argc, argv)) {
-    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
-    case Precision::kSingle: ClientAC<float>(argc, argv, PerformanceXsyrk<float>, o); break;
-    case Precision::kDouble: ClientAC<double>(argc, argv, PerformanceXsyrk<double>, o); break;
-    case Precision::kComplexSingle: ClientAC<float2>(argc, argv, PerformanceXsyrk<float2>, o); break;
-    case Precision::kComplexDouble: ClientAC<double2>(argc, argv, PerformanceXsyrk<double2>, o); break;
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  clblast::ClientXsyrk(argc, argv);
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXsyrk<float>, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXsyrk<double>, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXsyrk<clblast::float2>, clblast::float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXsyrk<clblast::double2>, clblast::double2>(argc, argv); break;
+  }
  return 0;
 }

--- a/test/routines/xaxpy.h
+++ b/test/routines/xaxpy.h
@ -0,0 +1,113 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xaxpy routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XAXPY_H_
+#define CLBLAST_TEST_ROUTINES_XAXPY_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXaxpy {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN,
+            kArgXInc, kArgYInc,
+            kArgXOffset, kArgYOffset,
+            kArgAlpha};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &args) {
+    return args.n * args.x_inc + args.x_offset;
+  }
+  static size_t GetSizeY(const Arguments<T> &args) {
+    return args.n * args.y_inc + args.y_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.x_size = GetSizeX(args);
+    args.y_size = GetSizeY(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Axpy(args.n, args.alpha,
+                       buffers.x_vec(), args.x_offset, args.x_inc,
+                       buffers.y_vec(), args.y_offset, args.y_inc,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXaxpy(args.n, args.alpha,
+                              buffers.x_vec(), args.x_offset, args.x_inc,
+                              buffers.y_vec(), args.y_offset, args.y_inc,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.y_size, static_cast<T>(0));
+    buffers.y_vec.ReadBuffer(queue, args.y_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.n; }
+  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+    return id1*args.y_inc + args.y_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.n;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (3 * args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XAXPY_H_
+#endif
--- a/test/routines/xgemm.h
+++ b/test/routines/xgemm.h
@ -0,0 +1,134 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xgemm routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XGEMM_H_
+#define CLBLAST_TEST_ROUTINES_XGEMM_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXgemm {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgM, kArgN, kArgK,
+            kArgLayout, kArgATransp, kArgBTransp,
+            kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
+            kArgAOffset, kArgBOffset, kArgCOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+    auto a_two = (a_rotated) ? args.m : args.k;
+    return a_two * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeB(const Arguments<T> &args) {
+    auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
+    auto b_two = (b_rotated) ? args.k : args.n;
+    return b_two * args.b_ld + args.b_offset;
+  }
+  static size_t GetSizeC(const Arguments<T> &args) {
+    auto c_rotated = (args.layout == Layout::kRowMajor);
+    auto c_two = (c_rotated) ? args.m : args.n;
+    return c_two * args.c_ld + args.c_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.b_size = GetSizeB(args);
+    args.c_size = GetSizeC(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.k; }
+  static size_t DefaultLDB(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDC(const Arguments<T> &args) { return args.n; }
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Gemm(args.layout, args.a_transpose, args.b_transpose,
+                       args.m, args.n, args.k, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld,
+                       buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                       buffers.c_mat(), args.c_offset, args.c_ld,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXgemm(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasTranspose>(args.a_transpose),
+                              static_cast<clblasTranspose>(args.b_transpose),
+                              args.m, args.n, args.k, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld,
+                              buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                              buffers.c_mat(), args.c_offset, args.c_ld,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.c_size, static_cast<T>(0));
+    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.m; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    return (args.layout == Layout::kRowMajor) ?
+           id1*args.c_ld + id2 + args.c_offset:
+           id2*args.c_ld + id1 + args.c_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.m * args.n * args.m;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XGEMM_H_
+#endif
--- a/test/routines/xgemv.h
+++ b/test/routines/xgemv.h
@ -0,0 +1,132 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xgemv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XGEMV_H_
+#define CLBLAST_TEST_ROUTINES_XGEMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXgemv {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgM, kArgN,
+            kArgLayout, kArgATransp, 
+            kArgALeadDim, kArgXInc, kArgYInc,
+            kArgAOffset, kArgXOffset, kArgYOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    auto a_rotated = (args.layout == Layout::kRowMajor);
+    auto a_two = (a_rotated) ? args.m : args.n;
+    return a_two * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeX(const Arguments<T> &args) {
+    auto a_transposed = (args.a_transpose != Transpose::kNo);
+    auto n_real = (a_transposed) ? args.m : args.n;
+    return n_real * args.x_inc + args.x_offset;
+  }
+  static size_t GetSizeY(const Arguments<T> &args) {
+    auto a_transposed = (args.a_transpose != Transpose::kNo);
+    auto m_real = (a_transposed) ? args.n : args.m;
+    return m_real * args.y_inc + args.y_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.x_size = GetSizeX(args);
+    args.y_size = GetSizeY(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Gemv(args.layout, args.a_transpose,
+                       args.m, args.n, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld,
+                       buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+                       buffers.y_vec(), args.y_offset, args.y_inc,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXgemv(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasTranspose>(args.a_transpose),
+                              args.m, args.n, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld,
+                              buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+                              buffers.y_vec(), args.y_offset, args.y_inc,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.y_size, static_cast<T>(0));
+    buffers.y_vec.ReadBuffer(queue, args.y_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) {
+    auto a_transposed = (args.a_transpose != Transpose::kNo);
+    return (a_transposed) ? args.n : args.m;
+  }
+  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+    return id1*args.y_inc + args.y_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.m * args.n;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (args.m*args.n + 2*args.m + args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XGEMV_H_
+#endif
--- a/test/routines/xsymm.h
+++ b/test/routines/xsymm.h
@ -0,0 +1,134 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xsymm routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XSYMM_H_
+#define CLBLAST_TEST_ROUTINES_XSYMM_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXsymm {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgM, kArgN,
+            kArgLayout, kArgSide, kArgTriangle,
+            kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
+            kArgAOffset, kArgBOffset, kArgCOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    size_t k_value = (args.side == Side::kLeft) ? args.m : args.n;
+    auto a_rotated = (args.layout == Layout::kRowMajor);
+    auto a_two = (a_rotated) ? args.m : k_value;
+    return a_two * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeB(const Arguments<T> &args) {
+    size_t k_value = (args.side == Side::kLeft) ? args.m : args.n;
+    auto b_rotated = (args.layout == Layout::kRowMajor);
+    auto b_two = (b_rotated) ? k_value : args.n;
+    return b_two * args.b_ld + args.b_offset;
+  }
+  static size_t GetSizeC(const Arguments<T> &args) {
+    auto c_rotated = (args.layout == Layout::kRowMajor);
+    auto c_two = (c_rotated) ? args.m : args.n;
+    return c_two * args.c_ld + args.c_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.b_size = GetSizeB(args);
+    args.c_size = GetSizeC(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.m; }
+  static size_t DefaultLDB(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDC(const Arguments<T> &args) { return args.n; }
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Symm(args.layout, args.side, args.triangle,
+                       args.m, args.n, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld,
+                       buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                       buffers.c_mat(), args.c_offset, args.c_ld,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXsymm(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasSide>(args.side),
+                              static_cast<clblasUplo>(args.triangle),
+                              args.m, args.n, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld,
+                              buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                              buffers.c_mat(), args.c_offset, args.c_ld,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.c_size, static_cast<T>(0));
+    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.m; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    return (args.layout == Layout::kRowMajor) ?
+           id1*args.c_ld + id2 + args.c_offset:
+           id2*args.c_ld + id1 + args.c_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.m * args.n * args.m;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XSYMM_H_
+#endif
--- a/test/routines/xsyr2k.h
+++ b/test/routines/xsyr2k.h
@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xsyr2k routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XSYR2K_H_
+#define CLBLAST_TEST_ROUTINES_XSYR2K_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXsyr2k {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN, kArgK,
+            kArgLayout, kArgTriangle, kArgATransp,
+            kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
+            kArgAOffset, kArgBOffset, kArgCOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+    auto a_two = (a_rotated) ? args.n : args.k;
+    return a_two * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeB(const Arguments<T> &args) {
+    auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
+    auto b_two = (b_rotated) ? args.n : args.k;
+    return b_two * args.b_ld + args.b_offset;
+  }
+  static size_t GetSizeC(const Arguments<T> &args) {
+    return args.n * args.c_ld + args.c_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.b_size = GetSizeB(args);
+    args.c_size = GetSizeC(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.k; }
+  static size_t DefaultLDB(const Arguments<T> &args) { return args.k; }
+  static size_t DefaultLDC(const Arguments<T> &args) { return args.n; }
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Syr2k(args.layout, args.triangle, args.a_transpose,
+                        args.n, args.k, args.alpha,
+                        buffers.a_mat(), args.a_offset, args.a_ld,
+                        buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                        buffers.c_mat(), args.c_offset, args.c_ld,
+                        &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXsyr2k(static_cast<clblasOrder>(args.layout),
+                               static_cast<clblasUplo>(args.triangle),
+                               static_cast<clblasTranspose>(args.a_transpose),
+                               args.n, args.k, args.alpha,
+                               buffers.a_mat(), args.a_offset, args.a_ld,
+                               buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                               buffers.c_mat(), args.c_offset, args.c_ld,
+                               1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.c_size, static_cast<T>(0));
+    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.n; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    return id1*args.c_ld + id2 + args.c_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.n * args.n * args.k;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (args.n*args.k + args.n*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XSYR2K_H_
+#endif
--- a/test/routines/xsyrk.h
+++ b/test/routines/xsyrk.h
@ -0,0 +1,121 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xsyrk routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XSYRK_H_
+#define CLBLAST_TEST_ROUTINES_XSYRK_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXsyrk {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN, kArgK,
+            kArgLayout, kArgTriangle, kArgATransp,
+            kArgALeadDim, kArgCLeadDim,
+            kArgAOffset, kArgCOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+    auto a_two = (a_rotated) ? args.n : args.k;
+    return a_two * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeC(const Arguments<T> &args) {
+    return args.n * args.c_ld + args.c_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.c_size = GetSizeC(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.k; }
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &args) { return args.n; }
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Syrk(args.layout, args.triangle, args.a_transpose,
+                       args.n, args.k, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
+                       buffers.c_mat(), args.c_offset, args.c_ld,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasUplo>(args.triangle),
+                              static_cast<clblasTranspose>(args.a_transpose),
+                              args.n, args.k, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
+                              buffers.c_mat(), args.c_offset, args.c_ld,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.c_size, static_cast<T>(0));
+    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.n; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    return id1*args.c_ld + id2 + args.c_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return args.n * args.n * args.k;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (args.n*args.k + args.n*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XSYRK_H_
+#endif
--- a/test/wrapper_clblas.h
+++ b/test/wrapper_clblas.h
@ -1,6 +1,6 @@

 // =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under the MIT license. This
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //