diff --git a/.travis.yml b/.travis.yml
index 0465afa4..6a47bbd7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,14 +2,6 @@ language: cpp
 sudo: required
 dist: trusty
 
-os:
-  - linux
-  - osx
-
-compiler:
-  - gcc
-  - clang
-
 addons:
   apt:
     sources:
@@ -19,6 +11,14 @@ addons:
       - cmake
       - ocl-icd-opencl-dev
 
+matrix:
+  include:
+    - os: linux
+      compiler: gcc
+    - os: linux
+      compiler: clang
+    - os: osx
+
 env:
   global:
     - CLBLAST_ROOT=${TRAVIS_BUILD_DIR}/bin/clblast
diff --git a/CHANGELOG b/CHANGELOG
index 32a05b00..686bd235 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,25 @@
 
+Version 0.11.0
+- Improved the internal program source and binary caches for scalability and speed (thanks to 'intelfx')
+- Fixed a bug having to re-create the binary even if it was in the cache
+- Fixed a bug when using offsets in the direct version of the GEMM kernels
+- Fixed a missing cl_khr_fp64 when running double-precision on Intel CPUs
+- Fixed tests on Apple's CPU OpenCL implementation; still not fast but correct at least
+- Fixed bugs in the half-precision routines HTBMV/HTPMV/HTRMV/HSYR2K/HTRMM
+- Tests now also exit with an error code when OpenCL errors or compilation errors occur
+- Tests now also check for the L2 error in case of half-precision
+- Clients can now test against cuBLAS on NVIDIA systems for performance comparisons (-DCUBLAS=ON)
+- Replaced the R graph scripts with Python/Matplotlib scripts
+- Various minor fixes and enhancements
+- Added tuned parameters for various devices (see README)
+- Added the OverrideParameters function to the API to be able to supply custom tuning parmeters
+- Added triangular solver (level-2 & level-3) routines:
+  * STRSV/DTRSV/CTRSV/ZTRSV (experimental, un-optimized)
+  * STRSM/DTRSM/CTRSM/ZTRSM (experimental, un-optimized)
+- Added batched (not part of the BLAS standard) routines:
+  * SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED (batched version of AXPY)
+  * SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED (batched version of GEMM)
+
 Version 0.10.0
 - Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header
 - Changed the enums in the C API to avoid potential name clashes with external code
diff --git a/CMakeLists.txt b/CMakeLists.txt
index edb03dbf..c602c5e2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,7 +18,7 @@ set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_fla
 # CMake project details
 project("clblast" C CXX)
 set(clblast_VERSION_MAJOR 0)
-set(clblast_VERSION_MINOR 10)
+set(clblast_VERSION_MINOR 11)
 set(clblast_VERSION_PATCH 0)
 
 # Options and their default values
@@ -28,6 +28,7 @@ option(TUNERS "Enable compilation of the tuners" OFF)
 option(CLIENTS "Enable compilation of the clients to test and compare performance" OFF)
 option(TESTS "Enable compilation of the correctness tests" OFF)
 option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF)
+option(CUBLAS "Enables performance comparison against cuBLAS on NVIDIA GPUs" OFF)
 
 # Compile in verbose mode with additional diagnostic messages
 option(VERBOSE "Compile in verbose mode for additional diagnostic messages" OFF)
@@ -129,11 +130,14 @@ if(TUNERS)
   endif()
 endif()
 
-# Locates the reference BLAS libraries in case the tests need to be compiled. The "FindclBLAS.cmake"
-# and "FindCBLAS.cmake" are included.
+# Locates the reference BLAS libraries in case the tests need to be compiled. The "FindclBLAS.cmake",
+# "FindCBLAS.cmake" and "FindcuBLAS.cmake" are included.
 if(CLIENTS OR TESTS)
   find_package(clBLAS)
   find_package(CBLAS)
+  if(CUBLAS)
+    find_package(cuBLAS)
+  endif()
   if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND)
     if(TESTS)
       message(STATUS "Could NOT find clBLAS nor a CPU BLAS, disabling the compilation of the tests")
@@ -156,10 +160,10 @@ if(NETLIB)
   set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib)
 endif()
 set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
-set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
+set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv
                     xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
-set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
-set(LEVELX_ROUTINES xomatcopy)
+set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm)
+set(LEVELX_ROUTINES xomatcopy xaxpybatched xgemmbatched)
 set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES})
 set(PRECISIONS 32 64 3232 6464 16)
 
@@ -175,6 +179,7 @@ set(SOURCES
   src/clblast.cpp
   src/clblast_c.cpp
   src/routine.cpp
+  src/routines/levelx/xinvert.cpp  # only source, don't include it as a test
 )
 if(NETLIB)
   set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp)
@@ -241,7 +246,7 @@ endif()
 if(SAMPLES)
 
   # Downloads the cl.hpp file from Khronos
-  file(DOWNLOAD https://www.khronos.org/registry/cl/api/1.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp)
+  file(DOWNLOAD https://www.khronos.org/registry/OpenCL/api/2.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp)
 
   # Adds sample programs (C++)
   foreach(SAMPLE ${SAMPLE_PROGRAMS_CPP})
@@ -319,13 +324,22 @@ if(CLIENTS OR TESTS)
       add_definitions(" -DCLBLAST_REF_CBLAS")
     endif()
   endif()
+  if(CUBLAS_FOUND)
+    set(REF_INCLUDES ${REF_INCLUDES} ${CUDA_INCLUDE_DIRS})
+    set(REF_LIBRARIES ${REF_LIBRARIES} ${CUDA_LIBRARIES} ${CUBLAS_LIBRARIES})
+    if(MSVC)
+      add_definitions(" /DCLBLAST_REF_CUBLAS")
+    else()
+      add_definitions(" -DCLBLAST_REF_CUBLAS")
+    endif()
+  endif()
 
 endif()
 
 # ==================================================================================================
 
 # Section for the performance tests (i.e. the client). These compare against optionally a reference
-# library, either clBLAS or a CPU BLAS.
+# library, either clBLAS, a CPU BLAS, or CUDA's cuBLAS.
 if(CLIENTS)
 
   # Visual Studio requires the sources of non-exported objects/libraries
@@ -371,7 +385,7 @@ endif()
 # ==================================================================================================
 
 # Section for the correctness tests. Note that these tests require the presence of clBLAS and/or a
-# CPU BLAS library to act as a reference.
+# CPU BLAS library, and/or cuBLAS to act as a reference.
 if(TESTS)
   enable_testing()
 
@@ -414,6 +428,18 @@ if(TESTS)
     add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE})
   endforeach()
 
+  # Miscellaneous tests
+  set(MISC_TESTS override_parameters)
+  foreach(MISC_TEST ${MISC_TESTS})
+    add_executable(clblast_test_${MISC_TEST} ${TESTS_COMMON}
+                   test/correctness/misc/${MISC_TEST}.cpp)
+    target_link_libraries(clblast_test_${MISC_TEST} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
+    target_include_directories(clblast_test_${MISC_TEST} PUBLIC
+                               $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
+                               ${clblast_SOURCE_DIR} ${REF_INCLUDES})
+    add_test(clblast_test_${MISC_TEST} clblast_test_${MISC_TEST})
+  endforeach()
+
   # Adds 'alltests' target: runs all tests
   set(ALLTESTS )
   set(ALLTESTSDEPENDS )
diff --git a/README.md b/README.md
index d550122f..835f5eea 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ Use CLBlast instead of clBLAS:
 * When you want to be able to inspect the BLAS kernels or easily customize them to your needs.
 * When you run on exotic OpenCL devices for which you need to tune yourself.
 * When you are still running on OpenCL 1.1 hardware.
+* When you prefer a C++ API over a C API (C API also available in CLBlast).
 * When you value an organized and modern C++ codebase.
 * When you target Intel CPUs and GPUs or embedded devices
 * When you can benefit from the increased performance of half-precision fp16 data-types.
@@ -90,21 +91,23 @@ Or alternatively the plain C version:
 
     #include <clblast_c.h>
 
-Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the above mentioned include files and the included [API documentation](doc/clblast.md). Additionally, a couple of stand-alone example programs are included in the `samples` subfolder. They can optionally be compiled using the CMake infrastructure of CLBlast by providing the `-DSAMPLES=ON` flag, for example as follows:
+Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the above mentioned include files and the included [API documentation](doc/clblast.md). The API is kept as close as possible to the Netlib BLAS and the cuBLAS/clBLAS APIs.
+
+To get started quickly, a couple of stand-alone example programs are included in the `samples` subfolder. They can optionally be compiled using the CMake infrastructure of CLBlast by providing the `-DSAMPLES=ON` flag, for example as follows:
 
     cmake -DSAMPLES=ON ..
 
-Furthermore, it is possible to optionally set an OS environmental variable `CLBLAST_BUILD_OPTIONS` to pass specific build options to the OpenCL compiler.
-
 There is also a Netlib CBLAS C API available. This is however not recommended for full control over performance, since at every call it will copy all buffers to and from the OpenCL device. Especially for level 1 and level 2 BLAS functions performance will be impacted severly. However, it can be useful if you don't want to touch OpenCL at all. You can set the default device and platform by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. This API can be used as follows after providing the `-DNETLIB=ON` flag to CMake:
 
     #include <clblast_netlib_c.h>
 
+For all of CLBlast's APIs, it is possible to optionally set an OS environmental variable `CLBLAST_BUILD_OPTIONS` to pass specific build options to the OpenCL compiler.
+
 
 Using the tuners (optional)
 -------------
 
-The CLBlast library will be tuned in the future for the most commonly used OpenCL devices. This pre-release of CLBlast is only tuned for a limited number of devices, in particular those with the following `CL_DEVICE_NAME` values:
+The CLBlast library is already tuned for the most commonly used OpenCL devices and it's gradually being extended to other devices as well. For unseen devices CLBlast will make use of common-best tuning values for similar devices (e.g. AMD GPUs), so performance might still be decent. The current release of CLBlast is tuned for devices with the following `CL_DEVICE_NAME` values:
 
 * NVIDIA GPUs:
   - GRID K520
@@ -115,18 +118,23 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
   - GeForce GTX 750 Ti
   - GeForce GTX 980
   - GeForce GTX 1070
+  - GeForce GTX 1080
   - GeForce GTX TITAN
   - GeForce GTX TITAN Black
   - GeForce GTX TITAN X
+  - TITAN X (Pascal)
   - Tesla K20m
   - Tesla K40m
 * AMD GPUs:
   - AMD Radeon R9 M370X Compute Engine
+  - ATI Radeon HD 6750M
+  - Ellesmere
   - Hawaii
   - Oland
   - Pitcairn
   - Tahiti
   - Tonga
+  - Turks
 * Intel GPUs:
   - HD Graphics 530
   - HD Graphics 5500 BroadWell U-Processor GT2
@@ -137,7 +145,9 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
   - Iris Pro
 * Intel CPUs:
   - Core i5-6200U
+  - Core i7-2670QM
   - Core i7-3770K
+  - Core i7-4790K
   - Core i7-5930K
 * Other devices:
   - ARM Mali-T628 GPU
@@ -151,7 +161,7 @@ Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https:/
 
 Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables.
 
-The tuners output a JSON-file with the results. The best results need to be added to `src/database/kernels/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
+The tuners output a JSON-file with the results. The best results need to be added to `src/database/kernels/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python (2.7 or 3.x) script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
 
 In summary, tuning the entire library for your device can be done as follows (starting from the root of the CLBlast folder):
 
@@ -163,6 +173,8 @@ In summary, tuning the entire library for your device can be done as follows (st
     python ../scripts/database/database.py . ..
     make
 
+Alternatively, you can also supply your tuning parameters programmatically through the CLBlast API. This is especially useful if you tune for specific non-standard arguments (e.g. a rectangular or a very small matrix). To do so, you can call the `OverrideParameters` function which will set new parameters for a specific kernel. At the first next call of the target routine, CLBlast will compile a new binary and use it together with the new parameters from then on. Until `OverrideParameters` is called again of course. See the [API documentation](doc/clblast.md#overrideparameters-override-tuning-parameters-auxiliary-function) for more details.
+
 
 Compiling the correctness tests (optional)
 -------------
@@ -187,15 +199,15 @@ All tests can be run directly together in one go through the `make alltests` tar
 Compiling the performance tests/clients (optional)
 -------------
 
-To test the performance of CLBlast and compare optionally against [clBLAS](http://github.com/clMathLibraries/clBLAS) or a CPU BLAS library (see above for requirements), compile with the clients enabled by specifying `-DCLIENTS=ON`, for example as follows:
+To test the performance of CLBlast and compare optionally against [clBLAS](http://github.com/clMathLibraries/clBLAS), cuBLAS (if testing on an NVIDIA GPU and `-DCUBLAS=ON` set), or a CPU BLAS library (see above for requirements), compile with the clients enabled by specifying `-DCLIENTS=ON`, for example as follows:
 
     cmake -DCLIENTS=ON ..
 
 The performance tests come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run CLBlast in a head-to-head performance test against optionally clBLAS and/or a CPU BLAS library. You can use the command-line options `-clblas 1` or `-cblas 1` to select a library to test against.
 
-The folder `doc/performance` contains some PDF files with performance results on tested devices. Performance is compared in this case against a tuned version of the clBLAS library. These graphs can be generated automatically on your own device. First, compile CLBlast with the clients enabled. Then, make sure your installation of the reference clBLAS is performance-tuned by running the `tune` executable. Finally, run one of the graph-scripts found in `scripts/graphs` using R. For example, to generate the Xgemm PDF on device 1 of platform 0 from the `build` subdirectory:
+The folder `doc/performance` contains some PDF files with performance results on tested devices. Performance is compared in this case against a tuned version of the clBLAS library. These graphs can be generated automatically on your own device. First, compile CLBlast with the clients enabled. Then, make sure your installation of the reference clBLAS is performance-tuned by running the `tune` executable (shipped with clBLAS). Finally, run the Python/Matplotlib graph-script found in `scripts/benchmark/benchmark.py`. For example, to generate the SGEMM PDF on device 1 of platform 0 from the `build` subdirectory:
 
-    Rscript ../scripts/graphs/xgemm.r 0 1
+    python ../scripts/benchmark/benchmark.py --platform 0 --device 1 --benchmark gemm
 
 Note that the CLBlast library provides pre-tuned parameter-values for some devices only: if your device is not among these, then out-of-the-box performance might be poor. See above under `Using the tuners` to find out how to tune for your device.
 
@@ -250,6 +262,7 @@ CLBlast supports almost all the Netlib BLAS routines plus a couple of extra non-
 | xSPR     | ✔ | ✔ | - | - | ✔ |
 | xSYR2    | ✔ | ✔ | - | - | ✔ |
 | xSPR2    | ✔ | ✔ | - | - | ✔ |
+| xTRSV    | ✔ | ✔ | ✔ | ✔ |   | (experimental, un-optimized)
 
 | Level-3  | S | D | C | Z | H |
 | ---------|---|---|---|---|---|
@@ -261,6 +274,14 @@ CLBlast supports almost all the Netlib BLAS routines plus a couple of extra non-
 | xSYR2K   | ✔ | ✔ | ✔ | ✔ | ✔ |
 | xHER2K   | - | - | ✔ | ✔ | - |
 | xTRMM    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xTRSM    | ✔ | ✔ | ✔ | ✔ |   | (experimental, un-optimized)
+
+Futhermore, there are also batched versions of BLAS routines available, processing multiple smaller computations in one go for better performance:
+
+| Batched      | S | D | C | Z | H |
+| -------------|---|---|---|---|---|
+| xAXPYBATCHED | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xGEMMBATCHED | ✔ | ✔ | ✔ | ✔ | ✔ |
 
 In addition, some extra non-BLAS routines are also supported by CLBlast, classified as level-X. They are experimental and should be used with care:
 
@@ -271,7 +292,7 @@ In addition, some extra non-BLAS routines are also supported by CLBlast, classif
 | IxMIN      | ✔ | ✔ | ✔ | ✔ | ✔ |
 | xOMATCOPY  | ✔ | ✔ | ✔ | ✔ | ✔ |
 
-Some less commonly used BLAS routines are not yet supported yet by CLBlast. They are xROTG, xROTMG, xROT, xROTM, xTRSV, xTBSV, xTPSV, and xTRSM.
+Some less commonly used BLAS routines are not yet supported yet by CLBlast. They are xROTG, xROTMG, xROT, xROTM, xTBSV, and xTPSV.
 
 
 Half precision (fp16)
diff --git a/cmake/Modules/FindcuBLAS.cmake b/cmake/Modules/FindcuBLAS.cmake
new file mode 100644
index 00000000..e470289b
--- /dev/null
+++ b/cmake/Modules/FindcuBLAS.cmake
@@ -0,0 +1,82 @@
+
+# ==================================================================================================
+# This file is part of the cuBLASt project. The project is licensed under Apache Version 2.0. This
+# project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+# width of 100 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+#
+# ==================================================================================================
+#
+# Defines the following variables:
+#   CUBLAS_FOUND          Boolean holding whether or not the cuBLAS library was found
+#   CUBLAS_INCLUDE_DIRS   The CUDA and cuBLAS include directory
+#   CUDA_LIBRARIES        The CUDA library
+#   CUBLAS_LIBRARIES      The cuBLAS library
+#
+# In case CUDA is not installed in the default directory, set the CUDA_ROOT variable to point to
+# the root of cuBLAS, such that 'cublas_v2.h' can be found in $CUDA_ROOT/include. This can either be
+# done using an environmental variable (e.g. export CUDA_ROOT=/path/to/cuBLAS) or using a CMake
+# variable (e.g. cmake -DCUDA_ROOT=/path/to/cuBLAS ..).
+#
+# ==================================================================================================
+
+# Sets the possible install locations
+set(CUBLAS_HINTS
+  ${CUDA_ROOT}
+  $ENV{CUDA_ROOT}
+  $ENV{CUDA_TOOLKIT_ROOT_DIR}
+)
+set(CUBLAS_PATHS
+  /usr
+  /usr/local
+  /usr/local/cuda
+)
+
+# Finds the include directories
+find_path(CUBLAS_INCLUDE_DIRS
+  NAMES cublas_v2.h cuda.h
+  HINTS ${CUBLAS_HINTS}
+  PATH_SUFFIXES include inc include/x86_64 include/x64
+  PATHS ${CUBLAS_PATHS}
+  DOC "cuBLAS include header cublas_v2.h"
+)
+mark_as_advanced(CUBLAS_INCLUDE_DIRS)
+
+# Finds the libraries
+find_library(CUDA_LIBRARIES
+  NAMES cudart
+  HINTS ${CUBLAS_HINTS}
+  PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import
+  PATHS ${CUBLAS_PATHS}
+  DOC "CUDA library"
+)
+mark_as_advanced(CUDA_LIBRARIES)
+find_library(CUBLAS_LIBRARIES
+  NAMES cublas
+  HINTS ${CUBLAS_HINTS}
+  PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import
+  PATHS ${CUBLAS_PATHS}
+  DOC "cuBLAS library"
+)
+mark_as_advanced(CUBLAS_LIBRARIES)
+
+# ==================================================================================================
+
+# Notification messages
+if(NOT CUBLAS_INCLUDE_DIRS)
+    message(STATUS "Could NOT find 'cuBLAS.h', install CUDA/cuBLAS or set CUDA_ROOT")
+endif()
+if(NOT CUDA_LIBRARIES)
+    message(STATUS "Could NOT find CUDA library, install it or set CUDA_ROOT")
+endif()
+if(NOT CUBLAS_LIBRARIES)
+    message(STATUS "Could NOT find cuBLAS library, install it or set CUDA_ROOT")
+endif()
+
+# Determines whether or not cuBLAS was found
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(cuBLAS DEFAULT_MSG CUBLAS_INCLUDE_DIRS CUDA_LIBRARIES CUBLAS_LIBRARIES)
+
+# ==================================================================================================
diff --git a/doc/clblast.md b/doc/clblast.md
index 37b99f3d..6ff5f7d0 100644
--- a/doc/clblast.md
+++ b/doc/clblast.md
@@ -1445,6 +1445,63 @@ Arguments to TPMV:
 
 
 
+xTRSV: Solves a triangular system of equations
+-------------
+
+
+
+C++ API:
+```
+template <typename T>
+StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastStrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to TRSV:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `cl_mem x_buffer`: OpenCL buffer to store the output x vector.
+* `const size_t x_offset`: The offset in elements from the start of the output x vector.
+* `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
 xGER: General rank-1 matrix update
 -------------
 
@@ -2708,6 +2765,71 @@ Requirements for TRMM:
 
 
 
+xTRSM: Solves a triangular system of equations
+-------------
+
+Solves the equation _A * X = alpha * B_ for the unknown _m_ by _n_ matrix X, in which _A_ is an _n_ by _n_ unit or non-unit triangular matrix and B is an _m_ by _n_ matrix. The matrix _B_ is overwritten by the solution _X_.
+
+C++ API:
+```
+template <typename T>
+StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t m, const size_t n,
+                               const float alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t m, const size_t n,
+                               const double alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t m, const size_t n,
+                               const cl_float2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t m, const size_t n,
+                               const cl_double2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to TRSM:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Side side`: The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142).
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `cl_mem b_buffer`: OpenCL buffer to store the output B matrix.
+* `const size_t b_offset`: The offset in elements from the start of the output B matrix.
+* `const size_t b_ld`: Leading dimension of the output B matrix. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
 xOMATCOPY: Scaling and out-place transpose/copy (non-BLAS function)
 -------------
 
@@ -2781,3 +2903,231 @@ Requirements for OMATCOPY:
 
 
 
+xAXPYBATCHED: Batched version of AXPY
+-------------
+
+As AXPY, but multiple operations are batched together for better performance.
+
+C++ API:
+```
+template <typename T>
+StatusCode AxpyBatched(const size_t n,
+                       const T *alphas,
+                       const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                       cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                       const size_t batch_count,
+                       cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
+                                      const float *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDaxpyBatched(const size_t n,
+                                      const double *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCaxpyBatched(const size_t n,
+                                      const cl_float2 *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZaxpyBatched(const size_t n,
+                                      const cl_double2 *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHaxpyBatched(const size_t n,
+                                      const cl_half *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to AXPYBATCHED:
+
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T *alphas`: Input scalar constants.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t *x_offsets`: The offsets in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
+* `const size_t *y_offsets`: The offsets in elements from the start of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
+* `const size_t batch_count`: Number of batches. This value must be positive.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xGEMMBATCHED: Batched version of GEMM
+-------------
+
+As GEMM, but multiple operations are batched together for better performance.
+
+C++ API:
+```
+template <typename T>
+StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                       const size_t m, const size_t n, const size_t k,
+                       const T *alphas,
+                       const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                       const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                       const T *betas,
+                       cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                       const size_t batch_count,
+                       cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const float *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const float *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const double *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const double *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const cl_float2 *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const cl_float2 *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const cl_double2 *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const cl_double2 *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const cl_half *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const cl_half *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to GEMMBATCHED:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const Transpose b_transpose`: Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const size_t k`: Integer size argument. This value must be positive.
+* `const T *alphas`: Input scalar constants.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t *a_offsets`: The offsets in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix.
+* `const size_t *b_offsets`: The offsets in elements from the start of the input B matrix.
+* `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0.
+* `const T *betas`: Input scalar constants.
+* `cl_mem c_buffer`: OpenCL buffer to store the output C matrix.
+* `const size_t *c_offsets`: The offsets in elements from the start of the output C matrix.
+* `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0.
+* `const size_t batch_count`: Number of batches. This value must be positive.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for GEMMBATCHED:
+
+* When `transpose_a == Transpose::kNo`, then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `k`.
+* When `transpose_b == Transpose::kNo`, then `b_ld` must be at least `k`, otherwise `b_ld` must be at least `n`.
+* The value of `c_ld` must be at least `m`.
+
+
+
+ClearCache: Resets the cache of compiled binaries (auxiliary function)
+-------------
+
+CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on for the same device. This cache can be cleared to free up system memory or it can be useful in case of debugging.
+
+C++ API:
+```
+StatusCode ClearCache()
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastClearCache()
+```
+
+
+
+FillCache: Populates the cache of compiled binaries for a specific device (auxiliary function)
+-------------
+
+CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on for the same device. This cache is automatically populated whenever a new binary is created. Thus, the first run of a specific kernel could take extra time. For debugging or performance evaluation purposes, it might be useful to populate the cache upfront. This function populates the cache for all kernels in CLBlast for all precisions, but for a specific device only.
+
+C++ API:
+```
+StatusCode FillCache(const cl_device_id device)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastFillCache(const cl_device_id device)
+```
+
+Arguments to FillCache:
+
+* `const cl_device_id device`: The OpenCL device to fill the cache for.
+
+
+
+OverrideParameters: Override tuning parameters (auxiliary function)
+-------------
+
+This function overrides tuning parameters for a specific device-precision-kernel combination. The next time the target routine is called it will be re-compiled and use the new parameters. All further times (until `OverrideParameters` is called again) it will load the kernel from the cache and thus continue to use the new parameters. Note that the first time after calling `OverrideParameters` a performance drop can be observable due to the re-compilation of the kernel.
+
+C++ API:
+```
+StatusCode OverrideParameters(const cl_device_id device, const std::string &kernel_name,
+                              const Precision precision,
+                              const std::unordered_map<std::string,size_t> &parameters)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastOverrideParameters(const cl_device_id device, const char* kernel_name,
+                                            const CLBlastPrecision precision, const size_t num_parameters,
+                                            const char** parameters_names, const size_t* parameters_values)
+```
+
+Arguments to OverrideParameters (C++ version):
+
+* `const cl_device_id device`: The OpenCL device to set the new parameters for.
+* `const std::string &kernel_name`: The target kernel name. This has to be one of the existing CLBlast kernels (Xaxpy, Xdot, Xgemv, XgemvFast, XgemvFastRot, Xgemv, Xger, Copy, Pad, Transpose, Padtranspose, Xgemm, or XgemmDirect). If this argument is incorrect, this function will return with the `clblast::kInvalidOverrideKernel` status-code.
+* `const Precision precision`: The CLBlast precision enum to set the new parameters for.
+* `const std::unordered_map<std::string,size_t> &parameters`: An unordered map of strings to integers. This has to contain all the tuning parameters for a specific kernel as reported by the included tuners (e.g. `{ {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} }` for the `Copy` kernel). If this argument is incorrect, this function will return with the `clblast::kMissingOverrideParameter` status-code.
diff --git a/include/clblast.h b/include/clblast.h
index 7b2021d8..54944ea2 100644
--- a/include/clblast.h
+++ b/include/clblast.h
@@ -17,6 +17,8 @@
 #define CLBLAST_CLBLAST_H_
 
 #include <cstdlib> // For size_t
+#include <string> // For OverrideParameters function
+#include <unordered_map> // For OverrideParameters function
 
 // Includes the normal OpenCL C header
 #if defined(__APPLE__) || defined(__MACOSX)
@@ -95,6 +97,9 @@ enum class StatusCode {
   kInsufficientMemoryY       = -1007, // Vector Y's OpenCL buffer is too small
 
   // Custom additional status codes for CLBlast
+  kInvalidBatchCount         = -2049, // The batch count needs to be positive
+  kInvalidOverrideKernel     = -2048, // Trying to override parameters for an invalid kernel
+  kMissingOverrideParameter  = -2047, // Missing override parameter(s) for the target kernel
   kInvalidLocalMemUsage      = -2046, // Not enough local memory available on this device
   kNoHalfPrecision           = -2045, // Half precision (16-bits) not supported by the device
   kNoDoublePrecision         = -2044, // Double precision (64-bits) not supported by the device
@@ -114,7 +119,7 @@ enum class Side { kLeft = 141, kRight = 142 };
 
 // Precision scoped enum (values in bits)
 enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
-                       kComplexSingle = 3232, kComplexDouble = 6464 };
+                       kComplexSingle = 3232, kComplexDouble = 6464, kAny = -1 };
 
 // =================================================================================================
 // BLAS level-1 (vector-vector) routines
@@ -583,7 +588,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c
                 cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
+// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
 template <typename T>
 StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                 const size_t m, const size_t n,
@@ -605,6 +610,27 @@ StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
                     cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                     cl_command_queue* queue, cl_event* event = nullptr);
 
+// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
+template <typename T>
+StatusCode AxpyBatched(const size_t n,
+                       const T *alphas,
+                       const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                       cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                       const size_t batch_count,
+                       cl_command_queue* queue, cl_event* event = nullptr);
+
+// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED
+template <typename T>
+StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                       const size_t m, const size_t n, const size_t k,
+                       const T *alphas,
+                       const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                       const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                       const T *betas,
+                       cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                       const size_t batch_count,
+                       cl_command_queue* queue, cl_event* event = nullptr);
+
 // =================================================================================================
 
 // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
@@ -617,6 +643,14 @@ StatusCode PUBLIC_API FillCache(const cl_device_id device);
 
 // =================================================================================================
 
+// Overrides tuning parameters for a specific device-precision-kernel combination. The next time
+// the target routine is called it will re-compile and use the new parameters from then on.
+StatusCode PUBLIC_API OverrideParameters(const cl_device_id device, const std::string &kernel_name,
+                                         const Precision precision,
+                                         const std::unordered_map<std::string,size_t> &parameters);
+
+// =================================================================================================
+
 } // namespace clblast
 
 // CLBLAST_CLBLAST_H_
diff --git a/include/clblast_c.h b/include/clblast_c.h
index 72f50d83..b0ef5f34 100644
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@@ -96,6 +96,9 @@ typedef enum CLBlastStatusCode_ {
   CLBlastInsufficientMemoryY       = -1007, // Vector Y's OpenCL buffer is too small
 
   // Custom additional status codes for CLBlast
+  CLBlastInvalidBatchCount         = -2049, // The batch count needs to be positive
+  CLBlastInvalidOverrideKernel     = -2048, // Trying to override parameters for an invalid kernel
+  CLBlastMissingOverrideParameter  = -2047, // Missing override parameter(s) for the target kernel
   CLBlastInvalidLocalMemUsage      = -2046, // Not enough local memory available on this device
   CLBlastNoHalfPrecision           = -2045, // Half precision (16-bits) not supported by the device
   CLBlastNoDoublePrecision         = -2044, // Double precision (64-bits) not supported by the device
@@ -117,6 +120,11 @@ typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131,
                                 CLBlastDiagonalUnit = 132 } CLBlastDiagonal;
 typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide;
 
+// Precision enum (values in bits)
+typedef enum CLBlastPrecision_ { CLBlastPrecisionHalf = 16, CLBlastPrecisionSingle = 32,
+                                 CLBlastPrecisionDouble = 64, CLBlastPrecisionComplexSingle = 3232,
+                                 CLBlastPrecisionComplexDouble = 6464 } CLBlastPrecision;
+
 // =================================================================================================
 // BLAS level-1 (vector-vector) routines
 // =================================================================================================
@@ -1258,7 +1266,7 @@ CLBlastStatusCode PUBLIC_API CLBlastHtrmm(const CLBlastLayout layout, const CLBl
                                           cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                           cl_command_queue* queue, cl_event* event);
 
-// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
+// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
 CLBlastStatusCode PUBLIC_API CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                           const size_t m, const size_t n,
                                           const float alpha,
@@ -1283,12 +1291,6 @@ CLBlastStatusCode PUBLIC_API CLBlastZtrsm(const CLBlastLayout layout, const CLBl
                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                           cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                           cl_command_queue* queue, cl_event* event);
-CLBlastStatusCode PUBLIC_API CLBlastHtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
-                                          const size_t m, const size_t n,
-                                          const cl_half alpha,
-                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                                          cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
-                                          cl_command_queue* queue, cl_event* event);
 
 // =================================================================================================
 // Extra non-BLAS routines (level-X)
@@ -1326,6 +1328,85 @@ CLBlastStatusCode PUBLIC_API CLBlastHomatcopy(const CLBlastLayout layout, const
                                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                               cl_command_queue* queue, cl_event* event);
 
+// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
+CLBlastStatusCode PUBLIC_API CLBlastSaxpyBatched(const size_t n,
+                                                 const float *alphas,
+                                                 const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                                 cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                                 const size_t batch_count,
+                                                 cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDaxpyBatched(const size_t n,
+                                                 const double *alphas,
+                                                 const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                                 cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                                 const size_t batch_count,
+                                                 cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCaxpyBatched(const size_t n,
+                                                 const cl_float2 *alphas,
+                                                 const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                                 cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                                 const size_t batch_count,
+                                                 cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZaxpyBatched(const size_t n,
+                                                 const cl_double2 *alphas,
+                                                 const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                                 cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                                 const size_t batch_count,
+                                                 cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHaxpyBatched(const size_t n,
+                                                 const cl_half *alphas,
+                                                 const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                                 cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                                 const size_t batch_count,
+                                                 cl_command_queue* queue, cl_event* event);
+
+// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED
+CLBlastStatusCode PUBLIC_API CLBlastSgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                                 const size_t m, const size_t n, const size_t k,
+                                                 const float *alphas,
+                                                 const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                                 const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                                 const float *betas,
+                                                 cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                                 const size_t batch_count,
+                                                 cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                                 const size_t m, const size_t n, const size_t k,
+                                                 const double *alphas,
+                                                 const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                                 const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                                 const double *betas,
+                                                 cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                                 const size_t batch_count,
+                                                 cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                                 const size_t m, const size_t n, const size_t k,
+                                                 const cl_float2 *alphas,
+                                                 const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                                 const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                                 const cl_float2 *betas,
+                                                 cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                                 const size_t batch_count,
+                                                 cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                                 const size_t m, const size_t n, const size_t k,
+                                                 const cl_double2 *alphas,
+                                                 const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                                 const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                                 const cl_double2 *betas,
+                                                 cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                                 const size_t batch_count,
+                                                 cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                                 const size_t m, const size_t n, const size_t k,
+                                                 const cl_half *alphas,
+                                                 const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                                 const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                                 const cl_half *betas,
+                                                 cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                                 const size_t batch_count,
+                                                 cl_command_queue* queue, cl_event* event);
+
 // =================================================================================================
 
 // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
@@ -1338,6 +1419,14 @@ CLBlastStatusCode PUBLIC_API CLBlastFillCache(const cl_device_id device);
 
 // =================================================================================================
 
+// Overrides tuning parameters for a specific device-precision-kernel combination. The next time
+// the target routine is called it will re-compile and use the new parameters from then on.
+CLBlastStatusCode PUBLIC_API CLBlastOverrideParameters(const cl_device_id device, const char* kernel_name,
+                                                       const CLBlastPrecision precision, const size_t num_parameters,
+                                                       const char** parameters_names, const size_t* parameters_values);
+
+// =================================================================================================
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/include/clblast_half.h b/include/clblast_half.h
index 05d96f9f..3d77fdd9 100644
--- a/include/clblast_half.h
+++ b/include/clblast_half.h
@@ -32,9 +32,8 @@
 
 // =================================================================================================
 
-// Host data-type for half-precision floating-point (16-bit). This is based on the OpenCL type,
-// which is a typedef for unsigned short.
-typedef cl_half half;
+// The host data-type for half-precision floating-point (16-bit) is based on the `cl_half` OpenCL
+// type, which is a typedef for unsigned short.
 
 // 32-bit union for conversions
 typedef union ConversionBits_ {
@@ -47,7 +46,7 @@ typedef union ConversionBits_ {
 // Converts a IEEE-compliant single-precision value to half-precision floating-point. This function
 // applies simple truncation (round toward zero, but with overflows set to infinity) as rounding
 // mode.
-inline half FloatToHalf(const float value) {
+inline cl_half FloatToHalf(const float value) {
   static const unsigned short base_table[512] = { 
     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
@@ -108,7 +107,7 @@ inline half FloatToHalf(const float value) {
 }
 
 // Converts a half-precision value to IEEE-compliant single-precision floating-point
-inline float HalfToFloat(const half value) {
+inline float HalfToFloat(const cl_half value) {
   static const unsigned int mantissa_table[2048] = { 
     0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000,
     0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000,
diff --git a/include/clblast_netlib_c.h b/include/clblast_netlib_c.h
index b5577cfa..384fab20 100644
--- a/include/clblast_netlib_c.h
+++ b/include/clblast_netlib_c.h
@@ -862,7 +862,7 @@ void PUBLIC_API cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side,
                             const void* a, const int a_ld,
                             void* b, const int b_ld);
 
-// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
+// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
 void PUBLIC_API cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                             const int m, const int n,
                             const float alpha,
diff --git a/samples/cache.c b/samples/cache.c
index 40f2163f..980c7cf3 100644
--- a/samples/cache.c
+++ b/samples/cache.c
@@ -20,6 +20,8 @@
 #include <string.h>
 #include <time.h>
 
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
 // Includes the CLBlast library (C interface)
 #include <clblast_c.h>
 
diff --git a/samples/dgemv.c b/samples/dgemv.c
index dc2fe7db..975cb7ac 100644
--- a/samples/dgemv.c
+++ b/samples/dgemv.c
@@ -19,6 +19,8 @@
 #include <stdio.h>
 #include <string.h>
 
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
 // Includes the CLBlast library (C interface)
 #include <clblast_c.h>
 
diff --git a/samples/haxpy.c b/samples/haxpy.c
index 8e0833f8..4f2bb400 100644
--- a/samples/haxpy.c
+++ b/samples/haxpy.c
@@ -18,6 +18,8 @@
 #include <stdio.h>
 #include <string.h>
 
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
 // Includes the CLBlast library (C interface)
 #include <clblast_c.h>
 
diff --git a/samples/sasum.c b/samples/sasum.c
index c285dd14..78377336 100644
--- a/samples/sasum.c
+++ b/samples/sasum.c
@@ -19,6 +19,8 @@
 #include <stdio.h>
 #include <string.h>
 
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
 // Includes the CLBlast library (C interface)
 #include <clblast_c.h>
 
diff --git a/samples/sgemm.c b/samples/sgemm.c
index 132dad81..92f3057d 100644
--- a/samples/sgemm.c
+++ b/samples/sgemm.c
@@ -19,6 +19,8 @@
 #include <stdio.h>
 #include <string.h>
 
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
 // Includes the CLBlast library (C interface)
 #include <clblast_c.h>
 
diff --git a/samples/sgemm.cpp b/samples/sgemm.cpp
index 401ecff8..b960865b 100644
--- a/samples/sgemm.cpp
+++ b/samples/sgemm.cpp
@@ -20,6 +20,9 @@
 #include <chrono>
 #include <vector>
 
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
 // Includes the C++ OpenCL API. If not yet available, it can be found here:
 // https://www.khronos.org/registry/cl/api/1.1/cl.hpp
 #include "cl.hpp"
@@ -103,7 +106,7 @@ int main() {
   auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();
 
   // Example completed. See "clblast.h" for status codes (0 -> success).
-  printf("Completed SGEMM in %.3lf ms with status %d\n", time_ms, status);
+  printf("Completed SGEMM in %.3lf ms with status %d\n", time_ms, static_cast<int>(status));
   return 0;
 }
 
diff --git a/scripts/benchmark/benchmark.py b/scripts/benchmark/benchmark.py
new file mode 100644
index 00000000..a5f216c6
--- /dev/null
+++ b/scripts/benchmark/benchmark.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+import argparse
+import json
+import os
+import sys
+
+import settings
+import plot
+import utils
+
+EXPERIMENTS = {
+    "axpy": settings.AXPY,
+    "axpybatched": settings.AXPYBATCHED,
+    "gemv": settings.GEMV,
+    "gemm": settings.GEMM,
+    "gemm_small": settings.GEMM_SMALL,
+    "gemmbatched": settings.GEMMBATCHED,
+    "symm": settings.SYMM,
+    "syrk": settings.SYRK,
+    "summary": settings.SUMMARY,
+}
+
+
+def run_benchmark(name, arguments_list, precision, num_runs, platform, device):
+    binary = "./clblast_client_x" + name
+
+    # Loops over sub-benchmarks per benchmark
+    results = []
+    for arguments in arguments_list:
+
+        # Sets the arguments
+        constant_arguments = ["-warm_up", "-q", "-no_abbrv", "-cblas 0", "-cublas 0"]
+        common_arguments = ["-precision %d" % precision, "-runs %d" % num_runs]
+        opencl_arguments = ["-platform %d" % platform, "-device %d" % device]
+        all_arguments = opencl_arguments + common_arguments + constant_arguments
+        for name, value in arguments.items():
+            all_arguments.append("-" + name + " " + str(value))
+
+        # Calls the binary and parses the results
+        benchmark_output = utils.run_binary(binary, all_arguments)
+        result = utils.parse_results(benchmark_output)
+
+        # For half-precision: also runs single-precision for comparison
+        if precision == 16:
+            all_arguments = [arg if arg != "-precision 16" else "-precision 32" for arg in all_arguments]
+            benchmark_output = utils.run_binary(binary, all_arguments)
+            result_extra = utils.parse_results(benchmark_output)
+            for index in range(len(min(result, result_extra))):
+                result[index]["GBs_1_FP32"] = result_extra[index]["GBs_1"]
+                result[index]["GBs_2"] = result_extra[index]["GBs_2"]
+                result[index]["GFLOPS_1_FP32"] = result_extra[index]["GFLOPS_1"]
+                result[index]["GFLOPS_2"] = result_extra[index]["GFLOPS_2"]
+
+        results.extend(result)
+    return results
+
+
+def parse_arguments(argv):
+    parser = argparse.ArgumentParser(description="Runs a full benchmark for a specific routine on a specific device")
+    parser.add_argument("-b", "--benchmark", required=True, help="The benchmark to perform (choose from %s)" % sorted(EXPERIMENTS.keys()))
+    parser.add_argument("-p", "--platform", required=True, type=int, help="The ID of the OpenCL platform to test on")
+    parser.add_argument("-d", "--device", required=True, type=int, help="The ID of the OpenCL device to test on")
+    parser.add_argument("-n", "--num_runs", type=int, default=None, help="Overrides the default number of benchmark repeats for averaging")
+    parser.add_argument("-x", "--precision", type=int, default=32, help="The precision to test for (choose from 16, 32, 64, 3232, 6464")
+    parser.add_argument("-l", "--load_from_disk", action="store_true", help="Increase verbosity of the script")
+    parser.add_argument("-t", "--plot_title", default="", help="The title for the plots, defaults to benchmark name")
+    parser.add_argument("-z", "--tight_plot", action="store_true", help="Enables tight plot layout for in paper or presentation")
+    parser.add_argument("-o", "--output_folder", default=os.getcwd(), help="Sets the folder for output plots (defaults to current folder)")
+    parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity of the script")
+    cl_args = parser.parse_args(argv)
+    return vars(cl_args)
+
+
+def benchmark_single(benchmark, platform, device, num_runs, precision, load_from_disk,
+                     plot_title, tight_plot, output_folder, verbose):
+
+    # Sanity check
+    if not os.path.isdir(output_folder):
+        print("[benchmark] Error: folder '%s' doesn't exist" % output_folder)
+        return
+
+    # The benchmark name and plot title
+    benchmark_name = utils.precision_to_letter(precision) + benchmark.upper()
+    if benchmark.upper() != "SUMMARY":
+        plot_title = benchmark_name if plot_title is "" else benchmark_name + ": " + plot_title
+
+    # Retrieves the benchmark settings
+    if benchmark not in EXPERIMENTS.keys():
+        print("[benchmark] Invalid benchmark '%s', choose from %s" % (benchmark, EXPERIMENTS.keys()))
+        return
+    experiment = EXPERIMENTS[benchmark]
+    benchmarks = experiment["benchmarks"]
+
+    # Either run the benchmarks for this experiment or load old results from disk
+    json_file_name = os.path.join(output_folder, benchmark_name.lower() + "_benchmarks.json")
+    if load_from_disk and os.path.isfile(json_file_name):
+        print("[benchmark] Loading previous benchmark results from '" + json_file_name + "'")
+        with open(json_file_name) as f:
+            results = json.load(f)
+    else:
+
+        # Runs all the individual benchmarks
+        print("[benchmark] Running on platform %d, device %d" % (platform, device))
+        print("[benchmark] Running %d benchmarks for settings '%s'" % (len(benchmarks), benchmark))
+        results = {"label_names": experiment["label_names"], "num_rows": experiment["num_rows"],
+                   "num_cols": experiment["num_cols"], "benchmarks": []}
+        for bench in benchmarks:
+            num_runs_benchmark = bench["num_runs"] if num_runs is None else num_runs
+            print("[benchmark] Running benchmark '%s:%s'" % (bench["name"], bench["title"]))
+            result = run_benchmark(bench["name"], bench["arguments"], precision, num_runs_benchmark,
+                                   platform, device)
+            results["benchmarks"].append(result)
+
+        # Stores the results to disk
+        print("[benchmark] Saving benchmark results to '" + json_file_name + "'")
+        with open(json_file_name, "wb") as f:
+            json.dump(results, f, sort_keys=True, indent=4)
+
+    # Retrieves the data from the benchmark settings
+    file_name_suffix = "_tight" if tight_plot else ""
+    pdf_file_name = os.path.join(output_folder, benchmark_name.lower() + "_plot" + file_name_suffix + ".pdf")
+    titles = [utils.precision_to_letter(precision) + b["name"].upper() + " " + b["title"] for b in benchmarks]
+    x_keys = [b["x_keys"] for b in benchmarks]
+    y_keys = [b["y_keys"] for b in benchmarks]
+    x_labels = [b["x_label"] for b in benchmarks]
+    y_labels = [b["y_label"] for b in benchmarks]
+    label_names = results["label_names"]
+
+    # For half-precision: also adds single-precision results for comparison
+    if precision == 16:
+        label_names = ["CLBlast FP16", "clBLAS FP32", "CLBlast FP32"]
+        y_keys = [y_key + [y_key[0] + "_FP32"] for y_key in y_keys]
+
+    # Plots the graphs
+    plot.plot_graphs(results["benchmarks"], pdf_file_name, results["num_rows"], results["num_cols"],
+                     x_keys, y_keys, titles, x_labels, y_labels,
+                     label_names, plot_title, tight_plot, verbose)
+
+    print("[benchmark] All done")
+
+
+if __name__ == '__main__':
+    parsed_arguments = parse_arguments(sys.argv[1:])
+    benchmark_single(**parsed_arguments)
diff --git a/scripts/benchmark/benchmark_all.py b/scripts/benchmark/benchmark_all.py
new file mode 100644
index 00000000..9bf09190
--- /dev/null
+++ b/scripts/benchmark/benchmark_all.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+import argparse
+import os
+import sys
+
+from benchmark import benchmark_single
+
+
+BENCHMARKS = ["axpy", "gemv", "gemm", "summary", "axpybatched", "gemmbatched"]
+
+
+def parse_arguments(argv):
+    parser = argparse.ArgumentParser(description="Runs all (main) benchmarks in one go for a given device")
+    parser.add_argument("-p", "--platform", required=True, type=int, help="The ID of the OpenCL platform to test on")
+    parser.add_argument("-d", "--device", required=True, type=int, help="The ID of the OpenCL device to test on")
+    parser.add_argument("-x", "--precision", type=int, default=32, help="The precision to test for (choose from 16, 32, 64, 3232, 6464")
+    parser.add_argument("-l", "--load_from_disk", action="store_true", help="Increase verbosity of the script")
+    parser.add_argument("-t", "--plot_title", default=None, help="The title for the plots, defaults to benchmark name")
+    parser.add_argument("-o", "--output_folder", default=os.getcwd(), help="Sets the folder for output plots (defaults to current folder)")
+    parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity of the script")
+    cl_args = parser.parse_args(argv)
+    return vars(cl_args)
+
+
+def benchmark_all(platform, device, precision, load_from_disk,
+                  plot_title, output_folder, verbose):
+    for bench in BENCHMARKS:
+        from_disk = load_from_disk
+        for tight_plot in [True, False]:  # two plots for a single benchmark
+            benchmark_single(bench, platform, device, None, precision, from_disk,
+                             plot_title, tight_plot, output_folder, verbose)
+            from_disk = True  # for the next plot of the same data
+
+
+if __name__ == '__main__':
+    parsed_arguments = parse_arguments(sys.argv[1:])
+    benchmark_all(**parsed_arguments)
diff --git a/scripts/benchmark/plot.py b/scripts/benchmark/plot.py
new file mode 100644
index 00000000..0cb6d8c5
--- /dev/null
+++ b/scripts/benchmark/plot.py
@@ -0,0 +1,118 @@
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+import utils
+
+import matplotlib
+matplotlib.use('Agg')
+from matplotlib import rcParams
+import matplotlib.pyplot as plt
+
+# Colors
+BLUEISH = [c / 255.0 for c in [71, 101, 177]]  # #4765b1
+REDISH = [c / 255.0 for c in [214, 117, 104]]  # #d67568
+PURPLISH = [c / 255.0 for c in [85, 0, 119]]  # #550077
+COLORS = [BLUEISH, REDISH, PURPLISH]
+MARKERS = ["o-", "x-", ".-"]
+
+
+def plot_graphs(results, file_name, num_rows, num_cols,
+                x_keys, y_keys, titles, x_labels, y_labels,
+                label_names, title, tight_plot, verbose):
+    assert len(results) == num_rows * num_cols
+    assert len(results) != 1
+    assert len(x_keys) == len(results)
+    assert len(y_keys) == len(results)
+    assert len(titles) == len(results)
+    assert len(x_labels) == len(results)
+    assert len(y_labels) == len(results)
+
+    # Tight plot (for in a paper or presentation) or regular (for display on a screen)
+    if tight_plot:
+        plot_size = 5
+        w_space = 0.20
+        h_space = 0.39
+        title_from_top = 0.11
+        legend_from_top = 0.17
+        legend_from_top_per_item = 0.04
+        x_label_from_bottom = 0.09
+        legend_spacing = 0.0
+        font_size = 15
+        font_size_legend = 13
+        font_size_title = font_size
+        bounding_box = "tight"
+    else:
+        plot_size = 8
+        w_space = 0.15
+        h_space = 0.22
+        title_from_top = 0.09
+        legend_from_top = 0.10
+        legend_from_top_per_item = 0.07
+        x_label_from_bottom = 0.06
+        legend_spacing = 0.8
+        font_size = 15
+        font_size_legend = font_size
+        font_size_title = 18
+        bounding_box = None  # means not 'tight'
+
+    # Initializes the plot
+    size_x = plot_size * num_cols
+    size_y = plot_size * num_rows
+    fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(size_x, size_y), facecolor='w', edgecolor='k')
+    fig.text(.5, 0.92, title, horizontalalignment="center", fontsize=font_size_title)
+    plt.subplots_adjust(wspace=w_space, hspace=h_space)
+    rcParams.update({'font.size': font_size})
+
+    # Loops over each subplot
+    for row in range(num_rows):
+        for col in range(num_cols):
+            index = row * num_cols + col
+            result = results[index]
+            ax = axes.flat[index]
+            plt.sca(ax)
+            print("[plot] Plotting subplot %d" % index)
+
+            # Sets the x-axis labels
+            x_list = [[r[x_key] for r in result] for x_key in x_keys[index]]
+            x_ticks = [",".join([utils.float_to_kilo_mega(v) for v in values]) for values in zip(*x_list)]
+            x_location = range(len(x_ticks))
+
+            # Optional sparsifying of the labels on the x-axis
+            if tight_plot and len(x_location) > 10:
+                x_ticks = [v if not (i % 2) else "" for i, v in enumerate(x_ticks)]
+
+            # Sets the y-data
+            y_list = [[r[y_key] for r in result] for y_key in y_keys[index]]
+            y_max = max([max(y) for y in y_list])
+
+            # Sets the axes
+            y_rounding = 10 if y_max < 80 else 50 if y_max < 400 else 200
+            y_axis_limit = (y_max * 1.2) - ((y_max * 1.2) % y_rounding) + y_rounding
+            plt.ylim(ymin=0, ymax=y_axis_limit)
+            plt.xticks(x_location, x_ticks, rotation='vertical')
+
+            # Sets the labels
+            ax.set_title(titles[index], y=1.0 - title_from_top, fontsize=font_size)
+            if col == 0 or y_labels[index] != y_labels[index - 1]:
+                ax.set_ylabel(y_labels[index])
+            ax.set_xlabel(x_labels[index])
+            ax.xaxis.set_label_coords(0.5, x_label_from_bottom)
+
+            # Plots the graph
+            assert len(COLORS) >= len(y_keys[index])
+            assert len(MARKERS) >= len(y_keys[index])
+            assert len(label_names) == len(y_keys[index])
+            for i in range(len(y_keys[index])):
+                ax.plot(x_location, y_list[i], MARKERS[i], label=label_names[i], color=COLORS[i])
+
+            # Sets the legend
+            leg = ax.legend(loc=(0.02, 1.0 - legend_from_top - legend_from_top_per_item * len(y_keys[index])),
+                            handletextpad=0.1, labelspacing=legend_spacing, fontsize=font_size_legend)
+            leg.draw_frame(False)
+
+    # Saves the plot to disk
+    print("[benchmark] Saving plot to '" + file_name + "'")
+    fig.savefig(file_name, bbox_inches=bounding_box)
diff --git a/scripts/benchmark/settings.py b/scripts/benchmark/settings.py
new file mode 100644
index 00000000..38db9ef5
--- /dev/null
+++ b/scripts/benchmark/settings.py
@@ -0,0 +1,381 @@
+#!/usr/bin/env python
+
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+import utils
+
+
+AXPY = {
+    "label_names": ["CLBlast", "clBLAS"],
+    "num_rows": 2, "num_cols": 3,
+    "benchmarks": [
+        {
+            "name": "axpy", "num_runs": 40,
+            "title": "multiples of 256K",
+            "x_label": "sizes (n)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": utils.k(256), "incx": 1, "incy": 1, "step": utils.k(256), "num_steps": 16}],
+        },
+        {
+            "name": "axpy", "num_runs": 40,
+            "title": "multiples of 256K+1",
+            "x_label": "sizes (n)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": utils.k(256) + 1, "incx": 1, "incy": 1, "step": utils.k(256) + 1, "num_steps": 16}],
+        },
+        {
+            "name": "axpy", "num_runs": 40,
+            "title": "around 1M",
+            "x_label": "sizes (n)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": utils.m(1), "incx": 1, "incy": 1, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "axpy", "num_runs": 20,
+            "title": "around 16M",
+            "x_label": "sizes (n)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": utils.m(16), "incx": 1, "incy": 1, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "axpy", "num_runs": 20,
+            "title": "strides n=8M",
+            "x_label": "increments for x,y", "x_keys": ["incx", "incy"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": utils.m(8), "incx": inc_x, "incy": inc_y, "step": 0, "num_steps": 1}
+                          for inc_x in [1, 2, 4] for inc_y in [1, 2, 4]],
+        },
+        {
+            "name": "axpy", "num_runs": 40,
+            "title": "powers of 2",
+            "x_label": "sizes (n)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": n, "incx": 1, "incy": 1, "step": 0, "num_steps": 1}
+                          for n in utils.powers_of_2(utils.k(32), utils.m(64))],
+        }
+    ]
+}
+
+AXPYBATCHED = {
+    "label_names": ["CLBlast", "clBLAS (non batched)"],
+    "num_rows": 1, "num_cols": 3,
+    "benchmarks": [
+        {
+            "name": "axpybatched", "num_runs": 30,
+            "title": "8 batches",
+            "x_label": "sizes (n)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"batch_num": 8, "n": n, "incx": 1, "incy": 1, "step": 0, "num_steps": 1}
+                          for n in utils.powers_of_2(utils.k(8), utils.m(4))],
+        },
+        {
+            "name": "axpybatched", "num_runs": 20,
+            "title": "64 batches",
+            "x_label": "sizes (n)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"batch_num": 64, "n": n, "incx": 1, "incy": 1, "step": 0, "num_steps": 1}
+                          for n in utils.powers_of_2(utils.k(8), utils.m(4))],
+        },
+        {
+            "name": "axpybatched", "num_runs": 40,
+            "title": "n=512K",
+            "x_label": "number of batches", "x_keys": ["batch_num"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"batch_num": b, "n": utils.k(512), "incx": 1, "incy": 1, "step": 1, "num_steps": 1}
+                          for b in utils.powers_of_2(1, 512)],
+        }
+    ]
+}
+
+GEMV = {
+    "label_names": ["CLBlast", "clBLAS"],
+    "num_rows": 2, "num_cols": 3,
+    "benchmarks": [
+        {
+            "name": "gemv", "num_runs": 40,
+            "title": "multiples of 256",
+            "x_label": "sizes (n=m)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": 256, "m": 256, "incx": 1, "incy": 1, "layout": 102, "step": 256, "num_steps": 20}],
+        },
+        {
+            "name": "gemv", "num_runs": 40,
+            "title": "multiples of 257",
+            "x_label": "sizes (n=m)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": 257, "m": 257, "incx": 1, "incy": 1, "layout": 102, "step": 257, "num_steps": 20}],
+        },
+        {
+            "name": "gemv", "num_runs": 20,
+            "title": "around 4K",
+            "x_label": "sizes (n=m)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": 4096, "m": 4096, "incx": 1, "incy": 1, "layout": 102, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "gemv", "num_runs": 40,
+            "title": "multiples of 256 rotated",
+            "x_label": "sizes (n=m)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": 256, "m": 256, "incx": 1, "incy": 1, "layout": 101, "step": 256, "num_steps": 20}],
+        },
+        {
+            "name": "gemv", "num_runs": 40,
+            "title": "multiples of 257 rotated",
+            "x_label": "sizes (n=m)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": 257, "m": 257, "incx": 1, "incy": 1, "layout": 101, "step": 257, "num_steps": 20}],
+        },
+        {
+            "name": "gemv", "num_runs": 20,
+            "title": "strides n=m=4K",
+            "x_label": "increments/strides for x,y", "x_keys": ["incx", "incy"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": 4096, "m": 4096, "incx": inc_x, "incy": inc_y, "layout": 102, "step": 0, "num_steps": 1}
+                          for inc_x in [1, 2, 4] for inc_y in [1, 2, 4]],
+        }
+    ]
+}
+
+GEMM = {
+    "label_names": ["CLBlast", "clBLAS"],
+    "num_rows": 2, "num_cols": 3,
+    "benchmarks": [
+        {
+            "name": "gemm", "num_runs": 20,
+            "title": "multiples of 128",
+            "x_label": "sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 128, "n": 128, "k": 128, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 128, "num_steps": 20}],
+        },
+        {
+            "name": "gemm", "num_runs": 20,
+            "title": "multiples of 129",
+            "x_label": "sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 129, "n": 129, "k": 129, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 129, "num_steps": 20}],
+        },
+        {
+            "name": "gemm", "num_runs": 20,
+            "title": "around 512",
+            "x_label": "sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 512, "n": 512, "k": 512, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "gemm", "num_runs": 10,
+            "title": "around 2048",
+            "x_label": "sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 2048, "n": 2048, "k": 2048, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "gemm", "num_runs": 10,
+            "title": "layouts/transpose",
+            "x_label": "layout, transA, transB", "x_keys": ["layout", "transA", "transB"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 1024, "n": 1024, "k": 1024, "layout": layout,
+                           "transA": transA, "transB": transB, "step": 0, "num_steps": 1}
+                          for layout in [101, 102] for transA in [111, 112] for transB in [111, 112]],
+        },
+        {
+            "name": "gemm", "num_runs": 10,
+            "title": "powers of 2",
+            "x_label": "sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": n, "n": n, "k": n, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 0, "num_steps": 1}
+                          for n in utils.powers_of_2(8, utils.k(4))],
+        }
+    ]
+}
+
+GEMM_SMALL = {
+    "label_names": ["CLBlast", "clBLAS"],
+    "num_rows": 2, "num_cols": 1,
+    "benchmarks": [
+        {
+            "name": "gemm", "num_runs": 10,
+            "title": "small matrices in steps of 16",
+            "x_label": "sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 128, "n": 128, "k": 128, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 16, "num_steps": 57}],
+        },
+        {
+            "name": "gemm", "num_runs": 10,
+            "title": "small matrices in steps of 1",
+            "x_label": "sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 128, "n": 128, "k": 128, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 1, "num_steps": 385}],
+        },
+
+    ]
+}
+
+GEMMBATCHED = {
+    "label_names": ["CLBlast", "clBLAS (non batched)"],
+    "num_rows": 1, "num_cols": 3,
+    "benchmarks": [
+        {
+            "name": "gemmbatched", "num_runs": 40,
+            "title": "8 batches",
+            "x_label": "sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"batch_num": 8, "m": 32, "n": 32, "k": 32, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 32, "num_steps": 20}],
+        },
+        {
+            "name": "gemmbatched", "num_runs": 20,
+            "title": "64 batches",
+            "x_label": "sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"batch_num": 64, "m": 32, "n": 32, "k": 32, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 32, "num_steps": 20}],
+        },
+        {
+            "name": "gemmbatched", "num_runs": 30,
+            "title": "m=n=k=128",
+            "x_label": "number of batches", "x_keys": ["batch_num"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"batch_num": b, "m": 128, "n": 128, "k": 128, "layout": 102,
+                           "transA": 111, "transB": 111} for b in utils.powers_of_2(1, utils.k(16))],
+        }
+    ]
+}
+
+SYMM = {
+    "label_names": ["CLBlast", "clBLAS"],
+    "num_rows": 2, "num_cols": 3,
+    "benchmarks": [
+        {
+            "name": "symm", "num_runs": 10,
+            "title": "multiples of 128",
+            "x_label": "sizes (m=n)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 128, "n": 128, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 128, "num_steps": 20}],
+        },
+        {
+            "name": "symm", "num_runs": 10,
+            "title": "multiples of 129",
+            "x_label": "sizes (m=n)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 129, "n": 129, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 129, "num_steps": 20}],
+        },
+        {
+            "name": "symm", "num_runs": 10,
+            "title": "around 512",
+            "x_label": "sizes (m=n)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 512, "n": 512, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "symm", "num_runs": 10,
+            "title": "around 2048",
+            "x_label": "sizes (m=n)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 2048, "n": 2048, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "symm", "num_runs": 10,
+            "title": "layouts/sides/triangles",
+            "x_label": "layout, side, triangle", "x_keys": ["layout", "side", "triangle"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 1024, "n": 1024, "layout": layout,
+                           "side": side, "triangle": triangle, "step": 0, "num_steps": 1}
+                          for layout in [101, 102] for side in [141, 142] for triangle in [121, 122]],
+        },
+        {
+            "name": "symm", "num_runs": 10,
+            "title": "powers of 2",
+            "x_label": "sizes (m=n)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": n, "n": n, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 0, "num_steps": 1}
+                          for n in utils.powers_of_2(8, utils.k(4))],
+        }
+    ]
+}
+
+SYRK = {
+    "label_names": ["CLBlast", "clBLAS"],
+    "num_rows": 2, "num_cols": 3,
+    "benchmarks": [
+        {
+            "name": "syrk", "num_runs": 10,
+            "title": "multiples of 128",
+            "x_label": "sizes (n=k)", "x_keys": ["n"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"n": 128, "k": 128, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 128, "num_steps": 20}],
+        },
+        {
+            "name": "syrk", "num_runs": 10,
+            "title": "multiples of 129",
+            "x_label": "sizes (n=k)", "x_keys": ["n"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"n": 129, "k": 129, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 129, "num_steps": 20}],
+        },
+        {
+            "name": "syrk", "num_runs": 10,
+            "title": "around 512",
+            "x_label": "sizes (n=k)", "x_keys": ["n"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"n": 512, "k": 512, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "syrk", "num_runs": 10,
+            "title": "around 2048",
+            "x_label": "sizes (n=k)", "x_keys": ["n"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"n": 2048, "k": 2048, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "syrk", "num_runs": 10,
+            "title": "layouts/sides/triangles",
+            "x_label": "layout, triangle, transA", "x_keys": ["layout", "triangle", "transA"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"n": 1024, "k": 1024, "layout": layout,
+                           "triangle": triangle, "transA": transA, "step": 0, "num_steps": 1}
+                          for layout in [101, 102] for triangle in [121, 122] for transA in [111, 112]],
+        },
+        {
+            "name": "syrk", "num_runs": 10,
+            "title": "powers of 2",
+            "x_label": "sizes (n=k)", "x_keys": ["n"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"n": n, "k": n, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 0, "num_steps": 1}
+                          for n in utils.powers_of_2(8, utils.k(4))],
+        }
+    ]
+}
+
+SUMMARY = {
+    "label_names": ["CLBlast", "clBLAS"],
+    "num_rows": 3, "num_cols": 2,
+    "benchmarks": [
+        AXPY["benchmarks"][0],
+        AXPY["benchmarks"][1],
+        GEMV["benchmarks"][0],
+        GEMV["benchmarks"][1],
+        GEMM["benchmarks"][0],
+        GEMM["benchmarks"][1],
+    ]
+}
diff --git a/scripts/benchmark/utils.py b/scripts/benchmark/utils.py
new file mode 100644
index 00000000..62e18de2
--- /dev/null
+++ b/scripts/benchmark/utils.py
@@ -0,0 +1,66 @@
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+import csv
+import subprocess
+
+
+def k(value):
+    return value * 1024
+
+
+def m(value):
+    return value * 1024 * 1024
+
+
+def float_to_kilo_mega(value):
+    if value % 1024 or value <= 1024:
+        return "%.0f" % value
+    elif value % (1024 * 1024) or value <= (1024 * 1024):
+        return "%.0fK" % (value / 1024.0)
+    else:
+        return "%.0fM" % (value / (1024.0 * 1024.0))
+
+
+def powers_of_2(start, stop):
+    while start <= stop:
+        yield start
+        start *= 2
+
+
+def precision_to_letter(precision):
+    if precision == 16:
+        return "H"
+    elif precision == 32:
+        return "S"
+    elif precision == 64:
+        return "D"
+    elif precision == 3232:
+        return "C"
+    elif precision == 6464:
+        return "Z"
+    else:
+        return "X"
+
+
+def run_binary(command, arguments):
+    full_command = command + " " + " ".join(arguments)
+    print("[benchmark] Calling binary: %s" % str(full_command))
+    try:
+        return subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE).stdout.read()
+    except OSError as e:
+        print("[benchmark] Error while running the binary, got exception: %s" + str(e))
+        return False
+
+
+def parse_results(csv_data):
+    csv_data = csv_data.split("\n")
+    results = csv.DictReader(csv_data, delimiter=";", skipinitialspace=True)
+    results = [r for r in results]
+    for result in results:
+        for key in result:
+            result[key] = float(result[key]) if "." in result[key] else int(result[key])
+    return results
diff --git a/scripts/database/database.py b/scripts/database/database.py
index 31f313da..e398aa30 100755
--- a/scripts/database/database.py
+++ b/scripts/database/database.py
@@ -29,12 +29,62 @@ VENDOR_TRANSLATION_TABLE = {
 }
 
 
+def remove_mismatched_arguments(database):
+    """Checks for tuning results with mis-matched entries and removes them according to user preferences"""
+    kernel_attributes = clblast.DEVICE_TYPE_ATTRIBUTES + clblast.KERNEL_ATTRIBUTES + ["kernel"]
+
+    # For Python 2 and 3 compatibility
+    try:
+        user_input = raw_input
+    except NameError:
+        user_input = input
+        pass
+
+    # Check for mis-matched entries
+    for kernel_group_name, kernel_group in db.group_by(database["sections"], kernel_attributes):
+        group_by_arguments = db.group_by(kernel_group, clblast.ARGUMENT_ATTRIBUTES)
+        if len(group_by_arguments) != 1:
+            print("[database] WARNING: entries for a single kernel with multiple argument values " + str(kernel_group_name))
+            print("[database] Either quit now, or remove all but one of the argument combinations below:")
+            for index, (attribute_group_name, mismatching_entries) in enumerate(group_by_arguments):
+                print("[database]     %d: %s" % (index, attribute_group_name))
+            for attribute_group_name, mismatching_entries in group_by_arguments:
+                response = user_input("[database] Remove entries corresponding to %s, [y/n]? " % str(attribute_group_name))
+                if response == "y":
+                    for entry in mismatching_entries:
+                        database["sections"].remove(entry)
+                    print("[database] Removed %d entry/entries" % len(mismatching_entries))
+
+    # Sanity-check: all mis-matched entries should be removed
+    for kernel_group_name, kernel_group in db.group_by(database["sections"], kernel_attributes):
+        group_by_arguments = db.group_by(kernel_group, clblast.ARGUMENT_ATTRIBUTES)
+        if len(group_by_arguments) != 1:
+            print("[database] ERROR: entries for a single kernel with multiple argument values " + str(kernel_group_name))
+        assert len(group_by_arguments) == 1
+
+
+def remove_database_entries(database, remove_if_matches_fields):
+    assert len(remove_if_matches_fields.keys()) > 0
+
+    def remove_this_entry(section):
+        for key in remove_if_matches_fields.keys():
+            if section[key] != remove_if_matches_fields[key]:
+                return False
+        return True
+
+    old_length = len(database["sections"])
+    database["sections"] = [x for x in database["sections"] if not remove_this_entry(x)]
+    new_length = len(database["sections"])
+    print("[database] Removed %d entries from the database" % (old_length - new_length))
+
+
 def main(argv):
 
     # Parses the command-line arguments
     parser = argparse.ArgumentParser()
     parser.add_argument("source_folder", help="The folder with JSON files to parse to add to the database")
     parser.add_argument("clblast_root", help="Root of the CLBlast sources")
+    parser.add_argument("-r", "--remove_device", type=str, default=None, help="Removes all entries for a specific device")
     parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity of the script")
     cl_args = parser.parse_args(argv)
 
@@ -76,10 +126,19 @@ def main(argv):
         new_size = db.length(database)
         print("with " + str(new_size - old_size) + " new items")  # Newline printed here
 
+    # Checks for tuning results with mis-matched entries
+    remove_mismatched_arguments(database)
+
     # Stores the modified database back to disk
     if len(glob.glob(json_files)) >= 1:
         io.save_database(database, database_filename)
 
+    # Removes database entries before continuing
+    if cl_args.remove_device is not None:
+        print("[database] Removing all results for device '%s'" % cl_args.remove_device)
+        remove_database_entries(database, {"device": cl_args.remove_device})
+        io.save_database(database, database_filename)
+
     # Retrieves the best performing results
     print("[database] Calculating the best results per device/kernel...")
     database_best_results = bests.get_best_results(database)
diff --git a/scripts/database/database/clblast.py b/scripts/database/database/clblast.py
index d89b6350..8af3ab5b 100644
--- a/scripts/database/database/clblast.py
+++ b/scripts/database/database/clblast.py
@@ -123,7 +123,7 @@ def print_cpp_database(database, output_dir):
                         devices = sorted(set([s["device"] for s in type_database]))
                         for device_name in devices:
                             device_database = [s for s in type_database if s["device"] == device_name]
-                            device_name_quoted = "\"%s\"," % device_name
+                            device_name_quoted = "\"%s\"," % device_name.strip()
                             device_name_cpp = "        { %-50s { " % device_name_quoted
                             f.write(device_name_cpp)
 
diff --git a/scripts/database/database/db.py b/scripts/database/database/db.py
index 94948b1a..51c9f1ec 100644
--- a/scripts/database/database/db.py
+++ b/scripts/database/database/db.py
@@ -5,6 +5,9 @@
 # Author(s):
 #   Cedric Nugteren <www.cedricnugteren.nl>
 
+import itertools
+from operator import itemgetter
+
 import clblast
 
 
@@ -62,3 +65,14 @@ def combine_result(old_results, new_result):
     # No match found: append a new result
     old_results.append(new_result)
     return old_results
+
+
+def group_by(database, attributes):
+    """Returns an list with the name of the group and the corresponding entries in the database"""
+    assert len(database) > 0
+    attributes = [a for a in attributes if a in database[0]]
+    database.sort(key=itemgetter(*attributes))
+    result = []
+    for key, data in itertools.groupby(database, key=itemgetter(*attributes)):
+        result.append((key, list(data)))
+    return result
diff --git a/scripts/database/database/io.py b/scripts/database/database/io.py
index d14f1297..0bba2ce1 100644
--- a/scripts/database/database/io.py
+++ b/scripts/database/database/io.py
@@ -56,5 +56,11 @@ def load_tuning_results(filename):
         assert json_data["precision"] == str(result["parameters"]["PRECISION"])
         result["parameters"].pop("PRECISION", None)
 
+    # Fixes the scalar argument values
+    for value, replacement in zip(["2.00", "2.00+0.50i"], ["2.000000", "2+0.5i"]):
+        for field in ["arg_alpha", "arg_beta"]:
+            if field in json_data.keys() and json_data[field] == value:
+                json_data[field] = replacement
+
     # All done
     return json_data
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 35d902b7..8c13b2ff 100755
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -38,11 +38,14 @@ FILES = [
     "/src/clblast_c.cpp",
     "/test/wrapper_clblas.hpp",
     "/test/wrapper_cblas.hpp",
+    "/test/wrapper_cublas.hpp",
     "/include/clblast_netlib_c.h",
     "/src/clblast_netlib_c.cpp",
 ]
-HEADER_LINES = [117, 73, 118, 22, 29, 41, 65, 32]
-FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 2]
+HEADER_LINES = [122, 77, 126, 24, 29, 41, 29, 65, 32]
+FOOTER_LINES = [25, 139, 27, 38, 6, 6, 6, 9, 2]
+HEADER_LINES_DOC = 0
+FOOTER_LINES_DOC = 63
 
 # Different possibilities for requirements
 ald_m = "The value of `a_ld` must be at least `m`."
@@ -99,65 +102,69 @@ bmnn = size_helper("layout == CLBlastLayoutRowMajor", "((side == CLBlastSideLeft
 # Populates a list of routines
 ROUTINES = [
 [  # Level 1: vector-vector
-  Routine(False, True,  "1", "rotg",  T, [S,D],            [],                  [],                                                     [],         ["sa","sb","sc","ss"],        ["1","1","1","1"], [],       "",    "Generate givens plane rotation", "", []),
-  Routine(False, True,  "1", "rotmg", T, [S,D],            [],                  [],                                                     ["sy1"],    ["sd1","sd2","sx1","sparam"], ["1","1","1","1","1"], [],   "",    "Generate modified givens plane rotation", "", []),
-  Routine(False, True,  "1", "rot",   T, [S,D],            ["n"],               [],                                                     [],         ["x","y"],                    [xn,yn],       ["cos","sin"],"",    "Apply givens plane rotation", "", []),
-  Routine(False, True,  "1", "rotm",  T, [S,D],            ["n"],               [],                                                     [],         ["x","y","sparam"],           [xn,yn,"1"],   [],           "",    "Apply modified givens plane rotation", "", []),
-  Routine(True,  True,  "1", "swap",  T, [S,D,C,Z,H],      ["n"],               [],                                                     [],         ["x","y"],                    [xn,yn],       [],           "",    "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []),
-  Routine(True,  True,  "1", "scal",  T, [S,D,C,Z,H],      ["n"],               [],                                                     [],         ["x"],                        [xn],          ["alpha"],    "",    "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []),
-  Routine(True,  True,  "1", "copy",  T, [S,D,C,Z,H],      ["n"],               [],                                                     ["x"],      ["y"],                        [xn,yn],       [],           "",    "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []),
-  Routine(True,  True,  "1", "axpy",  T, [S,D,C,Z,H],      ["n"],               [],                                                     ["x"],      ["y"],                        [xn,yn],       ["alpha"],    "",    "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []),
-  Routine(True,  True,  "1", "dot",   T, [S,D,H],          ["n"],               [],                                                     ["x","y"],  ["dot"],                      [xn,yn,"1"],   [],           "n",   "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []),
-  Routine(True,  True,  "1", "dotu",  T, [C,Z],            ["n"],               [],                                                     ["x","y"],  ["dot"],                      [xn,yn,"1"],   [],           "n",   "Dot product of two complex vectors", "See the regular xDOT routine.", []),
-  Routine(True,  True,  "1", "dotc",  T, [C,Z],            ["n"],               [],                                                     ["x","y"],  ["dot"],                      [xn,yn,"1"],   [],           "n",   "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
-  Routine(True,  True,  "1", "nrm2",  T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["nrm2"],                     [xn,"1"],      [],           "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
-  Routine(True,  True,  "1", "asum",  T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["asum"],                     [xn,"1"],      [],           "n",   "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
-  Routine(True,  False, "1", "sum",   T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["sum"],                      [xn,"1"],      [],           "n",   "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
-  Routine(True,  True,  "1", "amax",  T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [xn,"1"],      [],           "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
-  Routine(True,  False, "1", "max",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [xn,"1"],      [],           "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
-  Routine(True,  False, "1", "min",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imin"],                     [xn,"1"],      [],           "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
+  Routine(False, True,  False, "1", "rotg",  T, [S,D],            [],                  [],                                                     [],         ["sa","sb","sc","ss"],        ["1","1","1","1"], [],       "",    "Generate givens plane rotation", "", []),
+  Routine(False, True,  False, "1", "rotmg", T, [S,D],            [],                  [],                                                     ["sy1"],    ["sd1","sd2","sx1","sparam"], ["1","1","1","1","1"], [],   "",    "Generate modified givens plane rotation", "", []),
+  Routine(False, True,  False, "1", "rot",   T, [S,D],            ["n"],               [],                                                     [],         ["x","y"],                    [xn,yn],       ["cos","sin"],"",    "Apply givens plane rotation", "", []),
+  Routine(False, True,  False, "1", "rotm",  T, [S,D],            ["n"],               [],                                                     [],         ["x","y","sparam"],           [xn,yn,"1"],   [],           "",    "Apply modified givens plane rotation", "", []),
+  Routine(True,  True,  False, "1", "swap",  T, [S,D,C,Z,H],      ["n"],               [],                                                     [],         ["x","y"],                    [xn,yn],       [],           "",    "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []),
+  Routine(True,  True,  False, "1", "scal",  T, [S,D,C,Z,H],      ["n"],               [],                                                     [],         ["x"],                        [xn],          ["alpha"],    "",    "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []),
+  Routine(True,  True,  False, "1", "copy",  T, [S,D,C,Z,H],      ["n"],               [],                                                     ["x"],      ["y"],                        [xn,yn],       [],           "",    "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []),
+  Routine(True,  True,  False, "1", "axpy",  T, [S,D,C,Z,H],      ["n"],               [],                                                     ["x"],      ["y"],                        [xn,yn],       ["alpha"],    "",    "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []),
+  Routine(True,  True,  False, "1", "dot",   T, [S,D,H],          ["n"],               [],                                                     ["x","y"],  ["dot"],                      [xn,yn,"1"],   [],           "n",   "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []),
+  Routine(True,  True,  False, "1", "dotu",  T, [C,Z],            ["n"],               [],                                                     ["x","y"],  ["dot"],                      [xn,yn,"1"],   [],           "n",   "Dot product of two complex vectors", "See the regular xDOT routine.", []),
+  Routine(True,  True,  False, "1", "dotc",  T, [C,Z],            ["n"],               [],                                                     ["x","y"],  ["dot"],                      [xn,yn,"1"],   [],           "n",   "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
+  Routine(True,  True,  False, "1", "nrm2",  T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["nrm2"],                     [xn,"1"],      [],           "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
+  Routine(True,  True,  False, "1", "asum",  T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["asum"],                     [xn,"1"],      [],           "n",   "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
+  Routine(True,  False, False, "1", "sum",   T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["sum"],                      [xn,"1"],      [],           "n",   "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
+  Routine(True,  True,  False, "1", "amax",  T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [xn,"1"],      [],           "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
+  Routine(True,  False, False, "1", "max",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [xn,"1"],      [],           "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
+  Routine(True,  False, False, "1", "min",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imin"],                     [xn,"1"],      [],           "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
 ],
 [  # Level 2: matrix-vector
-  Routine(True,  True,  "2a", "gemv",  T,  [S,D,C,Z,H],    ["m","n"],           ["layout","a_transpose"],                               ["a","x"],  ["y"],                        [amn,xmn,ynm], ["alpha","beta"], "",    "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
-  Routine(True,  True,  "2a", "gbmv",  T,  [S,D,C,Z,H],    ["m","n","kl","ku"], ["layout","a_transpose"],                               ["a","x"],  ["y"],                        [amn,xmn,ynm], ["alpha","beta"], "",    "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]),
-  Routine(True,  True,  "2a", "hemv",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]),
-  Routine(True,  True,  "2a", "hbmv",  T,  [C,Z],          ["n","k"],           ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]),
-  Routine(True,  True,  "2a", "hpmv",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["ap","x"], ["y"],                        [apn,xn,yn],   ["alpha","beta"], "",    "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2a", "symv",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]),
-  Routine(True,  True,  "2a", "sbmv",  T,  [S,D,H],        ["n","k"],           ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]),
-  Routine(True,  True,  "2a", "spmv",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["ap","x"], ["y"],                        [apn,xn,yn],   ["alpha","beta"], "",    "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2a", "trmv",  T,  [S,D,C,Z,H],    ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "n",   "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
-  Routine(True,  True,  "2a", "tbmv",  T,  [S,D,C,Z,H],    ["n","k"],           ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "n",   "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
-  Routine(True,  True,  "2a", "tpmv",  T,  [S,D,C,Z,H],    ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["ap"],     ["x"],                        [apn,xn],      [],               "n",   "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
-  Routine(False, True,  "2a", "trsv",  T,  [S,D,C,Z],      ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "",    "Solves a triangular system of equations", "", []),
-  Routine(False, True,  "2a", "tbsv",  T,  [S,D,C,Z],      ["n","k"],           ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "",    "Solves a banded triangular system of equations", "", [ald_k_one]),
-  Routine(False, True,  "2a", "tpsv",  T,  [S,D,C,Z],      ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["ap"],     ["x"],                        [apn,xn],      [],               "",    "Solves a packed triangular system of equations", "", []),
+  Routine(True,  True,  False, "2a", "gemv",  T,  [S,D,C,Z,H],    ["m","n"],           ["layout","a_transpose"],                               ["a","x"],  ["y"],                        [amn,xmn,ynm], ["alpha","beta"], "",    "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
+  Routine(True,  True,  False, "2a", "gbmv",  T,  [S,D,C,Z,H],    ["m","n","kl","ku"], ["layout","a_transpose"],                               ["a","x"],  ["y"],                        [amn,xmn,ynm], ["alpha","beta"], "",    "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]),
+  Routine(True,  True,  False, "2a", "hemv",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]),
+  Routine(True,  True,  False, "2a", "hbmv",  T,  [C,Z],          ["n","k"],           ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]),
+  Routine(True,  True,  False, "2a", "hpmv",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["ap","x"], ["y"],                        [apn,xn,yn],   ["alpha","beta"], "",    "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  False, "2a", "symv",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]),
+  Routine(True,  True,  False, "2a", "sbmv",  T,  [S,D,H],        ["n","k"],           ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]),
+  Routine(True,  True,  False, "2a", "spmv",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["ap","x"], ["y"],                        [apn,xn,yn],   ["alpha","beta"], "",    "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  False, "2a", "trmv",  T,  [S,D,C,Z,H],    ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "n",   "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
+  Routine(True,  True,  False, "2a", "tbmv",  T,  [S,D,C,Z,H],    ["n","k"],           ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "n",   "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
+  Routine(True,  True,  False, "2a", "tpmv",  T,  [S,D,C,Z,H],    ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["ap"],     ["x"],                        [apn,xn],      [],               "n",   "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
+  Routine(True,  True,  False, "2a", "trsv",  T,  [S,D,C,Z],      ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "",    "Solves a triangular system of equations", "", []),
+  Routine(False, True,  False, "2a", "tbsv",  T,  [S,D,C,Z],      ["n","k"],           ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "",    "Solves a banded triangular system of equations", "", [ald_k_one]),
+  Routine(False, True,  False, "2a", "tpsv",  T,  [S,D,C,Z],      ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["ap"],     ["x"],                        [apn,xn],      [],               "",    "Solves a packed triangular system of equations", "", []),
   # Level 2: matrix update
-  Routine(True,  True,  "2b", "ger",   T,  [S,D,H],        ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        [xm,yn,amn],   ["alpha"],        "",    "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]),
-  Routine(True,  True,  "2b", "geru",  T,  [C,Z],          ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        [xm,yn,amn],   ["alpha"],        "",    "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]),
-  Routine(True,  True,  "2b", "gerc",  T,  [C,Z],          ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        [xm,yn,amn],   ["alpha"],        "",    "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]),
-  Routine(True,  True,  "2b", "her",   Tc, [Css,Zdd],      ["n"],               ["layout","triangle"],                                  ["x"],      ["a"],                        [xn,an],       ["alpha"],        "",    "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]),
-  Routine(True,  True,  "2b", "hpr",   Tc, [Css,Zdd],      ["n"],               ["layout","triangle"],                                  ["x"],      ["ap"],                       [xn,apn],      ["alpha"],        "",    "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2b", "her2",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["x","y"],  ["a"],                        [xn,yn,an],    ["alpha"],        "",    "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]),
-  Routine(True,  True,  "2b", "hpr2",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["x","y"],  ["ap"],                       [xn,yn,apn],   ["alpha"],        "",    "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2b", "syr",   T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x"],      ["a"],                        [xn,an],       ["alpha"],        "",    "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]),
-  Routine(True,  True,  "2b", "spr",   T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x"],      ["ap"],                       [xn,apn],      ["alpha"],        "",    "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2b", "syr2",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x","y"],  ["a"],                        [xn,yn,an],    ["alpha"],        "",    "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]),
-  Routine(True,  True,  "2b", "spr2",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x","y"],  ["ap"],                       [xn,yn,apn],   ["alpha"],        "",    "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  False, "2b", "ger",   T,  [S,D,H],        ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        [xm,yn,amn],   ["alpha"],        "",    "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]),
+  Routine(True,  True,  False, "2b", "geru",  T,  [C,Z],          ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        [xm,yn,amn],   ["alpha"],        "",    "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]),
+  Routine(True,  True,  False, "2b", "gerc",  T,  [C,Z],          ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        [xm,yn,amn],   ["alpha"],        "",    "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]),
+  Routine(True,  True,  False, "2b", "her",   Tc, [Css,Zdd],      ["n"],               ["layout","triangle"],                                  ["x"],      ["a"],                        [xn,an],       ["alpha"],        "",    "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]),
+  Routine(True,  True,  False, "2b", "hpr",   Tc, [Css,Zdd],      ["n"],               ["layout","triangle"],                                  ["x"],      ["ap"],                       [xn,apn],      ["alpha"],        "",    "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  False, "2b", "her2",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["x","y"],  ["a"],                        [xn,yn,an],    ["alpha"],        "",    "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]),
+  Routine(True,  True,  False, "2b", "hpr2",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["x","y"],  ["ap"],                       [xn,yn,apn],   ["alpha"],        "",    "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  False, "2b", "syr",   T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x"],      ["a"],                        [xn,an],       ["alpha"],        "",    "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]),
+  Routine(True,  True,  False, "2b", "spr",   T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x"],      ["ap"],                       [xn,apn],      ["alpha"],        "",    "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  False, "2b", "syr2",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x","y"],  ["a"],                        [xn,yn,an],    ["alpha"],        "",    "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]),
+  Routine(True,  True,  False, "2b", "spr2",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x","y"],  ["ap"],                       [xn,yn,apn],   ["alpha"],        "",    "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
 ],
 [  # Level 3: matrix-matrix
-  Routine(True,  True,  "3", "gemm",  T,  [S,D,C,Z,H],     ["m","n","k"],        ["layout","a_transpose","b_transpose"],                ["a","b"],  ["c"],                        [amk,bkn,cmn],   ["alpha","beta"], "",    "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
-  Routine(True,  True,  "3", "symm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle"],                          ["a","b"],  ["c"],                        [ammn,bmnn,cmn], ["alpha","beta"], "",    "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]),
-  Routine(True,  True,  "3", "hemm",  T,  [C,Z],           ["m","n"],            ["layout","side","triangle"],                          ["a","b"],  ["c"],                        [ammn,bmnn,cmn], ["alpha","beta"], "",    "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]),
-  Routine(True,  True,  "3", "syrk",  T,  [S,D,C,Z,H],     ["n","k"],            ["layout","triangle","a_transpose"],                   ["a"],      ["c"],                        [ank,cn],        ["alpha","beta"], "",    "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]),
-  Routine(True,  True,  "3", "herk",  Tc, [Css,Zdd],       ["n","k"],            ["layout","triangle","a_transpose"],                   ["a"],      ["c"],                        [ank,cn],        ["alpha","beta"], "",    "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]),
-  Routine(True,  True,  "3", "syr2k", T,  [S,D,C,Z,H],     ["n","k"],            ["layout","triangle","ab_transpose"],                  ["a","b"],  ["c"],                        [ankab,bnkab,cn],["alpha","beta"], "",    "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
-  Routine(True,  True,  "3", "her2k", TU, [Ccs,Zzd],       ["n","k"],            ["layout","triangle","ab_transpose"],                  ["a","b"],  ["c"],                        [ankab,bnkab,cn],["alpha","beta"], "",    "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
-  Routine(True,  True,  "3", "trmm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        [amns,bmn],      ["alpha"],        "",    "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
-  Routine(False, True,  "3", "trsm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        [amns,bmn],      ["alpha"],        "",    "Solves a triangular system of equations", "", []),
+  Routine(True,  True,  False, "3", "gemm",  T,  [S,D,C,Z,H],     ["m","n","k"],        ["layout","a_transpose","b_transpose"],                ["a","b"],  ["c"],                        [amk,bkn,cmn],   ["alpha","beta"], "",    "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
+  Routine(True,  True,  False, "3", "symm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle"],                          ["a","b"],  ["c"],                        [ammn,bmnn,cmn], ["alpha","beta"], "",    "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]),
+  Routine(True,  True,  False, "3", "hemm",  T,  [C,Z],           ["m","n"],            ["layout","side","triangle"],                          ["a","b"],  ["c"],                        [ammn,bmnn,cmn], ["alpha","beta"], "",    "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]),
+  Routine(True,  True,  False, "3", "syrk",  T,  [S,D,C,Z,H],     ["n","k"],            ["layout","triangle","a_transpose"],                   ["a"],      ["c"],                        [ank,cn],        ["alpha","beta"], "",    "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]),
+  Routine(True,  True,  False, "3", "herk",  Tc, [Css,Zdd],       ["n","k"],            ["layout","triangle","a_transpose"],                   ["a"],      ["c"],                        [ank,cn],        ["alpha","beta"], "",    "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]),
+  Routine(True,  True,  False, "3", "syr2k", T,  [S,D,C,Z,H],     ["n","k"],            ["layout","triangle","ab_transpose"],                  ["a","b"],  ["c"],                        [ankab,bnkab,cn],["alpha","beta"], "",    "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
+  Routine(True,  True,  False, "3", "her2k", TU, [Ccs,Zzd],       ["n","k"],            ["layout","triangle","ab_transpose"],                  ["a","b"],  ["c"],                        [ankab,bnkab,cn],["alpha","beta"], "",    "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
+  Routine(True,  True,  False, "3", "trmm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        [amns,bmn],      ["alpha"],        "",    "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
+  Routine(True,  True,  False, "3", "trsm",  T,  [S,D,C,Z],       ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        [amns,bmn],      ["alpha"],        "",    "Solves a triangular system of equations", "Solves the equation _A * X = alpha * B_ for the unknown _m_ by _n_ matrix X, in which _A_ is an _n_ by _n_ unit or non-unit triangular matrix and B is an _m_ by _n_ matrix. The matrix _B_ is overwritten by the solution _X_.", []),
 ],
 [  # Level X: extra routines (not part of BLAS)
-  Routine(True,  True,  "x", "omatcopy", T, [S,D,C,Z,H],   ["m","n"],            ["layout","a_transpose"],                              ["a"],      ["b"],                        [amn,bnma],      ["alpha"],        "",    "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
+  # Special routines:
+  Routine(True,  True,  False, "x", "omatcopy", T, [S,D,C,Z,H],   ["m","n"],            ["layout","a_transpose"],                              ["a"],      ["b"],                        [amn,bnma],      ["alpha"],        "",    "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
+  # Batched routines:
+  Routine(True,  True,  True,  "x", "axpy",     T, [S,D,C,Z,H],   ["n"],                [],                                                    ["x"],      ["y"],                        [xn,yn],         ["alpha"],        "",    "Batched version of AXPY", "As AXPY, but multiple operations are batched together for better performance.", []),
+  Routine(True,  True,  True,  "x", "gemm",     T, [S,D,C,Z,H],   ["m","n","k"],        ["layout","a_transpose","b_transpose"],                ["a","b"],  ["c"],                        [amk,bkn,cmn],   ["alpha","beta"], "",    "Batched version of GEMM", "As GEMM, but multiple operations are batched together for better performance.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
 ]]
 
 
@@ -188,7 +195,7 @@ def main(argv):
         # Re-writes the body of the file
         with open(library_root + FILES[i], "w") as f:
             body = ""
-            levels = [1, 2, 3] if (i == 4 or i == 5) else [1, 2, 3, 4]
+            levels = [1, 2, 3] if (i == 4 or i == 5 or i == 6) else [1, 2, 3, 4]
             for level in levels:
                 body += cpp.LEVEL_SEPARATORS[level - 1] + "\n"
                 for routine in ROUTINES[level - 1]:
@@ -205,9 +212,13 @@ def main(argv):
                     if i == 5:
                         body += cpp.wrapper_cblas(routine)
                     if i == 6:
-                        body += cpp.clblast_netlib_c_h(routine)
+                        body += cpp.wrapper_cublas(routine)
                     if i == 7:
-                        body += cpp.clblast_netlib_c_cc(routine)
+                        if not routine.batched:
+                            body += cpp.clblast_netlib_c_h(routine)
+                    if i == 8:
+                        if not routine.batched:
+                            body += cpp.clblast_netlib_c_cc(routine)
             f.write("".join(file_header))
             f.write(body)
             f.write("".join(file_footer))
@@ -217,7 +228,7 @@ def main(argv):
         for routine in ROUTINES[level - 1]:
             if routine.has_tests:
                 level_string = cpp.LEVEL_NAMES[level - 1]
-                routine_suffix = "level" + level_string + "/x" + routine.name + ".cpp"
+                routine_suffix = "level" + level_string + "/x" + routine.lowercase_name() + ".cpp"
 
                 # Correctness tests
                 filename = library_root + "/test/correctness/routines/" + routine_suffix
@@ -233,11 +244,20 @@ def main(argv):
                     f.write(cpp.performance_test(routine, level_string))
                     f.write(cpp.FOOTER)
 
-    # Outputs the API documentation
+    # API documentation
     filename = cl_args.clblast_root + "/doc/clblast.md"
+
+    # Stores the header and the footer of the original documentation file
+    with open(filename) as f:
+        original = f.readlines()
+    file_header = original[:HEADER_LINES_DOC]
+    file_footer = original[-FOOTER_LINES_DOC:]
+
+    # Outputs the API documentation
     with open(filename, "w") as f:
 
         # Outputs the header
+        f.write("".join(file_header))
         doc_header = doc.header()
         f.write(doc_header)
 
@@ -248,5 +268,8 @@ def main(argv):
                     doc_routine = doc.generate(routine)
                     f.write(doc_routine)
 
+        # Outputs the footer
+        f.write("".join(file_footer))
+
 if __name__ == '__main__':
     main(sys.argv[1:])
diff --git a/scripts/generator/generator/convert.py b/scripts/generator/generator/convert.py
index c0309ec3..07f45669 100644
--- a/scripts/generator/generator/convert.py
+++ b/scripts/generator/generator/convert.py
@@ -56,6 +56,19 @@ def option_to_cblas(x):
     }[x]
 
 
+def option_to_cublas(x):
+    """As above, but for clBLAS data-types"""
+    return {
+        'layout': "Layout",
+        'a_transpose': "cublasOperation_t",
+        'b_transpose': "cublasOperation_t",
+        'ab_transpose': "cublasOperation_t",
+        'side': "cublasSideMode_t",
+        'triangle': "cublasFillMode_t",
+        'diagonal': "cublasDiagType_t",
+    }[x]
+
+
 def option_to_documentation(x):
     """Translates an option name to a documentation string"""
     return {
diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py
index c14d00a1..17e418e3 100644
--- a/scripts/generator/generator/cpp.py
+++ b/scripts/generator/generator/cpp.py
@@ -51,8 +51,10 @@ def clblast_cc(routine):
         result += routine.routine_header_cpp(12, "") + " {" + NL
         result += "  try {" + NL
         result += "    auto queue_cpp = Queue(*queue);" + NL
-        result += "    auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL
-        result += "    routine.Do" + routine.name.capitalize() + "("
+        result += "    auto routine = X" + routine.plain_name() + "<" + routine.template.template + ">(queue_cpp, event);" + NL
+        if routine.batched:
+            result += "    " + (NL + "    ").join(routine.batched_transform_to_cpp()) + NL
+        result += "    routine.Do" + routine.capitalized_name() + "("
         result += ("," + NL + indent1).join([a for a in routine.arguments_clcudaapi()])
         result += ");" + NL
         result += "    return StatusCode::kSuccess;" + NL
@@ -63,7 +65,7 @@ def clblast_cc(routine):
     result += "}" + NL
     for flavour in routine.flavours:
         indent2 = " " * (34 + routine.length() + len(flavour.template))
-        result += "template StatusCode PUBLIC_API " + routine.name.capitalize() + "<" + flavour.template + ">("
+        result += "template StatusCode PUBLIC_API " + routine.capitalized_name() + "<" + flavour.template + ">("
         result += ("," + NL + indent2).join([a for a in routine.arguments_type(flavour)])
         result += "," + NL + indent2 + "cl_command_queue*, cl_event*);" + NL
     return result
@@ -84,9 +86,11 @@ def clblast_c_cc(routine):
         template = "<" + flavour.template + ">" if routine.no_scalars() else ""
         indent = " " * (16 + routine.length() + len(template))
         result += routine.routine_header_c(flavour, 27, "") + " {" + NL
+        if routine.batched:
+            result += "  " + (NL + "  ").join(routine.batched_transform_to_complex(flavour)) + NL
         result += "  try {" + NL
         result += "    return static_cast<CLBlastStatusCode>(" + NL
-        result += "      clblast::" + routine.name.capitalize() + template + "("
+        result += "      clblast::" + routine.capitalized_name() + template + "("
         result += ("," + NL + indent).join([a for a in routine.arguments_cast(flavour, indent)])
         result += "," + NL + indent + "queue, event)" + NL
         result += "    );" + NL
@@ -286,14 +290,69 @@ def wrapper_cblas(routine):
     return result
 
 
+def wrapper_cublas(routine):
+    """The wrapper to the reference cuBLAS routines (for performance/correctness testing)"""
+    result = ""
+    if routine.has_tests:
+        result += NL + "// Forwards the cuBLAS calls for %s" % routine.short_names_tested() + NL
+        if routine.no_scalars():
+            result += routine.routine_header_wrapper_cublas(routine.template, True, 23) + ";" + NL
+        for flavour in routine.flavours:
+            result += routine.routine_header_wrapper_cublas(flavour, False, 23) + " {" + NL
+
+            # There is a version available in cuBLAS
+            if flavour.precision_name in ["S", "D", "C", "Z"]:
+                indent = " " * (24 + routine.length())
+                arguments = routine.arguments_wrapper_cublas(flavour)
+
+                # Handles row-major
+                if routine.has_layout():
+                    result += "  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }" + NL
+
+                # Complex scalars
+                for scalar in routine.scalars:
+                    if flavour.is_complex(scalar):
+                        cuda_complex = "cuDoubleComplex" if flavour.precision_name == "Z" else "cuComplex"
+                        result += "  " + cuda_complex + " " + scalar + "_cuda;" + NL
+                        result += "  " + scalar + "_cuda.x = " + scalar + ".real();" + NL
+                        result += "  " + scalar + "_cuda.y = " + scalar + ".imag();" + NL
+
+                # Calls the cuBLAS routine
+                result += "  auto status = cublas" + flavour.name_cublas() + routine.name + "(handle, "
+                result += ("," + NL + indent).join([a for a in arguments]) + ");" + NL
+                result += "  cudaDeviceSynchronize();" + NL
+                result += "  return status;"
+
+            # There is no cuBLAS available, forward the call to one of the available functions
+            else:  # Half-precision
+                result += "  return CUBLAS_STATUS_NOT_SUPPORTED;"
+            #     indent = " " * (24 + routine.length())
+
+            #     # Convert to float (note: also integer buffers are stored as half/float)
+            #     for buf in routine.inputs + routine.outputs:
+            #         result += "  auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer, queues[0]);" + NL
+
+            #     # Call the float routine
+            #     result += "  return cublasX" + routine.name + "(handle,"
+            #     result += ("," + NL + indent).join([a for a in routine.arguments_half()]) + ");" + NL
+            #     result += "  cudaDeviceSynchronize();" + NL
+            #     result += "  return status;"
+
+            #     # Convert back to half
+            #     for buf in routine.outputs:
+            #         result += "  FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis, queues[0]);" + NL
+            #     result += "  return status;"
+
+            # Complete
+            result += NL + "}" + NL
+    return result
+
+
 def performance_test(routine, level_string):
     """Generates the body of a performance test for a specific routine"""
     result = ""
     result += "#include \"test/performance/client.hpp\"" + NL
-    result += "#include \"test/routines/level" + level_string + "/x" + routine.name + ".hpp\"" + NL + NL
-    result += "// Shortcuts to the clblast namespace" + NL
-    result += "using float2 = clblast::float2;" + NL
-    result += "using double2 = clblast::double2;" + NL + NL
+    result += "#include \"test/routines/level" + level_string + "/x" + routine.lowercase_name() + ".hpp\"" + NL + NL
     result += "// Main function (not within the clblast namespace)" + NL
     result += "int main(int argc, char *argv[]) {" + NL
     result += "  const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);" + NL
@@ -304,7 +363,7 @@ def performance_test(routine, level_string):
         found = False
         for flavour in routine.flavours:
             if flavour.precision_name == precision:
-                result += NL + "      clblast::RunClient<clblast::TestX" + routine.name + flavour.test_template()
+                result += NL + "      clblast::RunClient<clblast::TestX" + routine.plain_name() + flavour.test_template()
                 result += ">(argc, argv); break;" + NL
                 found = True
         if not found:
@@ -319,17 +378,14 @@ def correctness_test(routine, level_string):
     """Generates the body of a correctness test for a specific routine"""
     result = ""
     result += "#include \"test/correctness/testblas.hpp\"" + NL
-    result += "#include \"test/routines/level" + level_string + "/x" + routine.name + ".hpp\"" + NL + NL
-    result += "// Shortcuts to the clblast namespace" + NL
-    result += "using float2 = clblast::float2;" + NL
-    result += "using double2 = clblast::double2;" + NL + NL
+    result += "#include \"test/routines/level" + level_string + "/x" + routine.lowercase_name() + ".hpp\"" + NL + NL
     result += "// Main function (not within the clblast namespace)" + NL
     result += "int main(int argc, char *argv[]) {" + NL
     result += "  auto errors = size_t{0};" + NL
     not_first = "false"
     for flavour in routine.flavours:
-        result += "  errors += clblast::RunTests<clblast::TestX" + routine.name + flavour.test_template()
-        result += ">(argc, argv, " + not_first + ", \"" + flavour.name + routine.name.upper() + "\");" + NL
+        result += "  errors += clblast::RunTests<clblast::TestX" + routine.plain_name() + flavour.test_template()
+        result += ">(argc, argv, " + not_first + ", \"" + flavour.name + routine.upper_name() + "\");" + NL
         not_first = "true"
     result += "  if (errors > 0) { return 1; } else { return 0; }" + NL
     result += "}" + NL
diff --git a/scripts/generator/generator/datatype.py b/scripts/generator/generator/datatype.py
index 98874174..fdb584bc 100644
--- a/scripts/generator/generator/datatype.py
+++ b/scripts/generator/generator/datatype.py
@@ -30,17 +30,17 @@ class DataType:
         self.beta_cl = scalars[3]
         self.buffer_type = buffer_type
 
-    def use_alpha(self):
+    def use_alpha(self, postfix=""):
         """Outputs the name of the data-type (alpha/beta), possibly transforming into the right type"""
         if self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]:
-            return self.alpha_cpp + "{alpha.s[0], alpha.s[1]}"
-        return "alpha"
+            return self.alpha_cpp + "{alpha" + postfix + ".s[0], alpha" + postfix + ".s[1]}"
+        return "alpha" + postfix
 
-    def use_beta(self):
+    def use_beta(self, postfix=""):
         """As above, but for beta instead of alpha"""
         if self.beta_cpp in [D_FLOAT2, D_DOUBLE2]:
-            return self.beta_cpp + "{beta.s[0], beta.s[1]}"
-        return "beta"
+            return self.beta_cpp + "{beta" + postfix + ".s[0], beta" + postfix + ".s[1]}"
+        return "beta" + postfix
 
     def use_alpha_opencl(self):
         """As above, but the transformation is in the opposite direction"""
@@ -72,9 +72,11 @@ class DataType:
 
     def test_template(self):
         """Returns the template as used in the correctness/performance tests"""
+        buffer_type = "clblast::" + self.buffer_type if self.is_non_standard() else self.buffer_type
+        beta_cpp = "clblast::" + self.beta_cpp if self.beta_cpp in [D_HALF, D_FLOAT2, D_DOUBLE2] else self.beta_cpp
         if self.buffer_type != self.beta_cpp:
-            return "<" + self.buffer_type + "," + self.beta_cpp + ">, " + self.buffer_type + ", " + self.beta_cpp
-        return "<" + self.buffer_type + ">, " + self.buffer_type + ", " + self.beta_cpp
+            return "<" + buffer_type + "," + self.beta_cpp + ">, " + buffer_type + ", " + beta_cpp
+        return "<" + buffer_type + ">, " + buffer_type + ", " + beta_cpp
 
     def is_complex(self, scalar):
         """Current scalar is complex"""
@@ -85,6 +87,11 @@ class DataType:
         """Current type is of a non-standard type"""
         return self.buffer_type in [D_HALF, D_FLOAT2, D_DOUBLE2]
 
+    def name_cublas(self):
+        if "i" in self.name:
+            return "I" + self.name[1].lower()
+        return self.name
+
 
 # Regular data-types
 H = DataType("H", "H", D_HALF, [D_HALF] * 2 + [D_HALF_OPENCL] * 2, D_HALF)  # half (16)
diff --git a/scripts/generator/generator/doc.py b/scripts/generator/generator/doc.py
index c77ec1a0..9c73ffbc 100644
--- a/scripts/generator/generator/doc.py
+++ b/scripts/generator/generator/doc.py
@@ -20,7 +20,7 @@ def generate(routine):
     result = ""
 
     # Routine header
-    result += "x" + routine.name.upper() + ": " + routine.description + NL
+    result += "x" + routine.upper_name() + ": " + routine.description + NL
     result += "-------------" + NL + NL
     result += routine.details + NL + NL
 
@@ -36,7 +36,7 @@ def generate(routine):
     result += "```" + NL + NL
 
     # Routine arguments
-    result += "Arguments to " + routine.name.upper() + ":" + NL + NL
+    result += "Arguments to " + routine.upper_name() + ":" + NL + NL
     for argument in routine.arguments_doc():
         result += "* " + argument + NL
     result += "* `cl_command_queue* queue`: "
@@ -47,7 +47,7 @@ def generate(routine):
 
     # Routine requirements
     if len(routine.requirements_doc()) > 0:
-        result += "Requirements for " + routine.name.upper() + ":" + NL + NL
+        result += "Requirements for " + routine.upper_name() + ":" + NL + NL
         for requirement in routine.requirements_doc():
             result += "* " + requirement + NL
         result += NL
diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py
index 6fcce23b..1c534611 100644
--- a/scripts/generator/generator/routine.py
+++ b/scripts/generator/generator/routine.py
@@ -12,11 +12,12 @@ import generator.convert as convert
 
 class Routine:
     """Class holding routine-specific information (e.g. name, which arguments, which precisions)"""
-    def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options,
+    def __init__(self, implemented, has_tests, batched, level, name, template, flavours, sizes, options,
                  inputs, outputs, buffer_sizes, scalars, scratch,
                  description, details, requirements):
         self.implemented = implemented
         self.has_tests = has_tests
+        self.batched = batched
         self.level = level
         self.name = name
         self.template = template
@@ -32,6 +33,69 @@ class Routine:
         self.details = details
         self.requirements = requirements
 
+    def lowercase_name(self):
+        postfix = "batched" if self.batched else ""
+        return self.name + postfix
+
+    def plain_name(self):
+        postfix = "Batched" if self.batched else ""
+        return self.name + postfix
+
+    def capitalized_name(self):
+        postfix = "Batched" if self.batched else ""
+        return self.name.capitalize() + postfix
+
+    def upper_name(self):
+        postfix = "BATCHED" if self.batched else ""
+        return self.name.upper() + postfix
+
+    def b_star(self):
+        return "*" if self.batched else ""
+
+    def b_s(self):
+        return "s" if self.batched else ""
+
+    def batch_count_def(self):
+        return ["const size_t batch_count"] if self.batched else []
+
+    def batch_count_list(self):
+        return ["batch_count"] if self.batched else []
+
+    def batch_count_type(self):
+        return ["const size_t"] if self.batched else []
+
+    def batch_count_doc(self):
+        return ["`const size_t batch_count`: Number of batches. This value must be positive."] if self.batched else []
+
+    def batched_transform_to_cpp(self):
+        result = []
+        for scalar in self.scalars:
+            result.append("auto " + scalar + "s_cpp = std::vector<T>();")
+        for buffer_name in self.inputs + self.outputs:
+            result.append("auto " + buffer_name + "_offsets_cpp = std::vector<size_t>();")
+        result.append("for (auto batch = size_t{0}; batch < batch_count; ++batch) {")
+        for scalar in self.scalars:
+            result.append("  " + scalar + "s_cpp.push_back(" + scalar + "s[batch]);")
+        for buffer_name in self.inputs + self.outputs:
+            result.append("  " + buffer_name + "_offsets_cpp.push_back(" + buffer_name + "_offsets[batch]);")
+        result.append("}")
+        return result
+
+    def batched_transform_to_complex(self, flavour):
+        result = []
+        for scalar in self.scalars:
+            result.append("auto " + scalar + "s_cpp = std::vector<" + flavour.buffer_type + ">();")
+        result.append("for (auto batch = size_t{0}; batch < batch_count; ++batch) {")
+        for scalar in self.scalars:
+            content = scalar
+            if scalar == "alpha":
+                content = flavour.use_alpha(postfix="s[batch]")
+            elif scalar == "beta":
+                content = flavour.use_beta(postfix="s[batch]")
+            result.append("  " + scalar + "s_cpp.push_back(" + content + ");")
+        result.append("}")
+        return result
+
     @staticmethod
     def scalar_buffers_first():
         """List of scalar buffers"""
@@ -127,21 +191,25 @@ class Routine:
 
     def length(self):
         """Retrieves the number of characters in the routine's name"""
-        return len(self.name)
+        return len(self.capitalized_name())
 
     def no_scalars(self):
         """Determines whether or not this routine has scalar arguments (alpha/beta)"""
         return self.scalars == []
 
+    def has_layout(self):
+        """Determines whether the layout is an argument"""
+        return "layout" in self.options
+
     def short_names(self):
         """Returns the upper-case names of these routines (all flavours)"""
-        return "/".join([f.name + self.name.upper() for f in self.flavours])
+        return "/".join([f.name + self.upper_name() for f in self.flavours])
 
     def short_names_tested(self):
         """As above, but excludes some"""
-        names = [f.name + self.name.upper() for f in self.flavours]
-        if "H" + self.name.upper() in names:
-            names.remove("H" + self.name.upper())
+        names = [f.name + self.upper_name() for f in self.flavours]
+        if "H" + self.upper_name() in names:
+            names.remove("H" + self.upper_name())
         return "/".join(names)
 
     def buffers_first(self):
@@ -159,7 +227,7 @@ class Routine:
         """Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')"""
         if name in self.inputs or name in self.outputs:
             a = [name + "_buffer"]
-            b = [name + "_offset"]
+            b = [name + "_offset" + self.b_s()]
             c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else []
             return [", ".join(a + b + c)]
         return []
@@ -187,13 +255,13 @@ class Routine:
         prefix = "const " if name in self.inputs else ""
         if name in self.inputs or name in self.outputs:
             a = [prefix + "cl_mem " + name + "_buffer"]
-            b = ["const size_t " + name + "_offset"]
+            b = ["const size_t " + self.b_star() + name + "_offset" + self.b_s()]
             c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
             return [", ".join(a + b + c)]
         return []
 
     def buffer_def_wrapper_cl(self, name, flavour):
-        """As above but with data-types"""
+        """As above but for OpenCL"""
         prefix = "const " if name in self.inputs else ""
         if name in self.inputs or name in self.outputs:
             a = [prefix + "Buffer<" + flavour.buffer_type + ">& " + name + "_buffer"]
@@ -202,6 +270,16 @@ class Routine:
             return [", ".join(a + b + c)]
         return []
 
+    def buffer_def_wrapper_cuda(self, name, flavour):
+        """As above but for CUDA"""
+        prefix = "const " if name in self.inputs else ""
+        if name in self.inputs or name in self.outputs:
+            a = [prefix + flavour.buffer_type + "* " + name + "_buffer"]
+            b = ["const size_t " + name + "_offset"]
+            c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
+            return [", ".join(a + b + c)]
+        return []
+
     def buffer_def_vector(self, name, flavour):
         """As above but as vectors"""
         prefix = "const " if name in self.inputs else ""
@@ -228,7 +306,7 @@ class Routine:
         if name in self.inputs or name in self.outputs:
             buffer_type = "unsigned int" if (name in self.index_buffers()) else self.template.buffer_type
             a = ["Buffer<" + buffer_type + ">(" + name + "_buffer)"]
-            b = [name + "_offset"]
+            b = [name + "_offsets_cpp"] if self.batched else [name + "_offset"]
             c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else []
             return [", ".join(a + b + c)]
         return []
@@ -265,12 +343,38 @@ class Routine:
             return [", ".join(a + c)]
         return []
 
+    def buffer_wrapper_cublas(self, name, flavour):
+        """As above but for cuBLAS the wrapper"""
+        prefix = "const " if name in self.inputs else ""
+        if name in self.inputs or name in self.outputs:
+            if name in self.index_buffers():
+                a = ["reinterpret_cast<int*>(&" + name + "_buffer[" + name + "_offset])"]
+            elif name in self.outputs and flavour.name in ["Sc", "Dz"]:
+                dtype = "float" if flavour.name == "Sc" else "double"
+                a = ["reinterpret_cast<" + dtype + "*>(&" + name + "_buffer[" + name + "_offset])"]
+            elif flavour.precision_name in ["C", "Z"]:
+                cuda_complex = "cuDoubleComplex" if flavour.precision_name == "Z" else "cuComplex"
+                a = ["reinterpret_cast<" + prefix + cuda_complex + "*>" +
+                     "(&" + name + "_buffer[" + name + "_offset])"]
+            else:
+                a = ["&" + name + "_buffer[" + name + "_offset]"]
+            c = []
+            if name in ["x", "y"]:
+                c = ["static_cast<int>(" + name + "_" + self.postfix(name) + ")"]
+            elif name in ["a", "b", "c"]:
+                c = [name + "_" + self.postfix(name)]
+            result = [", ".join(a + c)]
+            if self.name == "trmm" and name == "a":
+                result *= 2
+            return result
+        return []
+
     def buffer_type(self, name):
         """As above, but only data-types"""
         prefix = "const " if (name in self.inputs) else ""
         if (name in self.inputs) or (name in self.outputs):
             a = [prefix + "cl_mem"]
-            b = ["const size_t"]
+            b = ["const size_t" + self.b_star()]
             c = ["const size_t"] if (name not in self.buffers_without_ld_inc()) else []
             return [", ".join(a + b + c)]
         return []
@@ -283,18 +387,19 @@ class Routine:
             math_name = name.upper() + " matrix" if (name in self.buffers_matrix()) else name + " vector"
             inc_ld_description = "Leading dimension " if (name in self.buffers_matrix()) else "Stride/increment "
             a = ["`" + prefix + "cl_mem " + name + "_buffer`: OpenCL buffer to store the " + inout + " " + math_name + "."]
-            b = ["`const size_t " + name + "_offset`: The offset in elements from the start of the " + inout + " " + math_name + "."]
+            b = ["`const size_t " + self.b_star() + name + "_offset" + self.b_s() + "`: The offset" + self.b_s() + " in elements from the start of the " + inout + " " + math_name + "."]
+            c = []
             if name not in self.buffers_without_ld_inc():
                 c = ["`const size_t " + name + "_" + self.postfix(name) + "`: " +
                      inc_ld_description + "of the " + inout + " " + math_name + ". This value must be greater than 0."]
-            else:
-                c = []
             return a + b + c
         return []
 
     def scalar(self, name):
         """Retrieves the name of a scalar (alpha/beta)"""
         if name in self.scalars:
+            if self.batched:
+                return [name + "s_cpp"]
             return [name]
         return []
 
@@ -314,8 +419,12 @@ class Routine:
         """Retrieves the use of a scalar (alpha/beta)"""
         if name in self.scalars:
             if name == "alpha":
+                if self.batched:
+                    return ["alphas_cpp.data()"]
                 return [flavour.use_alpha()]
             elif name == "beta":
+                if self.batched:
+                    return ["betas_cpp.data()"]
                 return [flavour.use_beta()]
             return [name]
         return []
@@ -338,20 +447,28 @@ class Routine:
             return [name]
         return []
 
+    def scalar_use_wrapper_cublas(self, name, flavour):
+        """As above, but for the cuBLAS wrapper"""
+        if name in self.scalars:
+            if flavour.is_complex(name):
+                return ["&" + name + "_cuda"]
+            return ["&" + name]
+        return []
+
     def scalar_def(self, name, flavour):
         """Retrieves the definition of a scalar (alpha/beta)"""
         if name in self.scalars:
             if name == "alpha":
-                return ["const " + flavour.alpha_cl + " " + name]
-            return ["const " + flavour.beta_cl + " " + name]
+                return ["const " + flavour.alpha_cl + " " + self.b_star() + name + self.b_s()]
+            return ["const " + flavour.beta_cl + " " + self.b_star() + name + self.b_s()]
         return []
 
     def scalar_def_plain(self, name, flavour):
         """As above, but without 'cl_' prefix"""
         if name in self.scalars:
             if name == "alpha":
-                return ["const " + flavour.alpha_cpp + " " + name]
-            return ["const " + flavour.beta_cpp + " " + name]
+                return ["const " + flavour.alpha_cpp + " " + self.b_star() + name + self.b_s()]
+            return ["const " + flavour.beta_cpp + " " + self.b_star() + name + self.b_s()]
         return []
 
     def scalar_def_void(self, name, flavour):
@@ -368,16 +485,16 @@ class Routine:
         """Retrieves the type of a scalar (alpha/beta)"""
         if name in self.scalars:
             if name == "alpha":
-                return ["const " + flavour.alpha_cpp]
-            return ["const " + flavour.beta_cpp]
+                return ["const " + flavour.alpha_cpp + self.b_star()]
+            return ["const " + flavour.beta_cpp + self.b_star()]
         return []
 
     def scalar_doc(self, name):
         """Retrieves the documentation of a scalar"""
         if name in self.scalars:
             if name == "alpha":
-                return ["`const " + self.template.alpha_cpp + " " + name + "`: Input scalar constant."]
-            return ["`const " + self.template.beta_cpp + " " + name + "`: Input scalar constant."]
+                return ["`const " + self.template.alpha_cpp + " " + self.b_star() + name + self.b_s() + "`: Input scalar constant" + self.b_s() + "."]
+            return ["`const " + self.template.beta_cpp + " " + self.b_star() + name + self.b_s() + "`: Input scalar constant" + self.b_s() + "."]
         return []
 
     def scalar_create_cpp(self, flavour):
@@ -396,6 +513,12 @@ class Routine:
             return [", ".join([s for s in self.sizes])]
         return []
 
+    def sizes_list_as_int(self):
+        """Retrieves a list of comma-separated sizes (m, n, k) cast to integers"""
+        if self.sizes:
+            return [", ".join(["static_cast<int>(" + s + ")" for s in self.sizes])]
+        return []
+
     def sizes_def(self):
         """Retrieves the definition of the sizes (m,n,k)"""
         if self.sizes:
@@ -427,6 +550,15 @@ class Routine:
             return [", ".join(self.options)]
         return []
 
+    def options_list_no_layout(self):
+        """Retrieves a list of options"""
+        options = self.options[:]
+        if "layout" in options:
+            options.remove("layout")
+        if options:
+            return [", ".join(options)]
+        return []
+
     def options_cast(self, indent):
         """As above, but now casted to CLBlast data-types"""
         if self.options:
@@ -462,6 +594,13 @@ class Routine:
             return [", ".join(definitions)]
         return []
 
+    def options_def_wrapper_cublas(self):
+        """As above, but now using cuBLAS data-types"""
+        if self.options:
+            definitions = ["const " + convert.option_to_cublas(o) + " " + o for o in self.options]
+            return [", ".join(definitions)]
+        return []
+
     def options_type(self):
         """Retrieves the types of the options (layout, transpose, side, etc.)"""
         if self.options:
@@ -507,7 +646,8 @@ class Routine:
                 self.scalar("beta") +
                 list(chain(*[self.buffer_clcudaapi(b) for b in self.buffers_second()])) +
                 list(chain(*[self.buffer_clcudaapi(b) for b in self.scalar_buffers_second()])) +
-                list(chain(*[self.scalar(s) for s in self.other_scalars()])))
+                list(chain(*[self.scalar(s) for s in self.other_scalars()])) +
+                self.batch_count_list())
 
     def arguments_cast(self, flavour, indent):
         """As above, but with CLBlast casts"""
@@ -518,7 +658,8 @@ class Routine:
                 self.scalar_use("beta", flavour) +
                 list(chain(*[self.buffer(b) for b in self.buffers_second()])) +
                 list(chain(*[self.buffer(b) for b in self.scalar_buffers_second()])) +
-                list(chain(*[self.scalar_use(s, flavour) for s in self.other_scalars()])))
+                list(chain(*[self.scalar_use(s, flavour) for s in self.other_scalars()])) +
+                self.batch_count_list())
 
     def arguments_netlib(self, flavour, indent):
         """As above, but for the Netlib CBLAS API"""
@@ -544,7 +685,7 @@ class Routine:
 
     def arguments_wrapper_cblas(self, flavour):
         """As above, but for the CBLAS wrapper"""
-        return (self.options_list() + self.sizes_list() +
+        return (self.options_list() + self.sizes_list_as_int() +
                 self.scalar_use_wrapper_cblas("alpha", flavour) +
                 list(chain(*[self.buffer_wrapper_cblas(b, flavour) for b in self.buffers_first()])) +
                 self.scalar_use_wrapper_cblas("beta", flavour) +
@@ -552,6 +693,17 @@ class Routine:
                 list(chain(*[self.buffer_wrapper_cblas(b, flavour) for b in self.scalar_buffers_second()])) +
                 list(chain(*[self.scalar_use_wrapper_cblas(s, flavour) for s in self.other_scalars()])))
 
+    def arguments_wrapper_cublas(self, flavour):
+        """As above, but for the cuBLAS wrapper"""
+        return (self.options_list_no_layout() + self.sizes_list_as_int() +
+                self.scalar_use_wrapper_cublas("alpha", flavour) +
+                list(chain(*[self.buffer_wrapper_cublas(b, flavour) for b in self.buffers_first()])) +
+                self.scalar_use_wrapper_cublas("beta", flavour) +
+                list(chain(*[self.buffer_wrapper_cublas(b, flavour) for b in self.buffers_second()])) +
+                list(chain(*[self.buffer_wrapper_cublas(b, flavour) for b in self.scalar_buffers_first()])) +
+                list(chain(*[self.buffer_wrapper_cublas(b, flavour) for b in self.scalar_buffers_second()])) +
+                list(chain(*[self.scalar_use_wrapper_cublas(s, flavour) for s in self.other_scalars()])))
+
     def arguments_def(self, flavour):
         """Retrieves a combination of all the argument definitions"""
         return (self.options_def() + self.sizes_def() +
@@ -561,7 +713,8 @@ class Routine:
                 self.scalar_def("beta", flavour) +
                 list(chain(*[self.buffer_def(b) for b in self.buffers_second()])) +
                 list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) +
-                list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
+                list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])) +
+                self.batch_count_def())
 
     def arguments_def_netlib(self, flavour):
         """As above, but for the Netlib CBLAS API"""
@@ -574,6 +727,7 @@ class Routine:
                 list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
         if self.name in self.routines_scalar_no_return():
             result += list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_first()]))
+        result += self.batch_count_def()
         return result
 
     def arguments_def_c(self, flavour):
@@ -585,7 +739,8 @@ class Routine:
                 self.scalar_def("beta", flavour) +
                 list(chain(*[self.buffer_def(b) for b in self.buffers_second()])) +
                 list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) +
-                list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
+                list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])) +
+                self.batch_count_def())
 
     def arguments_def_wrapper_clblas(self, flavour):
         """As above, but clBLAS wrapper plain data-types"""
@@ -609,6 +764,17 @@ class Routine:
                 list(chain(*[self.buffer_def_vector(b, flavour) for b in self.scalar_buffers_second()])) +
                 list(chain(*[self.scalar_def_plain(s, flavour) for s in self.other_scalars()])))
 
+    def arguments_def_wrapper_cublas(self, flavour):
+        """As above, but cuBLAS wrapper plain data-types"""
+        return (self.options_def_wrapper_cublas() + self.sizes_def() +
+                list(chain(*[self.buffer_def_wrapper_cuda(b, flavour) for b in self.scalar_buffers_first()])) +
+                self.scalar_def_plain("alpha", flavour) +
+                list(chain(*[self.buffer_def_wrapper_cuda(b, flavour) for b in self.buffers_first()])) +
+                self.scalar_def_plain("beta", flavour) +
+                list(chain(*[self.buffer_def_wrapper_cuda(b, flavour) for b in self.buffers_second()])) +
+                list(chain(*[self.buffer_def_wrapper_cuda(b, flavour) for b in self.scalar_buffers_second()])) +
+                list(chain(*[self.scalar_def_plain(s, flavour) for s in self.other_scalars()])))
+
     def arguments_type(self, flavour):
         """Retrieves a combination of all the argument types"""
         return (self.options_type() + self.sizes_type() +
@@ -618,7 +784,8 @@ class Routine:
                 self.scalar_type("beta", flavour) +
                 list(chain(*[self.buffer_type(b) for b in self.buffers_second()])) +
                 list(chain(*[self.buffer_type(b) for b in self.scalar_buffers_second()])) +
-                list(chain(*[self.scalar_type(s, flavour) for s in self.other_scalars()])))
+                list(chain(*[self.scalar_type(s, flavour) for s in self.other_scalars()])) +
+                self.batch_count_type())
 
     def arguments_doc(self):
         """Retrieves a combination of all the argument types"""
@@ -630,7 +797,8 @@ class Routine:
                 self.scalar_doc("beta") +
                 list(chain(*[self.buffer_doc(b) for b in self.buffers_second()])) +
                 list(chain(*[self.buffer_doc(b) for b in self.scalar_buffers_second()])) +
-                list(chain(*[self.scalar_doc(s) for s in self.other_scalars()])))
+                list(chain(*[self.scalar_doc(s) for s in self.other_scalars()])) +
+                self.batch_count_doc())
 
     def requirements_doc(self):
         """Retrieves a list of routine requirements for documentation"""
@@ -640,7 +808,7 @@ class Routine:
         """Retrieves the C++ templated definition for a routine"""
         indent = " " * (spaces + self.length())
         result = "template <" + self.template.name + ">\n"
-        result += "StatusCode " + self.name.capitalize() + "("
+        result += "StatusCode " + self.capitalized_name() + "("
         result += (",\n" + indent).join([a for a in self.arguments_def(self.template)])
         result += ",\n" + indent + "cl_command_queue* queue, cl_event* event" + default_event + ")"
         return result
@@ -649,7 +817,7 @@ class Routine:
         """As above, but now without variable names"""
         indent = " " * (spaces + self.length())
         result = "template <" + self.template.name + ">\n"
-        result += "StatusCode " + self.name.capitalize() + "("
+        result += "StatusCode " + self.capitalized_name() + "("
         result += (",\n" + indent).join([a for a in self.arguments_type(self.template)])
         result += ",\n" + indent + "cl_command_queue*, cl_event*)"
         return result
@@ -657,7 +825,7 @@ class Routine:
     def routine_header_c(self, flavour, spaces, extra_qualifier):
         """As above, but now for C"""
         indent = " " * (spaces + self.length())
-        result = "CLBlastStatusCode" + extra_qualifier + " CLBlast" + flavour.name + self.name + "("
+        result = "CLBlastStatusCode" + extra_qualifier + " CLBlast" + flavour.name + self.plain_name() + "("
         result += (",\n" + indent).join([a for a in self.arguments_def_c(flavour)])
         result += ",\n" + indent + "cl_command_queue* queue, cl_event* event)"
         return result
@@ -677,6 +845,8 @@ class Routine:
         if self.name in self.routines_scalar_no_return():
             routine_name += "_sub"
             indent += "    "
+        if self.batched:
+            routine_name += "batched"
         result = return_type + extra_qualifier + " cblas_" + flavour.name.lower() + routine_name + "("
         result += (",\n" + indent).join([a for a in self.arguments_def_netlib(flavour)]) + ")"
         return result
@@ -703,3 +873,17 @@ class Routine:
         result = "void cblasX" + self.name + "("
         result += (",\n" + indent).join([a for a in self.arguments_def_wrapper_cblas(flavour)]) + ")"
         return result
+
+    def routine_header_wrapper_cublas(self, flavour, def_only, spaces):
+        """As above, but now for the cuBLAS wrapper"""
+        template = "<" + flavour.template + ">" if self.no_scalars() and not def_only else ""
+        indent = " " * (spaces + self.length() + len(template))
+        result = ""
+        if self.no_scalars():
+            result += "template <"
+            if def_only:
+                result += flavour.name
+            result += ">\n"
+        result += "cublasStatus_t cublasX" + self.name + template + "(cublasHandle_t handle, "
+        result += (",\n" + indent).join([a for a in self.arguments_def_wrapper_cublas(flavour)]) + ")"
+        return result
diff --git a/scripts/graphs/common.r b/scripts/graphs/common.r
deleted file mode 100644
index 2c437a9f..00000000
--- a/scripts/graphs/common.r
+++ /dev/null
@@ -1,262 +0,0 @@
-
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project uses a tab-size of two spaces and a max-width of 100 characters per line.
-#
-# Author(s):
-#   Cedric Nugteren <www.cedricnugteren.nl>
-#
-# This file implements the common performance scripts, such as creating a graph.
-#
-# ==================================================================================================
-
-# Colours
-black     = "#000000"
-grey      = "#888888"
-purplish  = "#550077" # [ 85,  0,119] lumi=26
-blueish   = "#4765b1" # [ 71,101,177] lumi=100
-redish    = "#d67568" # [214,117,104] lumi=136
-greenish  = "#9bd4ca" # [155,212,202] lumi=199
-
-# Sets the graph markers (circles, triangles, etc.)
-pchs = c(15, 18, 17, 12)
-
-# Other constants
-kilo = 1024
-mega = 1024*1024
-
-# R options
-options("width"=170)
-
-# ==================================================================================================
-
-# Settings
-num_runs <- 5
-num_runs_short <- 50
-xtics_subset_threshold <- 100
-xtics_subset_stepsize <- 8
-
-devices <- c("-platform","-device")
-options_string <- "-q -no_abbrv -cblas 0"
-
-# Command-line arguments
-command_line <- commandArgs(trailingOnly=TRUE)
-if (length(command_line) != 2) {
-  print("Usage for device Z on platform Y: Rscript xxxxx.r Y Z")
-  quit()
-}
-platform_id <- command_line[1]
-device_id <- command_line[2]
-
-# Selects the device
-devices_values <- c(platform_id, device_id)
-devices_string <- paste(devices, devices_values, collapse=" ")
-
-
-# Filter the string: only lines containing a ";" can be valid lines
-filter_string <- function(raw_result_string) {
-  result_string <- c()
-  for (line in raw_result_string) {
-    if (grepl(";",line)) {
-      result_string <-
-       c(result_string, line)
-    }
-  }
-  return(result_string)
-}
-
-# ==================================================================================================
-
-# The main function
-main <- function(routine_name, precision, test_names, test_values,
-                test_xlabels, test_xaxis, metric_gflops) {
-
-  # Names
-  display_name <- toupper(routine_name)
-  if (precision == 16) { display_name <- gsub("^X","H",display_name); }
-  if (precision == 32) { display_name <- gsub("^X","S",display_name); }
-  if (precision == 64) { display_name <- gsub("^X","D",display_name); }
-  if (precision == 3232) { display_name <- gsub("^X","C",display_name); }
-  if (precision == 6464) { display_name <- gsub("^X","Z",display_name); }
-  executable <- paste("./clblast_client_", routine_name, sep="")
-
-  # Display
-  library_names <- c("CLBlast", "clBLAS")
-  if (precision == 16) { library_names <- c("CLBlast FP16", "CLBlast FP32", "clBLAS FP32"); }
-  colourset <- c(blueish, redish)
-  if (precision == 16) { colourset <- c(blueish, purplish, redish); }
-
-  # Configures the outputfile
-  file_name <- paste(display_name, ".pdf", sep="")
-  if (length(test_names) == 6) {
-    pdf(file_name, height=8, width=13)
-    par(mfrow=c(2, 3))
-    par(oma=c(0, 0, 0, 0))
-    par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)]
-    par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)]
-  }
-  else { # length(test_names) == 2
-    pdf(file_name, height=8, width=13)
-    par(mfrow=c(2, 1))
-    par(oma=c(0, 0, 0, 0))
-    par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)]
-    par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)]
-  }
-
-  # Loops over the test-cases
-  for (test_id in 1:length(test_names)) {
-    params_values <- test_values[[test_id]]
-
-    # Loops over the commands within a single list (within a case)
-    for (command_id in 1:length(params_values)) {
-
-      # Runs the client and captures the result
-      params_string <- paste(parameters, params_values[[command_id]], collapse=" ")
-      arguments <- paste(devices_string, params_string, options_string, sep=" ")
-      print(paste("Running", executable, arguments, sep=" "))
-      raw_result_string <- system2(command=executable, args=arguments, stdout=TRUE)
-      result_string <- filter_string(raw_result_string)
-
-      # Reads the result into a dataframe
-      command_db <- read.csv(text=result_string, sep=";")
-
-      # For half-precision: also runs the FP32 version for comparison
-      if (precision == 16) {
-        params_string <- gsub("-precision 16", "-precision 32", params_string)
-        arguments <- paste(devices_string, params_string, options_string, sep=" ")
-        print(paste("Running", executable, arguments, sep=" "))
-        raw_result_string <- system2(command=executable, args=arguments, stdout=TRUE)
-        result_string <- filter_string(raw_result_string)
-
-        # Reads the result into a dataframe
-        command_db_32 <- read.csv(text=result_string, sep=";")
-        stopifnot(nrow(command_db) == nrow(command_db_32))
-
-        # Combines the results
-        command_db["ms_FP32_1"] = command_db_32$ms_1
-        command_db["GFLOPS_FP32_1"] = command_db_32$GFLOPS_1
-        command_db["GBs_FP32_1"] = command_db_32$GBs_1
-        command_db["ms_FP32_2"] = command_db_32$ms_2
-        command_db["GFLOPS_FP32_2"] = command_db_32$GFLOPS_2
-        command_db["GBs_FP32_2"] = command_db_32$GBs_2
-      }
-
-      # Append the results to the final dataframe
-      if (command_id == 1) {
-        db <- command_db
-      } else {
-        db <- rbind(db, command_db)
-      }
-    }
-    print(db)
-
-    # Sets the values on the x-axis and their labels (test dependent)
-    if (is.character(test_xaxis[[test_id]][[1]])) {
-      xdata <- db[,test_xaxis[[test_id]][[1]]]
-      xtics <- xdata
-      log_scale <- test_xaxis[[test_id]][[2]]
-    }
-    else {
-      xdata <- test_xaxis[[test_id]][[1]]
-      xtics <- test_xaxis[[test_id]][[2]]
-      log_scale <- ""
-    }
-
-    # Plots the graph with GFLOPS on the Y-axis
-    if (metric_gflops) {
-      if (precision == 16) {
-        ydata = list(db$GFLOPS_1, db$GFLOPS_FP32_1, db$GFLOPS_FP32_2)
-        ymax = max(max(db$GFLOPS_1), max(db$GFLOPS_FP32_1), max(db$GFLOPS_FP32_2))
-      } else {
-        ydata = list(db$GFLOPS_1, db$GFLOPS_2)
-        ymax = max(max(db$GFLOPS_1), max(db$GFLOPS_2))
-      }
-      plot_graph(xdata=xdata, ydata=ydata, log_setting=log_scale,
-                 xmin=min(xdata), xmax=max(xdata),
-                 ymin=0, ymax=ymax,
-                 xtics=xtics,
-                 xlabel=test_xlabels[[test_id]], ylabel="GFLOPS (higher is better)",
-                 graph_title=paste(display_name, test_names[[test_id]], sep=" "),
-                 multiple=50, experiment_names=library_names, colourset=colourset)
-    # Plots the graph with GB/s on the Y-axis
-    } else {
-      if (precision == 16) {
-        ydata = list(db$GBs_1, db$GBs_FP32_1, db$GBs_FP32_2)
-        ymax = max(max(db$GBs_1), max(db$GBs_FP32_1), max(db$GBs_FP32_2))
-      } else {
-        ydata = list(db$GBs_1, db$GBs_2)
-        ymax = max(max(db$GBs_1), max(db$GBs_2))
-      }
-      plot_graph(xdata=xdata, ydata=ydata, log_setting=log_scale,
-                 xmin=min(xdata), xmax=max(xdata),
-                 ymin=0, ymax=ymax,
-                 xtics=xtics,
-                 xlabel=test_xlabels[[test_id]], ylabel="GB/s (higher is better)",
-                 graph_title=paste(display_name, test_names[[test_id]], sep=" "),
-                 multiple=10, experiment_names=library_names, colourset=colourset)
-    }
-  }
-}
-
-# ==================================================================================================
-
-# Plots data
-plot_graph <- function(xdata, ydata, log_setting,
-                       xmin, xmax, ymin, ymax,
-                       xtics, xlabel, ylabel,
-                       graph_title,
-                       multiple, experiment_names, colourset) {
-
-  # Update the ymax to the next multiple of something
-  ymax <- multiple*ceiling(ymax/multiple)
-
-  # Add kilo or mega to the x-labels
-  for (i in 1:length(xtics)) {
-    if (!is.na(as.numeric(xtics[i]))) {
-      if (as.numeric(xtics[i])%%mega == 0) {
-        xtics[i] <- paste(as.character(as.numeric(xtics[i])/mega), "M", sep="")
-      } else if (as.numeric(xtics[i])%%kilo == 0) {
-        xtics[i] <- paste(as.character(as.numeric(xtics[i])/kilo), "K", sep="")
-      }
-    }
-  }
-
-  # Creates an initial graph with axis but without data
-  par(new=F)
-  plot(x=xmin:xmax, y=rep(1, length(xmin:xmax)), log=log_setting,
-       main="", xlab="", ylab="",
-       ylim=c(ymin, ymax), xlim=c(xmin, xmax), axes=F, "n")
-  axis(side=2, las=2)
-  if (length(xdata) > xtics_subset_threshold) {  # Too many indices to print, plot only every Nth
-    subset <- seq(from=1, to=length(xdata), by=xtics_subset_stepsize)
-    axis(side=1, at=xdata[subset], labels=xtics[subset], las=2)
-  } else {
-    axis(side=1, at=xdata, labels=xtics, las=2)
-  }
-  title(xlab=xlabel, line=-1)
-  title(ylab=ylabel, line=2)
-  title(graph_title, line=-2)
-  par(new=T)
-
-  # Loops over all experiments
-  num_experiments <- length(ydata)
-  for (id in 1:num_experiments) {
-
-    # Plots the data for this experiment
-    plot(x=xdata, y=ydata[[id]], log=log_setting,
-         col=colourset[id], pch=pchs[id], lty=1, lwd=1, cex=1,
-         xlab="", ylab="", ylim=c(ymin, ymax), xlim=c(xmin, xmax),
-         axes=F, "b", xpd=T)
-    par(new=T)
-  }
-
-  # Add a legend
-  legend("bottomright", experiment_names,
-         lwd=1, ncol=1, col=colourset, pch=pchs, lty=1, cex=1,
-         bty="n", xpd=T)
-
-  # Done
-  par(new=F)
-}
-
-# ==================================================================================================
diff --git a/scripts/graphs/xaxpy.r b/scripts/graphs/xaxpy.r
deleted file mode 100644
index 187590aa..00000000
--- a/scripts/graphs/xaxpy.r
+++ /dev/null
@@ -1,96 +0,0 @@
-
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project uses a tab-size of two spaces and a max-width of 100 characters per line.
-#
-# Author(s):
-#   Cedric Nugteren <www.cedricnugteren.nl>
-#
-# This file implements the performance script for the Xaxpy routine
-#
-# ==================================================================================================
-
-# Includes the common functions
-args <- commandArgs(trailingOnly = FALSE)
-thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
-source(file.path(dirname(thisfile), "common.r"))
-
-# ==================================================================================================
-
-# Settings
-routine_name <- "xaxpy"
-parameters <- c("-n","-incx","-incy",
-                "-num_steps","-step","-runs","-precision")
-precision <- 32
-
-# Sets the names of the test-cases
-test_names <- list(
-  "multiples of 256K",
-  "multiples of 256K (+1)",
-  "around n=1M",
-  "around n=16M",
-  "strides (n=8M)",
-  "powers of 2"
-)
-
-# Defines the test-cases
-test_values <- list(
-  list(c(256*kilo, 1, 1, 16, 256*kilo, num_runs, precision)),
-  list(c(256*kilo+1, 1, 1, 16, 256*kilo, num_runs, precision)),
-  list(c(1*mega, 1, 1, 16, 1, num_runs, precision)),
-  list(c(16*mega, 1, 1, 16, 1, num_runs, precision)),
-  list(
-    c(8*mega, 1, 1, 1, 0, num_runs, precision),
-    c(8*mega, 2, 1, 1, 0, num_runs, precision),
-    c(8*mega, 4, 1, 1, 0, num_runs, precision),
-    c(8*mega, 8, 1, 1, 0, num_runs, precision),
-    c(8*mega, 1, 2, 1, 0, num_runs, precision),
-    c(8*mega, 1, 4, 1, 0, num_runs, precision),
-    c(8*mega, 1, 8, 1, 0, num_runs, precision),
-    c(8*mega, 2, 2, 1, 0, num_runs, precision),
-    c(8*mega, 4, 4, 1, 0, num_runs, precision),
-    c(8*mega, 8, 8, 1, 0, num_runs, precision)
-  ),
-  list(
-    c(32*kilo, 1, 1, 1, 0, num_runs, precision),
-    c(64*kilo, 1, 1, 1, 0, num_runs, precision),
-    c(128*kilo, 1, 1, 1, 0, num_runs, precision),
-    c(256*kilo, 1, 1, 1, 0, num_runs, precision),
-    c(512*kilo, 1, 1, 1, 0, num_runs, precision),
-    c(1*mega, 1, 1, 1, 0, num_runs, precision),
-    c(2*mega, 1, 1, 1, 0, num_runs, precision),
-    c(4*mega, 1, 1, 1, 0, num_runs, precision),
-    c(8*mega, 1, 1, 1, 0, num_runs, precision),
-    c(16*mega, 1, 1, 1, 0, num_runs, precision),
-    c(32*mega, 1, 1, 1, 0, num_runs, precision),
-    c(64*mega, 1, 1, 1, 0, num_runs, precision)
-  )
-)
-
-# Defines the x-labels corresponding to the test-cases
-test_xlabels <- list(
-  "vector sizes (n)",
-  "vector sizes (n)",
-  "vector sizes (n)",
-  "vector sizes (n)",
-  "increments/strides for x and y",
-  "vector sizes (n)"
-)
-
-# Defines the x-axis of the test-cases
-test_xaxis <- list(
-  c("n", ""),
-  c("n", ""),
-  c("n", ""),
-  c("n", ""),
-  list(1:10, c("x1y1", "x2y1", "x4y1", "x8y1", "x1y2", "x1y4", "x1y8", "x2y2", "x4y4", "x8y8")),
-  c("n", "x")
-)
-
-# ==================================================================================================
-
-# Start the script
-main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
-     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=FALSE)
-
-# ==================================================================================================
\ No newline at end of file
diff --git a/scripts/graphs/xgemm.r b/scripts/graphs/xgemm.r
deleted file mode 100755
index e758f460..00000000
--- a/scripts/graphs/xgemm.r
+++ /dev/null
@@ -1,94 +0,0 @@
-
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project uses a tab-size of two spaces and a max-width of 100 characters per line.
-#
-# Author(s):
-#   Cedric Nugteren <www.cedricnugteren.nl>
-#
-# This file implements the performance script for the Xgemm routine
-#
-# ==================================================================================================
-
-# Includes the common functions
-args <- commandArgs(trailingOnly = FALSE)
-thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
-source(file.path(dirname(thisfile), "common.r"))
-
-# ==================================================================================================
-
-# Settings
-routine_name <- "xgemm"
-parameters <- c("-m","-n","-k","-layout","-transA","-transB",
-                "-num_steps","-step","-runs","-precision")
-precision <- 32
-
-# Sets the names of the test-cases
-test_names <- list(
-  "multiples of 128",
-  "multiples of 128 (+1)",
-  "around m=n=k=512",
-  "around m=n=k=2048",
-  "layouts and transposing (m=n=k=1024)",
-  "powers of 2"
-)
-
-# Defines the test-cases
-test_values <- list(
-  list(c( 128,  128,  128, 102, 111, 111, 16, 128, num_runs, precision)),
-  list(c( 129,  129,  129, 102, 111, 111, 16, 128, num_runs, precision)),
-  list(c( 512,  512,  512, 102, 111, 111, 16, 1, num_runs, precision)),
-  list(c(2048, 2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)),
-  list(
-    c(1024, 1024, 1024, 101, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 1024, 101, 111, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 1024, 101, 112, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 1024, 101, 112, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 1024, 102, 111, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 1024, 102, 112, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 1024, 102, 112, 112, 1, 0, num_runs, precision)
-  ),
-  list(
-    c(   8,    8,    8, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  16,   16,   16, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  32,   32,   32, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  64,   64,   64, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 128,  128,  128, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 256,  256,  256, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 512,  512,  512, 102, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
-    c(2048, 2048, 2048, 102, 111, 111, 1, 0, num_runs, precision),
-    c(4096, 4096, 4096, 102, 111, 111, 1, 0, num_runs, precision),
-    c(8192, 8192, 8192, 102, 111, 111, 1, 0, num_runs, precision)
-  )
-)
-
-# Defines the x-labels corresponding to the test-cases
-test_xlabels <- list(
-  "matrix sizes (m=n=k)",
-  "matrix sizes (m=n=k)",
-  "matrix sizes (m=n=k)",
-  "matrix sizes (m=n=k)",
-  "layout (row/col), transA (n/y), transB (n/y)",
-  "matrix sizes (m=n=k)"
-)
-
-# Defines the x-axis of the test-cases
-test_xaxis <- list(
-  c("m", ""),
-  c("m", ""),
-  c("m", ""),
-  c("m", ""),
-  list(1:8, c("row,n,n", "row,n,y", "row,y,n", "row,y,y",
-              "col,n,n", "col,n,y", "col,y,n", "col,y,y")),
-  c("m", "x")
-)
-
-# ==================================================================================================
-
-# Start the script
-main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
-     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
-
-# ==================================================================================================
\ No newline at end of file
diff --git a/scripts/graphs/xgemm_small.r b/scripts/graphs/xgemm_small.r
deleted file mode 100644
index ef94ef20..00000000
--- a/scripts/graphs/xgemm_small.r
+++ /dev/null
@@ -1,56 +0,0 @@
-
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project uses a tab-size of two spaces and a max-width of 100 characters per line.
-#
-# Author(s):
-#   Cedric Nugteren <www.cedricnugteren.nl>
-#
-# This file implements the performance script for small sizes of Xgemm, testing the direct kernel
-#
-# ==================================================================================================
-
-# Includes the common functions
-args <- commandArgs(trailingOnly = FALSE)
-thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
-source(file.path(dirname(thisfile), "common.r"))
-
-# ==================================================================================================
-
-# Settings
-routine_name <- "xgemm"
-parameters <- c("-m","-n","-k","-layout","-transA","-transB",
-                "-num_steps","-step","-runs","-precision")
-precision <- 32
-
-# Sets the names of the test-cases
-test_names <- list(
-  "small matrices in steps of 16",
-  "small matrices in steps of 1"
-)
-
-# Defines the test-cases
-test_values <- list(
-  list(c( 128,  128,  128, 102, 111, 111, 57, 16, num_runs_short, precision)),
-  list(c( 128,  128,  128, 102, 111, 111, 385, 1, num_runs_short, precision))
-)
-
-# Defines the x-labels corresponding to the test-cases
-test_xlabels <- list(
-  "matrix sizes (m=n=k)",
-  "matrix sizes (m=n=k)"
-)
-
-# Defines the x-axis of the test-cases
-test_xaxis <- list(
-  c("m", ""),
-  c("m", "")
-)
-
-# ==================================================================================================
-
-# Start the script
-main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
-     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
-
-# ==================================================================================================
\ No newline at end of file
diff --git a/scripts/graphs/xgemv.r b/scripts/graphs/xgemv.r
deleted file mode 100644
index 9a8040f7..00000000
--- a/scripts/graphs/xgemv.r
+++ /dev/null
@@ -1,83 +0,0 @@
-
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project uses a tab-size of two spaces and a max-width of 100 characters per line.
-#
-# Author(s):
-#   Cedric Nugteren <www.cedricnugteren.nl>
-#
-# This file implements the performance script for the Xgemv routine
-#
-# ==================================================================================================
-
-# Includes the common functions
-args <- commandArgs(trailingOnly = FALSE)
-thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
-source(file.path(dirname(thisfile), "common.r"))
-
-# ==================================================================================================
-
-# Settings
-routine_name <- "xgemv"
-parameters <- c("-n","-m","-incx","-incy","-layout",
-                "-num_steps","-step","-runs","-precision")
-precision <- 32
-
-# Sets the names of the test-cases
-test_names <- list(
-  "multiples of 256",
-  "multiples of 256 (+1)",
-  "around n=m=2K",
-  "multiples of 256 [rotated]",
-  "multiples of 256 (+1) [rotated]",
-  "strides (n=2K)"
-)
-
-# Defines the test-cases
-test_values <- list(
-  list(c(256, 256, 1, 1, 102, 16, 256, num_runs, precision)),
-  list(c(256+1, 256+1, 1, 1, 102, 16, 256, num_runs, precision)),
-  list(c(2*kilo, 2*kilo, 1, 1, 102, 16, 1, num_runs, precision)),
-  list(c(256, 256, 1, 1, 101, 16, 256, num_runs, precision)),
-  list(c(256+1, 256+1, 1, 1, 101, 16, 256, num_runs, precision)),
-  list(
-    c(2*kilo, 2*kilo, 1, 1, 102, 1, 0, num_runs, precision),
-    c(2*kilo, 2*kilo, 2, 1, 102, 1, 0, num_runs, precision),
-    c(2*kilo, 2*kilo, 4, 1, 102, 1, 0, num_runs, precision),
-    c(2*kilo, 2*kilo, 8, 1, 102, 1, 0, num_runs, precision),
-    c(2*kilo, 2*kilo, 1, 2, 102, 1, 0, num_runs, precision),
-    c(2*kilo, 2*kilo, 1, 4, 102, 1, 0, num_runs, precision),
-    c(2*kilo, 2*kilo, 1, 8, 102, 1, 0, num_runs, precision),
-    c(2*kilo, 2*kilo, 2, 2, 102, 1, 0, num_runs, precision),
-    c(2*kilo, 2*kilo, 4, 4, 102, 1, 0, num_runs, precision),
-    c(2*kilo, 2*kilo, 8, 8, 102, 1, 0, num_runs, precision)
-  )
-)
-
-# Defines the x-labels corresponding to the test-cases
-test_xlabels <- list(
-  "vector sizes (n)",
-  "vector sizes (n)",
-  "vector sizes (n)",
-  "vector sizes (n)",
-  "vector sizes (n)",
-  "increments/strides for x and y"
-)
-
-# Defines the x-axis of the test-cases
-test_xaxis <- list(
-  c("n", ""),
-  c("n", ""),
-  c("n", ""),
-  c("n", ""),
-  c("n", ""),
-  list(1:10, c("x1y1", "x2y1", "x4y1", "x8y1", "x1y2", "x1y4", "x1y8", "x2y2", "x4y4", "x8y8"))
-)
-
-# ==================================================================================================
-
-# Start the script
-main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
-     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=FALSE)
-
-# ==================================================================================================
\ No newline at end of file
diff --git a/scripts/graphs/xsymm.r b/scripts/graphs/xsymm.r
deleted file mode 100644
index 89d137d2..00000000
--- a/scripts/graphs/xsymm.r
+++ /dev/null
@@ -1,94 +0,0 @@
-
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project uses a tab-size of two spaces and a max-width of 100 characters per line.
-#
-# Author(s):
-#   Cedric Nugteren <www.cedricnugteren.nl>
-#
-# This file implements the performance script for the Xsymm routine
-#
-# ==================================================================================================
-
-# Includes the common functions
-args <- commandArgs(trailingOnly = FALSE)
-thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
-source(file.path(dirname(thisfile), "common.r"))
-
-# ==================================================================================================
-
-# Settings
-routine_name <- "xsymm"
-parameters <- c("-m","-n","-layout","-side","-triangle",
-                "-num_steps","-step","-runs","-precision")
-precision <- 32
-
-# Sets the names of the test-cases
-test_names <- list(
-  "multiples of 128",
-  "multiples of 128 (+1)",
-  "around m=n=512",
-  "around m=n=2048",
-  "layouts and side/triangle (m=n=1024)",
-  "powers of 2"
-)
-
-# Defines the test-cases
-test_values <- list(
-  list(c( 128,  128, 102, 141, 121, 16, 128, num_runs, precision)),
-  list(c( 129,  129, 102, 141, 121, 16, 128, num_runs, precision)),
-  list(c( 512,  512, 102, 141, 121, 16, 1, num_runs, precision)),
-  list(c(2048, 2048, 102, 141, 121, 16, 1, num_runs, precision)),
-  list(
-    c(1024, 1024, 101, 141, 121, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 141, 122, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 142, 121, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 142, 122, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 141, 121, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 141, 122, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 142, 121, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 142, 122, 1, 0, num_runs, precision)
-  ),
-  list(
-    c(   8,    8, 102, 141, 121, 1, 0, num_runs, precision),
-    c(  16,   16, 102, 141, 121, 1, 0, num_runs, precision),
-    c(  32,   32, 102, 141, 121, 1, 0, num_runs, precision),
-    c(  64,   64, 102, 141, 121, 1, 0, num_runs, precision),
-    c( 128,  128, 102, 141, 121, 1, 0, num_runs, precision),
-    c( 256,  256, 102, 141, 121, 1, 0, num_runs, precision),
-    c( 512,  512, 102, 141, 121, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 141, 121, 1, 0, num_runs, precision),
-    c(2048, 2048, 102, 141, 121, 1, 0, num_runs, precision),
-    c(4096, 4096, 102, 141, 121, 1, 0, num_runs, precision),
-    c(8192, 8192, 102, 141, 121, 1, 0, num_runs, precision)
-  )
-)
-
-# Defines the x-labels corresponding to the test-cases
-test_xlabels <- list(
-  "matrix sizes (m=n)",
-  "matrix sizes (m=n)",
-  "matrix sizes (m=n)",
-  "matrix sizes (m=n)",
-  "layout (row/col), side (l/r), triangle (up/lo)",
-  "matrix sizes (m=n)"
-)
-
-# Defines the x-axis of the test-cases
-test_xaxis <- list(
-  c("m", ""),
-  c("m", ""),
-  c("m", ""),
-  c("m", ""),
-  list(1:8, c("row,l,up", "row,r,up", "row,l,lo", "row,r,lo",
-              "col,l,up", "col,r,up", "col,l,lo", "col,r,lo")),
-  c("m", "x")
-)
-
-# ==================================================================================================
-
-# Start the script
-main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
-     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
-
-# ==================================================================================================
\ No newline at end of file
diff --git a/scripts/graphs/xsyr2k.r b/scripts/graphs/xsyr2k.r
deleted file mode 100644
index 4b2dd4a0..00000000
--- a/scripts/graphs/xsyr2k.r
+++ /dev/null
@@ -1,94 +0,0 @@
-
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project uses a tab-size of two spaces and a max-width of 100 characters per line.
-#
-# Author(s):
-#   Cedric Nugteren <www.cedricnugteren.nl>
-#
-# This file implements the performance script for the Xsyr2k routine
-#
-# ==================================================================================================
-
-# Includes the common functions
-args <- commandArgs(trailingOnly = FALSE)
-thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
-source(file.path(dirname(thisfile), "common.r"))
-
-# ==================================================================================================
-
-# Settings
-routine_name <- "xsyr2k"
-parameters <- c("-n","-k","-layout","-triangle","-transA",
-                "-num_steps","-step","-runs","-precision")
-precision <- 32
-
-# Sets the names of the test-cases
-test_names <- list(
-  "multiples of 128",
-  "multiples of 128 (+1)",
-  "around n=k=512",
-  "around n=k=1536",
-  "layouts and transposing (n=k=1024)",
-  "powers of 2"
-)
-
-# Defines the test-cases
-test_values <- list(
-  list(c( 128,  128, 102, 111, 111, 16, 128, num_runs, precision)),
-  list(c( 129,  129, 102, 111, 111, 16, 128, num_runs, precision)),
-  list(c( 512,  512, 102, 111, 111, 16, 1, num_runs, precision)),
-  list(c(1536, 1536, 102, 111, 111, 16, 1, num_runs, precision)),
-  list(
-    c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision)
-  ),
-  list(
-    c(   8,    8, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  16,   16, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  32,   32, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  64,   64, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 128,  128, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 256,  256, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 512,  512, 102, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
-    c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision),
-    c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision),
-    c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision)
-  )
-)
-
-# Defines the x-labels corresponding to the test-cases
-test_xlabels <- list(
-  "matrix sizes (n=k)",
-  "matrix sizes (n=k)",
-  "matrix sizes (n=k)",
-  "matrix sizes (n=k)",
-  "layout (row/col), triangle (u/l), transA (n/y)",
-  "matrix sizes (n=k)"
-)
-
-# Defines the x-axis of the test-cases
-test_xaxis <- list(
-  c("n", ""),
-  c("n", ""),
-  c("n", ""),
-  c("n", ""),
-  list(1:8, c("row,u,n", "row,u,y", "row,l,n", "row,l,y",
-              "col,u,n", "col,u,y", "col,l,n", "col,l,y")),
-  c("n", "x")
-)
-
-# ==================================================================================================
-
-# Start the script
-main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
-     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
-
-# ==================================================================================================
\ No newline at end of file
diff --git a/scripts/graphs/xsyrk.r b/scripts/graphs/xsyrk.r
deleted file mode 100644
index 754c93e2..00000000
--- a/scripts/graphs/xsyrk.r
+++ /dev/null
@@ -1,94 +0,0 @@
-
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project uses a tab-size of two spaces and a max-width of 100 characters per line.
-#
-# Author(s):
-#   Cedric Nugteren <www.cedricnugteren.nl>
-#
-# This file implements the performance script for the Xsyrk routine
-#
-# ==================================================================================================
-
-# Includes the common functions
-args <- commandArgs(trailingOnly = FALSE)
-thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
-source(file.path(dirname(thisfile), "common.r"))
-
-# ==================================================================================================
-
-# Settings
-routine_name <- "xsyrk"
-parameters <- c("-n","-k","-layout","-triangle","-transA",
-                "-num_steps","-step","-runs","-precision")
-precision <- 32
-
-# Sets the names of the test-cases
-test_names <- list(
-  "multiples of 128",
-  "multiples of 128 (+1)",
-  "around n=k=512",
-  "around n=k=2048",
-  "layouts and transposing (n=k=1024)",
-  "powers of 2"
-)
-
-# Defines the test-cases
-test_values <- list(
-  list(c( 128,  128, 102, 121, 111, 16, 128, num_runs, precision)),
-  list(c( 129,  129, 102, 121, 111, 16, 128, num_runs, precision)),
-  list(c( 512,  512, 102, 121, 111, 16, 1, num_runs, precision)),
-  list(c(2048, 2048, 102, 121, 111, 16, 1, num_runs, precision)),
-  list(
-    c(1024, 1024, 101, 121, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 121, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 122, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 122, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 121, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 121, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 122, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 122, 112, 1, 0, num_runs, precision)
-  ),
-  list(
-    c(   8,    8, 102, 121, 111, 1, 0, num_runs, precision),
-    c(  16,   16, 102, 121, 111, 1, 0, num_runs, precision),
-    c(  32,   32, 102, 121, 111, 1, 0, num_runs, precision),
-    c(  64,   64, 102, 121, 111, 1, 0, num_runs, precision),
-    c( 128,  128, 102, 121, 111, 1, 0, num_runs, precision),
-    c( 256,  256, 102, 121, 111, 1, 0, num_runs, precision),
-    c( 512,  512, 102, 121, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 121, 111, 1, 0, num_runs, precision),
-    c(2048, 2048, 102, 121, 111, 1, 0, num_runs, precision),
-    c(4096, 4096, 102, 121, 111, 1, 0, num_runs, precision),
-    c(8192, 8192, 102, 121, 111, 1, 0, num_runs, precision)
-  )
-)
-
-# Defines the x-labels corresponding to the test-cases
-test_xlabels <- list(
-  "matrix sizes (n=k)",
-  "matrix sizes (n=k)",
-  "matrix sizes (n=k)",
-  "matrix sizes (n=k)",
-  "layout (row/col), triangle (u/l), transA (n/y)",
-  "matrix sizes (n=k)"
-)
-
-# Defines the x-axis of the test-cases
-test_xaxis <- list(
-  c("n", ""),
-  c("n", ""),
-  c("n", ""),
-  c("n", ""),
-  list(1:8, c("row,u,n", "row,u,y", "row,l,n", "row,l,y",
-              "col,u,n", "col,u,y", "col,l,n", "col,l,y")),
-  c("n", "x")
-)
-
-# ==================================================================================================
-
-# Start the script
-main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
-     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
-
-# ==================================================================================================
\ No newline at end of file
diff --git a/scripts/graphs/xtrmm.r b/scripts/graphs/xtrmm.r
deleted file mode 100644
index c2faaa8b..00000000
--- a/scripts/graphs/xtrmm.r
+++ /dev/null
@@ -1,127 +0,0 @@
-
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project uses a tab-size of two spaces and a max-width of 100 characters per line.
-#
-# Author(s):
-#   Cedric Nugteren <www.cedricnugteren.nl>
-#
-# This file implements the performance script for the Xtrmm routine
-#
-# ==================================================================================================
-
-# Includes the common functions
-args <- commandArgs(trailingOnly = FALSE)
-thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
-source(file.path(dirname(thisfile), "common.r"))
-
-# ==================================================================================================
-
-# Settings
-routine_name <- "xtrmm"
-parameters <- c("-m","-n","-layout","-side","-triangle","-transA","-diagonal",
-                "-num_steps","-step","-runs","-precision")
-precision <- 32
-
-# Sets the names of the test-cases
-test_names <- list(
-  "multiples of 128",
-  "multiples of 128 (+1)",
-  "around m=n=512",
-  "around m=n=2048",
-  "layouts and side/triangle (m=n=1024)",
-  "powers of 2"
-)
-
-# Defines the test-cases
-test_values <- list(
-  list(c( 128,  128, 102, 141, 121, 111, 131, 16, 128, num_runs, precision)),
-  list(c( 129,  129, 102, 141, 121, 111, 131, 16, 128, num_runs, precision)),
-  list(c( 512,  512, 102, 141, 121, 111, 131, 16, 1, num_runs, precision)),
-  list(c(2048, 2048, 102, 141, 121, 111, 131, 16, 1, num_runs, precision)),
-  list(
-    c(1024, 1024, 101, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 141, 121, 111, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 141, 121, 112, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 141, 121, 112, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 141, 122, 111, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 141, 122, 111, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 141, 122, 112, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 141, 122, 112, 132, 1, 0, num_runs, precision),
-
-    c(1024, 1024, 101, 142, 121, 111, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 142, 121, 111, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 142, 121, 112, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 142, 121, 112, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 142, 122, 111, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 142, 122, 111, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 142, 122, 112, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 142, 122, 112, 132, 1, 0, num_runs, precision),
-
-    c(1024, 1024, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 141, 121, 111, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 141, 121, 112, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 141, 121, 112, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 141, 122, 111, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 141, 122, 111, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 141, 122, 112, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 141, 122, 112, 132, 1, 0, num_runs, precision),
-
-    c(1024, 1024, 102, 142, 121, 111, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 142, 121, 111, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 142, 121, 112, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 142, 121, 112, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 142, 122, 111, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 142, 122, 111, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 142, 122, 112, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 142, 122, 112, 132, 1, 0, num_runs, precision)
-  ),
-  list(
-    c(   8,    8, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c(  16,   16, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c(  32,   32, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c(  64,   64, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c( 128,  128, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c( 256,  256, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c( 512,  512, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c(2048, 2048, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c(4096, 4096, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c(8192, 8192, 102, 141, 121, 111, 131, 1, 0, num_runs, precision)
-  )
-)
-
-# Defines the x-labels corresponding to the test-cases
-test_xlabels <- list(
-  "matrix sizes (m=n)",
-  "matrix sizes (m=n)",
-  "matrix sizes (m=n)",
-  "matrix sizes (m=n)",
-  "layout (row/col), side (l/r), triangle (up/lo), transA (n/y), diag (u/nu)",
-  "matrix sizes (m=n)"
-)
-
-# Defines the x-axis of the test-cases
-test_xaxis <- list(
-  c("m", ""),
-  c("m", ""),
-  c("m", ""),
-  c("m", ""),
-  list(1:32, c("row,l,up,n,u", "row,l,up,n,nu", "row,l,up,y,u", "row,l,up,y,nu",
-               "row,r,up,n,u", "row,r,up,n,nu", "row,r,up,y,u", "row,r,up,y,nu",
-               "row,l,lo,n,u", "row,l,lo,n,nu", "row,l,lo,y,u", "row,l,lo,y,nu",
-               "row,r,lo,n,u", "row,r,lo,n,nu", "row,r,lo,y,u", "row,r,lo,y,nu",
-               "col,l,up,n,u", "col,l,up,n,nu", "col,l,up,y,u", "col,l,up,y,nu",
-               "col,r,up,n,u", "col,r,up,n,nu", "col,r,up,y,u", "col,r,up,y,nu",
-               "col,l,lo,n,u", "col,l,lo,n,nu", "col,l,lo,y,u", "col,l,lo,y,nu",
-               "col,r,lo,n,u", "col,r,lo,n,nu", "col,r,lo,y,u", "col,r,lo,y,nu")),
-  c("m", "x")
-)
-
-# ==================================================================================================
-
-# Start the script
-main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
-     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
-
-# ==================================================================================================
\ No newline at end of file
diff --git a/src/cache.cpp b/src/cache.cpp
index 6786eaa2..4b74b0a1 100644
--- a/src/cache.cpp
+++ b/src/cache.cpp
@@ -15,108 +15,116 @@
 #include <vector>
 #include <mutex>
 
+#include "database/database.hpp"
 #include "cache.hpp"
 
 namespace clblast {
 // =================================================================================================
 
-// Stores the compiled binary or IR in the cache
-void StoreBinaryToCache(const std::string &binary, const std::string &device_name,
-                        const Precision &precision, const std::string &routine_name) {
-  #ifdef VERBOSE
-    printf("[DEBUG] Storing binary in cache\n");
-  #endif
-  binary_cache_mutex_.lock();
-  binary_cache_.push_back(BinaryCache{binary, device_name, precision, routine_name});
-  binary_cache_mutex_.unlock();
-}
+template <typename Key, typename Value>
+template <typename U>
+Value Cache<Key, Value>::Get(const U &key, bool *in_cache) const {
+  std::lock_guard<std::mutex> lock(cache_mutex_);
 
-// Stores the compiled program in the cache
-void StoreProgramToCache(const Program &program, const Context &context,
-                         const Precision &precision, const std::string &routine_name) {
-  #ifdef VERBOSE
-    printf("[DEBUG] Storing program in cache\n");
-  #endif
-  program_cache_mutex_.lock();
-  program_cache_.push_back(ProgramCache{program, context(), precision, routine_name});
-  program_cache_mutex_.unlock();
-}
-
-// Queries the cache and retrieves a matching binary. Assumes that the match is available, throws
-// otherwise.
-const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision,
-                                      const std::string &routine_name) {
-  #ifdef VERBOSE
-    printf("[DEBUG] Retrieving binary from cache\n");
-  #endif
-  binary_cache_mutex_.lock();
-  for (auto &cached_binary: binary_cache_) {
-    if (cached_binary.MatchInCache(device_name, precision, routine_name)) {
-      binary_cache_mutex_.unlock();
-      return cached_binary.binary;
+#if __cplusplus >= 201402L
+  // generalized std::map::find() of C++14
+  auto it = cache_.find(key);
+#else
+  // O(n) lookup in a vector
+  auto it = std::find_if(cache_.begin(), cache_.end(), [&] (const std::pair<Key, Value> &pair) {
+    return pair.first == key;
+  });
+#endif
+  if (it == cache_.end()) {
+    if (in_cache) {
+      *in_cache = false;
     }
+    return Value();
   }
-  binary_cache_mutex_.unlock();
-  throw LogicError("GetBinaryFromCache: Expected binary in cache, but found none");
+
+  if (in_cache) {
+    *in_cache = true;
+  }
+  return it->second;
 }
 
-// Queries the cache and retrieves a matching program. Assumes that the match is available, throws
-// otherwise.
-const Program& GetProgramFromCache(const Context &context, const Precision &precision,
-                                   const std::string &routine_name) {
-  #ifdef VERBOSE
-    printf("[DEBUG] Retrieving program from cache\n");
-  #endif
-  program_cache_mutex_.lock();
-  for (auto &cached_program: program_cache_) {
-    if (cached_program.MatchInCache(context(), precision, routine_name)) {
-      program_cache_mutex_.unlock();
-      return cached_program.program;
-    }
+template <typename Key, typename Value>
+void Cache<Key, Value>::Store(Key &&key, Value &&value) {
+  std::lock_guard<std::mutex> lock(cache_mutex_);
+
+#if __cplusplus >= 201402L
+  // emplace() into a map
+  auto r = cache_.emplace(std::move(key), std::move(value));
+  if (!r.second) {
+    throw LogicError("Cache::Store: object already in cache");
   }
-  program_cache_mutex_.unlock();
-  throw LogicError("GetProgramFromCache: Expected program in cache, but found none");
+#else
+  // emplace_back() into a vector
+  cache_.emplace_back(std::move(key), std::move(value));
+#endif
 }
 
-// Queries the cache to see whether or not the compiled kernel is already there
-bool BinaryIsInCache(const std::string &device_name, const Precision &precision,
-                     const std::string &routine_name) {
-  binary_cache_mutex_.lock();
-  for (auto &cached_binary: binary_cache_) {
-    if (cached_binary.MatchInCache(device_name, precision, routine_name)) {
-      binary_cache_mutex_.unlock();
-      return true;
+template <typename Key, typename Value>
+void Cache<Key, Value>::Remove(const Key &key) {
+  std::lock_guard<std::mutex> lock(cache_mutex_);
+#if __cplusplus >= 201402L
+  cache_.erase(key);
+#else
+  auto it = cache_.begin();
+  while (it != cache_.end()) {
+    if ((*it).first == key) {
+      it = cache_.erase(it);
     }
+    else ++it;
   }
-  binary_cache_mutex_.unlock();
-  return false;
+#endif
 }
 
-// Queries the cache to see whether or not the compiled kernel is already there
-bool ProgramIsInCache(const Context &context, const Precision &precision,
-                      const std::string &routine_name) {
-  program_cache_mutex_.lock();
-  for (auto &cached_program: program_cache_) {
-    if (cached_program.MatchInCache(context(), precision, routine_name)) {
-      program_cache_mutex_.unlock();
-      return true;
+template <typename Key, typename Value>
+template <int I1, int I2>
+void Cache<Key, Value>::RemoveBySubset(const Key &key) {
+  std::lock_guard<std::mutex> lock(cache_mutex_);
+  auto it = cache_.begin();
+  while (it != cache_.end()) {
+    const auto current_key = (*it).first;
+    if ((std::get<I1>(key) == std::get<I1>(current_key)) &&
+        (std::get<I2>(key) == std::get<I2>(current_key))) {
+      it = cache_.erase(it);
     }
+    else ++it;
   }
-  program_cache_mutex_.unlock();
-  return false;
 }
 
+template <typename Key, typename Value>
+void Cache<Key, Value>::Invalidate() {
+  std::lock_guard<std::mutex> lock(cache_mutex_);
+
+  cache_.clear();
+}
+
+template <typename Key, typename Value>
+Cache<Key, Value> &Cache<Key, Value>::Instance() {
+  return instance_;
+}
+
+template <typename Key, typename Value>
+Cache<Key, Value> Cache<Key, Value>::instance_;
+
 // =================================================================================================
 
-// Clears the cache of stored binaries and programs
-void CacheClearAll() {
-  binary_cache_mutex_.lock();
-  binary_cache_.clear();
-  binary_cache_mutex_.unlock();
-  program_cache_mutex_.lock();
-  program_cache_.clear();
-  program_cache_mutex_.unlock();
-}
+template class Cache<BinaryKey, std::string>;
+template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const;
+
+// =================================================================================================
+
+template class Cache<ProgramKey, Program>;
+template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const;
+template void ProgramCache::RemoveBySubset<1, 2>(const ProgramKey &); // precision and routine name
+
+// =================================================================================================
+
+template class Cache<DatabaseKey, Database>;
+template Database DatabaseCache::Get(const DatabaseKeyRef &, bool *) const;
 
 // =================================================================================================
 } // namespace clblast
diff --git a/src/cache.hpp b/src/cache.hpp
index 9ecb0f1e..694de839 100644
--- a/src/cache.hpp
+++ b/src/cache.hpp
@@ -15,81 +15,92 @@
 #define CLBLAST_CACHE_H_
 
 #include <string>
-#include <vector>
 #include <mutex>
+#include <map>
 
 #include "utilities/utilities.hpp"
 
 namespace clblast {
 // =================================================================================================
 
-// The cache of compiled OpenCL binaries, along with some meta-data
-struct BinaryCache {
-  std::string binary;
-  std::string device_name;
-  Precision precision;
-  std::string routine_name_;
+// The generic thread-safe cache. We assume that the Key may be a heavyweight struct that is not
+// normally used by the caller, while the Value is either lightweight or ref-counted.
+// Hence, searching by non-Key is supported (if there is a corresponding operator<()), and
+// on Store() the Key instance is moved from the caller (because it will likely be constructed
+// as temporary at the time of Store()).
+template <typename Key, typename Value>
+class Cache {
+public:
+  // Cached object is returned by-value to avoid racing with Invalidate().
+  // Due to lack of std::optional<>, in case of a cache miss we return a default-constructed
+  // Value and set the flag to false.
+  template <typename U>
+  Value Get(const U &key, bool *in_cache) const;
 
-  // Finds out whether the properties match
-  bool MatchInCache(const std::string &ref_device, const Precision &ref_precision,
-                    const std::string &ref_routine) {
-    return (device_name == ref_device &&
-            precision == ref_precision &&
-            routine_name_ == ref_routine);
-  }
-};
+  // We do not return references to just stored object to avoid racing with Invalidate().
+  // Caller is expected to store a temporary.
+  void Store(Key &&key, Value &&value);
+  void Invalidate();
 
-// The actual cache, implemented as a vector of the above data-type, and its mutex
-static std::vector<BinaryCache> binary_cache_;
-static std::mutex binary_cache_mutex_;
+  // Removes all entries with a given key
+  void Remove(const Key &key);
+  template <int I1, int I2> void RemoveBySubset(const Key &key); // currently supports 2 indices
+
+  static Cache<Key, Value> &Instance();
+
+private:
+#if __cplusplus >= 201402L
+  // The std::less<void> allows to search in cache by an object comparable with Key, without
+  // constructing a temporary Key
+  // (see http://en.cppreference.com/w/cpp/utility/functional/less_void,
+  //      http://www.open-std.org/JTC1/SC22/WG21/docs/papers/2013/n3657.htm,
+  //      http://stackoverflow.com/questions/10536788/avoiding-key-construction-for-stdmapfind)
+  std::map<Key, Value, std::less<void>> cache_;
+#else
+  std::vector<std::pair<Key, Value>> cache_;
+#endif
+  mutable std::mutex cache_mutex_;
+
+  static Cache<Key, Value> instance_;
+}; // class Cache
 
 // =================================================================================================
 
-// The cache of compiled OpenCL programs, along with some meta-data
-struct ProgramCache {
-  Program program;
-  cl_context context;
-  Precision precision;
-  std::string routine_name_;
+// The key struct for the cache of compiled OpenCL binaries
+// Order of fields: precision, routine_name, device_name (smaller fields first)
+typedef std::tuple<Precision, std::string, std::string> BinaryKey;
+typedef std::tuple<const Precision &, const std::string &, const std::string &> BinaryKeyRef;
 
-  // Finds out whether the properties match
-  bool MatchInCache(const cl_context ref_context, const Precision &ref_precision,
-                    const std::string &ref_routine) {
-    return (context == ref_context &&
-            precision == ref_precision &&
-            routine_name_ == ref_routine);
-  }
-};
+typedef Cache<BinaryKey, std::string> BinaryCache;
 
-// The actual cache, implemented as a vector of the above data-type, and its mutex
-static std::vector<ProgramCache> program_cache_;
-static std::mutex program_cache_mutex_;
+extern template class Cache<BinaryKey, std::string>;
+extern template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const;
 
 // =================================================================================================
 
-// Stores the compiled binary or program in the cache
-void StoreBinaryToCache(const std::string &binary, const std::string &device_name,
-                        const Precision &precision, const std::string &routine_name);
-void StoreProgramToCache(const Program &program, const Context &context,
-                         const Precision &precision, const std::string &routine_name);
+// The key struct for the cache of compiled OpenCL programs (context-dependent)
+// Order of fields: context, precision, routine_name (smaller fields first)
+typedef std::tuple<cl_context, Precision, std::string> ProgramKey;
+typedef std::tuple<const cl_context &, const Precision &, const std::string &> ProgramKeyRef;
 
-// Queries the cache and retrieves a matching binary or program. Assumes that the match is
-// available, throws otherwise.
-const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision,
-                                      const std::string &routine_name);
-const Program& GetProgramFromCache(const Context &context, const Precision &precision,
-                                   const std::string &routine_name);
+typedef Cache<ProgramKey, Program> ProgramCache;
 
-// Queries the cache to see whether or not the compiled kernel is already there
-bool BinaryIsInCache(const std::string &device_name, const Precision &precision,
-                     const std::string &routine_name);
-bool ProgramIsInCache(const Context &context, const Precision &precision,
-                      const std::string &routine_name);
+extern template class Cache<ProgramKey, Program>;
+extern template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const;
 
 // =================================================================================================
 
-// Clears the cache of stored binaries
-void CacheClearAll();
+class Database;
+
+// The key struct for the cache of database maps.
+// Order of fields: precision, device_name, kernel_name (smaller fields first)
+typedef std::tuple<Precision, std::string, std::string> DatabaseKey;
+typedef std::tuple<const Precision &, const std::string &, const std::string &> DatabaseKeyRef;
+
+typedef Cache<DatabaseKey, Database> DatabaseCache;
+
+extern template class Cache<DatabaseKey, Database>;
+extern template Database DatabaseCache::Get(const DatabaseKeyRef &, bool *) const;
 
 // =================================================================================================
 } // namespace clblast
diff --git a/src/clblast.cpp b/src/clblast.cpp
index 4bb4e0b3..78548eba 100644
--- a/src/clblast.cpp
+++ b/src/clblast.cpp
@@ -15,8 +15,8 @@
 
 #include <string>
 
-#include "clblast.h"
 #include "cache.hpp"
+#include "clblast.h"
 
 // BLAS level-1 includes
 #include "routines/level1/xswap.hpp"
@@ -45,6 +45,7 @@
 #include "routines/level2/xtrmv.hpp"
 #include "routines/level2/xtbmv.hpp"
 #include "routines/level2/xtpmv.hpp"
+#include "routines/level2/xtrsv.hpp"
 #include "routines/level2/xger.hpp"
 #include "routines/level2/xgeru.hpp"
 #include "routines/level2/xgerc.hpp"
@@ -66,9 +67,12 @@
 #include "routines/level3/xsyr2k.hpp"
 #include "routines/level3/xher2k.hpp"
 #include "routines/level3/xtrmm.hpp"
+#include "routines/level3/xtrsm.hpp"
 
 // Level-x includes (non-BLAS)
 #include "routines/levelx/xomatcopy.hpp"
+#include "routines/levelx/xaxpybatched.hpp"
+#include "routines/levelx/xgemmbatched.hpp"
 
 namespace clblast {
 
@@ -1145,12 +1149,20 @@ template StatusCode PUBLIC_API Tpmv<half>(const Layout, const Triangle, const Tr
 
 // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
 template <typename T>
-StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal,
-                const size_t,
-                const cl_mem, const size_t, const size_t,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event) {
+  try {
+    auto queue_cpp = Queue(*queue);
+    auto routine = Xtrsv<T>(queue_cpp, event);
+    routine.DoTrsv(layout, triangle, a_transpose, diagonal,
+                   n,
+                   Buffer<T>(a_buffer), a_offset, a_ld,
+                   Buffer<T>(x_buffer), x_offset, x_inc);
+    return StatusCode::kSuccess;
+  } catch (...) { return DispatchException(); }
 }
 template StatusCode PUBLIC_API Trsv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
                                            const size_t,
@@ -2065,15 +2077,24 @@ template StatusCode PUBLIC_API Trmm<half>(const Layout, const Side, const Triang
                                           cl_mem, const size_t, const size_t,
                                           cl_command_queue*, cl_event*);
 
-// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
+// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
 template <typename T>
-StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
-                const size_t, const size_t,
-                const T,
-                const cl_mem, const size_t, const size_t,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                cl_command_queue* queue, cl_event* event) {
+  try {
+    auto queue_cpp = Queue(*queue);
+    auto routine = Xtrsm<T>(queue_cpp, event);
+    routine.DoTrsm(layout, side, triangle, a_transpose, diagonal,
+                   m, n,
+                   alpha,
+                   Buffer<T>(a_buffer), a_offset, a_ld,
+                   Buffer<T>(b_buffer), b_offset, b_ld);
+    return StatusCode::kSuccess;
+  } catch (...) { return DispatchException(); }
 }
 template StatusCode PUBLIC_API Trsm<float>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
                                            const size_t, const size_t,
@@ -2099,12 +2120,6 @@ template StatusCode PUBLIC_API Trsm<double2>(const Layout, const Side, const Tri
                                              const cl_mem, const size_t, const size_t,
                                              cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
-template StatusCode PUBLIC_API Trsm<half>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
-                                          const size_t, const size_t,
-                                          const half,
-                                          const cl_mem, const size_t, const size_t,
-                                          cl_mem, const size_t, const size_t,
-                                          cl_command_queue*, cl_event*);
 
 // =================================================================================================
 // Extra non-BLAS routines (level-X)
@@ -2160,16 +2175,222 @@ template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose,
                                               cl_mem, const size_t, const size_t,
                                               cl_command_queue*, cl_event*);
 
+// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
+template <typename T>
+StatusCode AxpyBatched(const size_t n,
+                       const T *alphas,
+                       const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                       cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                       const size_t batch_count,
+                       cl_command_queue* queue, cl_event* event) {
+  try {
+    auto queue_cpp = Queue(*queue);
+    auto routine = XaxpyBatched<T>(queue_cpp, event);
+    auto alphas_cpp = std::vector<T>();
+    auto x_offsets_cpp = std::vector<size_t>();
+    auto y_offsets_cpp = std::vector<size_t>();
+    for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+      alphas_cpp.push_back(alphas[batch]);
+      x_offsets_cpp.push_back(x_offsets[batch]);
+      y_offsets_cpp.push_back(y_offsets[batch]);
+    }
+    routine.DoAxpyBatched(n,
+                          alphas_cpp,
+                          Buffer<T>(x_buffer), x_offsets_cpp, x_inc,
+                          Buffer<T>(y_buffer), y_offsets_cpp, y_inc,
+                          batch_count);
+    return StatusCode::kSuccess;
+  } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API AxpyBatched<float>(const size_t,
+                                                  const float*,
+                                                  const cl_mem, const size_t*, const size_t,
+                                                  cl_mem, const size_t*, const size_t,
+                                                  const size_t,
+                                                  cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API AxpyBatched<double>(const size_t,
+                                                   const double*,
+                                                   const cl_mem, const size_t*, const size_t,
+                                                   cl_mem, const size_t*, const size_t,
+                                                   const size_t,
+                                                   cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API AxpyBatched<float2>(const size_t,
+                                                   const float2*,
+                                                   const cl_mem, const size_t*, const size_t,
+                                                   cl_mem, const size_t*, const size_t,
+                                                   const size_t,
+                                                   cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API AxpyBatched<double2>(const size_t,
+                                                    const double2*,
+                                                    const cl_mem, const size_t*, const size_t,
+                                                    cl_mem, const size_t*, const size_t,
+                                                    const size_t,
+                                                    cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API AxpyBatched<half>(const size_t,
+                                                 const half*,
+                                                 const cl_mem, const size_t*, const size_t,
+                                                 cl_mem, const size_t*, const size_t,
+                                                 const size_t,
+                                                 cl_command_queue*, cl_event*);
+
+// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED
+template <typename T>
+StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                       const size_t m, const size_t n, const size_t k,
+                       const T *alphas,
+                       const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                       const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                       const T *betas,
+                       cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                       const size_t batch_count,
+                       cl_command_queue* queue, cl_event* event) {
+  try {
+    auto queue_cpp = Queue(*queue);
+    auto routine = XgemmBatched<T>(queue_cpp, event);
+    auto alphas_cpp = std::vector<T>();
+    auto betas_cpp = std::vector<T>();
+    auto a_offsets_cpp = std::vector<size_t>();
+    auto b_offsets_cpp = std::vector<size_t>();
+    auto c_offsets_cpp = std::vector<size_t>();
+    for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+      alphas_cpp.push_back(alphas[batch]);
+      betas_cpp.push_back(betas[batch]);
+      a_offsets_cpp.push_back(a_offsets[batch]);
+      b_offsets_cpp.push_back(b_offsets[batch]);
+      c_offsets_cpp.push_back(c_offsets[batch]);
+    }
+    routine.DoGemmBatched(layout, a_transpose, b_transpose,
+                          m, n, k,
+                          alphas_cpp,
+                          Buffer<T>(a_buffer), a_offsets_cpp, a_ld,
+                          Buffer<T>(b_buffer), b_offsets_cpp, b_ld,
+                          betas_cpp,
+                          Buffer<T>(c_buffer), c_offsets_cpp, c_ld,
+                          batch_count);
+    return StatusCode::kSuccess;
+  } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API GemmBatched<float>(const Layout, const Transpose, const Transpose,
+                                                  const size_t, const size_t, const size_t,
+                                                  const float*,
+                                                  const cl_mem, const size_t*, const size_t,
+                                                  const cl_mem, const size_t*, const size_t,
+                                                  const float*,
+                                                  cl_mem, const size_t*, const size_t,
+                                                  const size_t,
+                                                  cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API GemmBatched<double>(const Layout, const Transpose, const Transpose,
+                                                   const size_t, const size_t, const size_t,
+                                                   const double*,
+                                                   const cl_mem, const size_t*, const size_t,
+                                                   const cl_mem, const size_t*, const size_t,
+                                                   const double*,
+                                                   cl_mem, const size_t*, const size_t,
+                                                   const size_t,
+                                                   cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API GemmBatched<float2>(const Layout, const Transpose, const Transpose,
+                                                   const size_t, const size_t, const size_t,
+                                                   const float2*,
+                                                   const cl_mem, const size_t*, const size_t,
+                                                   const cl_mem, const size_t*, const size_t,
+                                                   const float2*,
+                                                   cl_mem, const size_t*, const size_t,
+                                                   const size_t,
+                                                   cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API GemmBatched<double2>(const Layout, const Transpose, const Transpose,
+                                                    const size_t, const size_t, const size_t,
+                                                    const double2*,
+                                                    const cl_mem, const size_t*, const size_t,
+                                                    const cl_mem, const size_t*, const size_t,
+                                                    const double2*,
+                                                    cl_mem, const size_t*, const size_t,
+                                                    const size_t,
+                                                    cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API GemmBatched<half>(const Layout, const Transpose, const Transpose,
+                                                 const size_t, const size_t, const size_t,
+                                                 const half*,
+                                                 const cl_mem, const size_t*, const size_t,
+                                                 const cl_mem, const size_t*, const size_t,
+                                                 const half*,
+                                                 cl_mem, const size_t*, const size_t,
+                                                 const size_t,
+                                                 cl_command_queue*, cl_event*);
 // =================================================================================================
 
 // Clears the cache of stored binaries
 StatusCode ClearCache() {
   try {
-    CacheClearAll();
+    ProgramCache::Instance().Invalidate();
+    BinaryCache::Instance().Invalidate();
   } catch (...) { return DispatchException(); }
   return StatusCode::kSuccess;
 }
 
+template <typename Real, typename Complex>
+void FillCacheForPrecision(Queue &queue) {
+  try {
+
+    // Runs all the level 1 set-up functions
+    Xswap<Real>(queue, nullptr); Xswap<Complex>(queue, nullptr);
+    Xswap<Real>(queue, nullptr); Xswap<Complex>(queue, nullptr);
+    Xscal<Real>(queue, nullptr); Xscal<Complex>(queue, nullptr);
+    Xcopy<Real>(queue, nullptr); Xcopy<Complex>(queue, nullptr);
+    Xaxpy<Real>(queue, nullptr); Xaxpy<Complex>(queue, nullptr);
+    Xdot<Real>(queue, nullptr);
+    Xdotu<Complex>(queue, nullptr);
+    Xdotc<Complex>(queue, nullptr);
+    Xnrm2<Real>(queue, nullptr); Xnrm2<Complex>(queue, nullptr);
+    Xasum<Real>(queue, nullptr); Xasum<Complex>(queue, nullptr);
+    Xsum<Real>(queue, nullptr); Xsum<Complex>(queue, nullptr);
+    Xamax<Real>(queue, nullptr); Xamax<Complex>(queue, nullptr);
+    Xmax<Real>(queue, nullptr); Xmax<Complex>(queue, nullptr);
+    Xmin<Real>(queue, nullptr); Xmin<Complex>(queue, nullptr);
+
+    // Runs all the level 2 set-up functions
+    Xgemv<Real>(queue, nullptr); Xgemv<Complex>(queue, nullptr);
+    Xgbmv<Real>(queue, nullptr); Xgbmv<Complex>(queue, nullptr);
+    Xhemv<Complex>(queue, nullptr);
+    Xhbmv<Complex>(queue, nullptr);
+    Xhpmv<Complex>(queue, nullptr);
+    Xsymv<Real>(queue, nullptr);
+    Xsbmv<Real>(queue, nullptr);
+    Xspmv<Real>(queue, nullptr);
+    Xtrmv<Real>(queue, nullptr); Xtrmv<Complex>(queue, nullptr);
+    Xtbmv<Real>(queue, nullptr); Xtbmv<Complex>(queue, nullptr);
+    Xtpmv<Real>(queue, nullptr); Xtpmv<Complex>(queue, nullptr);
+    Xger<Real>(queue, nullptr);
+    Xgeru<Complex>(queue, nullptr);
+    Xgerc<Complex>(queue, nullptr);
+    Xher<Complex,Real>(queue, nullptr);
+    Xhpr<Complex,Real>(queue, nullptr);
+    Xher2<Complex>(queue, nullptr);
+    Xhpr2<Complex>(queue, nullptr);
+    Xsyr<Real>(queue, nullptr);
+    Xspr<Real>(queue, nullptr);
+    Xsyr2<Real>(queue, nullptr);
+    Xspr2<Real>(queue, nullptr);
+
+    // Runs all the level 3 set-up functions
+    Xgemm<Real>(queue, nullptr); Xgemm<Complex>(queue, nullptr);
+    Xsymm<Real>(queue, nullptr); Xsymm<Complex>(queue, nullptr);
+    Xhemm<Complex>(queue, nullptr);
+    Xsyrk<Real>(queue, nullptr); Xsyrk<Complex>(queue, nullptr);
+    Xherk<Complex,Real>(queue, nullptr);
+    Xsyr2k<Real>(queue, nullptr); Xsyr2k<Complex>(queue, nullptr);
+    Xher2k<Complex,Real>(queue, nullptr);
+    Xtrmm<Real>(queue, nullptr); Xtrmm<Complex>(queue, nullptr);
+
+    // Runs all the non-BLAS set-up functions
+    Xomatcopy<Real>(queue, nullptr); Xomatcopy<Complex>(queue, nullptr);
+
+  } catch(const RuntimeErrorCode &e) {
+    if (e.status() != StatusCode::kNoDoublePrecision &&
+        e.status() != StatusCode::kNoHalfPrecision) {
+      throw;
+    }
+  }
+}
+
 // Fills the cache with all binaries for a specific device
 // TODO: Add half-precision FP16 set-up calls
 StatusCode FillCache(const cl_device_id device) {
@@ -2180,58 +2401,52 @@ StatusCode FillCache(const cl_device_id device) {
     auto context = Context(device_cpp);
     auto queue = Queue(context, device_cpp);
 
-    // Runs all the level 1 set-up functions
-    Xswap<float>(queue, nullptr); Xswap<double>(queue, nullptr); Xswap<float2>(queue, nullptr); Xswap<double2>(queue, nullptr);
-    Xswap<float>(queue, nullptr); Xswap<double>(queue, nullptr); Xswap<float2>(queue, nullptr); Xswap<double2>(queue, nullptr);
-    Xscal<float>(queue, nullptr); Xscal<double>(queue, nullptr); Xscal<float2>(queue, nullptr); Xscal<double2>(queue, nullptr);
-    Xcopy<float>(queue, nullptr); Xcopy<double>(queue, nullptr); Xcopy<float2>(queue, nullptr); Xcopy<double2>(queue, nullptr);
-    Xaxpy<float>(queue, nullptr); Xaxpy<double>(queue, nullptr); Xaxpy<float2>(queue, nullptr); Xaxpy<double2>(queue, nullptr);
-    Xdot<float>(queue, nullptr); Xdot<double>(queue, nullptr);
-    Xdotu<float2>(queue, nullptr); Xdotu<double2>(queue, nullptr);
-    Xdotc<float2>(queue, nullptr); Xdotc<double2>(queue, nullptr);
-    Xnrm2<float>(queue, nullptr); Xnrm2<double>(queue, nullptr); Xnrm2<float2>(queue, nullptr); Xnrm2<double2>(queue, nullptr);
-    Xasum<float>(queue, nullptr); Xasum<double>(queue, nullptr); Xasum<float2>(queue, nullptr); Xasum<double2>(queue, nullptr);
-    Xsum<float>(queue, nullptr); Xsum<double>(queue, nullptr); Xsum<float2>(queue, nullptr); Xsum<double2>(queue, nullptr);
-    Xamax<float>(queue, nullptr); Xamax<double>(queue, nullptr); Xamax<float2>(queue, nullptr); Xamax<double2>(queue, nullptr);
-    Xmax<float>(queue, nullptr); Xmax<double>(queue, nullptr); Xmax<float2>(queue, nullptr); Xmax<double2>(queue, nullptr);
-    Xmin<float>(queue, nullptr); Xmin<double>(queue, nullptr); Xmin<float2>(queue, nullptr); Xmin<double2>(queue, nullptr);
+    FillCacheForPrecision<float, float2>(queue);
+    FillCacheForPrecision<double, double2>(queue);
 
-    // Runs all the level 2 set-up functions
-    Xgemv<float>(queue, nullptr); Xgemv<double>(queue, nullptr); Xgemv<float2>(queue, nullptr); Xgemv<double2>(queue, nullptr);
-    Xgbmv<float>(queue, nullptr); Xgbmv<double>(queue, nullptr); Xgbmv<float2>(queue, nullptr); Xgbmv<double2>(queue, nullptr);
-    Xhemv<float2>(queue, nullptr); Xhemv<double2>(queue, nullptr);
-    Xhbmv<float2>(queue, nullptr); Xhbmv<double2>(queue, nullptr);
-    Xhpmv<float2>(queue, nullptr); Xhpmv<double2>(queue, nullptr);
-    Xsymv<float>(queue, nullptr); Xsymv<double>(queue, nullptr);
-    Xsbmv<float>(queue, nullptr); Xsbmv<double>(queue, nullptr);
-    Xspmv<float>(queue, nullptr); Xspmv<double>(queue, nullptr);
-    Xtrmv<float>(queue, nullptr); Xtrmv<double>(queue, nullptr); Xtrmv<float2>(queue, nullptr); Xtrmv<double2>(queue, nullptr);
-    Xtbmv<float>(queue, nullptr); Xtbmv<double>(queue, nullptr); Xtbmv<float2>(queue, nullptr); Xtbmv<double2>(queue, nullptr);
-    Xtpmv<float>(queue, nullptr); Xtpmv<double>(queue, nullptr); Xtpmv<float2>(queue, nullptr); Xtpmv<double2>(queue, nullptr);
-    Xger<float>(queue, nullptr); Xger<double>(queue, nullptr);
-    Xgeru<float2>(queue, nullptr); Xgeru<double2>(queue, nullptr);
-    Xgerc<float2>(queue, nullptr); Xgerc<double2>(queue, nullptr);
-    Xher<float2,float>(queue, nullptr); Xher<double2,double>(queue, nullptr);
-    Xhpr<float2,float>(queue, nullptr); Xhpr<double2,double>(queue, nullptr);
-    Xher2<float2>(queue, nullptr); Xher2<double2>(queue, nullptr);
-    Xhpr2<float2>(queue, nullptr); Xhpr2<double2>(queue, nullptr);
-    Xsyr<float>(queue, nullptr); Xsyr<double>(queue, nullptr);
-    Xspr<float>(queue, nullptr); Xspr<double>(queue, nullptr);
-    Xsyr2<float>(queue, nullptr); Xsyr2<double>(queue, nullptr);
-    Xspr2<float>(queue, nullptr); Xspr2<double>(queue, nullptr);
+  } catch (...) { return DispatchException(); }
+  return StatusCode::kSuccess;
+}
 
-    // Runs all the level 3 set-up functions
-    Xgemm<float>(queue, nullptr); Xgemm<double>(queue, nullptr); Xgemm<float2>(queue, nullptr); Xgemm<double2>(queue, nullptr);
-    Xsymm<float>(queue, nullptr); Xsymm<double>(queue, nullptr); Xsymm<float2>(queue, nullptr); Xsymm<double2>(queue, nullptr);
-    Xhemm<float2>(queue, nullptr); Xhemm<double2>(queue, nullptr);
-    Xsyrk<float>(queue, nullptr); Xsyrk<double>(queue, nullptr); Xsyrk<float2>(queue, nullptr); Xsyrk<double2>(queue, nullptr);
-    Xherk<float2,float>(queue, nullptr); Xherk<double2,double>(queue, nullptr);
-    Xsyr2k<float>(queue, nullptr); Xsyr2k<double>(queue, nullptr); Xsyr2k<float2>(queue, nullptr); Xsyr2k<double2>(queue, nullptr);
-    Xher2k<float2,float>(queue, nullptr); Xher2k<double2,double>(queue, nullptr);
-    Xtrmm<float>(queue, nullptr); Xtrmm<double>(queue, nullptr); Xtrmm<float2>(queue, nullptr); Xtrmm<double2>(queue, nullptr);
+// =================================================================================================
 
-    // Runs all the level 3 set-up functions
-    Xomatcopy<float>(queue, nullptr); Xomatcopy<double>(queue, nullptr); Xomatcopy<float2>(queue, nullptr); Xomatcopy<double2>(queue, nullptr);
+// Overrides the tuning parameters for this device-precision-kernel combination
+StatusCode OverrideParameters(const cl_device_id device, const std::string &kernel_name,
+                              const Precision precision,
+                              const std::unordered_map<std::string,size_t> &parameters) {
+  try {
+
+    // Retrieves the device name
+    const auto device_cpp = Device(device);
+    const auto device_name = device_cpp.Name();
+
+    // Retrieves the current database values to verify whether the new ones are complete
+    auto in_cache = false;
+    const auto current_database = DatabaseCache::Instance().Get(DatabaseKeyRef{ precision, device_name, kernel_name }, &in_cache);
+    if (!in_cache) { return StatusCode::kInvalidOverrideKernel; }
+    for (const auto &current_param : current_database.GetParameterNames()) {
+      if (parameters.find(current_param) == parameters.end()) {
+        return StatusCode::kMissingOverrideParameter;
+      }
+    }
+
+    // Clears the existing program & binary cache for routines with the target kernel
+    const auto routine_names = Routine::routines_by_kernel.at(kernel_name);
+    for (const auto &routine_name : routine_names) {
+      ProgramCache::Instance().RemoveBySubset<1, 2>(ProgramKey{nullptr, precision, routine_name});
+      BinaryCache::Instance().Remove(BinaryKey{precision, routine_name, device_name});
+    }
+
+    // Creates a small custom database based on the provided parameters
+    const auto database_device = Database::DatabaseDevice{"default", parameters};
+    const auto database_vendor = Database::DatabaseVendor{database::kDeviceTypeAll, "default", {database_device}};
+    const auto database_entry = Database::DatabaseEntry{kernel_name, precision, {database_vendor}};
+    const auto database_entries = std::vector<Database::DatabaseEntry>{database_entry};
+    const auto database = Database(device_cpp, kernel_name, precision, database_entries);
+
+    // Removes the old database entry and stores the new one in the cache
+    DatabaseCache::Instance().Remove(DatabaseKey{ precision, device_name, kernel_name });
+    DatabaseCache::Instance().Store(DatabaseKey{ precision, device_name, kernel_name }, Database(database));
 
   } catch (...) { return DispatchException(); }
   return StatusCode::kSuccess;
diff --git a/src/clblast_c.cpp b/src/clblast_c.cpp
index 59e4cd16..b6a64749 100644
--- a/src/clblast_c.cpp
+++ b/src/clblast_c.cpp
@@ -12,12 +12,14 @@
 // =================================================================================================
 
 #include <string>
+#include <unordered_map>
 
+#include "utilities/utilities.hpp"
 #include "clblast_c.h"
 #include "clblast.h"
-#include "utilities/utilities.hpp"
 
 // Shortcuts to the clblast namespace
+using half = clblast::half;
 using float2 = clblast::float2;
 using double2 = clblast::double2;
 
@@ -3349,27 +3351,6 @@ CLBlastStatusCode CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide sid
     );
   } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
 }
-CLBlastStatusCode CLBlastHtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
-                               const size_t m, const size_t n,
-                               const cl_half alpha,
-                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
-                               cl_command_queue* queue, cl_event* event) {
-  try {
-    return static_cast<CLBlastStatusCode>(
-      clblast::Trsm(static_cast<clblast::Layout>(layout),
-                    static_cast<clblast::Side>(side),
-                    static_cast<clblast::Triangle>(triangle),
-                    static_cast<clblast::Transpose>(a_transpose),
-                    static_cast<clblast::Diagonal>(diagonal),
-                    m, n,
-                    alpha,
-                    a_buffer, a_offset, a_ld,
-                    b_buffer, b_offset, b_ld,
-                    queue, event)
-    );
-  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
-}
 
 // =================================================================================================
 // Extra non-BLAS routines (level-X)
@@ -3467,6 +3448,270 @@ CLBlastStatusCode CLBlastHomatcopy(const CLBlastLayout layout, const CLBlastTran
   } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
 }
 
+// AXPY
+CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
+                                      const float *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<float>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(alphas[batch]);
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::AxpyBatched(n,
+                           alphas_cpp.data(),
+                           x_buffer, x_offsets, x_inc,
+                           y_buffer, y_offsets, y_inc,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDaxpyBatched(const size_t n,
+                                      const double *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<double>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(alphas[batch]);
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::AxpyBatched(n,
+                           alphas_cpp.data(),
+                           x_buffer, x_offsets, x_inc,
+                           y_buffer, y_offsets, y_inc,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCaxpyBatched(const size_t n,
+                                      const cl_float2 *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<float2>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(float2{alphas[batch].s[0], alphas[batch].s[1]});
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::AxpyBatched(n,
+                           alphas_cpp.data(),
+                           x_buffer, x_offsets, x_inc,
+                           y_buffer, y_offsets, y_inc,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZaxpyBatched(const size_t n,
+                                      const cl_double2 *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<double2>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(double2{alphas[batch].s[0], alphas[batch].s[1]});
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::AxpyBatched(n,
+                           alphas_cpp.data(),
+                           x_buffer, x_offsets, x_inc,
+                           y_buffer, y_offsets, y_inc,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHaxpyBatched(const size_t n,
+                                      const cl_half *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<half>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(alphas[batch]);
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::AxpyBatched(n,
+                           alphas_cpp.data(),
+                           x_buffer, x_offsets, x_inc,
+                           y_buffer, y_offsets, y_inc,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+
+// GEMM
+CLBlastStatusCode CLBlastSgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const float *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const float *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<float>();
+  auto betas_cpp = std::vector<float>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(alphas[batch]);
+    betas_cpp.push_back(betas[batch]);
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::GemmBatched(static_cast<clblast::Layout>(layout),
+                           static_cast<clblast::Transpose>(a_transpose),
+                           static_cast<clblast::Transpose>(b_transpose),
+                           m, n, k,
+                           alphas_cpp.data(),
+                           a_buffer, a_offsets, a_ld,
+                           b_buffer, b_offsets, b_ld,
+                           betas_cpp.data(),
+                           c_buffer, c_offsets, c_ld,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const double *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const double *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<double>();
+  auto betas_cpp = std::vector<double>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(alphas[batch]);
+    betas_cpp.push_back(betas[batch]);
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::GemmBatched(static_cast<clblast::Layout>(layout),
+                           static_cast<clblast::Transpose>(a_transpose),
+                           static_cast<clblast::Transpose>(b_transpose),
+                           m, n, k,
+                           alphas_cpp.data(),
+                           a_buffer, a_offsets, a_ld,
+                           b_buffer, b_offsets, b_ld,
+                           betas_cpp.data(),
+                           c_buffer, c_offsets, c_ld,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const cl_float2 *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const cl_float2 *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<float2>();
+  auto betas_cpp = std::vector<float2>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(float2{alphas[batch].s[0], alphas[batch].s[1]});
+    betas_cpp.push_back(float2{betas[batch].s[0], betas[batch].s[1]});
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::GemmBatched(static_cast<clblast::Layout>(layout),
+                           static_cast<clblast::Transpose>(a_transpose),
+                           static_cast<clblast::Transpose>(b_transpose),
+                           m, n, k,
+                           alphas_cpp.data(),
+                           a_buffer, a_offsets, a_ld,
+                           b_buffer, b_offsets, b_ld,
+                           betas_cpp.data(),
+                           c_buffer, c_offsets, c_ld,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const cl_double2 *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const cl_double2 *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<double2>();
+  auto betas_cpp = std::vector<double2>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(double2{alphas[batch].s[0], alphas[batch].s[1]});
+    betas_cpp.push_back(double2{betas[batch].s[0], betas[batch].s[1]});
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::GemmBatched(static_cast<clblast::Layout>(layout),
+                           static_cast<clblast::Transpose>(a_transpose),
+                           static_cast<clblast::Transpose>(b_transpose),
+                           m, n, k,
+                           alphas_cpp.data(),
+                           a_buffer, a_offsets, a_ld,
+                           b_buffer, b_offsets, b_ld,
+                           betas_cpp.data(),
+                           c_buffer, c_offsets, c_ld,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const cl_half *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const cl_half *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<half>();
+  auto betas_cpp = std::vector<half>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(alphas[batch]);
+    betas_cpp.push_back(betas[batch]);
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::GemmBatched(static_cast<clblast::Layout>(layout),
+                           static_cast<clblast::Transpose>(a_transpose),
+                           static_cast<clblast::Transpose>(b_transpose),
+                           m, n, k,
+                           alphas_cpp.data(),
+                           a_buffer, a_offsets, a_ld,
+                           b_buffer, b_offsets, b_ld,
+                           betas_cpp.data(),
+                           c_buffer, c_offsets, c_ld,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+
 // =================================================================================================
 
 // Clears the cache of stored binaries
@@ -3484,3 +3729,23 @@ CLBlastStatusCode CLBlastFillCache(const cl_device_id device) {
 }
 
 // =================================================================================================
+
+// Overrides the tuning parameters for this device-precision-kernel combination
+CLBlastStatusCode PUBLIC_API CLBlastOverrideParameters(const cl_device_id device, const char* kernel_name,
+                                                       const CLBlastPrecision precision, const size_t num_parameters,
+                                                       const char** parameters_names, const size_t* parameters_values) {
+  try {
+    const auto kernel_name_cpp = std::string(kernel_name);
+    const auto precision_cpp = static_cast<clblast::Precision>(precision);
+    auto parameters = std::unordered_map<std::string, size_t>();
+    for (auto i = size_t{0}; i < num_parameters; ++i) {
+      const auto parameter_name = std::string(parameters_names[i]);
+      const auto parameter_value = parameters_values[i];
+      parameters[parameter_name] = parameter_value;
+    }
+    const auto status = clblast::OverrideParameters(device, kernel_name_cpp, precision_cpp, parameters);
+    return static_cast<CLBlastStatusCode>(status);
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+
+// =================================================================================================
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index 0383f53a..e0b8cbe9 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -164,6 +164,10 @@ class Platform {
     platform_ = platforms[platform_id];
   }
 
+  // Methods to retrieve platform information
+  std::string Name() const { return GetInfoString(CL_PLATFORM_NAME); }
+  std::string Vendor() const { return GetInfoString(CL_PLATFORM_VENDOR); }
+
   // Returns the number of devices on this platform
   size_t NumDevices() const {
     auto result = cl_uint{0};
@@ -175,6 +179,17 @@ class Platform {
   const cl_platform_id& operator()() const { return platform_; }
  private:
   cl_platform_id platform_;
+
+  // Private helper functions
+  std::string GetInfoString(const cl_device_info info) const {
+    auto bytes = size_t{0};
+    CheckError(clGetPlatformInfo(platform_, info, 0, nullptr, &bytes));
+    auto result = std::string{};
+    result.resize(bytes);
+    CheckError(clGetPlatformInfo(platform_, info, bytes, &result[0], nullptr));
+    result.resize(strlen(result.c_str())); // Removes any trailing '\0'-characters
+    return result;
+  }
 };
 
 // Retrieves a vector with all platforms
@@ -333,7 +348,10 @@ class Context {
 
   // Regular constructor with memory management
   explicit Context(const Device &device):
-      context_(new cl_context, [](cl_context* c) { CheckErrorDtor(clReleaseContext(*c)); delete c; }) {
+      context_(new cl_context, [](cl_context* c) {
+        if (*c) { CheckErrorDtor(clReleaseContext(*c)); }
+        delete c;
+      }) {
     auto status = CL_SUCCESS;
     const cl_device_id dev = device();
     *context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status);
@@ -355,33 +373,37 @@ using ContextPointer = cl_context*;
 // Enumeration of build statuses of the run-time compilation process
 enum class BuildStatus { kSuccess, kError, kInvalid };
 
-// C++11 version of 'cl_program'. Additionally holds the program's source code.
+// C++11 version of 'cl_program'.
 class Program {
  public:
-  // Note that there is no constructor based on the regular OpenCL data-type because of extra state
+  Program() = default;
 
   // Source-based constructor with memory management
-  explicit Program(const Context &context, std::string source):
-      program_(new cl_program, [](cl_program* p) { CheckErrorDtor(clReleaseProgram(*p)); delete p; }),
-      length_(source.length()),
-      source_(std::move(source)),
-      source_ptr_(&source_[0]) {
+  explicit Program(const Context &context, const std::string &source):
+      program_(new cl_program, [](cl_program* p) {
+        if (*p) { CheckErrorDtor(clReleaseProgram(*p)); }
+        delete p;
+      }) {
+    const char *source_ptr = &source[0];
+    size_t length = source.length();
     auto status = CL_SUCCESS;
-    *program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status);
+    *program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status);
     CLError::Check(status, "clCreateProgramWithSource");
   }
 
   // Binary-based constructor with memory management
-  explicit Program(const Device &device, const Context &context, const std::string& binary):
-      program_(new cl_program, [](cl_program* p) { CheckErrorDtor(clReleaseProgram(*p)); delete p; }),
-      length_(binary.length()),
-      source_(binary),
-      source_ptr_(&source_[0]) {
+  explicit Program(const Device &device, const Context &context, const std::string &binary):
+      program_(new cl_program, [](cl_program* p) {
+        if (*p) { CheckErrorDtor(clReleaseProgram(*p)); }
+        delete p;
+      }) {
+    const char *binary_ptr = &binary[0];
+    size_t length = binary.length();
     auto status1 = CL_SUCCESS;
     auto status2 = CL_SUCCESS;
     const cl_device_id dev = device();
-    *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_,
-                                          reinterpret_cast<const unsigned char**>(&source_ptr_),
+    *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length,
+                                          reinterpret_cast<const unsigned char**>(&binary_ptr),
                                           &status1, &status2);
     CLError::Check(status1, "clCreateProgramWithBinary (binary status)");
     CLError::Check(status2, "clCreateProgramWithBinary");
@@ -421,9 +443,6 @@ class Program {
   const cl_program& operator()() const { return *program_; }
  private:
   std::shared_ptr<cl_program> program_;
-  size_t length_;
-  std::string source_; // Note: the source can also be a binary or IR
-  const char* source_ptr_;
 };
 
 // =================================================================================================
@@ -440,8 +459,10 @@ class Queue {
 
   // Regular constructor with memory management
   explicit Queue(const Context &context, const Device &device):
-      queue_(new cl_command_queue, [](cl_command_queue* s) { CheckErrorDtor(clReleaseCommandQueue(*s));
-                                                             delete s; }) {
+      queue_(new cl_command_queue, [](cl_command_queue* s) {
+        if (*s) { CheckErrorDtor(clReleaseCommandQueue(*s)); }
+        delete s;
+      }) {
     auto status = CL_SUCCESS;
     *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
     CLError::Check(status, "clCreateCommandQueue");
@@ -594,9 +615,6 @@ class Buffer {
 
   // Copies from host to device: writing the device buffer a-synchronously
   void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
-    if (access_ == BufferAccess::kReadOnly) {
-      throw LogicError("Buffer: writing to a read-only buffer");
-    }
     if (GetSize() < (offset+size)*sizeof(T)) {
       throw LogicError("Buffer: target device buffer is too small");
     }
@@ -665,7 +683,10 @@ class Kernel {
 
   // Regular constructor with memory management
   explicit Kernel(const Program &program, const std::string &name):
-      kernel_(new cl_kernel, [](cl_kernel* k) { CheckErrorDtor(clReleaseKernel(*k)); delete k; }) {
+      kernel_(new cl_kernel, [](cl_kernel* k) {
+        if (*k) { CheckErrorDtor(clReleaseKernel(*k)); }
+        delete k;
+      }) {
     auto status = CL_SUCCESS;
     *kernel_ = clCreateKernel(program(), name.c_str(), &status);
     CLError::Check(status, "clCreateKernel");
diff --git a/src/database/apple_cpu_fallback.hpp b/src/database/apple_cpu_fallback.hpp
new file mode 100644
index 00000000..89ac8f71
--- /dev/null
+++ b/src/database/apple_cpu_fallback.hpp
@@ -0,0 +1,70 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file provides overrides for Apple's OpenCL CPU implementation. It is a special case compared
+// to all other implementations, as it only supports a 1-dimensional work-group size. In addition,
+// that work-group size is limited to 1024 (in theory) or much lower (kernel resource dependent).
+// Thus, instead of supporting this corner-case in the whole regular flow (starting from the tuner),
+// we provide this file with some manual overrides.
+//
+// Note: These overrides are to make the Apple CPU work and not crash, they are not in any way
+// optimized parameters. For decent speed don't use Apple's OpenCL CPU implementation.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+// =================================================================================================
+
+const Database::DatabaseEntry XaxpyApple = {
+  "Xaxpy", Precision::kAny, { { kDeviceTypeAll, "default", { { "default", { {"VW",8}, {"WGS",1}, {"WPT",4} } } } } }
+};
+const Database::DatabaseEntry XdotApple = {
+  "Xdot", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"WGS1",1}, {"WGS2",1} } } } } }
+};
+const Database::DatabaseEntry XgemvApple = {
+  "Xgemv", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"WGS1",1}, {"WPT1",4}, {"UNROLL1", 1} } } } } }
+};
+const Database::DatabaseEntry XgemvFastApple = {
+  "XgemvFast", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"VW2",1}, {"WGS2",1}, {"WPT2",1} } } } } }
+};
+const Database::DatabaseEntry XgemvFastRotApple = {
+  "XgemvFastRot", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"VW3",1}, {"WGS3",1}, {"WPT3",1} } } } } }
+};
+const Database::DatabaseEntry XgerApple = {
+  "Xger", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } } } } }
+};
+const Database::DatabaseEntry XtrsvApple = {
+  "Xtrsv", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"TRSV_BLOCK_SIZE",32} } } } } }
+};
+const Database::DatabaseEntry XgemmApple = {
+  "Xgemm", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"KWG",1}, {"KWI",1}, {"MDIMA",1}, {"MDIMC",1}, {"MWG",1}, {"NDIMB",1}, {"NDIMC",1}, {"NWG",1}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } } } } }
+};
+const Database::DatabaseEntry XgemmDirectApple = {
+  "XgemmDirect", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"KWID",1}, {"MDIMAD",1}, {"MDIMCD",1}, {"NDIMBD",1}, {"NDIMCD",1}, {"PADA",0}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",1} } } } } }
+};
+const Database::DatabaseEntry CopyApple = {
+  "Copy", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"COPY_DIMX",1}, {"COPY_DIMY",1}, {"COPY_VW",1}, {"COPY_WPT",1} } } } } }
+};
+const Database::DatabaseEntry PadApple = {
+  "Pad", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"PAD_DIMX",1}, {"PAD_DIMY",1}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } } } } }
+};
+const Database::DatabaseEntry TransposeApple = {
+  "Transpose", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"TRA_DIM",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } } } } }
+};
+const Database::DatabaseEntry PadtransposeApple = {
+  "Padtranspose", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",1}, {"PADTRA_WPT",1} } } } } }
+};
+const Database::DatabaseEntry InvertApple = {
+  "Invert", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"INTERNAL_BLOCK_SIZE",16} } } } } }
+};
+
+// =================================================================================================
+} // namespace database
+} // namespace clblast
diff --git a/src/database/database.cpp b/src/database/database.cpp
index cf548d46..404be804 100644
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@@ -11,6 +11,8 @@
 //
 // =================================================================================================
 
+#include <list>
+
 #include "utilities/utilities.hpp"
 
 #include "database/database.hpp"
@@ -20,35 +22,47 @@
 #include "database/kernels/xgemv_fast.hpp"
 #include "database/kernels/xgemv_fast_rot.hpp"
 #include "database/kernels/xger.hpp"
+#include "database/kernels/xtrsv.hpp"
 #include "database/kernels/xgemm.hpp"
 #include "database/kernels/xgemm_direct.hpp"
 #include "database/kernels/copy.hpp"
 #include "database/kernels/pad.hpp"
 #include "database/kernels/transpose.hpp"
 #include "database/kernels/padtranspose.hpp"
+#include "database/kernels/invert.hpp"
+#include "database/apple_cpu_fallback.hpp"
 #include "database/kernel_selection.hpp"
 
 namespace clblast {
 // =================================================================================================
 
-// Initializes the database
-const std::vector<const Database::DatabaseEntry*> Database::database = {
-  &database::XaxpyHalf, &database::XaxpySingle, &database::XaxpyDouble, &database::XaxpyComplexSingle, &database::XaxpyComplexDouble,
-  &database::XdotHalf, &database::XdotSingle, &database::XdotDouble, &database::XdotComplexSingle, &database::XdotComplexDouble,
-  &database::XgemvHalf, &database::XgemvSingle, &database::XgemvDouble, &database::XgemvComplexSingle, &database::XgemvComplexDouble,
-  &database::XgemvFastHalf, &database::XgemvFastSingle, &database::XgemvFastDouble, &database::XgemvFastComplexSingle, &database::XgemvFastComplexDouble,
-  &database::XgemvFastRotHalf, &database::XgemvFastRotSingle, &database::XgemvFastRotDouble, &database::XgemvFastRotComplexSingle, &database::XgemvFastRotComplexDouble,
-  &database::XgerHalf, &database::XgerSingle, &database::XgerDouble, &database::XgerComplexSingle, &database::XgerComplexDouble,
-  &database::XgemmHalf, &database::XgemmSingle, &database::XgemmDouble, &database::XgemmComplexSingle, &database::XgemmComplexDouble,
-  &database::XgemmDirectHalf, &database::XgemmDirectSingle, &database::XgemmDirectDouble, &database::XgemmDirectComplexSingle, &database::XgemmDirectComplexDouble,
-  &database::CopyHalf, &database::CopySingle, &database::CopyDouble, &database::CopyComplexSingle, &database::CopyComplexDouble,
-  &database::PadHalf, &database::PadSingle, &database::PadDouble, &database::PadComplexSingle, &database::PadComplexDouble,
-  &database::TransposeHalf, &database::TransposeSingle, &database::TransposeDouble, &database::TransposeComplexSingle, &database::TransposeComplexDouble,
-  &database::PadtransposeHalf, &database::PadtransposeSingle, &database::PadtransposeDouble, &database::PadtransposeComplexSingle, &database::PadtransposeComplexDouble,
-  &database::KernelSelectionHalf, &database::KernelSelectionSingle, &database::KernelSelectionDouble, &database::KernelSelectionComplexSingle, &database::KernelSelectionComplexDouble
+// Initializes the databases
+const std::vector<Database::DatabaseEntry> Database::database = std::vector<Database::DatabaseEntry>{
+  database::XaxpyHalf, database::XaxpySingle, database::XaxpyDouble, database::XaxpyComplexSingle, database::XaxpyComplexDouble,
+  database::XdotHalf, database::XdotSingle, database::XdotDouble, database::XdotComplexSingle, database::XdotComplexDouble,
+  database::XgemvHalf, database::XgemvSingle, database::XgemvDouble, database::XgemvComplexSingle, database::XgemvComplexDouble,
+  database::XgemvFastHalf, database::XgemvFastSingle, database::XgemvFastDouble, database::XgemvFastComplexSingle, database::XgemvFastComplexDouble,
+  database::XgemvFastRotHalf, database::XgemvFastRotSingle, database::XgemvFastRotDouble, database::XgemvFastRotComplexSingle, database::XgemvFastRotComplexDouble,
+  database::XgerHalf, database::XgerSingle, database::XgerDouble, database::XgerComplexSingle, database::XgerComplexDouble,
+  database::XtrsvHalf, database::XtrsvSingle, database::XtrsvDouble, database::XtrsvComplexSingle, database::XtrsvComplexDouble,
+  database::XgemmHalf, database::XgemmSingle, database::XgemmDouble, database::XgemmComplexSingle, database::XgemmComplexDouble,
+  database::XgemmDirectHalf, database::XgemmDirectSingle, database::XgemmDirectDouble, database::XgemmDirectComplexSingle, database::XgemmDirectComplexDouble,
+  database::CopyHalf, database::CopySingle, database::CopyDouble, database::CopyComplexSingle, database::CopyComplexDouble,
+  database::PadHalf, database::PadSingle, database::PadDouble, database::PadComplexSingle, database::PadComplexDouble,
+  database::TransposeHalf, database::TransposeSingle, database::TransposeDouble, database::TransposeComplexSingle, database::TransposeComplexDouble,
+  database::PadtransposeHalf, database::PadtransposeSingle, database::PadtransposeDouble, database::PadtransposeComplexSingle, database::PadtransposeComplexDouble,
+  database::InvertHalf, database::InvertSingle, database::InvertDouble, database::InvertComplexSingle, database::InvertComplexDouble,
+  database::KernelSelectionHalf, database::KernelSelectionSingle, database::KernelSelectionDouble, database::KernelSelectionComplexSingle, database::KernelSelectionComplexDouble
+};
+const std::vector<Database::DatabaseEntry> Database::apple_cpu_fallback = std::vector<Database::DatabaseEntry>{
+  database::XaxpyApple, database::XdotApple,
+  database::XgemvApple, database::XgemvFastApple, database::XgemvFastRotApple, database::XgerApple, database::XtrsvApple,
+  database::XgemmApple, database::XgemmDirectApple,
+  database::CopyApple, database::PadApple, database::TransposeApple, database::PadtransposeApple,
+  database::InvertApple
 };
 
-// The OpenCL device vendors
+// The default values
 const std::string Database::kDeviceVendorAll = "default";
 
 // Alternative names for some OpenCL vendors
@@ -63,12 +77,11 @@ const std::unordered_map<std::string, std::string> Database::kVendorNames{
 
 // Constructor, computing device properties and populating the parameter-vector from the database.
 // This takes an optional overlay database in case of custom tuning or custom kernels.
-Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
-                   const Precision precision, const std::vector<const DatabaseEntry*> &overlay):
-  parameters_{} {
+Database::Database(const Device &device, const std::string &kernel_name,
+                   const Precision precision, const std::vector<DatabaseEntry> &overlay):
+  parameters_(std::make_shared<Parameters>()) {
 
   // Finds information of the current device
-  auto device = queue.GetDevice();
   auto device_type = device.Type();
   auto device_vendor = device.Vendor();
   auto device_name = device.Name();
@@ -80,20 +93,31 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
     }
   }
 
-  // Iterates over all kernels to include, and retrieves the parameters for each of them
-  for (auto &kernel: kernels) {
-    auto search_result = ParametersPtr{};
+  // Sets the databases to search through
+  auto databases = std::list<std::vector<DatabaseEntry>>{overlay, database};
 
-    for (auto &db: { database, overlay}) {
-      search_result = Search(kernel, device_type, device_vendor, device_name, precision, db);
-      if (search_result) {
-        parameters_.insert(search_result->begin(), search_result->end());
-        break;
+  // Special case: modifies the database if the device is a CPU with Apple OpenCL
+  #if defined(__APPLE__) || defined(__MACOSX)
+    if (device.Type() == "CPU") {
+      auto extensions = device.Capabilities();
+      const auto is_apple = (extensions.find("cl_APPLE_SetMemObjectDestructor") == std::string::npos) ? false : true;
+      if (is_apple) {
+        databases.push_front(apple_cpu_fallback);
       }
     }
+  #endif
 
-    if (!search_result) { throw RuntimeErrorCode(StatusCode::kDatabaseError); }
+  // Searches potentially multiple databases
+  auto search_result = ParametersPtr{};
+  for (auto &db: databases) {
+    search_result = Search(kernel_name, device_type, device_vendor, device_name, precision, db);
+    if (search_result) {
+      parameters_->insert(search_result->begin(), search_result->end());
+      break;
+    }
   }
+
+  if (!search_result) { throw RuntimeErrorCode(StatusCode::kDatabaseError); }
 }
 
 // =================================================================================================
@@ -101,12 +125,21 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
 // Returns a list of OpenCL pre-processor defines in string form
 std::string Database::GetDefines() const {
   std::string defines{};
-  for (auto &parameter: parameters_) {
+  for (auto &parameter: *parameters_) {
     defines += "#define "+parameter.first+" "+ToString(parameter.second)+"\n";
   }
   return defines;
 }
 
+// Retrieves the names of all the parameters
+std::vector<std::string> Database::GetParameterNames() const {
+  auto parameter_names = std::vector<std::string>();
+  for (auto &parameter: *parameters_) {
+    parameter_names.push_back(parameter.first);
+  }
+  return parameter_names;
+}
+
 // =================================================================================================
 
 // Searches a particular database for the right kernel and precision
@@ -115,15 +148,16 @@ Database::ParametersPtr Database::Search(const std::string &this_kernel,
                                          const std::string &this_vendor,
                                          const std::string &this_device,
                                          const Precision this_precision,
-                                         const std::vector<const DatabaseEntry*> &this_database) const {
+                                         const std::vector<DatabaseEntry> &this_database) const {
 
   // Selects the right kernel
   for (auto &db: this_database) {
-    if (db->kernel == this_kernel && db->precision == this_precision) {
+    if ((db.kernel == this_kernel) &&
+        (db.precision == this_precision || db.precision == Precision::kAny)) {
 
       // Searches for the right vendor and device type, or selects the default if unavailable. This
       // assumes that the default vendor / device type is last in the database.
-      for (auto &vendor: db->vendors) {
+      for (auto &vendor: db.vendors) {
         if ((vendor.name == this_vendor || vendor.name == kDeviceVendorAll) &&
             (vendor.type == this_type || vendor.type == database::kDeviceTypeAll)) {
 
diff --git a/src/database/database.hpp b/src/database/database.hpp
index 7c05a20b..82fbc252 100644
--- a/src/database/database.hpp
+++ b/src/database/database.hpp
@@ -70,27 +70,61 @@ class Database {
   static const std::unordered_map<std::string, std::string> kVendorNames;
 
   // The database consists of separate database entries, stored together in a vector
-  static const std::vector<const DatabaseEntry*> database;
+  static const std::vector<DatabaseEntry> database;
+
+  // Database for a special case: Apple CPUs support limited number of threads
+  static const std::vector<DatabaseEntry> apple_cpu_fallback;
+
+  Database() = default;
 
   // The constructor with a user-provided database overlay (potentially an empty vector)
-  explicit Database(const Queue &queue, const std::vector<std::string> &routines,
-                    const Precision precision, const std::vector<const DatabaseEntry*> &overlay);
+  explicit Database(const Device &device, const std::string &kernel_name,
+                    const Precision precision, const std::vector<DatabaseEntry> &overlay);
 
   // Accessor of values by key
-  size_t operator[](const std::string key) const { return parameters_.find(key)->second; }
+  size_t operator[](const std::string &key) const { return parameters_->find(key)->second; }
+  bool exists(const std::string &key) const { return (parameters_->count(key) == 1); }
 
   // Obtain a list of OpenCL pre-processor defines based on the parameters
   std::string GetDefines() const;
 
+  // Retrieves the names of all the parameters
+  std::vector<std::string> GetParameterNames() const;
+
  private:
   // Search method for a specified database, returning pointer (possibly a nullptr)
   ParametersPtr Search(const std::string &this_kernel, const std::string &this_type,
                        const std::string &this_vendor, const std::string &this_device,
                        const Precision this_precision,
-                       const std::vector<const DatabaseEntry*> &db) const;
+                       const std::vector<DatabaseEntry> &db) const;
 
   // Found parameters suitable for this device/kernel
-  Parameters parameters_;
+  std::shared_ptr<Parameters> parameters_;
+};
+
+// =================================================================================================
+
+// Multiple databases together in a map
+class Databases {
+ public:
+
+  explicit Databases(const std::vector<std::string> &kernel_names): kernel_names_(kernel_names) { }
+
+  // Database accessor
+  Database& operator()(const std::string &kernel_name) { return databases_[kernel_name]; }
+
+  // Retrieves a parameter from the database
+  size_t operator[](const std::string &key) const {
+    for (const auto &kernel_name : kernel_names_) {
+      const auto &kernel_db = databases_.find(kernel_name)->second;
+      if (kernel_db.exists(key)) { return kernel_db[key]; }
+    }
+    throw RuntimeErrorCode(StatusCode::kDatabaseError);
+  }
+
+ private:
+  const std::vector<std::string> kernel_names_;
+  std::unordered_map<std::string, Database> databases_;
 };
 
 // =================================================================================================
diff --git a/src/database/kernel_selection.hpp b/src/database/kernel_selection.hpp
index f91b7e8f..0ef6d9a0 100644
--- a/src/database/kernel_selection.hpp
+++ b/src/database/kernel_selection.hpp
@@ -22,13 +22,12 @@ const Database::DatabaseEntry KernelSelectionHalf = {
   "KernelSelection", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
-        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
-        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",1280*1280*1280} } },
       }
     },
     { // Default
@@ -45,13 +44,12 @@ const Database::DatabaseEntry KernelSelectionSingle = {
   "KernelSelection", Precision::kSingle, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
-        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
-        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",1280*1280*1280} } },
       }
     },
     { // Default
@@ -68,13 +66,12 @@ const Database::DatabaseEntry KernelSelectionComplexSingle = {
   "KernelSelection", Precision::kComplexSingle, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
-        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
-        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",1280*1280*1280} } },
       }
     },
     { // Default
@@ -91,13 +88,12 @@ const Database::DatabaseEntry KernelSelectionDouble = {
   "KernelSelection", Precision::kDouble, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
-        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
-        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",1280*1280*1280} } },
       }
     },
     { // Default
@@ -114,13 +110,12 @@ const Database::DatabaseEntry KernelSelectionComplexDouble = {
   "KernelSelection", Precision::kComplexDouble, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
-        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
-        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",1280*1280*1280} } },
       }
     },
     { // Default
diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp
index 1bc63691..89d5a3a6 100644
--- a/src/database/kernels/copy.hpp
+++ b/src/database/kernels/copy.hpp
@@ -17,6 +17,12 @@ namespace database {
 
 const Database::DatabaseEntry CopyHalf = {
   "Copy", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
+      }
+    },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",4} } },
@@ -26,7 +32,7 @@ const Database::DatabaseEntry CopyHalf = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
       }
     },
   }
@@ -39,12 +45,15 @@ const Database::DatabaseEntry CopySingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "ATI Radeon HD 6750M",                             { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "Ellesmere",                                       { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",8} } },
         { "Hawaii",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
         { "Oland",                                           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
         { "Pitcairn",                                        { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "Tahiti",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
         { "Tonga",                                           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
-        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+        { "Turks",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -55,10 +64,12 @@ const Database::DatabaseEntry CopySingle = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",2} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",2} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
-        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",2} } },
       }
     },
     { // Intel GPUs
@@ -83,6 +94,7 @@ const Database::DatabaseEntry CopySingle = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "GeForce GTX 1070",                                { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "GeForce GTX 1080",                                { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "GeForce GTX 480",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "GeForce GTX 680",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
@@ -92,9 +104,10 @@ const Database::DatabaseEntry CopySingle = {
         { "GeForce GTX TITAN",                               { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
         { "GeForce GTX TITAN Black",                         { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} } },
         { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+        { "TITAN X (Pascal)",                                { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
         { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
       }
     },
     { // Default
@@ -112,18 +125,23 @@ const Database::DatabaseEntry CopyComplexSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "ATI Radeon HD 6750M",                             { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "Ellesmere",                                       { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } },
         { "Hawaii",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Oland",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Pitcairn",                                        { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
         { "Tonga",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",2} } },
-        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "Turks",                                           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",2} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
         { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
       }
@@ -150,6 +168,7 @@ const Database::DatabaseEntry CopyComplexSingle = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 1070",                                { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 1080",                                { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "GeForce GTX 480",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 750",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
@@ -157,14 +176,15 @@ const Database::DatabaseEntry CopyComplexSingle = {
         { "GeForce GTX 980",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN Black",                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
         { "Tesla K40m",                                      { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
-        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
       }
     },
   }
@@ -177,12 +197,13 @@ const Database::DatabaseEntry CopyDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "Ellesmere",                                       { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
         { "Hawaii",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Oland",                                           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
         { "Pitcairn",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Tonga",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",4} } },
-        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
       }
     },
     { // ARM GPUs
@@ -193,10 +214,12 @@ const Database::DatabaseEntry CopyDouble = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
-        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
       }
     },
     { // Intel accelerators
@@ -209,6 +232,7 @@ const Database::DatabaseEntry CopyDouble = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 1070",                                { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "GeForce GTX 1080",                                { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "GeForce GTX 480",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 670",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 680",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
@@ -218,14 +242,15 @@ const Database::DatabaseEntry CopyDouble = {
         { "GeForce GTX TITAN",                               { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
         { "GeForce GTX TITAN Black",                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
         { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
         { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
-        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
       }
     },
   }
@@ -238,6 +263,7 @@ const Database::DatabaseEntry CopyComplexDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "Ellesmere",                                       { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Hawaii",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
         { "Oland",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Pitcairn",                                        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
@@ -254,10 +280,12 @@ const Database::DatabaseEntry CopyComplexDouble = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",4} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
-        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
       }
     },
     { // Intel accelerators
@@ -270,6 +298,7 @@ const Database::DatabaseEntry CopyComplexDouble = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 1070",                                { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",4} } },
+        { "GeForce GTX 1080",                                { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 480",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 680",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
@@ -279,6 +308,7 @@ const Database::DatabaseEntry CopyComplexDouble = {
         { "GeForce GTX TITAN",                               { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN Black",                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "GeForce GTX TITAN X",                             { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
diff --git a/src/database/kernels/invert.hpp b/src/database/kernels/invert.hpp
new file mode 100644
index 00000000..2717f182
--- /dev/null
+++ b/src/database/kernels/invert.hpp
@@ -0,0 +1,78 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// Tuning parameters for the diagonal matrix inversion kernels
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+// =================================================================================================
+
+const Database::DatabaseEntry InvertHalf = {
+  "Invert", Precision::kHalf, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"INTERNAL_BLOCK_SIZE",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry InvertSingle = {
+  "Invert", Precision::kSingle, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"INTERNAL_BLOCK_SIZE",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry InvertComplexSingle = {
+  "Invert", Precision::kComplexSingle, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"INTERNAL_BLOCK_SIZE",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry InvertDouble = {
+  "Invert", Precision::kDouble, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"INTERNAL_BLOCK_SIZE",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry InvertComplexDouble = {
+  "Invert", Precision::kComplexDouble, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"INTERNAL_BLOCK_SIZE",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace database
+} // namespace clblast
diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp
index 310d3a78..c3bce0e3 100644
--- a/src/database/kernels/pad.hpp
+++ b/src/database/kernels/pad.hpp
@@ -17,6 +17,12 @@ namespace database {
 
 const Database::DatabaseEntry PadHalf = {
   "Pad", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+      }
+    },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
@@ -39,12 +45,15 @@ const Database::DatabaseEntry PadSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "ATI Radeon HD 6750M",                             { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "Ellesmere",                                       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
         { "Hawaii",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
         { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Pitcairn",                                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tahiti",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tonga",                                           { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+        { "Turks",                                           { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
       }
     },
     { // ARM GPUs
@@ -55,8 +64,10 @@ const Database::DatabaseEntry PadSingle = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
       }
@@ -83,6 +94,7 @@ const Database::DatabaseEntry PadSingle = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "GeForce GTX 1070",                                { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 1080",                                { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 480",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
         { "GeForce GTX 670",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
@@ -92,9 +104,10 @@ const Database::DatabaseEntry PadSingle = {
         { "GeForce GTX TITAN",                               { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN Black",                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "TITAN X (Pascal)",                                { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Tesla K40m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
       }
     },
     { // Default
@@ -112,12 +125,15 @@ const Database::DatabaseEntry PadComplexSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "ATI Radeon HD 6750M",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "Ellesmere",                                       { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Hawaii",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Pitcairn",                                        { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tahiti",                                          { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Tonga",                                           { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Turks",                                           { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
       }
     },
     { // ARM GPUs
@@ -128,10 +144,12 @@ const Database::DatabaseEntry PadComplexSingle = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
       }
     },
     { // Intel GPUs
@@ -156,6 +174,7 @@ const Database::DatabaseEntry PadComplexSingle = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 1070",                                { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 1080",                                { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 480",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "GeForce GTX 670",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
@@ -165,6 +184,7 @@ const Database::DatabaseEntry PadComplexSingle = {
         { "GeForce GTX TITAN",                               { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN Black",                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "TITAN X (Pascal)",                                { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tesla K40m",                                      { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
@@ -172,7 +192,7 @@ const Database::DatabaseEntry PadComplexSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
       }
     },
   }
@@ -185,12 +205,13 @@ const Database::DatabaseEntry PadDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Ellesmere",                                       { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Hawaii",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Pitcairn",                                        { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tahiti",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Tonga",                                           { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
     { // ARM GPUs
@@ -201,8 +222,10 @@ const Database::DatabaseEntry PadDouble = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
       }
@@ -217,6 +240,7 @@ const Database::DatabaseEntry PadDouble = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 1070",                                { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 1080",                                { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "GeForce GTX 480",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 670",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
@@ -226,6 +250,7 @@ const Database::DatabaseEntry PadDouble = {
         { "GeForce GTX TITAN",                               { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN Black",                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "TITAN X (Pascal)",                                { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Tesla K40m",                                      { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@@ -246,6 +271,7 @@ const Database::DatabaseEntry PadComplexDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Ellesmere",                                       { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Hawaii",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Pitcairn",                                        { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@@ -262,10 +288,12 @@ const Database::DatabaseEntry PadComplexDouble = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
       }
     },
     { // Intel accelerators
@@ -278,6 +306,7 @@ const Database::DatabaseEntry PadComplexDouble = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 1070",                                { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
+        { "GeForce GTX 1080",                                { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 480",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 670",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@@ -287,6 +316,7 @@ const Database::DatabaseEntry PadComplexDouble = {
         { "GeForce GTX TITAN",                               { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "GeForce GTX TITAN Black",                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
         { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "TITAN X (Pascal)",                                { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Tesla K20m",                                      { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tesla K40m",                                      { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp
index 8ef09e85..94bac195 100644
--- a/src/database/kernels/padtranspose.hpp
+++ b/src/database/kernels/padtranspose.hpp
@@ -17,6 +17,12 @@ namespace database {
 
 const Database::DatabaseEntry PadtransposeHalf = {
   "Padtranspose", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+      }
+    },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
@@ -39,11 +45,14 @@ const Database::DatabaseEntry PadtransposeSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+        { "ATI Radeon HD 6750M",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "Ellesmere",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Hawaii",                                          { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Tonga",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Turks",                                           { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
       }
     },
@@ -55,8 +64,10 @@ const Database::DatabaseEntry PadtransposeSingle = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
         { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
       }
@@ -83,6 +94,7 @@ const Database::DatabaseEntry PadtransposeSingle = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 1070",                                { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 1080",                                { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 670",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@@ -92,6 +104,7 @@ const Database::DatabaseEntry PadtransposeSingle = {
         { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "GeForce GTX TITAN Black",                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
         { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Tesla K20m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
         { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
@@ -112,11 +125,14 @@ const Database::DatabaseEntry PadtransposeComplexSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+        { "ATI Radeon HD 6750M",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "Ellesmere",                                       { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Hawaii",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Tonga",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Turks",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
       }
     },
@@ -128,10 +144,12 @@ const Database::DatabaseEntry PadtransposeComplexSingle = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
       }
     },
     { // Intel GPUs
@@ -156,6 +174,7 @@ const Database::DatabaseEntry PadtransposeComplexSingle = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 1070",                                { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 1080",                                { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 670",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@@ -165,6 +184,7 @@ const Database::DatabaseEntry PadtransposeComplexSingle = {
         { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX TITAN Black",                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
         { "Tesla K20m",                                      { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@@ -185,6 +205,7 @@ const Database::DatabaseEntry PadtransposeDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+        { "Ellesmere",                                       { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Hawaii",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
@@ -201,8 +222,10 @@ const Database::DatabaseEntry PadtransposeDouble = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
         { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
       }
@@ -217,6 +240,7 @@ const Database::DatabaseEntry PadtransposeDouble = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 1070",                                { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 1080",                                { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 670",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@@ -226,6 +250,7 @@ const Database::DatabaseEntry PadtransposeDouble = {
         { "GeForce GTX TITAN",                               { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX TITAN Black",                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
         { "Tesla K20m",                                      { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@@ -233,7 +258,7 @@ const Database::DatabaseEntry PadtransposeDouble = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
       }
     },
   }
@@ -246,6 +271,7 @@ const Database::DatabaseEntry PadtransposeComplexDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+        { "Ellesmere",                                       { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Hawaii",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
@@ -262,10 +288,12 @@ const Database::DatabaseEntry PadtransposeComplexDouble = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
-        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
       }
     },
     { // Intel accelerators
@@ -278,6 +306,7 @@ const Database::DatabaseEntry PadtransposeComplexDouble = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 1070",                                { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 1080",                                { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 670",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
@@ -287,6 +316,7 @@ const Database::DatabaseEntry PadtransposeComplexDouble = {
         { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX TITAN Black",                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
         { "Tesla K20m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp
index 23fecb49..8567c725 100644
--- a/src/database/kernels/transpose.hpp
+++ b/src/database/kernels/transpose.hpp
@@ -17,6 +17,12 @@ namespace database {
 
 const Database::DatabaseEntry TransposeHalf = {
   "Transpose", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
+      }
+    },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
@@ -26,7 +32,7 @@ const Database::DatabaseEntry TransposeHalf = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
       }
     },
   }
@@ -39,12 +45,15 @@ const Database::DatabaseEntry TransposeSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
+        { "ATI Radeon HD 6750M",                             { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "Ellesmere",                                       { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Hawaii",                                          { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
         { "Oland",                                           { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Pitcairn",                                        { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "Tahiti",                                          { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Tonga",                                           { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
-        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "Turks",                                           { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
       }
     },
     { // ARM GPUs
@@ -55,8 +64,10 @@ const Database::DatabaseEntry TransposeSingle = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",16} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",16} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
         { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
       }
@@ -83,6 +94,7 @@ const Database::DatabaseEntry TransposeSingle = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "GeForce GTX 1070",                                { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "GeForce GTX 1080",                                { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "GeForce GTX 480",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "GeForce GTX 670",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
@@ -92,6 +104,7 @@ const Database::DatabaseEntry TransposeSingle = {
         { "GeForce GTX TITAN",                               { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "GeForce GTX TITAN Black",                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "GeForce GTX TITAN X",                             { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "TITAN X (Pascal)",                                { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Tesla K20m",                                      { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Tesla K40m",                                      { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
@@ -112,12 +125,15 @@ const Database::DatabaseEntry TransposeComplexSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "ATI Radeon HD 6750M",                             { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "Ellesmere",                                       { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Hawaii",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "Oland",                                           { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Pitcairn",                                        { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "Tahiti",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "Tonga",                                           { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "Turks",                                           { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -128,8 +144,10 @@ const Database::DatabaseEntry TransposeComplexSingle = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
       }
@@ -150,6 +168,7 @@ const Database::DatabaseEntry TransposeComplexSingle = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "GeForce GTX 1070",                                { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "GeForce GTX 1080",                                { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX 480",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX 670",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
@@ -159,6 +178,7 @@ const Database::DatabaseEntry TransposeComplexSingle = {
         { "GeForce GTX TITAN",                               { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX TITAN Black",                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
@@ -166,7 +186,7 @@ const Database::DatabaseEntry TransposeComplexSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
       }
     },
   }
@@ -179,6 +199,7 @@ const Database::DatabaseEntry TransposeDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "Ellesmere",                                       { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Hawaii",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "Oland",                                           { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Pitcairn",                                        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
@@ -195,10 +216,12 @@ const Database::DatabaseEntry TransposeDouble = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",16} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
       }
     },
     { // Intel accelerators
@@ -211,6 +234,7 @@ const Database::DatabaseEntry TransposeDouble = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "GeForce GTX 1070",                                { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "GeForce GTX 1080",                                { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "GeForce GTX 480",                                 { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "GeForce GTX 670",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
@@ -220,6 +244,7 @@ const Database::DatabaseEntry TransposeDouble = {
         { "GeForce GTX TITAN",                               { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "GeForce GTX TITAN Black",                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
@@ -240,6 +265,7 @@ const Database::DatabaseEntry TransposeComplexDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "Ellesmere",                                       { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "Hawaii",                                          { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Oland",                                           { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "Pitcairn",                                        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
@@ -256,16 +282,19 @@ const Database::DatabaseEntry TransposeComplexDouble = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "GeForce GTX 1070",                                { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX 1080",                                { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX 480",                                 { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX 670",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
@@ -275,6 +304,7 @@ const Database::DatabaseEntry TransposeComplexDouble = {
         { "GeForce GTX TITAN",                               { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX TITAN Black",                         { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp
index 52845e96..f31e2f6e 100644
--- a/src/database/kernels/xaxpy.hpp
+++ b/src/database/kernels/xaxpy.hpp
@@ -17,6 +17,12 @@ namespace database {
 
 const Database::DatabaseEntry XaxpyHalf = {
   "Xaxpy", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"VW",4}, {"WGS",128}, {"WPT",4} } },
+        { "default",                                         { {"VW",4}, {"WGS",128}, {"WPT",4} } },
+      }
+    },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@@ -26,7 +32,7 @@ const Database::DatabaseEntry XaxpyHalf = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",8}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",8}, {"WGS",256}, {"WPT",4} } },
       }
     },
   }
@@ -39,12 +45,15 @@ const Database::DatabaseEntry XaxpySingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "ATI Radeon HD 6750M",                             { {"VW",1}, {"WGS",256}, {"WPT",2} } },
+        { "Ellesmere",                                       { {"VW",1}, {"WGS",64}, {"WPT",4} } },
         { "Hawaii",                                          { {"VW",2}, {"WGS",64}, {"WPT",2} } },
         { "Oland",                                           { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Pitcairn",                                        { {"VW",2}, {"WGS",128}, {"WPT",1} } },
         { "Tahiti",                                          { {"VW",2}, {"WGS",64}, {"WPT",1} } },
         { "Tonga",                                           { {"VW",1}, {"WGS",256}, {"WPT",8} } },
-        { "default",                                         { {"VW",2}, {"WGS",64}, {"WPT",2} } },
+        { "Turks",                                           { {"VW",2}, {"WGS",256}, {"WPT",1} } },
+        { "default",                                         { {"VW",2}, {"WGS",256}, {"WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -55,10 +64,12 @@ const Database::DatabaseEntry XaxpySingle = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW",8}, {"WGS",512}, {"WPT",1} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS",512}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",4}, {"WGS",256}, {"WPT",1} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
-        { "default",                                         { {"VW",2}, {"WGS",256}, {"WPT",1} } },
+        { "default",                                         { {"VW",8}, {"WGS",512}, {"WPT",1} } },
       }
     },
     { // Intel GPUs
@@ -83,6 +94,7 @@ const Database::DatabaseEntry XaxpySingle = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"VW",2}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 1070",                                { {"VW",1}, {"WGS",64}, {"WPT",4} } },
+        { "GeForce GTX 1080",                                { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX 480",                                 { {"VW",2}, {"WGS",128}, {"WPT",1} } },
         { "GeForce GTX 670",                                 { {"VW",2}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
@@ -92,9 +104,10 @@ const Database::DatabaseEntry XaxpySingle = {
         { "GeForce GTX TITAN",                               { {"VW",4}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX TITAN Black",                         { {"VW",4}, {"WGS",128}, {"WPT",4} } },
         { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"VW",4}, {"WGS",128}, {"WPT",1} } },
         { "Tesla K20m",                                      { {"VW",4}, {"WGS",128}, {"WPT",1} } },
         { "Tesla K40m",                                      { {"VW",4}, {"WGS",128}, {"WPT",1} } },
-        { "default",                                         { {"VW",4}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",4}, {"WGS",256}, {"WPT",1} } },
       }
     },
     { // Default
@@ -112,11 +125,14 @@ const Database::DatabaseEntry XaxpyComplexSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"VW",2}, {"WGS",64}, {"WPT",8} } },
+        { "ATI Radeon HD 6750M",                             { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "Ellesmere",                                       { {"VW",2}, {"WGS",256}, {"WPT",1} } },
         { "Hawaii",                                          { {"VW",1}, {"WGS",128}, {"WPT",2} } },
         { "Oland",                                           { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Pitcairn",                                        { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Tahiti",                                          { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Tonga",                                           { {"VW",1}, {"WGS",256}, {"WPT",8} } },
+        { "Turks",                                           { {"VW",2}, {"WGS",256}, {"WPT",1} } },
         { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
       }
     },
@@ -128,8 +144,10 @@ const Database::DatabaseEntry XaxpyComplexSingle = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW",4}, {"WGS",1024}, {"WPT",1} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",4}, {"WGS",256}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",1}, {"WGS",1024}, {"WPT",2} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW",4}, {"WGS",1024}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
         { "default",                                         { {"VW",8}, {"WGS",1024}, {"WPT",1} } },
       }
@@ -156,6 +174,7 @@ const Database::DatabaseEntry XaxpyComplexSingle = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"VW",1}, {"WGS",512}, {"WPT",1} } },
         { "GeForce GTX 1070",                                { {"VW",1}, {"WGS",64}, {"WPT",2} } },
+        { "GeForce GTX 1080",                                { {"VW",2}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 480",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX 670",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
@@ -165,6 +184,7 @@ const Database::DatabaseEntry XaxpyComplexSingle = {
         { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX TITAN Black",                         { {"VW",1}, {"WGS",128}, {"WPT",2} } },
         { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",512}, {"WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"VW",2}, {"WGS",512}, {"WPT",1} } },
         { "Tesla K20m",                                      { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Tesla K40m",                                      { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "default",                                         { {"VW",1}, {"WGS",256}, {"WPT",1} } },
@@ -185,6 +205,7 @@ const Database::DatabaseEntry XaxpyDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+        { "Ellesmere",                                       { {"VW",2}, {"WGS",64}, {"WPT",4} } },
         { "Hawaii",                                          { {"VW",1}, {"WGS",64}, {"WPT",2} } },
         { "Oland",                                           { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Pitcairn",                                        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
@@ -201,10 +222,12 @@ const Database::DatabaseEntry XaxpyDouble = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW",4}, {"WGS",64}, {"WPT",1} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",8}, {"WGS",64}, {"WPT",1} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW",8}, {"WGS",256}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",8}, {"WGS",2048}, {"WPT",1} } },
-        { "default",                                         { {"VW",8}, {"WGS",512}, {"WPT",1} } },
+        { "default",                                         { {"VW",8}, {"WGS",64}, {"WPT",1} } },
       }
     },
     { // Intel accelerators
@@ -217,6 +240,7 @@ const Database::DatabaseEntry XaxpyDouble = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 1070",                                { {"VW",1}, {"WGS",64}, {"WPT",8} } },
+        { "GeForce GTX 1080",                                { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "GeForce GTX 480",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "GeForce GTX 670",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@@ -226,14 +250,15 @@ const Database::DatabaseEntry XaxpyDouble = {
         { "GeForce GTX TITAN",                               { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
         { "GeForce GTX TITAN Black",                         { {"VW",2}, {"WGS",128}, {"WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",512}, {"WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"VW",2}, {"WGS",512}, {"WPT",1} } },
         { "Tesla K20m",                                      { {"VW",2}, {"WGS",128}, {"WPT",1} } },
         { "Tesla K40m",                                      { {"VW",2}, {"WGS",128}, {"WPT",1} } },
-        { "default",                                         { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",2} } },
+        { "default",                                         { {"VW",2}, {"WGS",256}, {"WPT",1} } },
       }
     },
   }
@@ -246,6 +271,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "Ellesmere",                                       { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Hawaii",                                          { {"VW",2}, {"WGS",64}, {"WPT",1} } },
         { "Oland",                                           { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "Pitcairn",                                        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
@@ -262,8 +288,10 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW",4}, {"WGS",1024}, {"WPT",1} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",8}, {"WGS",128}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",8}, {"WGS",512}, {"WPT",1} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW",8}, {"WGS",1024}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "default",                                         { {"VW",4}, {"WGS",1024}, {"WPT",1} } },
       }
@@ -278,6 +306,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 1070",                                { {"VW",1}, {"WGS",64}, {"WPT",2} } },
+        { "GeForce GTX 1080",                                { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX 480",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "GeForce GTX 670",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@@ -287,6 +316,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
         { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS",64}, {"WPT",4} } },
         { "GeForce GTX TITAN Black",                         { {"VW",1}, {"WGS",128}, {"WPT",4} } },
         { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"VW",1}, {"WGS",256}, {"WPT",2} } },
         { "Tesla K20m",                                      { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Tesla K40m",                                      { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@@ -294,7 +324,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",256}, {"WPT",1} } },
       }
     },
   }
diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp
index 8b07c539..2bc7eeba 100644
--- a/src/database/kernels/xdot.hpp
+++ b/src/database/kernels/xdot.hpp
@@ -17,6 +17,12 @@ namespace database {
 
 const Database::DatabaseEntry XdotHalf = {
   "Xdot", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"WGS1",256}, {"WGS2",64} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",64} } },
+      }
+    },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
@@ -39,17 +45,22 @@ const Database::DatabaseEntry XdotSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",128}, {"WGS2",32} } },
+        { "ATI Radeon HD 6750M",                             { {"WGS1",256}, {"WGS2",32} } },
+        { "Ellesmere",                                       { {"WGS1",128}, {"WGS2",32} } },
         { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
         { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",32} } },
         { "Tahiti",                                          { {"WGS1",128}, {"WGS2",32} } },
         { "Tonga",                                           { {"WGS1",64}, {"WGS2",32} } },
+        { "Turks",                                           { {"WGS1",128}, {"WGS2",64} } },
         { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",32}, {"WGS2",32} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",1024}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",1024}, {"WGS2",32} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",64}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
       }
     },
     { // Intel GPUs
@@ -67,6 +78,7 @@ const Database::DatabaseEntry XdotSingle = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",128}, {"WGS2",32} } },
         { "GeForce GTX 1070",                                { {"WGS1",128}, {"WGS2",1024} } },
+        { "GeForce GTX 1080",                                { {"WGS1",512}, {"WGS2",64} } },
         { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
         { "GeForce GTX 670",                                 { {"WGS1",512}, {"WGS2",1024} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",128} } },
@@ -75,13 +87,14 @@ const Database::DatabaseEntry XdotSingle = {
         { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",32} } },
         { "GeForce GTX TITAN Black",                         { {"WGS1",512}, {"WGS2",64} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
+        { "TITAN X (Pascal)",                                { {"WGS1",1024}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",1024}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",256}, {"WGS2",256} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",64} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",256}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
       }
     },
   }
@@ -94,17 +107,22 @@ const Database::DatabaseEntry XdotComplexSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",32} } },
+        { "ATI Radeon HD 6750M",                             { {"WGS1",256}, {"WGS2",256} } },
+        { "Ellesmere",                                       { {"WGS1",256}, {"WGS2",32} } },
         { "Oland",                                           { {"WGS1",128}, {"WGS2",32} } },
         { "Pitcairn",                                        { {"WGS1",256}, {"WGS2",32} } },
         { "Tahiti",                                          { {"WGS1",64}, {"WGS2",32} } },
         { "Tonga",                                           { {"WGS1",256}, {"WGS2",64} } },
-        { "default",                                         { {"WGS1",256}, {"WGS2",64} } },
+        { "Turks",                                           { {"WGS1",128}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",32} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",128}, {"WGS2",64} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",1024}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",1024}, {"WGS2",32} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",256}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",32} } },
       }
     },
     { // Intel GPUs
@@ -122,6 +140,7 @@ const Database::DatabaseEntry XdotComplexSingle = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",64}, {"WGS2",32} } },
         { "GeForce GTX 1070",                                { {"WGS1",128}, {"WGS2",32} } },
+        { "GeForce GTX 1080",                                { {"WGS1",128}, {"WGS2",64} } },
         { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
         { "GeForce GTX 670",                                 { {"WGS1",256}, {"WGS2",32} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",64} } },
@@ -130,13 +149,14 @@ const Database::DatabaseEntry XdotComplexSingle = {
         { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",64} } },
         { "GeForce GTX TITAN Black",                         { {"WGS1",128}, {"WGS2",64} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
+        { "TITAN X (Pascal)",                                { {"WGS1",256}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",512}, {"WGS2",32} } },
         { "default",                                         { {"WGS1",512}, {"WGS2",64} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",256}, {"WGS2",64} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",32} } },
       }
     },
   }
@@ -149,6 +169,7 @@ const Database::DatabaseEntry XdotDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",128} } },
+        { "Ellesmere",                                       { {"WGS1",128}, {"WGS2",64} } },
         { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
         { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",32} } },
         { "Tahiti",                                          { {"WGS1",256}, {"WGS2",32} } },
@@ -158,14 +179,17 @@ const Database::DatabaseEntry XdotDouble = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",64}, {"WGS2",128} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",512}, {"WGS2",64} } },
-        { "default",                                         { {"WGS1",512}, {"WGS2",64} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",256}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",64} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",128}, {"WGS2",32} } },
         { "GeForce GTX 1070",                                { {"WGS1",128}, {"WGS2",512} } },
+        { "GeForce GTX 1080",                                { {"WGS1",128}, {"WGS2",128} } },
         { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
         { "GeForce GTX 670",                                 { {"WGS1",256}, {"WGS2",32} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",64} } },
@@ -174,8 +198,9 @@ const Database::DatabaseEntry XdotDouble = {
         { "GeForce GTX 980",                                 { {"WGS1",128}, {"WGS2",32} } },
         { "GeForce GTX TITAN Black",                         { {"WGS1",128}, {"WGS2",64} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
+        { "TITAN X (Pascal)",                                { {"WGS1",128}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",512}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",128}, {"WGS2",64} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",128} } },
       }
     },
     { // Default
@@ -193,6 +218,7 @@ const Database::DatabaseEntry XdotComplexDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",32} } },
+        { "Ellesmere",                                       { {"WGS1",256}, {"WGS2",32} } },
         { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
         { "Pitcairn",                                        { {"WGS1",256}, {"WGS2",32} } },
         { "Tahiti",                                          { {"WGS1",256}, {"WGS2",32} } },
@@ -202,14 +228,17 @@ const Database::DatabaseEntry XdotComplexDouble = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",32}, {"WGS2",128} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",1024}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",1024}, {"WGS2",32} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",32}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",64}, {"WGS2",32} } },
         { "GeForce GTX 1070",                                { {"WGS1",128}, {"WGS2",64} } },
+        { "GeForce GTX 1080",                                { {"WGS1",128}, {"WGS2",32} } },
         { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
         { "GeForce GTX 670",                                 { {"WGS1",512}, {"WGS2",128} } },
         { "GeForce GTX 680",                                 { {"WGS1",256}, {"WGS2",64} } },
@@ -218,13 +247,14 @@ const Database::DatabaseEntry XdotComplexDouble = {
         { "GeForce GTX 980",                                 { {"WGS1",64}, {"WGS2",32} } },
         { "GeForce GTX TITAN Black",                         { {"WGS1",128}, {"WGS2",32} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",128}, {"WGS2",32} } },
+        { "TITAN X (Pascal)",                                { {"WGS1",128}, {"WGS2",64} } },
         { "Tesla K20m",                                      { {"WGS1",128}, {"WGS2",32} } },
         { "default",                                         { {"WGS1",128}, {"WGS2",64} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",256}, {"WGS2",64} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
       }
     },
   }
diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp
index 66ac8a9f..49e86edc 100644
--- a/src/database/kernels/xgemm.hpp
+++ b/src/database/kernels/xgemm.hpp
@@ -17,6 +17,12 @@ namespace database {
 
 const Database::DatabaseEntry XgemmHalf = {
   "Xgemm", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+      }
+    },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
@@ -38,12 +44,15 @@ const Database::DatabaseEntry XgemmSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } },
+        { "ATI Radeon HD 6750M",                             { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",8}, {"VWN",8} } },
+        { "Ellesmere",                                       { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
         { "Hawaii",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
         { "Oland",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "Pitcairn",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "Tahiti",                                          { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
         { "Tonga",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } },
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
+        { "Turks",                                           { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
       }
     },
     { // ARM GPUs
@@ -54,10 +63,12 @@ const Database::DatabaseEntry XgemmSingle = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",8} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
       }
     },
     { // Intel GPUs
@@ -82,6 +93,7 @@ const Database::DatabaseEntry XgemmSingle = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
         { "GeForce GTX 1070",                                { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
+        { "GeForce GTX 1080",                                { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
         { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX 670",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "GeForce GTX 680",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
@@ -91,9 +103,10 @@ const Database::DatabaseEntry XgemmSingle = {
         { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX TITAN Black",                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
         { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
+        { "TITAN X (Pascal)",                                { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
         { "Tesla K20m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "Tesla K40m",                                      { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
       }
     },
     { // Default
@@ -111,12 +124,15 @@ const Database::DatabaseEntry XgemmComplexSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+        { "ATI Radeon HD 6750M",                             { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+        { "Ellesmere",                                       { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
         { "Hawaii",                                          { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "Oland",                                           { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
         { "Pitcairn",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
         { "Tahiti",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
         { "Tonga",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
+        { "Turks",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
       }
     },
     { // ARM GPUs
@@ -127,10 +143,12 @@ const Database::DatabaseEntry XgemmComplexSingle = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
       }
     },
     { // Intel GPUs
@@ -155,6 +173,7 @@ const Database::DatabaseEntry XgemmComplexSingle = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
         { "GeForce GTX 1070",                                { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
+        { "GeForce GTX 1080",                                { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
         { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX 670",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX 680",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
@@ -164,6 +183,7 @@ const Database::DatabaseEntry XgemmComplexSingle = {
         { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX TITAN Black",                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
         { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
+        { "TITAN X (Pascal)",                                { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "Tesla K20m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
         { "Tesla K40m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@@ -184,6 +204,7 @@ const Database::DatabaseEntry XgemmDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+        { "Ellesmere",                                       { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
         { "Hawaii",                                          { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
         { "Oland",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
         { "Pitcairn",                                        { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
@@ -200,10 +221,12 @@ const Database::DatabaseEntry XgemmDouble = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
-        { "default",                                         { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
       }
     },
     { // Intel accelerators
@@ -216,6 +239,7 @@ const Database::DatabaseEntry XgemmDouble = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX 1070",                                { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+        { "GeForce GTX 1080",                                { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
         { "GeForce GTX 670",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX 680",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
@@ -225,14 +249,15 @@ const Database::DatabaseEntry XgemmDouble = {
         { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX TITAN Black",                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "TITAN X (Pascal)",                                { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
         { "Tesla K20m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "Tesla K40m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
       }
     },
   }
@@ -245,12 +270,13 @@ const Database::DatabaseEntry XgemmComplexDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+        { "Ellesmere",                                       { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "Hawaii",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
         { "Oland",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
         { "Pitcairn",                                        { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "Tahiti",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "Tonga",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
       }
     },
     { // ARM GPUs
@@ -261,10 +287,12 @@ const Database::DatabaseEntry XgemmComplexDouble = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
       }
     },
     { // Intel accelerators
@@ -277,6 +305,7 @@ const Database::DatabaseEntry XgemmComplexDouble = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX 1070",                                { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
+        { "GeForce GTX 1080",                                { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
         { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX 670",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
         { "GeForce GTX 680",                                 { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@@ -285,14 +314,15 @@ const Database::DatabaseEntry XgemmComplexDouble = {
         { "GeForce GTX 980",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX TITAN Black",                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX TITAN X",                             { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "TITAN X (Pascal)",                                { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "Tesla K20m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "Tesla K40m",                                      { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
       }
     },
   }
diff --git a/src/database/kernels/xgemm_direct.hpp b/src/database/kernels/xgemm_direct.hpp
index 4413cf1b..29a3fb19 100644
--- a/src/database/kernels/xgemm_direct.hpp
+++ b/src/database/kernels/xgemm_direct.hpp
@@ -17,6 +17,12 @@ namespace database {
 
 const Database::DatabaseEntry XgemmDirectHalf = {
   "XgemmDirect", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"KWID",8}, {"MDIMAD",32}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",32}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+        { "default",                                         { {"KWID",8}, {"MDIMAD",32}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",32}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+      }
+    },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
@@ -25,7 +31,7 @@ const Database::DatabaseEntry XgemmDirectHalf = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
       }
     },
   }
@@ -38,8 +44,18 @@ const Database::DatabaseEntry XgemmDirectSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+        { "ATI Radeon HD 6750M",                             { {"KWID",8}, {"MDIMAD",8}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",0}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+        { "Ellesmere",                                       { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",32}, {"NDIMCD",32}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",32} } },
         { "Tonga",                                           { {"KWID",16}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",32}, {"NDIMCD",8}, {"PADA",0}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
-        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "Turks",                                           { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",1}, {"VWND",8}, {"WGD",64} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",2}, {"VWND",2}, {"WGD",64} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
       }
     },
     { // Intel GPUs
@@ -51,9 +67,11 @@ const Database::DatabaseEntry XgemmDirectSingle = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 1080",                                { {"KWID",16}, {"MDIMAD",16}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
         { "GeForce GTX 750 Ti",                              { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
         { "GeForce GTX TITAN Black",                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
-        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
+        { "TITAN X (Pascal)",                                { {"KWID",8}, {"MDIMAD",32}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
       }
     },
     { // Default
@@ -71,10 +89,19 @@ const Database::DatabaseEntry XgemmDirectComplexSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "ATI Radeon HD 6750M",                             { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
         { "Tonga",                                           { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+        { "Turks",                                           { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",16} } },
         { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
       }
     },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+      }
+    },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
@@ -84,14 +111,16 @@ const Database::DatabaseEntry XgemmDirectComplexSingle = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 1080",                                { {"KWID",8}, {"MDIMAD",8}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
         { "GeForce GTX 750 Ti",                              { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } },
         { "GeForce GTX TITAN Black",                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "TITAN X (Pascal)",                                { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
         { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
       }
     },
   }
@@ -104,20 +133,30 @@ const Database::DatabaseEntry XgemmDirectDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "Ellesmere",                                       { {"KWID",8}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",32} } },
         { "Tonga",                                           { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
         { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
       }
     },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+      }
+    },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 1080",                                { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
         { "GeForce GTX 750 Ti",                              { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
         { "GeForce GTX TITAN Black",                         { {"KWID",8}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "TITAN X (Pascal)",                                { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
         { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",16} } },
       }
     },
   }
@@ -130,20 +169,30 @@ const Database::DatabaseEntry XgemmDirectComplexDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "Ellesmere",                                       { {"KWID",16}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
         { "Tonga",                                           { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
         { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
       }
     },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",32}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"KWID",8}, {"MDIMAD",16}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",16} } },
+      }
+    },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 1080",                                { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
         { "GeForce GTX 750 Ti",                              { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
         { "GeForce GTX TITAN Black",                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
+        { "TITAN X (Pascal)",                                { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
         { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
       }
     },
   }
diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
index 5f25f210..d56f9acc 100644
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@@ -17,6 +17,12 @@ namespace database {
 
 const Database::DatabaseEntry XgemvHalf = {
   "Xgemv", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"WGS1",256}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",256}, {"WPT1",1} } },
+      }
+    },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
@@ -39,18 +45,23 @@ const Database::DatabaseEntry XgemvSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",128}, {"WPT1",1} } },
+        { "ATI Radeon HD 6750M",                             { {"WGS1",32}, {"WPT1",1} } },
+        { "Ellesmere",                                       { {"WGS1",256}, {"WPT1",1} } },
         { "Hawaii",                                          { {"WGS1",128}, {"WPT1",1} } },
         { "Oland",                                           { {"WGS1",128}, {"WPT1",1} } },
         { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1} } },
         { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1} } },
         { "Tonga",                                           { {"WGS1",128}, {"WPT1",2} } },
+        { "Turks",                                           { {"WGS1",32}, {"WPT1",1} } },
         { "default",                                         { {"WGS1",128}, {"WPT1",1} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",128}, {"WPT1",4} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",64}, {"WPT1",4} } },
         { "default",                                         { {"WGS1",64}, {"WPT1",4} } },
       }
     },
@@ -62,7 +73,7 @@ const Database::DatabaseEntry XgemvSingle = {
         { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",256}, {"WPT1",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",32}, {"WPT1",1} } },
         { "Iris",                                            { {"WGS1",64}, {"WPT1",2} } },
-        { "Iris Pro",                                        { {"WGS1",256}, {"WPT1",2} } },
+        { "Iris Pro",                                        { {"WGS1",128}, {"WPT1",1} } },
         { "default",                                         { {"WGS1",128}, {"WPT1",1} } },
       }
     },
@@ -76,6 +87,7 @@ const Database::DatabaseEntry XgemvSingle = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",256}, {"WPT1",1} } },
         { "GeForce GTX 1070",                                { {"WGS1",128}, {"WPT1",1} } },
+        { "GeForce GTX 1080",                                { {"WGS1",32}, {"WPT1",1} } },
         { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1} } },
         { "GeForce GTX 670",                                 { {"WGS1",64}, {"WPT1",1} } },
         { "GeForce GTX 680",                                 { {"WGS1",256}, {"WPT1",1} } },
@@ -85,6 +97,7 @@ const Database::DatabaseEntry XgemvSingle = {
         { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
         { "GeForce GTX TITAN Black",                         { {"WGS1",256}, {"WPT1",1} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WPT1",1} } },
+        { "TITAN X (Pascal)",                                { {"WGS1",32}, {"WPT1",1} } },
         { "Tesla K20m",                                      { {"WGS1",128}, {"WPT1",1} } },
         { "Tesla K40m",                                      { {"WGS1",256}, {"WPT1",1} } },
         { "default",                                         { {"WGS1",256}, {"WPT1",1} } },
@@ -105,19 +118,24 @@ const Database::DatabaseEntry XgemvComplexSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1} } },
+        { "ATI Radeon HD 6750M",                             { {"WGS1",64}, {"WPT1",1} } },
+        { "Ellesmere",                                       { {"WGS1",32}, {"WPT1",1} } },
         { "Hawaii",                                          { {"WGS1",64}, {"WPT1",1} } },
         { "Oland",                                           { {"WGS1",64}, {"WPT1",1} } },
         { "Pitcairn",                                        { {"WGS1",64}, {"WPT1",1} } },
         { "Tahiti",                                          { {"WGS1",64}, {"WPT1",1} } },
         { "Tonga",                                           { {"WGS1",32}, {"WPT1",1} } },
+        { "Turks",                                           { {"WGS1",64}, {"WPT1",1} } },
         { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",32}, {"WPT1",4} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",128}, {"WPT1",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",64}, {"WPT1",4} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",2} } },
       }
     },
     { // Intel GPUs
@@ -142,6 +160,7 @@ const Database::DatabaseEntry XgemvComplexSingle = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",256}, {"WPT1",1} } },
         { "GeForce GTX 1070",                                { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 1080",                                { {"WGS1",32}, {"WPT1",1} } },
         { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1} } },
         { "GeForce GTX 670",                                 { {"WGS1",64}, {"WPT1",1} } },
         { "GeForce GTX 680",                                 { {"WGS1",64}, {"WPT1",1} } },
@@ -149,6 +168,7 @@ const Database::DatabaseEntry XgemvComplexSingle = {
         { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WPT1",1} } },
         { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
         { "GeForce GTX TITAN Black",                         { {"WGS1",32}, {"WPT1",1} } },
+        { "TITAN X (Pascal)",                                { {"WGS1",32}, {"WPT1",1} } },
         { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
@@ -167,6 +187,7 @@ const Database::DatabaseEntry XgemvDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1} } },
+        { "Ellesmere",                                       { {"WGS1",32}, {"WPT1",1} } },
         { "Hawaii",                                          { {"WGS1",128}, {"WPT1",1} } },
         { "Oland",                                           { {"WGS1",256}, {"WPT1",1} } },
         { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1} } },
@@ -177,8 +198,10 @@ const Database::DatabaseEntry XgemvDouble = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",64}, {"WPT1",4} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",2} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",64}, {"WPT1",4} } },
         { "default",                                         { {"WGS1",64}, {"WPT1",4} } },
       }
     },
@@ -192,6 +215,7 @@ const Database::DatabaseEntry XgemvDouble = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",128}, {"WPT1",1} } },
         { "GeForce GTX 1070",                                { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 1080",                                { {"WGS1",32}, {"WPT1",1} } },
         { "GeForce GTX 480",                                 { {"WGS1",256}, {"WPT1",1} } },
         { "GeForce GTX 670",                                 { {"WGS1",128}, {"WPT1",1} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WPT1",1} } },
@@ -201,6 +225,7 @@ const Database::DatabaseEntry XgemvDouble = {
         { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
         { "GeForce GTX TITAN Black",                         { {"WGS1",32}, {"WPT1",1} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",64}, {"WPT1",1} } },
+        { "TITAN X (Pascal)",                                { {"WGS1",32}, {"WPT1",1} } },
         { "Tesla K20m",                                      { {"WGS1",256}, {"WPT1",1} } },
         { "Tesla K40m",                                      { {"WGS1",256}, {"WPT1",1} } },
         { "default",                                         { {"WGS1",128}, {"WPT1",1} } },
@@ -221,6 +246,7 @@ const Database::DatabaseEntry XgemvComplexDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1} } },
+        { "Ellesmere",                                       { {"WGS1",32}, {"WPT1",1} } },
         { "Hawaii",                                          { {"WGS1",64}, {"WPT1",1} } },
         { "Oland",                                           { {"WGS1",256}, {"WPT1",1} } },
         { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1} } },
@@ -231,8 +257,10 @@ const Database::DatabaseEntry XgemvComplexDouble = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",64}, {"WPT1",4} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",32}, {"WPT1",4} } },
         { "default",                                         { {"WGS1",64}, {"WPT1",4} } },
       }
     },
diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp
index 994a220c..2ab6265e 100644
--- a/src/database/kernels/xgemv_fast.hpp
+++ b/src/database/kernels/xgemv_fast.hpp
@@ -17,6 +17,12 @@ namespace database {
 
 const Database::DatabaseEntry XgemvFastHalf = {
   "XgemvFast", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } },
+      }
+    },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
@@ -39,19 +45,24 @@ const Database::DatabaseEntry XgemvFastSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "ATI Radeon HD 6750M",                             { {"VW2",2}, {"WGS2",64}, {"WPT2",2} } },
+        { "Ellesmere",                                       { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Hawaii",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Oland",                                           { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Tahiti",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Tonga",                                           { {"VW2",1}, {"WGS2",16}, {"WPT2",4} } },
+        { "Turks",                                           { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW2",1}, {"WGS2",32}, {"WPT2",4} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
-        { "default",                                         { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW2",2}, {"WGS2",16}, {"WPT2",4} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
       }
     },
     { // Intel GPUs
@@ -62,7 +73,7 @@ const Database::DatabaseEntry XgemvFastSingle = {
         { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
         { "Iris",                                            { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
-        { "Iris Pro",                                        { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
+        { "Iris Pro",                                        { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
         { "default",                                         { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
       }
     },
@@ -76,6 +87,7 @@ const Database::DatabaseEntry XgemvFastSingle = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
         { "GeForce GTX 1070",                                { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 1080",                                { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
         { "GeForce GTX 480",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
         { "GeForce GTX 670",                                 { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
         { "GeForce GTX 680",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
@@ -85,6 +97,7 @@ const Database::DatabaseEntry XgemvFastSingle = {
         { "GeForce GTX TITAN",                               { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "GeForce GTX TITAN Black",                         { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "GeForce GTX TITAN X",                             { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "TITAN X (Pascal)",                                { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Tesla K20m",                                      { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "Tesla K40m",                                      { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "default",                                         { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
@@ -105,18 +118,23 @@ const Database::DatabaseEntry XgemvFastComplexSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
+        { "ATI Radeon HD 6750M",                             { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "Ellesmere",                                       { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Hawaii",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Oland",                                           { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Tahiti",                                          { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
         { "Tonga",                                           { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
+        { "Turks",                                           { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
         { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW2",2}, {"WGS2",64}, {"WPT2",4} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW2",4}, {"WGS2",16}, {"WPT2",4} } },
         { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } },
       }
     },
@@ -163,6 +181,7 @@ const Database::DatabaseEntry XgemvFastDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "Ellesmere",                                       { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
         { "Hawaii",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Oland",                                           { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
@@ -173,8 +192,10 @@ const Database::DatabaseEntry XgemvFastDouble = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW2",1}, {"WGS2",16}, {"WPT2",4} } },
         { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
       }
     },
@@ -188,6 +209,7 @@ const Database::DatabaseEntry XgemvFastDouble = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "GeForce GTX 1070",                                { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 1080",                                { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
         { "GeForce GTX 480",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "GeForce GTX 670",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
         { "GeForce GTX 680",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
@@ -197,6 +219,7 @@ const Database::DatabaseEntry XgemvFastDouble = {
         { "GeForce GTX TITAN",                               { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "GeForce GTX TITAN Black",                         { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "GeForce GTX TITAN X",                             { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "TITAN X (Pascal)",                                { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } },
         { "Tesla K20m",                                      { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
         { "Tesla K40m",                                      { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "default",                                         { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
@@ -217,6 +240,7 @@ const Database::DatabaseEntry XgemvFastComplexDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "Ellesmere",                                       { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
         { "Hawaii",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Oland",                                           { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
@@ -227,9 +251,11 @@ const Database::DatabaseEntry XgemvFastComplexDouble = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW2",4}, {"WGS2",32}, {"WPT2",4} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW2",2}, {"WGS2",64}, {"WPT2",4} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
-        { "default",                                         { {"VW2",2}, {"WGS2",64}, {"WPT2",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW2",1}, {"WGS2",16}, {"WPT2",2} } },
+        { "default",                                         { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
       }
     },
     { // Intel accelerators
diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp
index da8bcfeb..b234e27f 100644
--- a/src/database/kernels/xgemv_fast_rot.hpp
+++ b/src/database/kernels/xgemv_fast_rot.hpp
@@ -17,6 +17,12 @@ namespace database {
 
 const Database::DatabaseEntry XgemvFastRotHalf = {
   "XgemvFastRot", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
+      }
+    },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
@@ -38,14 +44,19 @@ const Database::DatabaseEntry XgemvFastRotSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
+        { "ATI Radeon HD 6750M",                             { {"VW3",8}, {"WGS3",128}, {"WPT3",16} } },
+        { "Ellesmere",                                       { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
         { "Tonga",                                           { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
+        { "Turks",                                           { {"VW3",8}, {"WGS3",128}, {"WPT3",16} } },
         { "default",                                         { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
-        { "default",                                         { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
       }
     },
     { // Intel GPUs
@@ -54,21 +65,23 @@ const Database::DatabaseEntry XgemvFastRotSingle = {
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
         { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
-        { "Iris Pro",                                        { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
+        { "Iris Pro",                                        { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
         { "default",                                         { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 1080",                                { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
         { "GeForce GTX 750 Ti",                              { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
         { "GeForce GTX TITAN",                               { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
         { "GeForce GTX TITAN Black",                         { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
+        { "TITAN X (Pascal)",                                { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
         { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
       }
     },
   }
@@ -81,14 +94,19 @@ const Database::DatabaseEntry XgemvFastRotComplexSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
+        { "ATI Radeon HD 6750M",                             { {"VW3",8}, {"WGS3",32}, {"WPT3",8} } },
+        { "Ellesmere",                                       { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
         { "Tonga",                                           { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
-        { "default",                                         { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
+        { "Turks",                                           { {"VW3",4}, {"WGS3",32}, {"WPT3",8} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
-        { "default",                                         { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
       }
     },
     { // Intel GPUs
@@ -103,7 +121,7 @@ const Database::DatabaseEntry XgemvFastRotComplexSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
       }
     },
   }
@@ -116,21 +134,26 @@ const Database::DatabaseEntry XgemvFastRotDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
+        { "Ellesmere",                                       { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
         { "Tonga",                                           { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
         { "default",                                         { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
-        { "default",                                         { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 1080",                                { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
         { "GeForce GTX 750 Ti",                              { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
         { "GeForce GTX TITAN",                               { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
         { "GeForce GTX TITAN Black",                         { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+        { "TITAN X (Pascal)",                                { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
         { "default",                                         { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
       }
     },
@@ -149,19 +172,22 @@ const Database::DatabaseEntry XgemvFastRotComplexDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
+        { "Ellesmere",                                       { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
         { "Tonga",                                           { {"VW3",4}, {"WGS3",16}, {"WPT3",8} } },
         { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",16} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
-        { "default",                                         { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
       }
     },
   }
diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp
index 5e2be6a9..5a61f0df 100644
--- a/src/database/kernels/xger.hpp
+++ b/src/database/kernels/xger.hpp
@@ -17,6 +17,12 @@ namespace database {
 
 const Database::DatabaseEntry XgerHalf = {
   "Xger", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
+      }
+    },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
@@ -26,7 +32,7 @@ const Database::DatabaseEntry XgerHalf = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",4}, {"WGS2",8}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
       }
     },
   }
@@ -39,12 +45,15 @@ const Database::DatabaseEntry XgerSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
+        { "ATI Radeon HD 6750M",                             { {"WGS1",16}, {"WGS2",16}, {"WPT",4} } },
+        { "Ellesmere",                                       { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
         { "Hawaii",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
         { "Oland",                                           { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
         { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
         { "Tahiti",                                          { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
         { "Tonga",                                           { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
+        { "Turks",                                           { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",16}, {"WGS2",16}, {"WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -55,7 +64,9 @@ const Database::DatabaseEntry XgerSingle = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",32}, {"WGS2",4}, {"WPT",4} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",256}, {"WGS2",4}, {"WPT",4} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
         { "default",                                         { {"WGS1",128}, {"WGS2",8}, {"WPT",4} } },
       }
@@ -75,6 +86,7 @@ const Database::DatabaseEntry XgerSingle = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
         { "GeForce GTX 1070",                                { {"WGS1",512}, {"WGS2",1}, {"WPT",1} } },
+        { "GeForce GTX 1080",                                { {"WGS1",16}, {"WGS2",4}, {"WPT",1} } },
         { "GeForce GTX 480",                                 { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
         { "GeForce GTX 670",                                 { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
@@ -82,6 +94,7 @@ const Database::DatabaseEntry XgerSingle = {
         { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
         { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
         { "GeForce GTX TITAN Black",                         { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+        { "TITAN X (Pascal)",                                { {"WGS1",512}, {"WGS2",2}, {"WPT",1} } },
         { "default",                                         { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
       }
     },
@@ -100,12 +113,15 @@ const Database::DatabaseEntry XgerComplexSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
+        { "ATI Radeon HD 6750M",                             { {"WGS1",16}, {"WGS2",16}, {"WPT",1} } },
+        { "Ellesmere",                                       { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
         { "Hawaii",                                          { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
         { "Oland",                                           { {"WGS1",4}, {"WGS2",8}, {"WPT",1} } },
         { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
         { "Tahiti",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
         { "Tonga",                                           { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+        { "Turks",                                           { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -116,9 +132,11 @@ const Database::DatabaseEntry XgerComplexSingle = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",256}, {"WGS2",2}, {"WPT",4} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",2}, {"WPT",4} } },
       }
     },
     { // Intel GPUs
@@ -136,6 +154,7 @@ const Database::DatabaseEntry XgerComplexSingle = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
         { "GeForce GTX 1070",                                { {"WGS1",16}, {"WGS2",64}, {"WPT",2} } },
+        { "GeForce GTX 1080",                                { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } },
         { "GeForce GTX 480",                                 { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
         { "GeForce GTX 670",                                 { {"WGS1",16}, {"WGS2",32}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
@@ -143,6 +162,7 @@ const Database::DatabaseEntry XgerComplexSingle = {
         { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
         { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
         { "GeForce GTX TITAN Black",                         { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
+        { "TITAN X (Pascal)",                                { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } },
         { "default",                                         { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
       }
     },
@@ -161,12 +181,13 @@ const Database::DatabaseEntry XgerDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
+        { "Ellesmere",                                       { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
         { "Hawaii",                                          { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
         { "Oland",                                           { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
         { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
         { "Tahiti",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
         { "Tonga",                                           { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -177,15 +198,18 @@ const Database::DatabaseEntry XgerDouble = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",512}, {"WGS2",16}, {"WPT",1} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",256}, {"WGS2",4}, {"WPT",4} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",128}, {"WGS2",8}, {"WPT",2} } },
         { "GeForce GTX 1070",                                { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
+        { "GeForce GTX 1080",                                { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } },
         { "GeForce GTX 480",                                 { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
         { "GeForce GTX 670",                                 { {"WGS1",32}, {"WGS2",32}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
@@ -193,7 +217,8 @@ const Database::DatabaseEntry XgerDouble = {
         { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WGS2",16}, {"WPT",1} } },
         { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
         { "GeForce GTX TITAN Black",                         { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
+        { "TITAN X (Pascal)",                                { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
       }
     },
     { // Default
@@ -211,6 +236,7 @@ const Database::DatabaseEntry XgerComplexDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+        { "Ellesmere",                                       { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
         { "Hawaii",                                          { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } },
         { "Oland",                                           { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
         { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
@@ -227,7 +253,9 @@ const Database::DatabaseEntry XgerComplexDouble = {
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",128}, {"WGS2",4}, {"WPT",4} } },
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",512}, {"WGS2",2}, {"WPT",2} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
         { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
       }
@@ -236,6 +264,7 @@ const Database::DatabaseEntry XgerComplexDouble = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
         { "GeForce GTX 1070",                                { {"WGS1",8}, {"WGS2",128}, {"WPT",1} } },
+        { "GeForce GTX 1080",                                { {"WGS1",8}, {"WGS2",4}, {"WPT",1} } },
         { "GeForce GTX 480",                                 { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
         { "GeForce GTX 670",                                 { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
@@ -243,6 +272,7 @@ const Database::DatabaseEntry XgerComplexDouble = {
         { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
         { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
         { "GeForce GTX TITAN Black",                         { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
+        { "TITAN X (Pascal)",                                { {"WGS1",4}, {"WGS2",8}, {"WPT",1} } },
         { "default",                                         { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
       }
     },
diff --git a/src/database/kernels/xtrsv.hpp b/src/database/kernels/xtrsv.hpp
new file mode 100644
index 00000000..0741569e
--- /dev/null
+++ b/src/database/kernels/xtrsv.hpp
@@ -0,0 +1,78 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file populates the database with best-found tuning parameters for the 'Xtrsv' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+// =================================================================================================
+
+const Database::DatabaseEntry XtrsvHalf = {
+  "Xtrsv", Precision::kHalf, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRSV_BLOCK_SIZE",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XtrsvSingle = {
+  "Xtrsv", Precision::kSingle, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRSV_BLOCK_SIZE",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XtrsvComplexSingle = {
+  "Xtrsv", Precision::kComplexSingle, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRSV_BLOCK_SIZE",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XtrsvDouble = {
+  "Xtrsv", Precision::kDouble, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRSV_BLOCK_SIZE",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XtrsvComplexDouble = {
+  "Xtrsv", Precision::kComplexDouble, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRSV_BLOCK_SIZE",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace database
+} // namespace clblast
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index b0817242..db4c8ec4 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -31,9 +31,7 @@ R"(
 
 // Enable support for double-precision
 #if PRECISION == 64 || PRECISION == 6464
-  #if __OPENCL_VERSION__ <= CL_VERSION_1_1
-     #pragma OPENCL EXTENSION cl_khr_fp64: enable
-  #endif
+   #pragma OPENCL EXTENSION cl_khr_fp64: enable
 #endif
 
 // Half-precision
@@ -71,7 +69,7 @@ R"(
 
 // Complex single-precision
 #elif PRECISION == 3232
-  typedef struct cfloat {float x; float y;} real;
+  typedef float2 real;
   typedef struct cfloat2 {real x; real y;} real2;
   typedef struct cfloat4 {real x; real y; real z; real w;} real4;
   typedef struct cfloat8 {real s0; real s1; real s2; real s3;
@@ -86,7 +84,7 @@ R"(
 
 // Complex double-precision
 #elif PRECISION == 6464
-  typedef struct cdouble {double x; double y;} real;
+  typedef double2 real;
   typedef struct cdouble2 {real x; real y;} real2;
   typedef struct cdouble4 {real x; real y; real z; real w;} real4;
   typedef struct cdouble8 {real s0; real s1; real s2; real s3;
@@ -162,6 +160,13 @@ R"(
   #define AbsoluteValue(value) value = fabs(value)
 #endif
 
+// Negation (component-wise)
+#if PRECISION == 3232 || PRECISION == 6464
+  #define Negate(value) value.x = -(value.x); value.y = -(value.y)
+#else
+  #define Negate(value) value = -(value)
+#endif
+
 // Adds two complex variables
 #if PRECISION == 3232 || PRECISION == 6464
   #define Add(c, a, b) c.x = a.x + b.x; c.y = a.y + b.y
@@ -169,6 +174,13 @@ R"(
   #define Add(c, a, b) c = a + b
 #endif
 
+// Subtracts two complex variables
+#if PRECISION == 3232 || PRECISION == 6464
+  #define Subtract(c, a, b) c.x = a.x - b.x; c.y = a.y - b.y
+#else
+  #define Subtract(c, a, b) c = a - b
+#endif
+
 // Multiply two complex variables (used in the defines below)
 #if PRECISION == 3232 || PRECISION == 6464
   #define MulReal(a, b) a.x*b.x - a.y*b.y
@@ -193,6 +205,20 @@ R"(
   #endif
 #endif
 
+// The scalar multiply-subtract function
+#if PRECISION == 3232 || PRECISION == 6464
+  #define MultiplySubtract(c, a, b) c.x -= MulReal(a,b); c.y -= MulImag(a,b)
+#else
+  #define MultiplySubtract(c, a, b) c -= a * b
+#endif
+
+// The scalar division function: full division
+#if PRECISION == 3232 || PRECISION == 6464
+  #define DivideFull(c, a, b) singlereal num_x = (a.x * b.x) + (a.y * b.y); singlereal num_y = (a.y * b.x) - (a.x * b.y); singlereal denom = (b.x * b.x) + (b.y * b.y); c.x = num_x / denom; c.y = num_y / denom
+#else
+  #define DivideFull(c, a, b) c = a / b
+#endif
+
 // The scalar AXPBY function
 #if PRECISION == 3232 || PRECISION == 6464
   #define AXPBY(e, a, b, c, d) e.x = MulReal(a,b) + MulReal(c,d); e.y = MulImag(a,b) + MulImag(c,d)
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index ece8476e..d30d4e55 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -9,7 +9,7 @@
 //
 // This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit
 // strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't
-// support vector data-types.
+// support vector data-types. The general version has a batched implementation as well.
 //
 // This kernel uses the level-1 BLAS common tuning parameters.
 //
@@ -36,14 +36,31 @@ void Xaxpy(const int n, const real_arg arg_alpha,
   }
 }
 
-// =================================================================================================
+// Faster version of the kernel without offsets and strided accesses but with if-statement. Also
+// assumes that 'n' is dividable by 'VW' and 'WPT'.
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XaxpyFaster(const int n, const real_arg arg_alpha,
+                 const __global realV* restrict xgm,
+                 __global realV* ygm) {
+  const real alpha = GetRealArg(arg_alpha);
+
+  if (get_global_id(0) < n / (VW)) {
+    #pragma unroll
+    for (int w=0; w<WPT; ++w) {
+      const int id = w*get_global_size(0) + get_global_id(0);
+      realV xvalue = xgm[id];
+      realV yvalue = ygm[id];
+      ygm[id] = MultiplyAddVector(yvalue, alpha, xvalue);
+    }
+  }
+}
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
 __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
-void XaxpyFast(const int n, const real_arg arg_alpha,
-               const __global realV* restrict xgm,
-               __global realV* ygm) {
+void XaxpyFastest(const int n, const real_arg arg_alpha,
+                  const __global realV* restrict xgm,
+                  __global realV* ygm) {
   const real alpha = GetRealArg(arg_alpha);
 
   #pragma unroll
@@ -57,6 +74,24 @@ void XaxpyFast(const int n, const real_arg arg_alpha,
 
 // =================================================================================================
 
+// Full version of the kernel with offsets and strided accesses: batched version
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XaxpyBatched(const int n, const __constant real_arg* arg_alphas,
+                  const __global real* restrict xgm, const __constant int* x_offsets, const int x_inc,
+                  __global real* ygm, const __constant int* y_offsets, const int y_inc) {
+  const int batch = get_group_id(1);
+  const real alpha = GetRealArg(arg_alphas[batch]);
+
+  // Loops over the work that needs to be done (allows for an arbitrary number of threads)
+  #pragma unroll
+  for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
+    real xvalue = xgm[id*x_inc + x_offsets[batch]];
+    MultiplyAdd(ygm[id*y_inc + y_offsets[batch]], alpha, xvalue);
+  }
+}
+
+// =================================================================================================
+
 // End of the C++11 raw string literal
 )"
 
diff --git a/src/kernels/level2/xtrsv.opencl b/src/kernels/level2/xtrsv.opencl
new file mode 100644
index 00000000..ebea77a3
--- /dev/null
+++ b/src/kernels/level2/xtrsv.opencl
@@ -0,0 +1,144 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains kernels to perform forward or backward substition, as used in the TRSV routine
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+#if defined(ROUTINE_TRSV)
+
+__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
+void FillVector(const int n, const int inc, const int offset,
+                __global real* restrict dest, const real_arg arg_value) {
+  const real value = GetRealArg(arg_value);
+  const int tid = get_global_id(0);
+  if (tid < n) {
+    dest[tid*inc + offset] = value;
+  }
+}
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+
+#ifndef TRSV_BLOCK_SIZE
+  #define TRSV_BLOCK_SIZE 32    // The block size for forward or backward substition
+#endif
+
+// =================================================================================================
+
+__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
+void trsv_forward(int n,
+                  const __global real *A, const int a_offset, int a_ld,
+                  __global real *b, const int b_offset, int b_inc,
+                  __global real *x, const int x_offset, int x_inc,
+                  const int is_transposed, const int is_unit_diagonal, const int do_conjugate) {
+  __local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE];
+  __local real xlm[TRSV_BLOCK_SIZE];
+  const int tid = get_local_id(0);
+
+  // Pre-loads the data into local memory
+  if (tid < n) {
+    Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]);
+    if (is_transposed == 0) {
+      for (int i = 0; i < n; ++i) {
+        alm[i][tid] = A[i + tid*a_ld + a_offset];
+      }
+    }
+    else {
+      for (int i = 0; i < n; ++i) {
+        alm[i][tid] = A[tid + i*a_ld + a_offset];
+      }
+    }
+    if (do_conjugate) {
+      for (int i = 0; i < n; ++i) {
+        COMPLEX_CONJUGATE(alm[i][tid]);
+      }
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Computes the result (single-threaded for now)
+  if (tid == 0) {
+    for (int i = 0; i < n; ++i) {
+      for (int j = 0; j < i; ++j) {
+        MultiplySubtract(xlm[i], alm[i][j], xlm[j]);
+      }
+      if (is_unit_diagonal == 0) { DivideFull(xlm[i], xlm[i], alm[i][i]); }
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Stores the results
+  if (tid < n) {
+    x[tid*x_inc + x_offset] = xlm[tid];
+  }
+}
+
+__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
+void trsv_backward(int n,
+                   const __global real *A, const int a_offset, int a_ld,
+                   __global real *b, const int b_offset, int b_inc,
+                   __global real *x, const int x_offset, int x_inc,
+                   const int is_transposed, const int is_unit_diagonal, const int do_conjugate) {
+  __local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE];
+  __local real xlm[TRSV_BLOCK_SIZE];
+  const int tid = get_local_id(0);
+
+  // Pre-loads the data into local memory
+  if (tid < n) {
+    Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]);
+    if (is_transposed == 0) {
+      for (int i = 0; i < n; ++i) {
+        alm[i][tid] = A[i + tid*a_ld + a_offset];
+      }
+    }
+    else {
+      for (int i = 0; i < n; ++i) {
+        alm[i][tid] = A[tid + i*a_ld + a_offset];
+      }
+    }
+    if (do_conjugate) {
+      for (int i = 0; i < n; ++i) {
+        COMPLEX_CONJUGATE(alm[i][tid]);
+      }
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Computes the result (single-threaded for now)
+  if (tid == 0) {
+    for (int i = n - 1; i >= 0; --i) {
+      for (int j = i + 1; j < n; ++j) {
+        MultiplySubtract(xlm[i], alm[i][j], xlm[j]);
+      }
+      if (is_unit_diagonal == 0) { DivideFull(xlm[i], xlm[i], alm[i][i]); }
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Stores the results
+  if (tid < n) {
+    x[tid*x_inc + x_offset] = xlm[tid];
+  }
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl
index 29480b25..93b89187 100644
--- a/src/kernels/level3/copy_pad.opencl
+++ b/src/kernels/level3/copy_pad.opencl
@@ -24,16 +24,14 @@ R"(
 // Copies a matrix from source to destination. The output is padded with zero values in case the
 // destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld
 // value and offset can be different.
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-void CopyPadMatrix(const int src_one, const int src_two,
-                   const int src_ld, const int src_offset,
-                   __global const real* restrict src,
-                   const int dest_one, const int dest_two,
-                   const int dest_ld, const int dest_offset,
-                   __global real* dest,
-                   const real_arg arg_alpha,
-                   const int do_conjugate) {
-  const real alpha = GetRealArg(arg_alpha);
+inline void _CopyPadMatrix(const int src_one, const int src_two,
+                           const int src_ld, const int src_offset,
+                           __global const real* restrict src,
+                           const int dest_one, const int dest_two,
+                           const int dest_ld, const int dest_offset,
+                           __global real* dest,
+                           const real alpha,
+                           const int do_conjugate) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -60,22 +58,36 @@ void CopyPadMatrix(const int src_one, const int src_two,
   }
 }
 
+// Interface to the above function
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void CopyPadMatrix(const int src_one, const int src_two,
+                   const int src_ld, const int src_offset,
+                   __global const real* restrict src,
+                   const int dest_one, const int dest_two,
+                   const int dest_ld, const int dest_offset,
+                   __global real* dest,
+                   const real_arg arg_alpha,
+                   const int do_conjugate) {
+  const real alpha = GetRealArg(arg_alpha);
+  _CopyPadMatrix(src_one, src_two, src_ld, src_offset, src,
+                 dest_one, dest_two, dest_ld, dest_offset, dest,
+                 alpha, do_conjugate);
+}
+
 // =================================================================================================
 
 // Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but
 // writes only the actual data back to the destination matrix. Again, the ld value and offset can
 // be different.
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-void CopyMatrix(const int src_one, const int src_two,
-                const int src_ld, const int src_offset,
-                __global const real* restrict src,
-                const int dest_one, const int dest_two,
-                const int dest_ld, const int dest_offset,
-                __global real* dest,
-                const real_arg arg_alpha,
-                const int upper, const int lower,
-                const int diagonal_imag_zero) {
-  const real alpha = GetRealArg(arg_alpha);
+inline void _CopyMatrix(const int src_one, const int src_two,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_one, const int dest_two,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest,
+                        const real alpha,
+                        const int upper, const int lower,
+                        const int diagonal_imag_zero) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -105,6 +117,62 @@ void CopyMatrix(const int src_one, const int src_two,
   }
 }
 
+// Interface to the above function
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void CopyMatrix(const int src_one, const int src_two,
+                const int src_ld, const int src_offset,
+                __global const real* restrict src,
+                const int dest_one, const int dest_two,
+                const int dest_ld, const int dest_offset,
+                __global real* dest,
+                const real_arg arg_alpha,
+                const int upper, const int lower,
+                const int diagonal_imag_zero) {
+  const real alpha = GetRealArg(arg_alpha);
+  _CopyMatrix(src_one, src_two, src_ld, src_offset, src,
+              dest_one, dest_two, dest_ld, dest_offset, dest,
+              alpha, upper, lower, diagonal_imag_zero);
+}
+
+// =================================================================================================
+#if defined(ROUTINE_GEMMBATCHED)
+
+// Batched version of the above
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void CopyPadMatrixBatched(const int src_one, const int src_two,
+                          const int src_ld, const __constant int* src_offsets,
+                          __global const real* restrict src,
+                          const int dest_one, const int dest_two,
+                          const int dest_ld, const __constant int* dest_offsets,
+                          __global real* dest,
+                          const int do_conjugate) {
+  const int batch = get_group_id(2);
+  const int src_offset = src_offsets[batch];
+  const int dest_offset = dest_offsets[batch];
+  real alpha; SetToOne(alpha);
+  _CopyPadMatrix(src_one, src_two, src_ld, src_offset, src,
+                 dest_one, dest_two, dest_ld, dest_offset, dest,
+                 alpha, do_conjugate);
+}
+
+// Batched version of the above
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void CopyMatrixBatched(const int src_one, const int src_two,
+                       const int src_ld, const __constant int* src_offsets,
+                       __global const real* restrict src,
+                       const int dest_one, const int dest_two,
+                       const int dest_ld, const __constant int* dest_offsets,
+                       __global real* dest) {
+  const int batch = get_group_id(2);
+  const int src_offset = src_offsets[batch];
+  const int dest_offset = dest_offsets[batch];
+  real alpha; SetToOne(alpha);
+  _CopyMatrix(src_one, src_two, src_ld, src_offset, src,
+              dest_one, dest_two, dest_ld, dest_offset, dest,
+              alpha, 0, 0, 0);
+}
+
+#endif
 // =================================================================================================
 
 // End of the C++11 raw string literal
diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl
new file mode 100644
index 00000000..55f4a963
--- /dev/null
+++ b/src/kernels/level3/invert_diagonal_blocks.opencl
@@ -0,0 +1,431 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains kernels to invert squared diagonal blocks of a matrix. These kernels are based
+// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
+// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
+// and Jack Dongarra.
+//
+// =================================================================================================
+//
+//  Let A be an block_size*block_size lower triangular matrix, and B its inverse.
+//  Then the block decomposition
+//  
+//      [ A11   0  ] * [ B11   0  ] = [ I 0 ]
+//      [ A21  A22 ]   [ B21  B22 ]   [ 0 I ]
+//  
+//  yields
+//  
+//      A11*B11 = I            ==>  B11 =  A11^{-1},
+//      A22*B22 = I            ==>  B22 =  A22^{-1},
+//      A21*B11 + A22*B21 = 0  ==>  B21 = -A22^{-1}*A21*B11 = -B22*A21*B11.
+//  
+//  The InvertDiagonalBlock kernel inverts A11 and A22.
+//  The TripleMatMul routines multiply:
+//  part 1:  B21 =  A21 * B11,
+//  part 2:  B21 = -B22 * B21.
+//  
+//  At this level, inner block is current_size=16, with one 4 x 4 work-group per inner block. Each
+//  submatrix Aij and Bij is current_size x current_size. The submatrix dimension is multiplied by 2
+//  at each level, so the next level is current_size*2 = 32. A 'page' is the next bigger block,
+//  here current_size*2=32,
+//                 [ B11   0  ]
+//  which contains [ B21  B22 ].
+//  Outer blocks are block_size x block_size.
+//  
+//  A21 may have < current_size rows, but is guaranteed to have current_size cols since A22 is on
+//  the right. This makes a single check easy to do.
+//  
+//  B is stored in workspace that is a full multiple of block_size x block_size; no checks needed.
+//  
+//  We split this into part1 & part2 to synchronize all blocks and make sure
+//  that writes to B12 are observed by all blocks.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+#if defined(ROUTINE_INVERT)
+
+#define LOCALX 17 // 16 + 1 to avoid bank conflicts
+#define LOCALY 16
+
+// =================================================================================================
+
+// Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix
+__kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1)))
+void InvertDiagonalBlock(int n, __global const real* restrict src, const int src_offset, const int src_ld,
+                         __global real* restrict dest, const int outer_block_size,
+                         const int unit_diagonal, const int is_upper)
+{
+  const int thread_index = get_local_id(0);
+  const int block_index = get_group_id(0);
+
+  // Sets the offset for this particular block in the source and destination matrices
+  const int src_block_offset = block_index * (INTERNAL_BLOCK_SIZE + src_ld * INTERNAL_BLOCK_SIZE) + src_offset;
+  const int num_inner_blocks = outer_block_size / INTERNAL_BLOCK_SIZE;
+  const int dest_block_offset = (block_index / num_inner_blocks) * outer_block_size * outer_block_size + // go to the (block_index / num_inner_blocks) outer outer_block_size*outer_block_size block,
+                                (block_index % num_inner_blocks) * (outer_block_size*INTERNAL_BLOCK_SIZE + INTERNAL_BLOCK_SIZE); // then to the (block_index % num_inner_blocks) inner INTERNAL_BLOCK_SIZE*INTERNAL_BLOCK_SIZE block inside that
+
+  // Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE
+  __local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+
+  // Loads the source lower triangle into local memory. Any values in the upper triangle or
+  // outside of the matrix are set to zero
+  #pragma unroll
+  for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
+    const bool condition = (is_upper) ? (thread_index <= j && block_index*INTERNAL_BLOCK_SIZE + j < n) :
+                                        (thread_index >= j && block_index*INTERNAL_BLOCK_SIZE + thread_index < n);
+    if (condition) {
+      lm[thread_index][j] = src[j*src_ld + thread_index + src_block_offset];
+    }
+    else {
+      SetToZero(lm[thread_index][j]);
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  
+  // Inverts the diagonal
+  real inverted_diagonal;
+  SetToOne(inverted_diagonal);
+  if (unit_diagonal == 0) {
+    const real diagonal_value = lm[thread_index][thread_index];
+    if (!IsZero(diagonal_value)) { // Only for non-singular values and values inside the matrix
+      real constant_one;
+      SetToOne(constant_one);
+      DivideFull(inverted_diagonal, constant_one, diagonal_value);
+    }
+  }
+  lm[thread_index][thread_index] = inverted_diagonal;
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Upper-triangular
+  if (is_upper) {
+
+    // Computes the elements 0:j-1 of the j-th column
+    for (int j = 1; j < INTERNAL_BLOCK_SIZE; ++j) {
+      if (thread_index < j) {
+        real sum;
+        SetToZero(sum);
+        #pragma unroll
+        for (int k = 0; k < j; ++k) {
+          MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
+        }
+        real diagonal_value = lm[j][j];
+        Negate(diagonal_value);
+        Multiply(lm[thread_index][j], diagonal_value, sum);
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  }
+
+  // Lower triangular
+  else {
+
+    // Computes the elements j+1:INTERNAL_BLOCK_SIZE-1 of the j-th column
+    for (int j = INTERNAL_BLOCK_SIZE - 2; j >= 0; --j) {
+      if (thread_index > j) {
+        real sum;
+        SetToZero(sum);
+        #pragma unroll
+        for (int k = j + 1; k < INTERNAL_BLOCK_SIZE; ++k) {
+          MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
+        }
+        real diagonal_value = lm[j][j];
+        Negate(diagonal_value);
+        Multiply(lm[thread_index][j], diagonal_value, sum);
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  }
+  
+  // Writes the result to global memory
+  #pragma unroll
+  for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
+    dest[j*outer_block_size + thread_index + dest_block_offset] = lm[thread_index][j];
+  }
+}
+
+// =================================================================================================
+
+// Triple matrix-multiplication kernel: C = A * B
+inline void TripleMatMul(const int size, const bool upper, const int part, __local real* blm, int n,
+                         __global const real* agm, __global const real* bgm, __global real* cgm,
+                         const int lda, const int ldb, const int ldc,
+                         int current_size, int num_pages, const int block_size) {
+
+  // Emulates a 3D grid: NX * (NY * num_pages)
+  const int by   = get_group_id(1) / num_pages;
+  const int page = get_group_id(1) % num_pages;
+  const int lidx = get_local_id(0);
+  const int lidy = get_local_id(1);
+  const int ibx  = get_group_id(0) * (get_local_size(0)*get_local_size(1));
+  const int iby  = by*16;
+  const int id   = lidx + lidy*get_local_size(0);
+  const int row  = page*current_size*2 + current_size + ibx + id;
+  int col        = page*current_size*2 + current_size;
+
+  // Sets the offsets for this specific thread
+  agm += ibx + id;
+  bgm += lidx + (iby + lidy)*ldb;
+  cgm += ibx + id + iby*ldc;
+
+  // Initializes the result registers
+  real cpm[16];
+  #pragma unroll
+  for (int j = 0; j < 16; ++j) {
+    SetToZero(cpm[j]);
+  }
+
+  // Computes NT x 16 block of C, each thread computes one 1 x 16 row
+  for (int k = 0; k < current_size; k += 16) {
+
+    // Loads a 16 x 16 block of B into local memory using NX x 4 threads
+    #pragma unroll
+    for( int i=0; i < 16; i += (size/4) ) {  // += get_local_size(0)
+      #pragma unroll
+      for( int j=0; j < 16; j += 4 ) {  // += get_local_size(1)
+        blm[(lidx + i) * LOCALX + (lidy + j)] = bgm[k + i + j*ldb];
+      }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Upper triangular
+    if (upper) {
+
+      // Performs 16 x 16 multiply-add operations
+      #pragma unroll
+      for (int i = 0; i < 16; ++i) {
+        if (part == 2 || col++ < n) {
+          #pragma unroll
+          for (int j = 0; j < 16; ++j) {
+            MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]);
+          }
+        }
+      }
+    }
+
+    // Lower triangular
+    else {
+      if (row < n) {
+
+        // Performs 16 x 16 multiply-add operations
+        #pragma unroll
+        for (int i = 0; i < 16; ++i) {
+          #pragma unroll
+          for (int j = 0; j < 16; ++j) {
+            MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]);
+          }
+        }
+      }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+
+  // Stores NT x 16 results: each thread writes one 16 x 1 row
+  #pragma unroll
+  for (int i = 0; i < 16; ++i) {
+    if (part == 2) { Negate(cpm[i]); }
+    cgm[0] = cpm[i];
+    cgm += ldc;
+  }
+}
+
+// =================================================================================================
+
+// Triple matrix-multiplication kernel part 1: B12 = A12 * B22 (upper) or B21 = A21 * B11 (lower)
+inline void TripleMatMulPart1(const int size, const bool upper, __local real* blm, int n,
+                              __global const real* src, const int a_offset, const int lda,
+                              __global real* dest, int current_size, int num_pages, const int block_size) {
+
+  // Emulates a 3D grid: NX * (NY * num_pages)
+  const int page = get_group_id(1) % num_pages;
+
+  // Computes the destination block offset:
+  // - go to the (page / pages_per_block) outer block_size * block_size block
+  // - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that
+  const int pages_per_block = block_size / (current_size*2);
+  dest += (page / pages_per_block) * block_size * block_size +
+          (page % pages_per_block) * (current_size*2*block_size + current_size*2);
+
+  // Using the GEMM notation: C = A*B
+  __global const real* agm;
+  __global const real* bgm;
+  __global real* cgm;
+  if (upper) { // upper triangular: B12 = A12 * B22
+    agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size*lda;  // A12
+    bgm = dest + current_size*block_size + current_size;                                      // B22
+    cgm = dest + current_size*block_size;                                                     // B12
+  }
+  else { // lower triangular: B21 = A21 * B11
+    agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size;  // A21
+    bgm = dest;                                                                           // B11
+    cgm = dest + current_size;                                                            // B21
+  }
+
+  // Runs the generic C = A * B matrix multiplication
+  const int ldb = block_size;
+  const int ldc = block_size;
+  TripleMatMul(size, upper, 1, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
+}
+
+// Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower)
+inline void TripleMatMulPart2(const int size, const bool upper, __local real* blm, const int n,
+                              __global real* dest, int current_size, int num_pages, const int block_size) {
+
+  // Emulates a 3D grid: NX * (NY * num_pages)
+  const int page = get_group_id(1) % num_pages;
+
+  // Computes the destination block offset:
+  // - go to the (page / pages_per_block) outer block_size * block_size block
+  // - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that
+  const int pages_per_block = block_size / (current_size*2);
+  dest += (page / pages_per_block) * block_size * block_size +
+          (page % pages_per_block) * (current_size*2*block_size + current_size*2);
+
+  // Using the GEMM notation: C = A*B
+  __global const real* agm;
+  __global const real* bgm;
+  __global real* cgm;
+  if (upper) { // upper triangular: B12 = -B11 * B12
+    agm = dest;                            // B11
+    cgm = dest + current_size*block_size;  // B12
+    bgm = cgm;                             // B12, okay to overwrite
+  }
+
+  else { // lower triangular: B21 = -B22 * B21
+    agm = dest + current_size*block_size + current_size;  // B22
+    cgm = dest + current_size;                            // B21
+    bgm = cgm;                                            // B21, okay to overwrite
+  }
+
+  // Runs the generic C = A * B matrix multiplication
+  const int lda = block_size;
+  const int ldb = block_size;
+  const int ldc = block_size;
+  TripleMatMul(size, upper, 2, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
+}
+
+// =================================================================================================
+
+// B21 = A21 * B11
+__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(16, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B21 = -B22 * B21
+__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(16, false, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B21 = A21 * B11
+__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(32, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B21 = -B22 * B21
+__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(32, false, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B21 = A21 * B11
+__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(64, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B21 = -B22 * B21
+__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// =================================================================================================
+
+// B12 =  A12 * B22
+__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(16, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B12 = -B11 * B12
+__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(16, true, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B12 =  A12 * B22
+__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(32, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B12 = -B11 * B12
+__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(32, true, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B12 =  A12 * B22
+__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(64, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B12 = -B11 * B12
+__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size);
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/level3.opencl b/src/kernels/level3/level3.opencl
index bf14ab12..5ba8cf29 100644
--- a/src/kernels/level3/level3.opencl
+++ b/src/kernels/level3/level3.opencl
@@ -73,6 +73,22 @@ R"(
   #define PADTRA_PAD 0    // Padding of the local memory to avoid bank-conflicts
 #endif
 
+// =================================================================================================
+#if defined(ROUTINE_INVERT) || defined(ROUTINE_TRSM)
+
+__kernel __attribute__((reqd_work_group_size(8, 8, 1)))
+void FillMatrix(const int m, const int n, const int ld, const int offset,
+                __global real* restrict dest, const real_arg arg_value) {
+  const real value = GetRealArg(arg_value);
+  const int id_one = get_global_id(0);
+  const int id_two = get_global_id(1);
+  if (id_one < m && id_two < n) {
+    dest[id_two*ld + id_one + offset] = value;
+  }
+}
+
+#endif
+
 // =================================================================================================
 
 // End of the C++11 raw string literal
diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl
index ba0b7062..fb60ce75 100644
--- a/src/kernels/level3/transpose_pad.opencl
+++ b/src/kernels/level3/transpose_pad.opencl
@@ -24,19 +24,15 @@ R"(
 
 // Transposes a matrix from source to destination. The output is padded with zero values in case the
 // destination matrix dimensions are larger than the transposed source matrix dimensions.
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
-void TransposePadMatrix(const int src_one, const int src_two,
-                        const int src_ld, const int src_offset,
-                        __global const real* restrict src,
-                        const int dest_one, const int dest_two,
-                        const int dest_ld, const int dest_offset,
-                        __global real* dest,
-                        const real_arg arg_alpha,
-                        const int do_conjugate) {
-  const real alpha = GetRealArg(arg_alpha);
-
-  // Local memory to store a tile of the matrix (for coalescing)
-  __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
+inline void _TransposePadMatrix(__local real* tile,
+                                const int src_one, const int src_two,
+                                const int src_ld, const int src_offset,
+                                __global const real* restrict src,
+                                const int dest_one, const int dest_two,
+                                const int dest_ld, const int dest_offset,
+                                __global real* dest,
+                                const real alpha,
+                                const int do_conjugate) {
 
   // Loop over the work per thread
   #pragma unroll
@@ -56,7 +52,9 @@ void TransposePadMatrix(const int src_one, const int src_two,
       if (id_src_two < src_two && id_src_one < src_one) {
         value = src[id_src_two*src_ld + id_src_one + src_offset];
       }
-      tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value;
+      const int tile_id0 = get_local_id(0)*PADTRA_WPT + w_one;
+      const int tile_id1 = get_local_id(1)*PADTRA_WPT + w_two;
+      tile[tile_id1 * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD) + tile_id0] = value;
     }
   }
 
@@ -75,7 +73,9 @@ void TransposePadMatrix(const int src_one, const int src_two,
 
       // Stores the transposed value in the destination matrix
       if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
-        real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
+        const int tile_id0 = get_local_id(1)*PADTRA_WPT + w_one;
+        const int tile_id1 = get_local_id(0)*PADTRA_WPT + w_two;
+        real value = tile[tile_id1 * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD) + tile_id0];
         if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
         Multiply(dest[id_dest_two*dest_ld + id_dest_one + dest_offset], alpha, value);
       }
@@ -83,25 +83,38 @@ void TransposePadMatrix(const int src_one, const int src_two,
   }
 }
 
+// Interface to the above function
+__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+void TransposePadMatrix(const int src_one, const int src_two,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_one, const int dest_two,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest,
+                        const real_arg arg_alpha,
+                        const int do_conjugate) {
+  const real alpha = GetRealArg(arg_alpha);
+  __local real tile[(PADTRA_WPT*PADTRA_TILE) * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD)];
+  _TransposePadMatrix(tile, src_one, src_two, src_ld, src_offset, src,
+                      dest_one, dest_two, dest_ld, dest_offset, dest,
+                      alpha, do_conjugate);
+}
+
 // =================================================================================================
 
 // Transposes a matrix, while considering possible padding in the source matrix. Data is read from a
 // padded source matrix, but only the actual data is written back to the transposed destination
 // matrix. This kernel optionally checks for upper/lower triangular matrices.
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
-void TransposeMatrix(const int src_one, const int src_two,
-                     const int src_ld, const int src_offset,
-                     __global const real* restrict src,
-                     const int dest_one, const int dest_two,
-                     const int dest_ld, const int dest_offset,
-                     __global real* dest,
-                     const real_arg arg_alpha,
-                     const int upper, const int lower,
-                     const int diagonal_imag_zero) {
-  const real alpha = GetRealArg(arg_alpha);
-
-  // Local memory to store a tile of the matrix (for coalescing)
-  __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
+inline void _TransposeMatrix(__local real* tile,
+                             const int src_one, const int src_two,
+                             const int src_ld, const int src_offset,
+                             __global const real* restrict src,
+                             const int dest_one, const int dest_two,
+                             const int dest_ld, const int dest_offset,
+                             __global real* dest,
+                             const real alpha,
+                             const int upper, const int lower,
+                             const int diagonal_imag_zero) {
 
   // Loop over the work per thread
   #pragma unroll
@@ -117,7 +130,9 @@ void TransposeMatrix(const int src_one, const int src_two,
       // Loads data into the local memory if the thread IDs are within bounds of the source matrix.
       if ((id_src_one < src_one) && (id_src_two < src_two)) {
         real value = src[id_src_two*src_ld + id_src_one + src_offset];
-        tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value;
+        const int tile_id0 = get_local_id(0)*PADTRA_WPT + w_one;
+        const int tile_id1 = get_local_id(1)*PADTRA_WPT + w_two;
+        tile[tile_id1 * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD) + tile_id0] = value;
       }
     }
   }
@@ -145,7 +160,9 @@ void TransposeMatrix(const int src_one, const int src_two,
 
         // Stores the transposed value in the destination matrix
         if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
-          real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
+          const int tile_id0 = get_local_id(1)*PADTRA_WPT + w_one;
+          const int tile_id1 = get_local_id(0)*PADTRA_WPT + w_two;
+          real value = tile[tile_id1 * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD) + tile_id0];
           if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); }
           Multiply(dest[id_dest_two*dest_ld + id_dest_one + dest_offset], alpha, value);
         }
@@ -154,6 +171,65 @@ void TransposeMatrix(const int src_one, const int src_two,
   }
 }
 
+// Interface to the above function
+__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+void TransposeMatrix(const int src_one, const int src_two,
+                     const int src_ld, const int src_offset,
+                     __global const real* restrict src,
+                     const int dest_one, const int dest_two,
+                     const int dest_ld, const int dest_offset,
+                     __global real* dest,
+                     const real_arg arg_alpha,
+                     const int upper, const int lower,
+                     const int diagonal_imag_zero) {
+  const real alpha = GetRealArg(arg_alpha);
+  __local real tile[(PADTRA_WPT*PADTRA_TILE) * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD)];
+  _TransposeMatrix(tile, src_one, src_two, src_ld, src_offset, src,
+                   dest_one, dest_two, dest_ld, dest_offset, dest,
+                   alpha, upper, lower, diagonal_imag_zero);
+}
+
+// =================================================================================================
+#if defined(ROUTINE_GEMMBATCHED)
+
+// Batched version of the above
+__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+void TransposePadMatrixBatched(const int src_one, const int src_two,
+                               const int src_ld, const __constant int* src_offsets,
+                               __global const real* restrict src,
+                               const int dest_one, const int dest_two,
+                               const int dest_ld, const __constant int* dest_offsets,
+                               __global real* dest,
+                               const int do_conjugate) {
+  const int batch = get_group_id(2);
+  const int src_offset = src_offsets[batch];
+  const int dest_offset = dest_offsets[batch];
+  real alpha; SetToOne(alpha);
+  __local real tile[(PADTRA_WPT*PADTRA_TILE) * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD)];
+  _TransposePadMatrix(tile, src_one, src_two, src_ld, src_offset, src,
+                      dest_one, dest_two, dest_ld, dest_offset, dest,
+                      alpha, do_conjugate);
+}
+
+// Batched version of the above
+__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+void TransposeMatrixBatched(const int src_one, const int src_two,
+                            const int src_ld, const __constant int* src_offsets,
+                            __global const real* restrict src,
+                            const int dest_one, const int dest_two,
+                            const int dest_ld, const __constant int* dest_offsets,
+                            __global real* dest) {
+  const int batch = get_group_id(2);
+  const int src_offset = src_offsets[batch];
+  const int dest_offset = dest_offsets[batch];
+  real alpha; SetToOne(alpha);
+  __local real tile[(PADTRA_WPT*PADTRA_TILE) * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD)];
+  _TransposeMatrix(tile, src_one, src_two, src_ld, src_offset, src,
+                   dest_one, dest_two, dest_ld, dest_offset, dest,
+                   alpha, 0, 0, 0);
+}
+
+#endif
 // =================================================================================================
 
 // End of the C++11 raw string literal
diff --git a/src/kernels/level3/xgemm_batched.opencl b/src/kernels/level3/xgemm_batched.opencl
new file mode 100644
index 00000000..c7bf10d5
--- /dev/null
+++ b/src/kernels/level3/xgemm_batched.opencl
@@ -0,0 +1,70 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the batched version of the non-direct GEMM kernel. See part 1 for information
+// about the non-batched version of the kernel.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Main entry point of the kernel. This is the regular full version.
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK,
+                  const __constant real_arg* arg_alphas,
+                  const __constant real_arg* arg_betas,
+                  const __global realM* restrict agm, const int a_one, const int a_two,
+                  const __global realN* restrict bgm, const int b_one, const int b_two,
+                  __global realM* cgm, const int c_one, const int c_two) {
+  const int batch = get_group_id(2);
+  const real alpha = GetRealArg(arg_alphas[batch]);
+  const real beta = GetRealArg(arg_betas[batch]);
+
+  // Sets the offsets
+  const int a_offset = batch * a_one * a_two;
+  const int b_offset = batch * b_one * b_two;
+  const int c_offset = batch * c_one * c_two;
+  const __global realM* restrict agm_ = &agm[a_offset / VWM];
+  const __global realN* restrict bgm_ = &bgm[b_offset / VWN];
+  __global realM* restrict cgm_ = &cgm[c_offset / VWM];
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+
+  // Computes the matrix-multiplication and stores the result in register memory
+  realM cpm[NWI][MWI/VWM];
+  #if SA == 1 && SB == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_, cpm, alm, blm);
+  #elif SA == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_, cpm, alm);
+  #elif SB == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_, cpm, blm);
+  #else
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_, cpm);
+  #endif
+
+  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+  StoreResults(cgm_, cpm, kSizeM, alpha, beta);
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/xgemm_direct_batched.opencl b/src/kernels/level3/xgemm_direct_batched.opencl
new file mode 100644
index 00000000..fa582cff
--- /dev/null
+++ b/src/kernels/level3/xgemm_direct_batched.opencl
@@ -0,0 +1,110 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the batched version of the direct GEMM kernels. See part 1 for information
+// about the non-batched version of the kernel.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Direct version of the batched GEMM kernel with [A, B] = [non-transposed, non-transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
+                                   const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
+                                   const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
+                                   const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld,
+                                   __global real* cgm, const __constant int* c_offsets, const int c_ld,
+                                   const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  const int batch = get_group_id(2);
+  const real_arg arg_alpha = arg_alphas[batch];
+  const real_arg arg_beta = arg_betas[batch];
+  const int a_offset = a_offsets[batch];
+  const int b_offset = b_offsets[batch];
+  const int c_offset = c_offsets[batch];
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 0, 0, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the batched GEMM kernel with [A, B] = [non-transposed, transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
+                                   const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
+                                   const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
+                                   const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld,
+                                   __global real* cgm, const __constant int* c_offsets, const int c_ld,
+                                   const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  const int batch = get_group_id(2);
+  const real_arg arg_alpha = arg_alphas[batch];
+  const real_arg arg_beta = arg_betas[batch];
+  const int a_offset = a_offsets[batch];
+  const int b_offset = b_offsets[batch];
+  const int c_offset = c_offsets[batch];
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 0, 1, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the batched GEMM kernel with [A, B] = [transposed, non-transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
+                                   const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
+                                   const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
+                                   const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld,
+                                   __global real* cgm, const __constant int* c_offsets, const int c_ld,
+                                   const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  const int batch = get_group_id(2);
+  const real_arg arg_alpha = arg_alphas[batch];
+  const real_arg arg_beta = arg_betas[batch];
+  const int a_offset = a_offsets[batch];
+  const int b_offset = b_offsets[batch];
+  const int c_offset = c_offsets[batch];
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 1, 0, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the batched GEMM kernel with [A, B] = [transposed, transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
+                                   const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
+                                   const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
+                                   const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld,
+                                   __global real* cgm, const __constant int* c_offsets, const int c_ld,
+                                   const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  const int batch = get_group_id(2);
+  const real_arg arg_alpha = arg_alphas[batch];
+  const real_arg arg_beta = arg_betas[batch];
+  const int a_offset = a_offsets[batch];
+  const int b_offset = b_offsets[batch];
+  const int c_offset = c_offsets[batch];
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 1, 1, c_transpose, a_conjugate, b_conjugate);
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/xgemm_direct_part2.opencl b/src/kernels/level3/xgemm_direct_part2.opencl
index d77cbf65..fc09307e 100644
--- a/src/kernels/level3/xgemm_direct_part2.opencl
+++ b/src/kernels/level3/xgemm_direct_part2.opencl
@@ -42,7 +42,7 @@ inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local re
       int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
 
       // Loads the data from global memory into the local memory
-      const realMD avec = agm[idk*(a_ld/VWMD) + idm + a_offset];
+      const realMD avec = agm[idk*(a_ld/VWMD) + idm + (a_offset/VWMD)];
       #if VWMD == 1
          alm[kg*(WGD + PADA) + mg] = avec;
       #elif VWMD == 2
@@ -113,7 +113,7 @@ inline void GlobalToLocalDirectB(const __global realND* restrict bgm, __local re
       int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
 
       // Loads the data from global memory into the local memory
-      const realND bvec = bgm[idk*(b_ld/VWND) + idn + b_offset];
+      const realND bvec = bgm[idk*(b_ld/VWND) + idn + (b_offset/VWND)];
       #if VWND == 1
          blm[kg*(WGD + PADB) + ng] = bvec;
       #elif VWND == 2
diff --git a/src/kernels/level3/xgemm_direct_part3.opencl b/src/kernels/level3/xgemm_direct_part3.opencl
index a9350e00..c04cdeb8 100644
--- a/src/kernels/level3/xgemm_direct_part3.opencl
+++ b/src/kernels/level3/xgemm_direct_part3.opencl
@@ -53,13 +53,13 @@ inline void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK,
     for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) {
 
       // Loads data: off-chip --> local (matrix A and B)
-      if (a_ld % VWMD == 0) {
+      if (a_ld % VWMD == 0 && a_offset % VWMD == 0) {
         GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
       }
       else {
         GlobalToLocalScalarA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
       }
-      if (b_ld % VWND == 0) {
+      if (b_ld % VWND == 0 && b_offset % VWND == 0) {
         GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate);
       }
       else {
diff --git a/src/routine.cpp b/src/routine.cpp
index acafb0d2..cb39c7ee 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -21,22 +21,75 @@
 namespace clblast {
 // =================================================================================================
 
+// For each kernel this map contains a list of routines it is used in
+const std::vector<std::string> Routine::routines_axpy = {"AXPY", "COPY", "SCAL", "SWAP"};
+const std::vector<std::string> Routine::routines_dot = {"AMAX", "ASUM", "DOT", "DOTC", "DOTU", "MAX", "MIN", "NRM2", "SUM"};
+const std::vector<std::string> Routine::routines_ger = {"GER", "GERC", "GERU", "HER", "HER2", "HPR", "HPR2", "SPR", "SPR2", "SYR", "SYR2"};
+const std::vector<std::string> Routine::routines_gemv = {"GBMV", "GEMV", "HBMV", "HEMV", "HPMV", "SBMV", "SPMV", "SYMV", "TMBV", "TPMV", "TRMV", "TRSV"};
+const std::vector<std::string> Routine::routines_gemm = {"GEMM", "HEMM", "SYMM", "TRMM"};
+const std::vector<std::string> Routine::routines_gemm_syrk = {"GEMM", "HEMM", "HER2K", "HERK", "SYMM", "SYR2K", "SYRK", "TRMM", "TRSM"};
+const std::vector<std::string> Routine::routines_trsm = {"TRSM"};
+const std::unordered_map<std::string, const std::vector<std::string>> Routine::routines_by_kernel = {
+  {"Xaxpy", routines_axpy},
+  {"Xdot", routines_dot},
+  {"Xgemv", routines_gemv},
+  {"XgemvFast", routines_gemv},
+  {"XgemvFastRot", routines_gemv},
+  {"Xtrsv", routines_gemv},
+  {"Xger", routines_ger},
+  {"Copy", routines_gemm_syrk},
+  {"Pad", routines_gemm_syrk},
+  {"Transpose", routines_gemm_syrk},
+  {"Padtranspose", routines_gemm_syrk},
+  {"Xgemm", routines_gemm_syrk},
+  {"XgemmDirect", routines_gemm},
+  {"KernelSelection", routines_gemm},
+  {"Invert", routines_trsm},
+};
+// =================================================================================================
+
 // The constructor does all heavy work, errors are returned as exceptions
 Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
-                 const std::vector<std::string> &routines, const Precision precision,
-                 const std::vector<const Database::DatabaseEntry*> &userDatabase,
+                 const std::vector<std::string> &kernel_names, const Precision precision,
+                 const std::vector<Database::DatabaseEntry> &userDatabase,
                  std::initializer_list<const char *> source):
     precision_(precision),
     routine_name_(name),
+    kernel_names_(kernel_names),
     queue_(queue),
     event_(event),
     context_(queue_.GetContext()),
     device_(queue_.GetDevice()),
     device_name_(device_.Name()),
-    db_(queue_, routines, precision_, userDatabase) {
+    db_(kernel_names) {
+
+  InitDatabase(userDatabase);
+  InitProgram(source);
+}
+
+void Routine::InitDatabase(const std::vector<Database::DatabaseEntry> &userDatabase) {
+  for (const auto &kernel_name : kernel_names_) {
+
+    // Queries the cache to see whether or not the kernel parameter database is already there
+    bool has_db;
+    db_(kernel_name) = DatabaseCache::Instance().Get(DatabaseKeyRef{ precision_, device_name_, kernel_name },
+                                                     &has_db);
+    if (has_db) { continue; }
+
+    // Builds the parameter database for this device and routine set and stores it in the cache
+    db_(kernel_name) = Database(device_, kernel_name, precision_, userDatabase);
+    DatabaseCache::Instance().Store(DatabaseKey{ precision_, device_name_, kernel_name },
+                                    Database{ db_(kernel_name) });
+  }
+}
+
+void Routine::InitProgram(std::initializer_list<const char *> source) {
 
   // Queries the cache to see whether or not the program (context-specific) is already there
-  if (ProgramIsInCache(context_, precision_, routine_name_)) { return; }
+  bool has_program;
+  program_ = ProgramCache::Instance().Get(ProgramKeyRef{ context_(), precision_, routine_name_ },
+                                          &has_program);
+  if (has_program) { return; }
 
   // Sets the build options from an environmental variable (if set)
   auto options = std::vector<std::string>();
@@ -47,33 +100,36 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
 
   // Queries the cache to see whether or not the binary (device-specific) is already there. If it
   // is, a program is created and stored in the cache
-  if (BinaryIsInCache(device_name_, precision_, routine_name_)) {
-    auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
-    auto program = Program(device_, context_, binary);
-    program.Build(device_, options);
-    StoreProgramToCache(program, context_, precision_, routine_name_);
+  bool has_binary;
+  auto binary = BinaryCache::Instance().Get(BinaryKeyRef{ precision_, routine_name_, device_name_ },
+                                            &has_binary);
+  if (has_binary) {
+    program_ = Program(device_, context_, binary);
+    program_.Build(device_, options);
+    ProgramCache::Instance().Store(ProgramKey{ context_(), precision_, routine_name_ },
+                                   Program{ program_ });
+    return;
   }
 
   // Otherwise, the kernel will be compiled and program will be built. Both the binary and the
   // program will be added to the cache.
 
   // Inspects whether or not cl_khr_fp64 is supported in case of double precision
-  const auto extensions = device_.Capabilities();
-  if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
-    if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
-      throw RuntimeErrorCode(StatusCode::kNoDoublePrecision);
-    }
+  if ((precision_ == Precision::kDouble && !PrecisionSupported<double>(device_)) ||
+      (precision_ == Precision::kComplexDouble && !PrecisionSupported<double2>(device_))) {
+    throw RuntimeErrorCode(StatusCode::kNoDoublePrecision);
   }
 
   // As above, but for cl_khr_fp16 (half precision)
-  if (precision_ == Precision::kHalf) {
-    if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
-      throw RuntimeErrorCode(StatusCode::kNoHalfPrecision);
-    }
+  if (precision_ == Precision::kHalf && !PrecisionSupported<half>(device_)) {
+    throw RuntimeErrorCode(StatusCode::kNoHalfPrecision);
   }
 
   // Collects the parameters for this device in the form of defines, and adds the precision
-  auto source_string = db_.GetDefines();
+  auto source_string = std::string{""};
+  for (const auto &kernel_name : kernel_names_) {
+    source_string += db_(kernel_name).GetDefines();
+  }
   source_string += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
 
   // Adds the name of the routine as a define
@@ -114,21 +170,23 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
   #endif
 
   // Compiles the kernel
-  auto program = Program(context_, source_string);
+  program_ = Program(context_, source_string);
   try {
-    program.Build(device_, options);
+    program_.Build(device_, options);
   } catch (const CLError &e) {
     if (e.status() == CL_BUILD_PROGRAM_FAILURE) {
       fprintf(stdout, "OpenCL compiler error/warning: %s\n",
-              program.GetBuildInfo(device_).c_str());
+              program_.GetBuildInfo(device_).c_str());
     }
     throw;
   }
 
   // Store the compiled binary and program in the cache
-  const auto binary = program.GetIR();
-  StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
-  StoreProgramToCache(program, context_, precision_, routine_name_);
+  BinaryCache::Instance().Store(BinaryKey{ precision_, routine_name_, device_name_ },
+                                program_.GetIR());
+
+  ProgramCache::Instance().Store(ProgramKey{ context_(), precision_, routine_name_ },
+                                 Program{ program_ });
 
   // Prints the elapsed compilation time in case of debugging in verbose mode
   #ifdef VERBOSE
diff --git a/src/routine.hpp b/src/routine.hpp
index 2d8b2415..903ccdb1 100644
--- a/src/routine.hpp
+++ b/src/routine.hpp
@@ -18,6 +18,7 @@
 
 #include <string>
 #include <vector>
+#include <unordered_map>
 
 #include "utilities/utilities.hpp"
 #include "cache.hpp"
@@ -35,18 +36,39 @@ class Routine {
   // Base class constructor. The user database is an optional extra database to override the
   // built-in database.
   // All heavy preparation work is done inside this constructor.
+  // NOTE: the caller must provide the same userDatabase for each combination of device, precision
+  // and routine list, otherwise the caching logic will break.
   explicit Routine(Queue &queue, EventPointer event, const std::string &name,
                    const std::vector<std::string> &routines, const Precision precision,
-                   const std::vector<const Database::DatabaseEntry*> &userDatabase,
+                   const std::vector<Database::DatabaseEntry> &userDatabase,
                    std::initializer_list<const char *> source);
 
+  // List of kernel-routine look-ups
+  static const std::vector<std::string> routines_axpy;
+  static const std::vector<std::string> routines_dot;
+  static const std::vector<std::string> routines_ger;
+  static const std::vector<std::string> routines_gemv;
+  static const std::vector<std::string> routines_gemm;
+  static const std::vector<std::string> routines_gemm_syrk;
+  static const std::vector<std::string> routines_trsm;
+  static const std::unordered_map<std::string, const std::vector<std::string>> routines_by_kernel;
+
+ private:
+
+  // Initializes program_, fetching cached program or building one
+  void InitProgram(std::initializer_list<const char *> source);
+
+  // Initializes db_, fetching cached database or building one
+  void InitDatabase(const std::vector<Database::DatabaseEntry> &userDatabase);
+
  protected:
 
   // Non-static variable for the precision
   const Precision precision_;
 
-  // The routine's name
+  // The routine's name and the corresponding kernels
   const std::string routine_name_;
+  const std::vector<std::string> kernel_names_;
 
   // The OpenCL objects, accessible only from derived classes
   Queue queue_;
@@ -57,8 +79,11 @@ class Routine {
   // OpenCL device properties
   const std::string device_name_;
 
+  // Compiled program (either retrieved from cache or compiled in slow path)
+  Program program_;
+
   // Connection to the database for all the device-specific parameters
-  const Database db_;
+  Databases db_;
 };
 
 // =================================================================================================
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index 53ca6355..28a43da5 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -19,8 +19,8 @@
 #include <string>
 #include <vector>
 
-#include "clblast.h"
 #include "clpp11.hpp"
+#include "clblast.h"
 #include "database/database.hpp"
 
 namespace clblast {
@@ -33,11 +33,52 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
 
 // =================================================================================================
 
+// Sets all elements of a matrix to a constant value
+template <typename T>
+void FillMatrix(Queue &queue, const Device &device,
+                const Program &program, const Databases &,
+                EventPointer event, const std::vector<Event> &waitForEvents,
+                const size_t m, const size_t n, const size_t ld, const size_t offset,
+                const Buffer<T> &dest,
+                const T constant_value) {
+  auto kernel = Kernel(program, "FillMatrix");
+  kernel.SetArgument(0, static_cast<int>(m));
+  kernel.SetArgument(1, static_cast<int>(n));
+  kernel.SetArgument(2, static_cast<int>(ld));
+  kernel.SetArgument(3, static_cast<int>(offset));
+  kernel.SetArgument(4, dest());
+  kernel.SetArgument(5, GetRealArg(constant_value));
+  auto local = std::vector<size_t>{8, 8};
+  auto global = std::vector<size_t>{Ceil(m, 8), Ceil(n, 8)};
+  RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+}
+
+// Sets all elements of a vector to a constant value
+template <typename T>
+void FillVector(Queue &queue, const Device &device,
+                const Program &program, const Databases &,
+                EventPointer event, const std::vector<Event> &waitForEvents,
+                const size_t n, const size_t inc, const size_t offset,
+                const Buffer<T> &dest,
+                const T constant_value) {
+  auto kernel = Kernel(program, "FillVector");
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, static_cast<int>(inc));
+  kernel.SetArgument(2, static_cast<int>(offset));
+  kernel.SetArgument(3, dest());
+  kernel.SetArgument(4, GetRealArg(constant_value));
+  auto local = std::vector<size_t>{64};
+  auto global = std::vector<size_t>{Ceil(n, 64)};
+  RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+}
+
+// =================================================================================================
+
 // Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
 // to write to symmetric and triangular matrices through optional arguments.
 template <typename T>
 void PadCopyTransposeMatrix(Queue &queue, const Device &device,
-                            const Database &db,
+                            const Databases &db,
                             EventPointer event, const std::vector<Event> &waitForEvents,
                             const size_t src_one, const size_t src_two,
                             const size_t src_ld, const size_t src_offset,
@@ -155,6 +196,70 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
   }
 }
 
+// Batched version of the above
+template <typename T>
+void PadCopyTransposeMatrixBatched(Queue &queue, const Device &device,
+                                   const Databases &db,
+                                   EventPointer event, const std::vector<Event> &waitForEvents,
+                                   const size_t src_one, const size_t src_two,
+                                   const size_t src_ld, const Buffer<int> &src_offsets,
+                                   const Buffer<T> &src,
+                                   const size_t dest_one, const size_t dest_two,
+                                   const size_t dest_ld, const Buffer<int> &dest_offsets,
+                                   const Buffer<T> &dest,
+                                   const Program &program, const bool do_pad,
+                                   const bool do_transpose, const bool do_conjugate,
+                                   const size_t batch_count) {
+
+  // Determines the right kernel
+  auto kernel_name = std::string{};
+  if (do_transpose) {
+    kernel_name = (do_pad) ? "TransposePadMatrixBatched" : "TransposeMatrixBatched";
+  }
+  else {
+    kernel_name = (do_pad) ? "CopyPadMatrixBatched" : "CopyMatrixBatched";
+  }
+
+  // Retrieves the kernel from the compiled binary
+  auto kernel = Kernel(program, kernel_name);
+
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(src_one));
+  kernel.SetArgument(1, static_cast<int>(src_two));
+  kernel.SetArgument(2, static_cast<int>(src_ld));
+  kernel.SetArgument(3, src_offsets());
+  kernel.SetArgument(4, src());
+  kernel.SetArgument(5, static_cast<int>(dest_one));
+  kernel.SetArgument(6, static_cast<int>(dest_two));
+  kernel.SetArgument(7, static_cast<int>(dest_ld));
+  kernel.SetArgument(8, dest_offsets());
+  kernel.SetArgument(9, dest());
+  if (do_pad) {
+    kernel.SetArgument(10, static_cast<int>(do_conjugate));
+  }
+
+  // Launches the kernel and returns the error code. Uses global and local thread sizes based on
+  // parameters in the database.
+  if (do_transpose) {
+    const auto global = std::vector<size_t>{
+      Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
+      Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
+      batch_count
+    };
+    const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"], 1};
+    RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+  }
+  else {
+    const auto global = std::vector<size_t>{
+      Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
+      Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"]),
+      batch_count
+    };
+    const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"], 1};
+    RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+  }
+}
+
 // =================================================================================================
 } // namespace clblast
 
diff --git a/src/routines/level1/xamax.cpp b/src/routines/level1/xamax.cpp
index e9efa1a7..40a66517 100644
--- a/src/routines/level1/xamax.cpp
+++ b/src/routines/level1/xamax.cpp
@@ -43,9 +43,8 @@ void Xamax<T>::DoAmax(const size_t n,
   TestVectorIndex(1, imax_buffer, imax_offset);
 
   // Retrieves the Xamax kernels from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel1 = Kernel(program, "Xamax");
-  auto kernel2 = Kernel(program, "XamaxEpilogue");
+  auto kernel1 = Kernel(program_, "Xamax");
+  auto kernel2 = Kernel(program_, "XamaxEpilogue");
 
   // Creates the buffer for intermediate values
   auto temp_size = 2*db_["WGS2"];
diff --git a/src/routines/level1/xasum.cpp b/src/routines/level1/xasum.cpp
index a242a5fa..b93b271c 100644
--- a/src/routines/level1/xasum.cpp
+++ b/src/routines/level1/xasum.cpp
@@ -43,9 +43,8 @@ void Xasum<T>::DoAsum(const size_t n,
   TestVectorScalar(1, asum_buffer, asum_offset);
 
   // Retrieves the Xasum kernels from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel1 = Kernel(program, "Xasum");
-  auto kernel2 = Kernel(program, "XasumEpilogue");
+  auto kernel1 = Kernel(program_, "Xasum");
+  auto kernel2 = Kernel(program_, "XasumEpilogue");
 
   // Creates the buffer for intermediate values
   auto temp_size = 2*db_["WGS2"];
diff --git a/src/routines/level1/xaxpy.cpp b/src/routines/level1/xaxpy.cpp
index 5436c5b7..0e588d99 100644
--- a/src/routines/level1/xaxpy.cpp
+++ b/src/routines/level1/xaxpy.cpp
@@ -44,19 +44,21 @@ void Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
   TestVectorY(n, y_buffer, y_offset, y_inc);
 
   // Determines whether or not the fast-version can be used
-  bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
-                         (y_offset == 0) && (y_inc == 1) &&
-                         IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]);
+  const auto use_faster_kernel = (x_offset == 0) && (x_inc == 1) &&
+                                 (y_offset == 0) && (y_inc == 1) &&
+                                 IsMultiple(n, db_["WPT"]*db_["VW"]);
+  const auto use_fastest_kernel = use_faster_kernel &&
+                                  IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]);
 
   // If possible, run the fast-version of the kernel
-  auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy";
+  const auto kernel_name = (use_fastest_kernel) ? "XaxpyFastest" :
+                           (use_faster_kernel) ? "XaxpyFaster" : "Xaxpy";
 
   // Retrieves the Xaxpy kernel from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program_, kernel_name);
 
   // Sets the kernel arguments
-  if (use_fast_kernel) {
+  if (use_faster_kernel || use_fastest_kernel) {
     kernel.SetArgument(0, static_cast<int>(n));
     kernel.SetArgument(1, GetRealArg(alpha));
     kernel.SetArgument(2, x_buffer());
@@ -74,13 +76,18 @@ void Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
   }
 
   // Launches the kernel
-  if (use_fast_kernel) {
+  if (use_fastest_kernel) {
     auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
     auto local = std::vector<size_t>{db_["WGS"]};
     RunKernel(kernel, queue_, device_, global, local, event_);
   }
+  else if (use_faster_kernel) {
+    auto global = std::vector<size_t>{Ceil(CeilDiv(n, db_["WPT"]*db_["VW"]), db_["WGS"])};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
   else {
-    auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+    const auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
     auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
     auto local = std::vector<size_t>{db_["WGS"]};
     RunKernel(kernel, queue_, device_, global, local, event_);
diff --git a/src/routines/level1/xcopy.cpp b/src/routines/level1/xcopy.cpp
index d86200c0..62889764 100644
--- a/src/routines/level1/xcopy.cpp
+++ b/src/routines/level1/xcopy.cpp
@@ -52,8 +52,7 @@ void Xcopy<T>::DoCopy(const size_t n,
   auto kernel_name = (use_fast_kernel) ? "XcopyFast" : "Xcopy";
 
   // Retrieves the Xcopy kernel from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program_, kernel_name);
 
   // Sets the kernel arguments
   if (use_fast_kernel) {
diff --git a/src/routines/level1/xdot.cpp b/src/routines/level1/xdot.cpp
index 9d718913..9f9c0590 100644
--- a/src/routines/level1/xdot.cpp
+++ b/src/routines/level1/xdot.cpp
@@ -46,9 +46,8 @@ void Xdot<T>::DoDot(const size_t n,
   TestVectorScalar(1, dot_buffer, dot_offset);
 
   // Retrieves the Xdot kernels from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel1 = Kernel(program, "Xdot");
-  auto kernel2 = Kernel(program, "XdotEpilogue");
+  auto kernel1 = Kernel(program_, "Xdot");
+  auto kernel2 = Kernel(program_, "XdotEpilogue");
 
   // Creates the buffer for intermediate values
   auto temp_size = 2*db_["WGS2"];
diff --git a/src/routines/level1/xnrm2.cpp b/src/routines/level1/xnrm2.cpp
index 373820a4..aa341aff 100644
--- a/src/routines/level1/xnrm2.cpp
+++ b/src/routines/level1/xnrm2.cpp
@@ -43,9 +43,8 @@ void Xnrm2<T>::DoNrm2(const size_t n,
   TestVectorScalar(1, nrm2_buffer, nrm2_offset);
 
   // Retrieves the Xnrm2 kernels from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel1 = Kernel(program, "Xnrm2");
-  auto kernel2 = Kernel(program, "Xnrm2Epilogue");
+  auto kernel1 = Kernel(program_, "Xnrm2");
+  auto kernel2 = Kernel(program_, "Xnrm2Epilogue");
 
   // Creates the buffer for intermediate values
   auto temp_size = 2*db_["WGS2"];
diff --git a/src/routines/level1/xscal.cpp b/src/routines/level1/xscal.cpp
index 0521b1e5..9bc096e5 100644
--- a/src/routines/level1/xscal.cpp
+++ b/src/routines/level1/xscal.cpp
@@ -49,8 +49,7 @@ void Xscal<T>::DoScal(const size_t n, const T alpha,
   auto kernel_name = (use_fast_kernel) ? "XscalFast" : "Xscal";
 
   // Retrieves the Xscal kernel from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program_, kernel_name);
 
   // Sets the kernel arguments
   if (use_fast_kernel) {
diff --git a/src/routines/level1/xswap.cpp b/src/routines/level1/xswap.cpp
index c9b97dc9..f046575f 100644
--- a/src/routines/level1/xswap.cpp
+++ b/src/routines/level1/xswap.cpp
@@ -52,8 +52,7 @@ void Xswap<T>::DoSwap(const size_t n,
   auto kernel_name = (use_fast_kernel) ? "XswapFast" : "Xswap";
 
   // Retrieves the Xswap kernel from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program_, kernel_name);
 
   // Sets the kernel arguments
   if (use_fast_kernel) {
diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp
index 7b4c2e8f..b7e8081b 100644
--- a/src/routines/level2/xgemv.cpp
+++ b/src/routines/level2/xgemv.cpp
@@ -22,9 +22,10 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>(), {}, {
+    Routine(queue, event, name, {"Xgemv", "XgemvFast", "XgemvFastRot", "Xtrsv"}, PrecisionValue<T>(), {}, {
     #include "../../kernels/level2/xgemv.opencl"
     #include "../../kernels/level2/xgemv_fast.opencl"
+    #include "../../kernels/level2/xtrsv.opencl"
     }) {
 }
 
@@ -69,14 +70,14 @@ void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
   if (m == 0 || n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
 
   // Computes whether or not the matrix has an alternative layout (row or column-major).
-  auto a_altlayout = (layout == Layout::kRowMajor);
+  const auto a_altlayout = (layout == Layout::kRowMajor);
   auto a_one = (a_altlayout) ? n : m;
-  auto a_two = (a_altlayout) ? m : n;
+  const auto a_two = (a_altlayout) ? m : n;
 
   // Swap m and n if the matrix is transposed
-  auto a_transposed = (a_transpose != Transpose::kNo);
-  auto m_real = (a_transposed) ? n : m;
-  auto n_real = (a_transposed) ? m : n;
+  const auto a_transposed = (a_transpose != Transpose::kNo);
+  const auto m_real = (a_transposed) ? n : m;
+  const auto n_real = (a_transposed) ? m : n;
 
   // Special adjustments for banded matrices
   if (kl != 0 || ku != 0) {
@@ -84,10 +85,10 @@ void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
   }
 
   // Determines whether the kernel needs to perform rotated access ('^' is the XOR operator)
-  auto a_rotated = a_transposed ^ a_altlayout;
+  const auto a_rotated = a_transposed ^ a_altlayout;
 
   // In case of complex data-types, the transpose can also become a conjugate transpose
-  auto a_conjugate = (a_transpose == Transpose::kConjugate);
+  const auto a_conjugate = (a_transpose == Transpose::kConjugate);
 
   // Tests the matrix and the vectors for validity
   if (packed) { TestMatrixAP(n, a_buffer, a_offset); }
@@ -106,8 +107,8 @@ void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
                     IsMultiple(a_ld, db_["VW3"]);
 
   // If possible, run the fast-version (rotated or non-rotated) of the kernel
-  auto kernel_name = "Xgemv";
-  auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]);
+  auto kernel_name = std::string{"Xgemv"};
+  const auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]);
   auto global_size = m_ceiled / db_["WPT1"];
   auto local_size = db_["WGS1"];
   if (fast_kernel) {
@@ -122,8 +123,7 @@ void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
   }
 
   // Retrieves the Xgemv kernel from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program_, kernel_name);
 
   // Sets the kernel arguments
   kernel.SetArgument(0, static_cast<int>(m_real));
diff --git a/src/routines/level2/xger.cpp b/src/routines/level2/xger.cpp
index d16ebd11..9ec156a1 100644
--- a/src/routines/level2/xger.cpp
+++ b/src/routines/level2/xger.cpp
@@ -53,8 +53,7 @@ void Xger<T>::DoGer(const Layout layout,
   TestVectorY(n, y_buffer, y_offset, y_inc);
 
   // Retrieves the kernel from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel = Kernel(program, "Xger");
+  auto kernel = Kernel(program_, "Xger");
 
   // Sets the kernel arguments
   kernel.SetArgument(0, static_cast<int>(a_one));
diff --git a/src/routines/level2/xher.cpp b/src/routines/level2/xher.cpp
index 6c334e63..ba12a3ef 100644
--- a/src/routines/level2/xher.cpp
+++ b/src/routines/level2/xher.cpp
@@ -67,8 +67,7 @@ void Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
   const auto matching_alpha = GetAlpha(alpha);
 
   // Retrieves the kernel from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel = Kernel(program, "Xher");
+  auto kernel = Kernel(program_, "Xher");
 
   // Sets the kernel arguments
   kernel.SetArgument(0, static_cast<int>(n));
diff --git a/src/routines/level2/xher2.cpp b/src/routines/level2/xher2.cpp
index 11e2c871..a420e693 100644
--- a/src/routines/level2/xher2.cpp
+++ b/src/routines/level2/xher2.cpp
@@ -54,8 +54,7 @@ void Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
   TestVectorY(n, y_buffer, y_offset, y_inc);
 
   // Retrieves the kernel from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel = Kernel(program, "Xher2");
+  auto kernel = Kernel(program_, "Xher2");
 
   // Sets the kernel arguments
   kernel.SetArgument(0, static_cast<int>(n));
diff --git a/src/routines/level2/xtbmv.cpp b/src/routines/level2/xtbmv.cpp
index f4a58ed2..117d26e0 100644
--- a/src/routines/level2/xtbmv.cpp
+++ b/src/routines/level2/xtbmv.cpp
@@ -52,9 +52,9 @@ void Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
   auto fast_kernels = false;
   try {
     MatVec(layout, a_transpose,
-           n, n, static_cast<T>(1),
+           n, n, ConstantOne<T>(),
            a_buffer, a_offset, a_ld,
-           scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+           scratch_buffer, x_offset, x_inc, ConstantZero<T>(),
            x_buffer, x_offset, x_inc,
            fast_kernels, fast_kernels,
            parameter, false, k, 0);
diff --git a/src/routines/level2/xtpmv.cpp b/src/routines/level2/xtpmv.cpp
index c0d26699..00282378 100644
--- a/src/routines/level2/xtpmv.cpp
+++ b/src/routines/level2/xtpmv.cpp
@@ -52,9 +52,9 @@ void Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
   auto fast_kernels = false;
   try {
     MatVec(layout, a_transpose,
-           n, n, static_cast<T>(1),
+           n, n, ConstantOne<T>(),
            ap_buffer, ap_offset, n,
-           scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+           scratch_buffer, x_offset, x_inc, ConstantZero<T>(),
            x_buffer, x_offset, x_inc,
            fast_kernels, fast_kernels,
            parameter, true, 0, 0);
diff --git a/src/routines/level2/xtrmv.cpp b/src/routines/level2/xtrmv.cpp
index 5fff9b31..80e29009 100644
--- a/src/routines/level2/xtrmv.cpp
+++ b/src/routines/level2/xtrmv.cpp
@@ -52,9 +52,9 @@ void Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
   auto fast_kernels = false;
   try {
     MatVec(layout, a_transpose,
-           n, n, static_cast<T>(1),
+           n, n, ConstantOne<T>(),
            a_buffer, a_offset, a_ld,
-           scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+           scratch_buffer, x_offset, x_inc, ConstantZero<T>(),
            x_buffer, x_offset, x_inc,
            fast_kernels, fast_kernels,
            parameter, false, 0, 0);
diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp
new file mode 100644
index 00000000..d5d009ff
--- /dev/null
+++ b/src/routines/level2/xtrsv.cpp
@@ -0,0 +1,161 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrsv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "routines/level2/xtrsv.hpp"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtrsv<T>::Xtrsv(Queue &queue, EventPointer event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+template <typename T>
+void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
+                            const Transpose a_transpose, const Diagonal diagonal,
+                            const size_t n,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+  if (n > db_["TRSV_BLOCK_SIZE"]) { throw BLASError(StatusCode::kUnexpectedError); };
+
+  // Translates CLBlast arguments to 0/1 integers for the OpenCL kernel
+  const auto is_unit_diagonal = (diagonal == Diagonal::kNonUnit) ? 0 : 1;
+  const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kColMajor) ||
+                              (a_transpose != Transpose::kNo && layout != Layout::kColMajor)) ? 0 : 1;
+  const auto do_conjugate = (a_transpose == Transpose::kConjugate) ? 1 : 0;
+
+  // The data is either in the upper or lower triangle
+  const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) ||
+                         (triangle == Triangle::kLower && a_transpose != Transpose::kNo));
+
+  // Retrieves the kernel from the compiled binary
+  const auto kernel_name = (is_upper) ? "trsv_backward" : "trsv_forward";
+  auto kernel = Kernel(program_, kernel_name);
+
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, a_buffer());
+  kernel.SetArgument(2, static_cast<int>(a_offset));
+  kernel.SetArgument(3, static_cast<int>(a_ld));
+  kernel.SetArgument(4, b_buffer());
+  kernel.SetArgument(5, static_cast<int>(b_offset));
+  kernel.SetArgument(6, static_cast<int>(b_inc));
+  kernel.SetArgument(7, x_buffer());
+  kernel.SetArgument(8, static_cast<int>(x_offset));
+  kernel.SetArgument(9, static_cast<int>(x_inc));
+  kernel.SetArgument(10, static_cast<int>(is_transposed));
+  kernel.SetArgument(11, static_cast<int>(is_unit_diagonal));
+  kernel.SetArgument(12, static_cast<int>(do_conjugate));
+
+  // Launches the kernel
+  const auto local = std::vector<size_t>{db_["TRSV_BLOCK_SIZE"]};
+  const auto global = std::vector<size_t>{1};
+  auto event = Event();
+  RunKernel(kernel, queue_, device_, global, local, event.pointer());
+  event.WaitForCompletion();
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
+                      const Transpose a_transpose, const Diagonal diagonal,
+                      const size_t n,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc) {
+
+  // Makes sure all dimensions are larger than zero
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
+
+  // Tests the matrix and vector
+  TestMatrixA(n, n, a_buffer, a_offset, a_ld);
+  TestVectorX(n, b_buffer, b_offset, b_inc);
+
+  // Creates a copy of B to avoid overwriting input while computing output
+  // TODO: Make x with 0 offset and unit increment by creating custom copy-to and copy-from kernels
+  const auto x_offset = b_offset;
+  const auto x_inc = b_inc;
+  const auto x_size = n*x_inc + x_offset;
+  auto x_buffer = Buffer<T>(context_, x_size);
+  b_buffer.CopyTo(queue_, x_size, x_buffer);
+
+  // Fills the output buffer with zeros
+  auto eventWaitList = std::vector<Event>();
+  auto fill_vector_event = Event();
+  FillVector(queue_, device_, program_, db_, fill_vector_event.pointer(), eventWaitList,
+             n, x_inc, x_offset, x_buffer, ConstantZero<T>());
+  fill_vector_event.WaitForCompletion();
+
+  // Derives properties based on the arguments
+  const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) ||
+                         (triangle == Triangle::kLower && a_transpose != Transpose::kNo));
+  const auto is_transposed = ((layout == Layout::kColMajor && a_transpose == Transpose::kNo) ||
+                              (layout != Layout::kColMajor && a_transpose != Transpose::kNo));
+
+  // Loops over the blocks
+  auto col = n; // the initial column position
+  for (auto i = size_t{0}; i < n; i += db_["TRSV_BLOCK_SIZE"]) {
+    const auto block_size = std::min(db_["TRSV_BLOCK_SIZE"], n - i);
+
+    // Sets the next column position
+    col = (is_upper) ? col - block_size : i;
+
+    // Sets the offsets for upper or lower triangular
+    const auto extra_offset_a = (is_transposed) ?
+                                (is_upper ? col + (col+block_size)*a_ld : col) :
+                                (is_upper ? col+block_size + col*a_ld : col*a_ld);
+    const auto extra_offset_x = (is_upper) ? (col+block_size)*x_inc : 0;
+    const auto extra_offset_b = col*x_inc;
+
+    // Runs the GEMV routine to compute x' = A * x
+    if (i > 0) {
+      const auto gemv_m = (a_transpose == Transpose::kNo) ? block_size : i;
+      const auto gemv_n = (a_transpose == Transpose::kNo) ? i : block_size;
+      DoGemv(layout, a_transpose, gemv_m, gemv_n, ConstantOne<T>(),
+             a_buffer, a_offset + extra_offset_a, a_ld,
+             x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(),
+             x_buffer, x_offset + extra_offset_b, x_inc );
+    }
+
+    // Runs the triangular substitution for the block size
+    Substitution(layout, triangle, a_transpose, diagonal, block_size,
+                 a_buffer, a_offset + col + col*a_ld, a_ld,
+                 b_buffer, b_offset + col*b_inc, b_inc,
+                 x_buffer, x_offset + col*x_inc, x_inc);
+  }
+
+  // Retrieves the results
+  x_buffer.CopyTo(queue_, x_size, b_buffer);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtrsv<half>;
+template class Xtrsv<float>;
+template class Xtrsv<double>;
+template class Xtrsv<float2>;
+template class Xtrsv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xtrsv.hpp b/src/routines/level2/xtrsv.hpp
new file mode 100644
index 00000000..67e626a1
--- /dev/null
+++ b/src/routines/level2/xtrsv.hpp
@@ -0,0 +1,60 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrsv routine. It uses a block-algorithm and performs small triangular
+// forward and backward substitutions on the diagonal parts of the matrix in combination with larger
+// GEMV computation on the remainder of the matrix.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTRSV_H_
+#define CLBLAST_ROUTINES_XTRSV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtrsv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::queue_;
+  using Xgemv<T>::context_;
+  using Xgemv<T>::device_;
+  using Xgemv<T>::db_;
+  using Xgemv<T>::program_;
+  using Xgemv<T>::DoGemv;
+
+  // Constructor
+  Xtrsv(Queue &queue, EventPointer event, const std::string &name = "TRSV");
+
+  // Templated-precision implementation of the routine
+  void DoTrsv(const Layout layout, const Triangle triangle,
+              const Transpose a_transpose, const Diagonal diagonal,
+              const size_t n,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+
+  // Performs forward or backward substitution on a small triangular matrix
+  void Substitution(const Layout layout, const Triangle triangle,
+                    const Transpose a_transpose, const Diagonal diagonal,
+                    const size_t n,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
+                    const Buffer<T> &x_buffer, const size_t offset_x, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTRSV_H_
+#endif
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index 4f70dc7a..658b22d0 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -33,10 +33,11 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
     #include "../../kernels/level3/convert_symmetric.opencl"
     #include "../../kernels/level3/convert_triangular.opencl"
     #include "../../kernels/level3/convert_hermitian.opencl"
+    , // separated in multiple parts to prevent C1091 in MSVC 2013
     #include "../../kernels/level3/xgemm_direct_part1.opencl"
     #include "../../kernels/level3/xgemm_direct_part2.opencl"
     #include "../../kernels/level3/xgemm_direct_part3.opencl"
-    , // separated in two parts to prevent C1091 in MSVC 2013
+    , // separated in multiple parts to prevent C1091 in MSVC 2013
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
     #include "../../kernels/level3/xgemm_part3.opencl"
@@ -103,19 +104,19 @@ void Xgemm<T>::DoGemm(const Layout layout,
   // Selects which version of GEMM to run
   const auto do_gemm_direct = (m * n * k < db_["XGEMM_MIN_INDIRECT_SIZE"]);
   if (do_gemm_direct) { // for small sizes (single kernel)
-    return GemmDirect(m, n, k, alpha,
-                      a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
-                      c_buffer, c_offset, c_ld,
-                      a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate);
+    GemmDirect(m, n, k, alpha,
+               a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
+               c_buffer, c_offset, c_ld,
+               a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate);
   }
   else { // for larger sizes (pre/post-processing plus a very fast kernel)
-    return GemmIndirect(m, n, k, alpha,
-                        a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
-                        c_buffer, c_offset, c_ld,
-                        a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate,
-                        a_one, a_two, a_want_rotated,
-                        b_one, b_two, b_want_rotated,
-                        c_one, c_two, c_want_rotated);
+    GemmIndirect(m, n, k, alpha,
+                 a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
+                 c_buffer, c_offset, c_ld,
+                 a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate,
+                 a_one, a_two, a_want_rotated,
+                 b_one, b_two, b_want_rotated,
+                 c_one, c_two, c_want_rotated);
   }
 }
 
@@ -126,16 +127,16 @@ void Xgemm<T>::DoGemm(const Layout layout,
 // overhead of these extra kernels might not be ideal for certain devices/arguments.
 template <typename T>
 void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
-                                  const T alpha,
-                                  const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                                  const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
-                                  const T beta,
-                                  const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
-                                  const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
-                                  const bool a_conjugate, const bool b_conjugate,
-                                  const size_t a_one, const size_t a_two, const bool a_want_rotated,
-                                  const size_t b_one, const size_t b_two, const bool b_want_rotated,
-                                  const size_t c_one, const size_t c_two, const bool c_want_rotated) {
+                            const T alpha,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                            const T beta,
+                            const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+                            const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                            const bool a_conjugate, const bool b_conjugate,
+                            const size_t a_one, const size_t a_two, const bool a_want_rotated,
+                            const size_t b_one, const size_t b_two, const bool b_want_rotated,
+                            const size_t c_one, const size_t c_two, const bool c_want_rotated) {
   // Calculates the ceiled versions of m, n, and k
   const auto m_ceiled = Ceil(m, db_["MWG"]);
   const auto n_ceiled = Ceil(n, db_["NWG"]);
@@ -150,9 +151,6 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
   const auto c_one_i = (c_want_rotated) ? n_ceiled : m_ceiled;
   const auto c_two_i = (c_want_rotated) ? m_ceiled : n_ceiled;
 
-  // Loads the program from the database
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
   // Determines whether or not temporary matrices are needed
   auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 &&
                    a_do_transpose == false && a_conjugate == false;
@@ -178,7 +176,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
     PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
                            a_one, a_two, a_ld, a_offset, a_buffer,
                            a_one_i, a_two_i, a_one_i, 0, a_temp,
-                           ConstantOne<T>(), program,
+                           ConstantOne<T>(), program_,
                            true, a_do_transpose, a_conjugate);
     eventWaitList.push_back(eventProcessA);
   }
@@ -189,7 +187,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
     PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
                            b_one, b_two, b_ld, b_offset, b_buffer,
                            b_one_i, b_two_i, b_one_i, 0, b_temp,
-                           ConstantOne<T>(), program,
+                           ConstantOne<T>(), program_,
                            true, b_do_transpose, b_conjugate);
     eventWaitList.push_back(eventProcessB);
   }
@@ -200,13 +198,13 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
     PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
                            c_one, c_two, c_ld, c_offset, c_buffer,
                            c_one_i, c_two_i, c_one_i, 0, c_temp,
-                           ConstantOne<T>(), program,
+                           ConstantOne<T>(), program_,
                            true, c_do_transpose, false);
     eventWaitList.push_back(eventProcessC);
   }
 
   // Retrieves the Xgemm kernel from the compiled binary
-  auto kernel = Kernel(program, "Xgemm");
+  auto kernel = Kernel(program_, "Xgemm");
 
   // Sets the kernel arguments
   kernel.SetArgument(0, static_cast<int>(m_ceiled));
@@ -236,7 +234,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
     PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
                            c_one_i, c_two_i, c_one_i, 0, c_temp,
                            c_one, c_two, c_ld, c_offset, c_buffer,
-                           ConstantOne<T>(), program,
+                           ConstantOne<T>(), program_,
                            false, c_do_transpose, false);
   }
 }
@@ -247,21 +245,18 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
 // The direct version of GEMM, requiring just one kernel, no pre or post-processing kernels.
 template <typename T>
 void Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
-                                const T alpha,
-                                const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                                const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
-                                const T beta,
-                                const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
-                                const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
-                                const bool a_conjugate, const bool b_conjugate) {
-
-  // Loads the program from the database
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+                          const T alpha,
+                          const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                          const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                          const T beta,
+                          const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+                          const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                          const bool a_conjugate, const bool b_conjugate) {
 
   // Retrieves the proper XgemmDirect kernel from the compiled binary
   const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") :
                                        (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN");
-  auto kernel = Kernel(program, name);
+  auto kernel = Kernel(program_, name);
 
   // Sets the kernel arguments
   kernel.SetArgument(0, static_cast<int>(m));
diff --git a/src/routines/level3/xhemm.cpp b/src/routines/level3/xhemm.cpp
index e5b1502a..8629f3de 100644
--- a/src/routines/level3/xhemm.cpp
+++ b/src/routines/level3/xhemm.cpp
@@ -58,8 +58,7 @@ void Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle trian
 
   // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm
   // routine afterwards
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program_, kernel_name);
 
   // Sets the arguments for the hermitian-to-squared kernel
   kernel.SetArgument(0, static_cast<int>(k));
diff --git a/src/routines/level3/xhemm.hpp b/src/routines/level3/xhemm.hpp
index 2385706e..7c011915 100644
--- a/src/routines/level3/xhemm.hpp
+++ b/src/routines/level3/xhemm.hpp
@@ -30,6 +30,7 @@ class Xhemm: public Xgemm<T> {
   using Xgemm<T>::queue_;
   using Xgemm<T>::context_;
   using Xgemm<T>::device_;
+  using Xgemm<T>::program_;
   using Xgemm<T>::db_;
   using Xgemm<T>::DoGemm;
 
diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp
index ee3bb8b8..2aed2781 100644
--- a/src/routines/level3/xher2k.cpp
+++ b/src/routines/level3/xher2k.cpp
@@ -81,9 +81,6 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
   // Decides which kernel to run: the upper-triangular or lower-triangular version
   auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
 
-  // Loads the program from the database
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
   // Determines whether or not temporary matrices are needed
   auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
                     ab_rotated == false && ab_conjugate == false;
@@ -116,7 +113,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
     PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList,
                            ab_one, ab_two, a_ld, a_offset, a_buffer,
                            n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
-                           ConstantOne<T>(), program,
+                           ConstantOne<T>(), program_,
                            true, ab_rotated, ab_conjugate);
     eventWaitList.push_back(eventProcessA1);
   }
@@ -125,7 +122,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
     PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList,
                            ab_one, ab_two, a_ld, a_offset, a_buffer,
                            n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
-                           ConstantOne<T>(), program,
+                           ConstantOne<T>(), program_,
                            true, ab_rotated, !ab_conjugate);
     eventWaitList.push_back(eventProcessA2);
   }
@@ -134,7 +131,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
     PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList,
                            ab_one, ab_two, b_ld, b_offset, b_buffer,
                            n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
-                           ConstantOne<T>(), program,
+                           ConstantOne<T>(), program_,
                            true, ab_rotated, ab_conjugate);
     eventWaitList.push_back(eventProcessB1);
   }
@@ -143,7 +140,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
     PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList,
                            ab_one, ab_two, b_ld, b_offset, b_buffer,
                            n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
-                           ConstantOne<T>(), program,
+                           ConstantOne<T>(), program_,
                            true, ab_rotated, !ab_conjugate);
     eventWaitList.push_back(eventProcessB2);
   }
@@ -154,12 +151,12 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
   PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
                          n, n, c_ld, c_offset, c_buffer,
                          n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
-                         ConstantOne<T>(), program,
+                         ConstantOne<T>(), program_,
                          true, c_rotated, false);
   eventWaitList.push_back(eventProcessC);
 
   // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
-  auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program_, kernel_name);
 
   // Sets the kernel arguments
   kernel.SetArgument(0, static_cast<int>(n_ceiled));
@@ -201,7 +198,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
   PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
                          n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                          n, n, c_ld, c_offset, c_buffer,
-                         ConstantOne<T>(), program,
+                         ConstantOne<T>(), program_,
                          false, c_rotated, false, upper, lower, true);
 }
 
diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp
index ae8e9324..d982859e 100644
--- a/src/routines/level3/xherk.cpp
+++ b/src/routines/level3/xherk.cpp
@@ -79,9 +79,6 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran
   // Decides which kernel to run: the upper-triangular or lower-triangular version
   auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
 
-  // Loads the program from the database
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
   // Determines whether or not temporary matrices are needed
   auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
                    a_rotated == false && a_conjugate == false;
@@ -109,7 +106,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran
     PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
                            a_one, a_two, a_ld, a_offset, a_buffer,
                            n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
-                           ConstantOne<T>(), program,
+                           ConstantOne<T>(), program_,
                            true, a_rotated, a_conjugate);
     eventWaitList.push_back(eventProcessA);
   }
@@ -118,7 +115,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran
     PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
                            a_one, a_two, a_ld, a_offset, a_buffer,
                            n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
-                           ConstantOne<T>(), program,
+                           ConstantOne<T>(), program_,
                            true, a_rotated, b_conjugate);
     eventWaitList.push_back(eventProcessB);
   }
@@ -129,12 +126,12 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran
   PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
                          n, n, c_ld, c_offset, c_buffer,
                          n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
-                         ConstantOne<T>(), program,
+                         ConstantOne<T>(), program_,
                          true, c_rotated, false);
   eventWaitList.push_back(eventProcessC);
 
   // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
-  auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program_, kernel_name);
 
   // Sets the kernel arguments
   kernel.SetArgument(0, static_cast<int>(n_ceiled));
@@ -163,7 +160,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran
   PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
                          n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                          n, n, c_ld, c_offset, c_buffer,
-                         ConstantOne<T>(), program,
+                         ConstantOne<T>(), program_,
                          false, c_rotated, false, upper, lower, true);
 }
 
diff --git a/src/routines/level3/xsymm.cpp b/src/routines/level3/xsymm.cpp
index d7f771d1..969edfc8 100644
--- a/src/routines/level3/xsymm.cpp
+++ b/src/routines/level3/xsymm.cpp
@@ -30,12 +30,12 @@ Xsymm<T>::Xsymm(Queue &queue, EventPointer event, const std::string &name):
 // The main routine
 template <typename T>
 void Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle,
-                            const size_t m, const size_t n,
-                            const T alpha,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
-                            const T beta,
-                            const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
+                      const size_t m, const size_t n,
+                      const T alpha,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                      const T beta,
+                      const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
 
   // Makes sure all dimensions are larger than zero
   if ((m == 0) || (n == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }
@@ -58,8 +58,7 @@ void Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle trian
 
   // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm
   // routine afterwards
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program_, kernel_name);
 
   // Sets the arguments for the symmetric-to-squared kernel
   kernel.SetArgument(0, static_cast<int>(k));
diff --git a/src/routines/level3/xsymm.hpp b/src/routines/level3/xsymm.hpp
index ee965364..7a584560 100644
--- a/src/routines/level3/xsymm.hpp
+++ b/src/routines/level3/xsymm.hpp
@@ -32,6 +32,7 @@ class Xsymm: public Xgemm<T> {
   using Xgemm<T>::queue_;
   using Xgemm<T>::context_;
   using Xgemm<T>::device_;
+  using Xgemm<T>::program_;
   using Xgemm<T>::db_;
   using Xgemm<T>::DoGemm;
 
diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp
index cb0e0461..7900eb74 100644
--- a/src/routines/level3/xsyr2k.cpp
+++ b/src/routines/level3/xsyr2k.cpp
@@ -77,9 +77,6 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran
   // Decides which kernel to run: the upper-triangular or lower-triangular version
   auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
 
-  // Loads the program from the database
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
   // Determines whether or not temporary matrices are needed
   auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
                    ab_rotated == false;
@@ -103,7 +100,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran
     PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
                            ab_one, ab_two, a_ld, a_offset, a_buffer,
                            n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
-                           ConstantOne<T>(), program,
+                           ConstantOne<T>(), program_,
                            true, ab_rotated, false);
     eventWaitList.push_back(eventProcessA);
   }
@@ -112,7 +109,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran
     PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
                            ab_one, ab_two, b_ld, b_offset, b_buffer,
                            n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
-                           ConstantOne<T>(), program,
+                           ConstantOne<T>(), program_,
                            true, ab_rotated, false);
     eventWaitList.push_back(eventProcessB);
   }
@@ -123,12 +120,12 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran
   PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
                          n, n, c_ld, c_offset, c_buffer,
                          n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
-                         ConstantOne<T>(), program,
+                         ConstantOne<T>(), program_,
                          true, c_rotated, false);
   eventWaitList.push_back(eventProcessC);
 
   // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
-  auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program_, kernel_name);
 
   // Sets the kernel arguments
   kernel.SetArgument(0, static_cast<int>(n_ceiled));
@@ -152,7 +149,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran
   eventWaitList.push_back(eventKernel1);
 
   // Swaps the arguments for matrices A and B, and sets 'beta' to 1
-  auto one = static_cast<T>(1);
+  auto one = ConstantOne<T>();
   kernel.SetArgument(3, GetRealArg(one));
   kernel.SetArgument(4, b_temp());
   kernel.SetArgument(5, a_temp());
@@ -168,7 +165,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran
   PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
                          n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                          n, n, c_ld, c_offset, c_buffer,
-                         ConstantOne<T>(), program,
+                         ConstantOne<T>(), program_,
                          false, c_rotated, false, upper, lower, false);
 }
 
diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp
index bd6c4b25..9588c28c 100644
--- a/src/routines/level3/xsyrk.cpp
+++ b/src/routines/level3/xsyrk.cpp
@@ -74,9 +74,6 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp
   // Decides which kernel to run: the upper-triangular or lower-triangular version
   auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
 
-  // Loads the program from the database
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
   // Determines whether or not temporary matrices are needed
   auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
                    a_rotated == false;
@@ -97,7 +94,7 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp
     PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
                            a_one, a_two, a_ld, a_offset, a_buffer,
                            n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
-                           ConstantOne<T>(), program,
+                           ConstantOne<T>(), program_,
                            true, a_rotated, false);
     eventWaitList.push_back(eventProcessA);
   }
@@ -108,12 +105,12 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp
   PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
                          n, n, c_ld, c_offset, c_buffer,
                          n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
-                         ConstantOne<T>(), program,
+                         ConstantOne<T>(), program_,
                          true, c_rotated, false);
   eventWaitList.push_back(eventProcessC);
 
   // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
-  auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program_, kernel_name);
 
   // Sets the kernel arguments
   kernel.SetArgument(0, static_cast<int>(n_ceiled));
@@ -142,7 +139,7 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp
   PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
                          n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                          n, n, c_ld, c_offset, c_buffer,
-                         ConstantOne<T>(), program,
+                         ConstantOne<T>(), program_,
                          false, c_rotated, false, upper, lower, false);
 }
 
diff --git a/src/routines/level3/xtrmm.cpp b/src/routines/level3/xtrmm.cpp
index ed810e72..26ef2a5e 100644
--- a/src/routines/level3/xtrmm.cpp
+++ b/src/routines/level3/xtrmm.cpp
@@ -70,8 +70,7 @@ void Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle trian
 
   // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm
   // routine afterwards
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program_, kernel_name);
 
   // Sets the arguments for the triangular-to-squared kernel
   kernel.SetArgument(0, static_cast<int>(k));
@@ -102,7 +101,7 @@ void Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle trian
            alpha,
            temp_triangular, 0, k,
            b_buffer_copy, b_offset, b_ld,
-           static_cast<T>(0.0),
+           ConstantZero<T>(),
            b_buffer, b_offset, b_ld);
   }
 
@@ -114,7 +113,7 @@ void Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle trian
              alpha,
              b_buffer_copy, b_offset, b_ld,
              temp_triangular, 0, k,
-             static_cast<T>(0.0),
+             ConstantZero<T>(),
              b_buffer, b_offset, b_ld);
     } catch (BLASError &e) {
       // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
diff --git a/src/routines/level3/xtrmm.hpp b/src/routines/level3/xtrmm.hpp
index 967bf132..e77b7214 100644
--- a/src/routines/level3/xtrmm.hpp
+++ b/src/routines/level3/xtrmm.hpp
@@ -31,6 +31,7 @@ class Xtrmm: public Xgemm<T> {
   using Xgemm<T>::queue_;
   using Xgemm<T>::context_;
   using Xgemm<T>::device_;
+  using Xgemm<T>::program_;
   using Xgemm<T>::db_;
   using Xgemm<T>::DoGemm;
 
diff --git a/src/routines/level3/xtrsm.cpp b/src/routines/level3/xtrsm.cpp
new file mode 100644
index 00000000..4378ca94
--- /dev/null
+++ b/src/routines/level3/xtrsm.cpp
@@ -0,0 +1,227 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the triangular matrix solver (A * X = B) TRSM class. This code is based
+// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
+// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
+// and Jack Dongarra and the OpenCL implementation in clBLAS.
+//
+// =================================================================================================
+
+#include "routines/level3/xtrsm.hpp"
+#include "routines/levelx/xinvert.hpp"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtrsm<T>::Xtrsm(Queue &queue, EventPointer event, const std::string &name):
+    Xgemm<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The entry point: transforming into col-major (if needed) and then running the col-major version
+template <typename T>
+void Xtrsm<T>::DoTrsm(const Layout layout, Side side, Triangle triangle,
+                      const Transpose a_transpose, const Diagonal diagonal,
+                      size_t m, size_t n,
+                      const T alpha,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
+
+  // Converts row-major to a col-major problem:
+  // The idea is that
+  //   B = A*X
+  // can be computed as
+  //   B' = (A*X)' = X'*A'
+  // Since changing the order is basically a transpose on each matrix, the formula becomes:
+  //   B = X*A
+  // So only the side (left/right) and the triangle (upper/lower) are changed and M/N are swapped
+  if (layout == Layout::kRowMajor) {
+    std::swap(m, n);
+    side = (side == Side::kLeft) ? Side::kRight : Side::kLeft;
+    triangle = (triangle == Triangle::kLower) ? Triangle::kUpper : Triangle::kLower;
+  }
+
+  // Runs the col-major version of TRSM
+  TrsmColMajor(side, triangle, a_transpose, diagonal,
+               m, n, alpha,
+               a_buffer, a_offset, a_ld,
+               b_buffer, b_offset, b_ld);
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+void Xtrsm<T>::TrsmColMajor(const Side side, const Triangle triangle,
+                            const Transpose a_transpose, const Diagonal diagonal,
+                            const size_t m, const size_t n,
+                            const T alpha,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
+
+  // Settings
+  constexpr auto block_size = size_t{32}; // tuneable
+
+  // Makes sure all dimensions are larger than zero
+  if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
+
+  // Computes the k dimension. This is based on whether or not matrix is A (on the left)
+  // or B (on the right) in the Xgemm routine.
+  const auto k = (side == Side::kLeft) ? m : n;
+
+  // Checks for validity of the triangular A matrix
+  TestMatrixA(k, k, a_buffer, a_offset, a_ld);
+
+  // Checks for validity of the input B matrix
+  TestMatrixB(m, n, b_buffer, b_offset, b_ld);
+
+  // Creates a copy of B to avoid overwriting input in GEMM while computing output
+  const auto b_size = b_ld * (n - 1) + m + b_offset;
+  const auto x_one = m;
+  const auto x_two = n;
+  const auto x_size = b_size;
+  const auto x_ld = b_ld;
+  const auto x_offset = b_offset;
+  auto x_buffer = Buffer<T>(context_, x_size);
+  b_buffer.CopyTo(queue_, x_size, x_buffer);
+
+  // Temporary buffer for the inverse of the A matrix
+  const auto a_inv_size = Ceil(k, block_size) * block_size;
+  auto a_inv_buffer = Buffer<T>(context_, a_inv_size);
+
+  // Fills the output buffer with zeros
+  auto eventWaitList = std::vector<Event>();
+  auto fill_matrix_event = Event();
+  FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), eventWaitList,
+             x_one, x_two, x_ld, x_offset, x_buffer, ConstantZero<T>());
+  fill_matrix_event.WaitForCompletion();
+
+  // Inverts the diagonal blocks
+  auto diagonal_invert_event = Event();
+  auto inverter = Xinvert<T>(queue_, diagonal_invert_event.pointer());
+  inverter.InvertMatrixDiagonalBlocks(Layout::kColMajor, triangle, diagonal,
+                                      k, block_size, a_buffer, a_offset, a_ld, a_inv_buffer);
+  diagonal_invert_event.WaitForCompletion();
+
+  // Derives properties based on the arguments
+  const auto condition = ((triangle == Triangle::kUpper && a_transpose != Transpose::kNo) ||
+                          (triangle == Triangle::kLower && a_transpose == Transpose::kNo));
+
+  // Left side
+  if (side == Side::kLeft) {
+
+    // True when (lower triangular) or (upper triangular and transposed)
+    if (condition) {
+      for (auto i = size_t{0}; i < m; i += block_size) {
+        const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>();
+        const auto current_block_size = std::min(m - i, block_size);
+        DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
+               current_block_size, n, current_block_size, gemm_alpha,
+               a_inv_buffer, i * block_size, block_size,
+               b_buffer, b_offset + i, b_ld, ConstantZero<T>(),
+               x_buffer, x_offset + i, x_ld);
+        if (i + block_size >= m) { break; }
+        const auto this_a_offset = (a_transpose == Transpose::kNo) ? (i + block_size) + i * a_ld : i + (block_size + i) * a_ld;
+        DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
+               m - i - block_size, n, block_size, ConstantNegOne<T>(),
+               a_buffer, this_a_offset, a_ld,
+               x_buffer, x_offset + i, x_ld, ConstantOne<T>(),
+               b_buffer, b_offset + i + block_size, b_ld);
+      }
+    }
+
+    // True when (upper triangular) or (lower triangular and transposed)
+    else {
+      const auto current_block_size = (m % block_size == 0) ? block_size : (m % block_size);
+      const auto i_start = static_cast<int>(m) - static_cast<int>(current_block_size);
+      for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) {
+        const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>();
+        DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
+               current_block_size, n, current_block_size, gemm_alpha,
+               a_inv_buffer, i * block_size, block_size,
+               b_buffer, b_offset + i, b_ld, ConstantZero<T>(),
+               x_buffer, x_offset + i, x_ld);
+        if (i - static_cast<int>(block_size) < 0) { break; }
+        const auto this_a_offset = (a_transpose == Transpose::kNo) ? i * a_ld : i;
+        DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
+               i, n, block_size, ConstantNegOne<T>(),
+               a_buffer, this_a_offset, a_ld,
+               x_buffer, x_offset + i, x_ld, ConstantOne<T>(),
+               b_buffer, b_offset, b_ld);
+      }
+    }
+  }
+
+  // Right side
+  else {
+
+    // True when (lower triangular) or (upper triangular and transposed)
+    if (condition) {
+      const auto current_block_size = (n % block_size == 0) ? block_size : (n % block_size);
+      const auto i_start = static_cast<int>(n) - static_cast<int>(current_block_size);
+      for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) {
+        const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>();
+        DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
+               m, current_block_size, current_block_size, gemm_alpha,
+               b_buffer, b_offset + i * b_ld, b_ld,
+               a_inv_buffer, i * block_size, block_size, ConstantZero<T>(),
+               x_buffer, x_offset + i * x_ld, x_ld);
+        if (i - static_cast<int>(block_size) < 0) { break; }
+        const auto this_a_offset = (a_transpose == Transpose::kNo) ? i : i * a_ld;
+        DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
+               m, i, current_block_size, ConstantNegOne<T>(),
+               x_buffer, x_offset + i * x_ld, x_ld,
+               a_buffer, this_a_offset, a_ld, ConstantOne<T>(),
+               b_buffer, b_offset, b_ld);
+      }
+    }
+
+    // True when (upper triangular) or (lower triangular and transposed)
+    else {
+      for (auto i = size_t{0}; i < n; i += block_size) {
+        const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>();
+        const auto current_block_size = std::min(n - i, block_size);
+        DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
+               m, current_block_size, current_block_size, gemm_alpha,
+               b_buffer, b_offset + i * b_ld, b_ld,
+               a_inv_buffer, i * block_size, block_size, ConstantZero<T>(),
+               x_buffer, x_offset + i * x_ld, x_ld);
+        if (i + block_size >= n) { break; }
+        const auto this_a_offset = (a_transpose == Transpose::kNo) ? i + (block_size + i) * a_ld : (i + block_size) + i * a_ld;
+        DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
+               m, n - i - block_size, block_size, ConstantNegOne<T>(),
+               x_buffer, x_offset + i * x_ld, x_ld,
+               a_buffer, this_a_offset, a_ld, ConstantOne<T>(),
+               b_buffer, b_offset + (i + block_size) * b_ld, b_ld);
+      }
+    }
+  }
+
+  // Retrieves the results
+  x_buffer.CopyTo(queue_, b_size, b_buffer);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtrsm<half>;
+template class Xtrsm<float>;
+template class Xtrsm<double>;
+template class Xtrsm<float2>;
+template class Xtrsm<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level3/xtrsm.hpp b/src/routines/level3/xtrsm.hpp
new file mode 100644
index 00000000..5b42398e
--- /dev/null
+++ b/src/routines/level3/xtrsm.hpp
@@ -0,0 +1,60 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrsm routine. The implementation is based on ??? (TODO).
+// Therefore, this class inherits from the Xgemm class.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTRSM_H_
+#define CLBLAST_ROUTINES_XTRSM_H_
+
+#include "routines/level3/xgemm.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtrsm: public Xgemm<T> {
+ public:
+
+  // Uses methods and variables the Xgemm routine
+  using Xgemm<T>::queue_;
+  using Xgemm<T>::context_;
+  using Xgemm<T>::device_;
+  using Xgemm<T>::db_;
+  using Xgemm<T>::program_;
+  using Xgemm<T>::DoGemm;
+
+  // Constructor
+  Xtrsm(Queue &queue, EventPointer event, const std::string &name = "TRSM");
+
+  // Templated-precision implementation of the routine
+  void DoTrsm(const Layout layout, Side side, Triangle triangle,
+              const Transpose a_transpose, const Diagonal diagonal,
+              size_t m, size_t n,
+              const T alpha,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
+
+  // Implementation of the column-major version
+  void TrsmColMajor(const Side side, const Triangle triangle,
+                    const Transpose a_transpose, const Diagonal diagonal,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTRSM_H_
+#endif
diff --git a/src/routines/levelx/xaxpybatched.cpp b/src/routines/levelx/xaxpybatched.cpp
new file mode 100644
index 00000000..6a4269be
--- /dev/null
+++ b/src/routines/levelx/xaxpybatched.cpp
@@ -0,0 +1,95 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the XaxpyBatched class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "routines/levelx/xaxpybatched.hpp"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+XaxpyBatched<T>::XaxpyBatched(Queue &queue, EventPointer event, const std::string &name):
+    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
+    #include "../../kernels/level1/level1.opencl"
+    #include "../../kernels/level1/xaxpy.opencl"
+    }) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+void XaxpyBatched<T>::DoAxpyBatched(const size_t n, const std::vector<T> &alphas,
+                                    const Buffer<T> &x_buffer, const std::vector<size_t> &x_offsets, const size_t x_inc,
+                                    const Buffer<T> &y_buffer, const std::vector<size_t> &y_offsets, const size_t y_inc,
+                                    const size_t batch_count) {
+
+  // Tests for a valid batch count
+  if ((batch_count < 1) || (alphas.size() != batch_count) ||
+      (x_offsets.size() != batch_count) || (y_offsets.size() != batch_count)) {
+    throw BLASError(StatusCode::kInvalidBatchCount);
+  }
+
+  // Makes sure all dimensions are larger than zero
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
+
+  // Tests the vectors for validity
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    TestVectorX(n, x_buffer, x_offsets[batch], x_inc);
+    TestVectorY(n, y_buffer, y_offsets[batch], y_inc);
+  }
+
+  // Upload the arguments to the device
+  std::vector<int> x_offsets_int(x_offsets.begin(), x_offsets.end());
+  std::vector<int> y_offsets_int(y_offsets.begin(), y_offsets.end());
+  auto x_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+  auto y_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+  auto alphas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count);
+  x_offsets_device.Write(queue_, batch_count, x_offsets_int);
+  y_offsets_device.Write(queue_, batch_count, y_offsets_int);
+  alphas_device.Write(queue_, batch_count, alphas);
+
+  // Retrieves the Xaxpy kernel from the compiled binary
+  auto kernel = Kernel(program_, "XaxpyBatched");
+
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, alphas_device());
+  kernel.SetArgument(2, x_buffer());
+  kernel.SetArgument(3, x_offsets_device());
+  kernel.SetArgument(4, static_cast<int>(x_inc));
+  kernel.SetArgument(5, y_buffer());
+  kernel.SetArgument(6, y_offsets_device());
+  kernel.SetArgument(7, static_cast<int>(y_inc));
+
+  // Launches the kernel
+  auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+  auto global = std::vector<size_t>{n_ceiled/db_["WPT"], batch_count};
+  auto local = std::vector<size_t>{db_["WGS"], 1};
+  RunKernel(kernel, queue_, device_, global, local, event_);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class XaxpyBatched<half>;
+template class XaxpyBatched<float>;
+template class XaxpyBatched<double>;
+template class XaxpyBatched<float2>;
+template class XaxpyBatched<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/levelx/xaxpybatched.hpp b/src/routines/levelx/xaxpybatched.hpp
new file mode 100644
index 00000000..513792ea
--- /dev/null
+++ b/src/routines/levelx/xaxpybatched.hpp
@@ -0,0 +1,43 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the XaxpyBatched routine. This is a non-blas batched version of AXPY.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XAXPYBATCHED_H_
+#define CLBLAST_ROUTINES_XAXPYBATCHED_H_
+
+#include <vector>
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class XaxpyBatched: public Routine {
+ public:
+
+  // Constructor
+  XaxpyBatched(Queue &queue, EventPointer event, const std::string &name = "AXPYBATCHED");
+
+  // Templated-precision implementation of the routine
+  void DoAxpyBatched(const size_t n, const std::vector<T> &alphas,
+                     const Buffer<T> &x_buffer, const std::vector<size_t> &x_offsets, const size_t x_inc,
+                     const Buffer<T> &y_buffer, const std::vector<size_t> &y_offsets, const size_t y_inc,
+                     const size_t batch_count);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XAXPYBATCHED_H_
+#endif
diff --git a/src/routines/levelx/xgemmbatched.cpp b/src/routines/levelx/xgemmbatched.cpp
new file mode 100644
index 00000000..0fea1922
--- /dev/null
+++ b/src/routines/levelx/xgemmbatched.cpp
@@ -0,0 +1,350 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the XgemmBatched class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "routines/levelx/xgemmbatched.hpp"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+XgemmBatched<T>::XgemmBatched(Queue &queue, EventPointer event, const std::string &name):
+    Routine(queue, event, name,
+            {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"},
+            PrecisionValue<T>(), {}, {
+    #include "../../kernels/level3/level3.opencl"
+    #include "../../kernels/level3/copy_fast.opencl"
+    #include "../../kernels/level3/copy_pad.opencl"
+    #include "../../kernels/level3/transpose_fast.opencl"
+    #include "../../kernels/level3/transpose_pad.opencl"
+    , // separated in multiple parts to prevent C1091 in MSVC 2013
+    #include "../../kernels/level3/xgemm_direct_part1.opencl"
+    #include "../../kernels/level3/xgemm_direct_part2.opencl"
+    #include "../../kernels/level3/xgemm_direct_part3.opencl"
+    , // separated in multiple parts to prevent C1091 in MSVC 2013
+    #include "../../kernels/level3/xgemm_part1.opencl"
+    #include "../../kernels/level3/xgemm_part2.opencl"
+    #include "../../kernels/level3/xgemm_part3.opencl"
+    , // separated in multiple parts to prevent C1091 in MSVC 2013
+    #include "../../kernels/level3/xgemm_batched.opencl"
+    #include "../../kernels/level3/xgemm_direct_batched.opencl"
+    }) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+void XgemmBatched<T>::DoGemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                                    const size_t m, const size_t n, const size_t k,
+                                    const std::vector<T> &alphas,
+                                    const Buffer<T> & a_buffer, const std::vector<size_t> &a_offsets, const size_t a_ld,
+                                    const Buffer<T> & b_buffer, const std::vector<size_t> &b_offsets, const size_t b_ld,
+                                    const std::vector<T> &betas,
+                                    const Buffer<T> & c_buffer, const std::vector<size_t> &c_offsets, const size_t c_ld,
+                                    const size_t batch_count) {
+
+  // Tests for a valid batch count
+  if ((batch_count < 1) || (alphas.size() != batch_count) || (betas.size() != batch_count) ||
+      (a_offsets.size() != batch_count) || (b_offsets.size() != batch_count) || (c_offsets.size() != batch_count)) {
+    throw BLASError(StatusCode::kInvalidBatchCount);
+  }
+
+  // Makes sure all dimensions are larger than zero
+  if ((m == 0) || (n == 0) || (k == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
+
+  // Computes whether or not the matrices are transposed in memory. See GEMM routine for details.
+  const auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) ||
+                         (layout == Layout::kRowMajor && a_transpose == Transpose::kNo);
+  const auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) ||
+                         (layout == Layout::kRowMajor && b_transpose == Transpose::kNo);
+  const auto c_rotated = (layout == Layout::kRowMajor);
+  static const auto a_want_rotated = false;
+  static const auto b_want_rotated = true;
+  static const auto c_want_rotated = false;
+  const auto a_do_transpose = a_rotated != a_want_rotated;
+  const auto b_do_transpose = b_rotated != b_want_rotated;
+  const auto c_do_transpose = c_rotated != c_want_rotated;
+
+  // In case of complex data-types, the transpose can also become a conjugate transpose
+  const auto a_conjugate = (a_transpose == Transpose::kConjugate);
+  const auto b_conjugate = (b_transpose == Transpose::kConjugate);
+
+  // Computes the first and second dimensions of the 3 matrices taking into account whether the
+  // matrices are rotated or not
+  const auto a_one = (a_rotated) ? k : m;
+  const auto a_two = (a_rotated) ? m : k;
+  const auto b_one = (b_rotated) ? n : k;
+  const auto b_two = (b_rotated) ? k : n;
+  const auto c_one = (c_rotated) ? n : m;
+  const auto c_two = (c_rotated) ? m : n;
+
+  // Tests the matrices for validity
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    TestMatrixA(a_one, a_two, a_buffer, a_offsets[batch], a_ld);
+    TestMatrixB(b_one, b_two, b_buffer, b_offsets[batch], b_ld);
+    TestMatrixC(c_one, c_two, c_buffer, c_offsets[batch], c_ld);
+  }
+
+  // Upload the scalar arguments to the device
+  auto alphas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count);
+  auto betas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count);
+  alphas_device.Write(queue_, batch_count, alphas);
+  betas_device.Write(queue_, batch_count, betas);
+
+  // Converts the offset to integers
+  std::vector<int> a_offsets_int(a_offsets.begin(), a_offsets.end());
+  std::vector<int> b_offsets_int(b_offsets.begin(), b_offsets.end());
+  std::vector<int> c_offsets_int(c_offsets.begin(), c_offsets.end());
+
+  // Selects which version of the batched GEMM to run
+  const auto do_gemm_direct = true;
+  if (do_gemm_direct) { // single generic kernel
+    BatchedGemmDirect(m, n, k, alphas_device,
+                      a_buffer, a_offsets_int, a_ld, b_buffer, b_offsets_int, b_ld,
+                      betas_device, c_buffer, c_offsets_int, c_ld,
+                      a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate,
+                      batch_count);
+  }
+  else { // pre/post-processing plus a very fast kernel
+    BatchedGemmIndirect(m, n, k, alphas_device,
+                        a_buffer, a_offsets_int, a_ld, b_buffer, b_offsets_int, b_ld,
+                        betas_device, c_buffer, c_offsets_int, c_ld,
+                        a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate,
+                        a_one, a_two, a_want_rotated,
+                        b_one, b_two, b_want_rotated,
+                        c_one, c_two, c_want_rotated,
+                        batch_count);
+  }
+}
+
+
+// =================================================================================================
+
+// The indirect version of batched GEMM. This uses the faster but non-general kernel. It has specific
+// requirements, but several pre and post-processing kernels take care of those. However, the
+// overhead of these extra kernels might not be ideal for certain devices/arguments.
+template <typename T>
+void XgemmBatched<T>::BatchedGemmIndirect(const size_t m, const size_t n, const size_t k,
+                                          const Buffer<T> &alphas,
+                                          const Buffer<T> &a_buffer, const std::vector<int> &a_offsets, const size_t a_ld,
+                                          const Buffer<T> &b_buffer, const std::vector<int> &b_offsets, const size_t b_ld,
+                                          const Buffer<T> &betas,
+                                          const Buffer<T> &c_buffer, const std::vector<int> &c_offsets, const size_t c_ld,
+                                          const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                                          const bool a_conjugate, const bool b_conjugate,
+                                          const size_t a_one, const size_t a_two, const bool a_want_rotated,
+                                          const size_t b_one, const size_t b_two, const bool b_want_rotated,
+                                          const size_t c_one, const size_t c_two, const bool c_want_rotated,
+                                          const size_t batch_count) {
+  // Calculates the ceiled versions of m, n, and k
+  const auto m_ceiled = Ceil(Ceil(m, db_["MWG"]), db_["VWM"]);
+  const auto n_ceiled = Ceil(Ceil(n, db_["NWG"]), db_["VWN"]);
+  const auto k_ceiled = Ceil(Ceil(k, db_["KWG"]), db_["VWM"]);
+
+  // Computes the first and second "internal" (ceiled) dimensions of the 3 matrices taking into account
+  // whether the matrices need to be rotated or not for the kernel.
+  const auto a_one_i = (a_want_rotated) ? k_ceiled : m_ceiled;
+  const auto a_two_i = (a_want_rotated) ? m_ceiled : k_ceiled;
+  const auto b_one_i = (b_want_rotated) ? n_ceiled : k_ceiled;
+  const auto b_two_i = (b_want_rotated) ? k_ceiled : n_ceiled;
+  const auto c_one_i = (c_want_rotated) ? n_ceiled : m_ceiled;
+  const auto c_two_i = (c_want_rotated) ? m_ceiled : n_ceiled;
+
+  // Sets the "internal" offsets, i.e. the perfect offsets
+  auto a_offsets_i = std::vector<int>(batch_count);
+  auto b_offsets_i = std::vector<int>(batch_count);
+  auto c_offsets_i = std::vector<int>(batch_count);
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    a_offsets_i[batch] = batch * a_one_i * a_two_i;
+    b_offsets_i[batch] = batch * b_one_i * b_two_i;
+    c_offsets_i[batch] = batch * c_one_i * c_two_i;
+  }
+
+  // Determines whether or not temporary matrices are needed
+  auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offsets == a_offsets_i &&
+                   a_do_transpose == false && a_conjugate == false;
+  auto b_no_temp = b_one == b_one_i && b_two == b_two_i && b_ld == b_one && b_offsets == b_offsets_i &&
+                   b_do_transpose == false && b_conjugate == false;
+  auto c_no_temp = c_one == c_one_i && c_two == c_two_i && c_ld == c_one && c_offsets == c_offsets_i &&
+                   c_do_transpose == false;
+
+  // Creates the temporary matrices
+  const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, batch_count * a_one_i * a_two_i);
+  const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, batch_count * b_one_i * b_two_i);
+  const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, batch_count * c_one_i * c_two_i);
+
+  // Events of all kernels (including pre/post processing kernels)
+  auto eventWaitList = std::vector<Event>();
+  auto emptyEventList = std::vector<Event>();
+
+  // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
+  // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+  // case nothing has to be done, these kernels can be skipped.
+  if (!a_no_temp) {
+    auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+    auto a_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+    a_offsets_device.Write(queue_, batch_count, a_offsets);
+    a_offsets_i_device.Write(queue_, batch_count, a_offsets_i);
+    auto eventProcessA = Event();
+    PadCopyTransposeMatrixBatched(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
+                                  a_one, a_two, a_ld, a_offsets_device, a_buffer,
+                                  a_one_i, a_two_i, a_one_i, a_offsets_i_device, a_temp,
+                                  program_, true, a_do_transpose, a_conjugate, batch_count);
+    eventWaitList.push_back(eventProcessA);
+  }
+
+  // As above, but now for matrix B
+  if (!b_no_temp) {
+    auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+    auto b_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+    b_offsets_device.Write(queue_, batch_count, b_offsets);
+    b_offsets_i_device.Write(queue_, batch_count, b_offsets_i);
+    auto eventProcessB = Event();
+    PadCopyTransposeMatrixBatched(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
+                                  b_one, b_two, b_ld, b_offsets_device, b_buffer,
+                                  b_one_i, b_two_i, b_one_i, b_offsets_i_device, b_temp,
+                                  program_, true, b_do_transpose, b_conjugate, batch_count);
+    eventWaitList.push_back(eventProcessB);
+  }
+
+  // As above, but now for matrix C
+  auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+  auto c_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+  if (!c_no_temp) {
+    c_offsets_device.Write(queue_, batch_count, c_offsets);
+    c_offsets_i_device.Write(queue_, batch_count, c_offsets_i);
+    auto eventProcessC = Event();
+    PadCopyTransposeMatrixBatched(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
+                                  c_one, c_two, c_ld, c_offsets_device, c_buffer,
+                                  c_one_i, c_two_i, c_one_i, c_offsets_i_device, c_temp,
+                                  program_, true, c_do_transpose, false, batch_count);
+    eventWaitList.push_back(eventProcessC);
+  }
+
+  // Retrieves the Xgemm kernel from the compiled binary
+  auto kernel = Kernel(program_, "XgemmBatched");
+
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(m_ceiled));
+  kernel.SetArgument(1, static_cast<int>(n_ceiled));
+  kernel.SetArgument(2, static_cast<int>(k_ceiled));
+  kernel.SetArgument(3, alphas());
+  kernel.SetArgument(4, betas());
+  kernel.SetArgument(5, a_temp());
+  kernel.SetArgument(6, static_cast<int>(a_one_i));
+  kernel.SetArgument(7, static_cast<int>(a_two_i));
+  kernel.SetArgument(8, b_temp());
+  kernel.SetArgument(9, static_cast<int>(b_one_i));
+  kernel.SetArgument(10, static_cast<int>(b_two_i));
+  kernel.SetArgument(11, c_temp());
+  kernel.SetArgument(12, static_cast<int>(c_one_i));
+  kernel.SetArgument(13, static_cast<int>(c_two_i));
+
+  // Computes the global and local thread sizes
+  const auto global = std::vector<size_t>{
+    (c_one_i * db_["MDIMC"]) / db_["MWG"],
+    (c_two_i * db_["NDIMC"]) / db_["NWG"],
+    batch_count
+  };
+  const auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"], 1};
+
+  // Launches the kernel
+  auto eventKernel = Event();
+  auto eventPointer = eventKernel.pointer();
+  RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList);
+
+  // Runs the post-processing kernel if needed
+  if (!c_no_temp) {
+    eventWaitList.push_back(eventKernel);
+    PadCopyTransposeMatrixBatched(queue_, device_, db_, event_, eventWaitList,
+                                  c_one_i, c_two_i, c_one_i, c_offsets_i_device, c_temp,
+                                  c_one, c_two, c_ld, c_offsets_device, c_buffer,
+                                  program_, false, c_do_transpose, false, batch_count);
+  }
+}
+
+// =================================================================================================
+
+// The direct version of batched GEMM, requiring just one kernel, no pre or post-processing kernels.
+template <typename T>
+void XgemmBatched<T>::BatchedGemmDirect(const size_t m, const size_t n, const size_t k,
+                                        const Buffer<T> &alphas,
+                                        const Buffer<T> &a_buffer, const std::vector<int> &a_offsets, const size_t a_ld,
+                                        const Buffer<T> &b_buffer, const std::vector<int> &b_offsets, const size_t b_ld,
+                                        const Buffer<T> &betas,
+                                        const Buffer<T> &c_buffer, const std::vector<int> &c_offsets, const size_t c_ld,
+                                        const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                                        const bool a_conjugate, const bool b_conjugate,
+                                        const size_t batch_count) {
+
+  // Uploads the offsets to the device
+  auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+  auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+  auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+  a_offsets_device.Write(queue_, batch_count, a_offsets);
+  b_offsets_device.Write(queue_, batch_count, b_offsets);
+  c_offsets_device.Write(queue_, batch_count, c_offsets);
+
+  // Retrieves the proper XgemmDirect kernel from the compiled binary
+  const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectBatchedTT" : "XgemmDirectBatchedTN") :
+                                       (b_do_transpose ? "XgemmDirectBatchedNT" : "XgemmDirectBatchedNN");
+  auto kernel = Kernel(program_, name);
+
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(m));
+  kernel.SetArgument(1, static_cast<int>(n));
+  kernel.SetArgument(2, static_cast<int>(k));
+  kernel.SetArgument(3, alphas());
+  kernel.SetArgument(4, betas());
+  kernel.SetArgument(5, a_buffer());
+  kernel.SetArgument(6, a_offsets_device());
+  kernel.SetArgument(7, static_cast<int>(a_ld));
+  kernel.SetArgument(8, b_buffer());
+  kernel.SetArgument(9, b_offsets_device());
+  kernel.SetArgument(10, static_cast<int>(b_ld));
+  kernel.SetArgument(11, c_buffer());
+  kernel.SetArgument(12, c_offsets_device());
+  kernel.SetArgument(13, static_cast<int>(c_ld));
+  kernel.SetArgument(14, static_cast<int>(c_do_transpose));
+  kernel.SetArgument(15, static_cast<int>(a_conjugate));
+  kernel.SetArgument(16, static_cast<int>(b_conjugate));
+
+  // Computes the global and local thread sizes
+  const auto m_ceiled = Ceil(m, db_["WGD"]);
+  const auto n_ceiled = Ceil(n, db_["WGD"]);
+  const auto global = std::vector<size_t>{
+    (m_ceiled * db_["MDIMCD"]) / db_["WGD"],
+    (n_ceiled * db_["NDIMCD"]) / db_["WGD"],
+    batch_count
+  };
+  const auto local = std::vector<size_t>{db_["MDIMCD"], db_["NDIMCD"], 1};
+
+  // Launches the kernel
+  RunKernel(kernel, queue_, device_, global, local, event_);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class XgemmBatched<half>;
+template class XgemmBatched<float>;
+template class XgemmBatched<double>;
+template class XgemmBatched<float2>;
+template class XgemmBatched<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/levelx/xgemmbatched.hpp b/src/routines/levelx/xgemmbatched.hpp
new file mode 100644
index 00000000..6136dd5f
--- /dev/null
+++ b/src/routines/levelx/xgemmbatched.hpp
@@ -0,0 +1,72 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the XgemmBatched routine. This is a non-blas batched version of GEMM.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGEMMBATCHED_H_
+#define CLBLAST_ROUTINES_XGEMMBATCHED_H_
+
+#include <vector>
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class XgemmBatched: public Routine {
+ public:
+
+  // Constructor
+  XgemmBatched(Queue &queue, EventPointer event, const std::string &name = "GEMMBATCHED");
+
+  // Templated-precision implementation of the routine
+  void DoGemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                     const size_t m, const size_t n, const size_t k,
+                     const std::vector<T> &alphas,
+                     const Buffer<T> & a_buffer, const std::vector<size_t> &a_offsets, const size_t a_ld,
+                     const Buffer<T> & b_buffer, const std::vector<size_t> &b_offsets, const size_t b_ld,
+                     const std::vector<T> &betas,
+                     const Buffer<T> & c_buffer, const std::vector<size_t> &c_offsets, const size_t c_ld,
+                     const size_t batch_count);
+
+  // Indirect version of batched GEMM (with pre and post-processing kernels)
+  void BatchedGemmIndirect(const size_t m, const size_t n, const size_t k,
+                           const Buffer<T> &alphas,
+                           const Buffer<T> &a_buffer, const std::vector<int> &a_offsets, const size_t a_ld,
+                           const Buffer<T> &b_buffer, const std::vector<int> &b_offsets, const size_t b_ld,
+                           const Buffer<T> &betas,
+                           const Buffer<T> &c_buffer, const std::vector<int> &c_offsets, const size_t c_ld,
+                           const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                           const bool a_conjugate, const bool b_conjugate,
+                           const size_t a_one, const size_t a_two, const bool a_want_rotated,
+                           const size_t b_one, const size_t b_two, const bool b_want_rotated,
+                           const size_t c_one, const size_t c_two, const bool c_want_rotated,
+                           const size_t batch_count);
+
+  // Direct version of batched GEMM (no pre and post-processing kernels)
+  void BatchedGemmDirect(const size_t m, const size_t n, const size_t k,
+                         const Buffer<T> &alphas,
+                         const Buffer<T> &a_buffer, const std::vector<int> &a_offsets, const size_t a_ld,
+                         const Buffer<T> &b_buffer, const std::vector<int> &b_offsets, const size_t b_ld,
+                         const Buffer<T> &betas,
+                         const Buffer<T> &c_buffer, const std::vector<int> &c_offsets, const size_t c_ld,
+                         const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                         const bool a_conjugate, const bool b_conjugate,
+                         const size_t batch_count);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGEMMBATCHED_H_
+#endif
diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp
new file mode 100644
index 00000000..5c21d5ce
--- /dev/null
+++ b/src/routines/levelx/xinvert.cpp
@@ -0,0 +1,151 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains all the common code to perform (partial) matrix inverting. This code is based
+// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
+// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
+// and Jack Dongarra.
+//
+// =================================================================================================
+
+#include "routines/levelx/xinvert.hpp"
+
+#include <string>
+#include <vector>
+#include <assert.h>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xinvert<T>::Xinvert(Queue &queue, EventPointer event, const std::string &name):
+    Routine(queue, event, name, {"Invert"}, PrecisionValue<T>(), {}, {
+    #include "../../kernels/level3/level3.opencl"
+    #include "../../kernels/level3/invert_diagonal_blocks.opencl"
+    }) {
+}
+
+// =================================================================================================
+
+// Inverts diagonal square blocks of a matrix
+template <typename T>
+void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag,
+                                            const size_t n, const size_t block_size,
+                                            const Buffer<T> &src, const size_t offset, const size_t ld_src,
+                                            Buffer<T> &dest) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((block_size == 0) || (n == 0)) {
+    throw BLASError(StatusCode::kInvalidDimension);
+  }
+
+  // Helper variables
+  const auto internal_block_size = static_cast<size_t>(db_["INTERNAL_BLOCK_SIZE"]);
+  assert(internal_block_size == 16);
+  const auto num_blocks = CeilDiv(n, block_size);
+  const auto num_internal_blocks = CeilDiv(n, internal_block_size);
+  const auto unit_diagonal = (diag == Diagonal::kUnit) ? true : false;
+
+  // This routine only supports block sizes which are a multiple of the internal block size and
+  // block sizes up to and including 128
+  if ((block_size % internal_block_size != 0) || (block_size > 128)) {
+    throw BLASError(StatusCode::kUnknownError);
+  }
+
+  // Checks for validity of the source and destination matrices
+  TestMatrixA(n, n, src, offset, ld_src);
+  TestMatrixB(block_size, num_blocks * block_size, dest, 0, block_size);
+
+  // Determines which kernels to run based on the layout (the kernels assume column-major as
+  // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
+  const bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                         (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+  const auto name_postfix = (is_upper) ? "Upper" : "Lower";
+
+  // Fills the output buffer with zeros
+  auto event_wait_list = std::vector<Event>();
+  auto fill_matrix_event = Event();
+  FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), event_wait_list,
+             block_size, num_blocks * block_size, block_size, 0, dest, ConstantZero<T>());
+  event_wait_list.push_back(fill_matrix_event);
+
+  // Inverts the diagonal IB by IB inner blocks of the matrix: one block per work-group
+  auto kernel = Kernel(program_, "InvertDiagonalBlock");
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, src());
+  kernel.SetArgument(2, static_cast<int>(offset));
+  kernel.SetArgument(3, static_cast<int>(ld_src));
+  kernel.SetArgument(4, dest());
+  kernel.SetArgument(5, static_cast<int>(block_size));
+  kernel.SetArgument(6, static_cast<int>(unit_diagonal));
+  kernel.SetArgument(7, static_cast<int>(is_upper));
+  const auto local = std::vector<size_t>{internal_block_size};
+  const auto global = std::vector<size_t>{num_internal_blocks * internal_block_size};
+  auto base_kernel_event = Event();
+  auto base_kernel_event_pointer = (internal_block_size == block_size) ? event_ : base_kernel_event.pointer();
+  RunKernel(kernel, queue_, device_, global, local, base_kernel_event_pointer, event_wait_list);
+  if (internal_block_size == block_size) { event_wait_list.push_back(base_kernel_event); }
+
+  // Builds up block_size x block_size blocks. For example, internal_block_size=16:
+  // use   16 x 16  blocks to build  32 x 32  blocks,  1 x (1 x npages) grid,  4 x 4 threads;
+  // then  32 x 32  blocks to build  64 x 64  blocks,  1 x (2 x npages) grid,  8 x 4 threads;
+  // then  64 x 64  blocks to build 128 x 128 blocks,  1 x (4 x npages) grid, 16 x 4 threads;
+  for (auto current_size = internal_block_size; current_size < block_size; current_size *= 2) {
+    assert(current_size == 16 || current_size == 32 || current_size == 64);
+
+    // Emulates a 3D grid: NX * (NY * npages)
+    const auto npages = CeilDiv(n, current_size*2);
+    const auto local0 = (current_size <= 32) ? current_size/4 : 16;
+    const auto local = std::vector<size_t>{local0, 4};
+    const auto global = std::vector<size_t>{(current_size/local[1]), npages*(current_size/16)*local[1]};
+
+    // Part 1
+    auto kernel1 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix);
+    kernel1.SetArgument(0, static_cast<int>(n));
+    kernel1.SetArgument(1, src());
+    kernel1.SetArgument(2, static_cast<int>(offset));
+    kernel1.SetArgument(3, static_cast<int>(ld_src));
+    kernel1.SetArgument(4, dest());
+    kernel1.SetArgument(5, static_cast<int>(current_size));
+    kernel1.SetArgument(6, static_cast<int>(npages));
+    kernel1.SetArgument(7, static_cast<int>(block_size));
+    auto kernel1_event = Event();
+    RunKernel(kernel1, queue_, device_, global, local, kernel1_event.pointer(), event_wait_list);
+    event_wait_list.push_back(kernel1_event);
+
+    // Part 2
+    const bool is_last_kernel = (current_size * 2 >= block_size);
+    auto kernel2 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part2" + name_postfix);
+    kernel2.SetArgument(0, static_cast<int>(n));
+    kernel2.SetArgument(1, dest());
+    kernel2.SetArgument(2, static_cast<int>(current_size));
+    kernel2.SetArgument(3, static_cast<int>(npages));
+    kernel2.SetArgument(4, static_cast<int>(block_size));
+    auto kernel2_event = Event();
+    auto kernel2_event_pointer = (is_last_kernel) ? event_ : kernel2_event.pointer();
+    RunKernel(kernel2, queue_, device_, global, local, kernel2_event_pointer, event_wait_list);
+    if (!is_last_kernel) { event_wait_list.push_back(kernel2_event); }
+
+    // Exit in case we reach beyond the bounds of the input matrix
+    if (current_size*2 >= n) { break; }
+  }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xinvert<half>;
+template class Xinvert<float>;
+template class Xinvert<double>;
+template class Xinvert<float2>;
+template class Xinvert<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/levelx/xinvert.hpp b/src/routines/levelx/xinvert.hpp
new file mode 100644
index 00000000..fa0a80e7
--- /dev/null
+++ b/src/routines/levelx/xinvert.hpp
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains all the common code to perform (partial) matrix inverting.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XINVERT_H_
+#define CLBLAST_ROUTINES_XINVERT_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+template <typename T>
+class Xinvert: public Routine {
+ public:
+
+  // Constructor
+  Xinvert(Queue &queue, EventPointer event, const std::string &name = "INVERT");
+
+  // Inverts diagonal square blocks of a matrix
+  void InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag,
+                                  const size_t n, const size_t block_size,
+                                  const Buffer<T> &src, const size_t offset, const size_t ld_src,
+                                  Buffer<T> &dest);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XINVERT_H_
+#endif
diff --git a/src/routines/levelx/xomatcopy.cpp b/src/routines/levelx/xomatcopy.cpp
index 875ca7d2..4ae8c056 100644
--- a/src/routines/levelx/xomatcopy.cpp
+++ b/src/routines/levelx/xomatcopy.cpp
@@ -65,14 +65,11 @@ void Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_transpose,
   TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
   TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
 
-  // Loads the program from the database
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
   auto emptyEventList = std::vector<Event>();
   PadCopyTransposeMatrix(queue_, device_, db_, event_, emptyEventList,
                          a_one, a_two, a_ld, a_offset, a_buffer,
                          b_one, b_two, b_ld, b_offset, b_buffer,
-                         alpha, program, false, transpose, conjugate);
+                         alpha, program_, false, transpose, conjugate);
 }
 
 // =================================================================================================
diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp
index 7a434513..d3d12bed 100644
--- a/src/tuning/kernels/copy_fast.cpp
+++ b/src/tuning/kernels/copy_fast.cpp
@@ -46,8 +46,9 @@ class TuneCopy {
   static size_t DefaultM() { return 1024; }
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
+  static size_t DefaultBatchCount() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
-  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
+  static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
@@ -104,6 +105,7 @@ class TuneCopy {
 } // namespace clblast
 
 // Shortcuts to the clblast namespace
+using half = clblast::half;
 using float2 = clblast::float2;
 using double2 = clblast::double2;
 
diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp
index 94d9c303..0e157364 100644
--- a/src/tuning/kernels/copy_pad.cpp
+++ b/src/tuning/kernels/copy_pad.cpp
@@ -46,8 +46,9 @@ class TunePad {
   static size_t DefaultM() { return 1024; }
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
+  static size_t DefaultBatchCount() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
-  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
+  static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
@@ -112,6 +113,7 @@ class TunePad {
 } // namespace clblast
 
 // Shortcuts to the clblast namespace
+using half = clblast::half;
 using float2 = clblast::float2;
 using double2 = clblast::double2;
 
diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp
index e16ab235..a1ce4220 100644
--- a/src/tuning/kernels/transpose_fast.cpp
+++ b/src/tuning/kernels/transpose_fast.cpp
@@ -46,8 +46,9 @@ class TuneTranspose {
   static size_t DefaultM() { return 1024; }
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
+  static size_t DefaultBatchCount() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
-  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
+  static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
@@ -109,6 +110,7 @@ class TuneTranspose {
 } // namespace clblast
 
 // Shortcuts to the clblast namespace
+using half = clblast::half;
 using float2 = clblast::float2;
 using double2 = clblast::double2;
 
diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp
index c01298bf..490580b5 100644
--- a/src/tuning/kernels/transpose_pad.cpp
+++ b/src/tuning/kernels/transpose_pad.cpp
@@ -46,8 +46,9 @@ class TunePadTranspose {
   static size_t DefaultM() { return 1024; }
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
+  static size_t DefaultBatchCount() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
-  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
+  static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
@@ -116,6 +117,7 @@ class TunePadTranspose {
 } // namespace clblast
 
 // Shortcuts to the clblast namespace
+using half = clblast::half;
 using float2 = clblast::float2;
 using double2 = clblast::double2;
 
diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp
index 824ab29e..a13e54f2 100644
--- a/src/tuning/kernels/xaxpy.cpp
+++ b/src/tuning/kernels/xaxpy.cpp
@@ -27,7 +27,7 @@ class TuneXaxpy {
 
   // The representative kernel and the source code
   static std::string KernelFamily() { return "xaxpy"; }
-  static std::string KernelName() { return "XaxpyFast"; }
+  static std::string KernelName() { return "XaxpyFastest"; }
   static std::string GetSources() {
     return
       #include "../src/kernels/common.opencl"
@@ -42,7 +42,7 @@ class TuneXaxpy {
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &args) {
     if (!IsMultiple(args.n, 64)) {
-      throw std::runtime_error("'XaxpyFast' requires 'n' to be a multiple of WGS*WPT*VW");
+      throw std::runtime_error("'XaxpyFastest' requires 'n' to be a multiple of WGS*WPT*VW");
     }
   }
 
@@ -50,8 +50,9 @@ class TuneXaxpy {
   static size_t DefaultM() { return 1; } // N/A for this kernel
   static size_t DefaultN() { return 4096*1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
+  static size_t DefaultBatchCount() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
-  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
+  static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) { return args.n; }
@@ -107,6 +108,7 @@ class TuneXaxpy {
 } // namespace clblast
 
 // Shortcuts to the clblast namespace
+using half = clblast::half;
 using float2 = clblast::float2;
 using double2 = clblast::double2;
 
diff --git a/src/tuning/kernels/xdot.cpp b/src/tuning/kernels/xdot.cpp
index f871d42a..b85c8521 100644
--- a/src/tuning/kernels/xdot.cpp
+++ b/src/tuning/kernels/xdot.cpp
@@ -46,8 +46,9 @@ class TuneXdot {
   static size_t DefaultM() { return 1; } // N/A for this kernel
   static size_t DefaultN() { return 2*1024*1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
+  static size_t DefaultBatchCount() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
-  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
+  static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) { return args.n; }
@@ -113,6 +114,7 @@ class TuneXdot {
 } // namespace clblast
 
 // Shortcuts to the clblast namespace
+using half = clblast::half;
 using float2 = clblast::float2;
 using double2 = clblast::double2;
 
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index f55eadd8..4ea9b465 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -51,6 +51,7 @@ class TuneXgemm {
   static size_t DefaultM() { return 1024; }
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1024; }
+  static size_t DefaultBatchCount() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return (V==1) ? 1.0 : 512.0; } // test all or sample randomly
   static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
@@ -172,6 +173,7 @@ class TuneXgemm {
 } // namespace clblast
 
 // Shortcuts to the clblast namespace
+using half = clblast::half;
 using float2 = clblast::float2;
 using double2 = clblast::double2;
 
@@ -191,7 +193,7 @@ void StartVariation(int argc, char *argv[]) {
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   StartVariation<1>(argc, argv);
-  StartVariation<2>(argc, argv);
+  //StartVariation<2>(argc, argv);
   return 0;
 }
 
diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp
index ee5bcb7e..e7a35b93 100644
--- a/src/tuning/kernels/xgemm_direct.cpp
+++ b/src/tuning/kernels/xgemm_direct.cpp
@@ -51,6 +51,7 @@ class TuneXgemmDirect {
   static size_t DefaultM() { return 256; }
   static size_t DefaultN() { return 256; }
   static size_t DefaultK() { return 256; }
+  static size_t DefaultBatchCount() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return (V==1) ? 1.0 : 32.0; } // test all or sample randomly
   static size_t DefaultNumRuns() { return 4; } // run every kernel this many times for averaging
 
@@ -171,6 +172,7 @@ class TuneXgemmDirect {
 } // namespace clblast
 
 // Shortcuts to the clblast namespace
+using half = clblast::half;
 using float2 = clblast::float2;
 using double2 = clblast::double2;
 
diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp
index 97a45225..9e9a6fe1 100644
--- a/src/tuning/kernels/xgemv.cpp
+++ b/src/tuning/kernels/xgemv.cpp
@@ -49,8 +49,9 @@ class TuneXgemv {
   static size_t DefaultM() { return 2048; }
   static size_t DefaultN() { return 2048; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
+  static size_t DefaultBatchCount() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
-  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
+  static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) { return args.n; }
@@ -153,6 +154,7 @@ class TuneXgemv {
 } // namespace clblast
 
 // Shortcuts to the clblast namespace
+using half = clblast::half;
 using float2 = clblast::float2;
 using double2 = clblast::double2;
 
diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp
index 5057492f..c82a29b6 100644
--- a/src/tuning/kernels/xger.cpp
+++ b/src/tuning/kernels/xger.cpp
@@ -46,8 +46,9 @@ class TuneXger {
   static size_t DefaultM() { return 1024; }
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
+  static size_t DefaultBatchCount() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
-  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
+  static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) { return args.m; }
@@ -112,6 +113,7 @@ class TuneXger {
 } // namespace clblast
 
 // Shortcuts to the clblast namespace
+using half = clblast::half;
 using float2 = clblast::float2;
 using double2 = clblast::double2;
 
diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp
index 1dd76894..25504430 100644
--- a/src/tuning/tuning.hpp
+++ b/src/tuning/tuning.hpp
@@ -17,6 +17,7 @@
 
 #include <vector>
 #include <string>
+#include <random>
 
 #include <cltune.h>
 
@@ -46,6 +47,7 @@ void Tuner(int argc, char* argv[]) {
     if (o == kArgAlpha)    { args.alpha    = GetArgument(command_line_args, help, kArgAlpha, GetScalar<T>()); }
     if (o == kArgBeta)     { args.beta     = GetArgument(command_line_args, help, kArgBeta, GetScalar<T>()); }
     if (o == kArgFraction) { args.fraction = GetArgument(command_line_args, help, kArgFraction, C::DefaultFraction()); }
+    if (o == kArgBatchCount) { args.batch_count = GetArgument(command_line_args, help, kArgBatchCount, C::DefaultBatchCount()); }
   }
   const auto num_runs = GetArgument(command_line_args, help, kArgNumRuns, C::DefaultNumRuns());
 
@@ -77,12 +79,14 @@ void Tuner(int argc, char* argv[]) {
   auto b_mat = std::vector<T>(C::GetSizeB(args));
   auto c_mat = std::vector<T>(C::GetSizeC(args));
   auto temp = std::vector<T>(C::GetSizeTemp(args));
-  PopulateVector(x_vec, kSeed);
-  PopulateVector(y_vec, kSeed);
-  PopulateVector(a_mat, kSeed);
-  PopulateVector(b_mat, kSeed);
-  PopulateVector(c_mat, kSeed);
-  PopulateVector(temp, kSeed);
+  std::mt19937 mt(kSeed);
+  std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
+  PopulateVector(x_vec, mt, dist);
+  PopulateVector(y_vec, mt, dist);
+  PopulateVector(a_mat, mt, dist);
+  PopulateVector(b_mat, mt, dist);
+  PopulateVector(c_mat, mt, dist);
+  PopulateVector(temp, mt, dist);
 
   // Initializes the tuner for the chosen device
   cltune::Tuner tuner(args.platform_id, args.device_id);
@@ -155,6 +159,7 @@ void Tuner(int argc, char* argv[]) {
     if (o == kArgK)     { metadata.push_back({"arg_k", std::to_string(args.k)}); }
     if (o == kArgAlpha) { metadata.push_back({"arg_alpha", ToString(args.alpha)}); }
     if (o == kArgBeta)  { metadata.push_back({"arg_beta", ToString(args.beta)}); }
+    if (o == kArgBatchCount) { metadata.push_back({"arg_batch_count", ToString(args.batch_count)}); }
   }
   tuner.PrintJSON("clblast_"+C::KernelFamily()+"_"+precision_string+".json", metadata);
 }
diff --git a/src/utilities/clblast_exceptions.hpp b/src/utilities/clblast_exceptions.hpp
index f3c7b9a3..0d0033b6 100644
--- a/src/utilities/clblast_exceptions.hpp
+++ b/src/utilities/clblast_exceptions.hpp
@@ -16,8 +16,8 @@
 #ifndef CLBLAST_EXCEPTIONS_H_
 #define CLBLAST_EXCEPTIONS_H_
 
-#include "clblast.h"
 #include "clpp11.hpp"
+#include "clblast.h"
 
 namespace clblast {
 // =================================================================================================
diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp
index 5e445bb9..95b70cd5 100644
--- a/src/utilities/utilities.cpp
+++ b/src/utilities/utilities.cpp
@@ -18,57 +18,80 @@
 #include <chrono>
 #include <random>
 #include <iomanip>
+#include <cmath>
 
 namespace clblast {
 // =================================================================================================
 
 // Returns a scalar with a default value
-template <typename T>
-T GetScalar() {
-  return static_cast<T>(2.0);
-}
+template <typename T> T GetScalar() { return static_cast<T>(2.0); }
 template float GetScalar<float>();
 template double GetScalar<double>();
+template <> half GetScalar() { return FloatToHalf(2.0f); }
+template <> float2 GetScalar() { return {2.0f, 0.5f}; }
+template <> double2 GetScalar() { return {2.0, 0.5}; }
 
-// Specialized version of the above for half-precision
-template <>
-half GetScalar() {
-  return FloatToHalf(2.0f);
-}
-
-// Specialized versions of the above for complex data-types
-template <>
-float2 GetScalar() {
-  return {2.0f, 0.5f};
-}
-template <>
-double2 GetScalar() {
-  return {2.0, 0.5};
-}
+// Returns a scalar of value 0
+template <typename T> T ConstantZero() { return static_cast<T>(0.0); }
+template float ConstantZero<float>();
+template double ConstantZero<double>();
+template <> half ConstantZero() { return FloatToHalf(0.0f); }
+template <> float2 ConstantZero() { return {0.0f, 0.0f}; }
+template <> double2 ConstantZero() { return {0.0, 0.0}; }
 
 // Returns a scalar of value 1
-template <typename T>
-T ConstantOne() {
-  return static_cast<T>(1.0);
-}
+template <typename T> T ConstantOne() { return static_cast<T>(1.0); }
 template float ConstantOne<float>();
 template double ConstantOne<double>();
+template <> half ConstantOne() { return FloatToHalf(1.0f); }
+template <> float2 ConstantOne() { return {1.0f, 0.0f}; }
+template <> double2 ConstantOne() { return {1.0, 0.0}; }
 
-// Specialized version of the above for half-precision
-template <>
-half ConstantOne() {
-  return FloatToHalf(1.0f);
+// Returns a scalar of value -1
+template <typename T> T ConstantNegOne() { return static_cast<T>(-1.0); }
+template float ConstantNegOne<float>();
+template double ConstantNegOne<double>();
+template <> half ConstantNegOne() { return FloatToHalf(-1.0f); }
+template <> float2 ConstantNegOne() { return {-1.0f, 0.0f}; }
+template <> double2 ConstantNegOne() { return {-1.0, 0.0}; }
+
+// Returns a scalar of some value
+template <typename T> T Constant(const double val) { return static_cast<T>(val); }
+template float Constant<float>(const double);
+template double Constant<double>(const double);
+template <> half Constant(const double val) { return FloatToHalf(static_cast<float>(val)); }
+template <> float2 Constant(const double val) { return {static_cast<float>(val), 0.0f}; }
+template <> double2 Constant(const double val) { return {val, 0.0}; }
+
+// Returns a small scalar value just larger than 0
+template <typename T> T SmallConstant() { return static_cast<T>(1e-4); }
+template float SmallConstant<float>();
+template double SmallConstant<double>();
+template <> half SmallConstant() { return FloatToHalf(1e-4f); }
+template <> float2 SmallConstant() { return {1e-4f, 0.0f}; }
+template <> double2 SmallConstant() { return {1e-4, 0.0}; }
+
+// Returns the absolute value of a scalar (modulus in case of a complex number)
+template <typename T> typename BaseType<T>::Type AbsoluteValue(const T value) { return std::fabs(value); }
+template float AbsoluteValue<float>(const float);
+template double AbsoluteValue<double>(const double);
+template <> half AbsoluteValue(const half value) { return FloatToHalf(std::fabs(HalfToFloat(value))); }
+template <> float AbsoluteValue(const float2 value) {
+  if (value.real() == 0.0f && value.imag() == 0.0f) { return 0.0f; }
+  return std::sqrt(value.real() * value.real() + value.imag() * value.imag());
+}
+template <> double AbsoluteValue(const double2 value) {
+  if (value.real() == 0.0 && value.imag() == 0.0) { return 0.0; }
+  return std::sqrt(value.real() * value.real() + value.imag() * value.imag());
 }
 
-// Specialized versions of the above for complex data-types
-template <>
-float2 ConstantOne() {
-  return {1.0f, 0.0f};
-}
-template <>
-double2 ConstantOne() {
-  return {1.0, 0.0};
-}
+// Returns whether a scalar is close to zero
+template <typename T> bool IsCloseToZero(const T value) { return (value > -SmallConstant<T>()) && (value < SmallConstant<T>()); }
+template bool IsCloseToZero<float>(const float);
+template bool IsCloseToZero<double>(const double);
+template <> bool IsCloseToZero(const half value) { return IsCloseToZero(HalfToFloat(value)); }
+template <> bool IsCloseToZero(const float2 value) { return IsCloseToZero(value.real()) || IsCloseToZero(value.imag()); }
+template <> bool IsCloseToZero(const double2 value) { return IsCloseToZero(value.real()) || IsCloseToZero(value.imag()); }
 
 // =================================================================================================
 
@@ -79,23 +102,27 @@ std::string ToString(T value) {
 }
 template std::string ToString<int>(int value);
 template std::string ToString<size_t>(size_t value);
-template std::string ToString<float>(float value);
-template std::string ToString<double>(double value);
+template <>
+std::string ToString(float value) {
+  std::ostringstream result;
+  result << std::fixed << std::setprecision(2) << value;
+  return result.str();
+}
+template <>
+std::string ToString(double value) {
+  std::ostringstream result;
+  result << std::fixed << std::setprecision(2) << value;
+  return result.str();
+}
 
 // If not possible directly: special cases for complex data-types
 template <>
 std::string ToString(float2 value) {
-  std::ostringstream real, imag;
-  real << std::setprecision(2) << value.real();
-  imag << std::setprecision(2) << value.imag();
-  return real.str()+"+"+imag.str()+"i";
+  return ToString(value.real())+"+"+ToString(value.imag())+"i";
 }
 template <>
 std::string ToString(double2 value) {
-  std::ostringstream real, imag;
-  real << std::setprecision(2) << value.real();
-  imag << std::setprecision(2) << value.imag();
-  return real.str()+"+"+imag.str()+"i";
+  return ToString(value.real())+"+"+ToString(value.imag())+"i";
 }
 
 // If not possible directly: special case for half-precision
@@ -149,6 +176,7 @@ std::string ToString(Precision value) {
     case Precision::kDouble: return ToString(static_cast<int>(value))+" (double)";
     case Precision::kComplexSingle: return ToString(static_cast<int>(value))+" (complex-single)";
     case Precision::kComplexDouble: return ToString(static_cast<int>(value))+" (complex-double)";
+    case Precision::kAny: return ToString(static_cast<int>(value))+" (any)";
   }
 }
 template <>
@@ -299,46 +327,81 @@ unsigned int GetRandomSeed() {
 
 // Create a random number generator and populates a vector with samples from a random distribution
 template <typename T>
-void PopulateVector(std::vector<T> &vector, const unsigned int seed) {
-  auto lower_limit = static_cast<T>(kTestDataLowerLimit);
-  auto upper_limit = static_cast<T>(kTestDataUpperLimit);
-  std::mt19937 mt(seed);
-  std::uniform_real_distribution<T> dist(lower_limit, upper_limit);
-  for (auto &element: vector) { element = dist(mt); }
+void PopulateVector(std::vector<T> &vector, std::mt19937 &mt, std::uniform_real_distribution<double> &dist) {
+  for (auto &element: vector) { element = static_cast<T>(dist(mt)); }
 }
-template void PopulateVector<float>(std::vector<float>&, const unsigned int);
-template void PopulateVector<double>(std::vector<double>&, const unsigned int);
+template void PopulateVector<float>(std::vector<float>&, std::mt19937&, std::uniform_real_distribution<double>&);
+template void PopulateVector<double>(std::vector<double>&, std::mt19937&, std::uniform_real_distribution<double>&);
 
 // Specialized versions of the above for complex data-types
 template <>
-void PopulateVector(std::vector<float2> &vector, const unsigned int seed) {
-  auto lower_limit = static_cast<float>(kTestDataLowerLimit);
-  auto upper_limit = static_cast<float>(kTestDataUpperLimit);
-  std::mt19937 mt(seed);
-  std::uniform_real_distribution<float> dist(lower_limit, upper_limit);
-  for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
+void PopulateVector(std::vector<float2> &vector, std::mt19937 &mt, std::uniform_real_distribution<double> &dist) {
+  for (auto &element: vector) {
+    element.real(static_cast<float>(dist(mt)));
+    element.imag(static_cast<float>(dist(mt)));
+  }
 }
 template <>
-void PopulateVector(std::vector<double2> &vector, const unsigned int seed) {
-  auto lower_limit = static_cast<double>(kTestDataLowerLimit);
-  auto upper_limit = static_cast<double>(kTestDataUpperLimit);
-  std::mt19937 mt(seed);
-  std::uniform_real_distribution<double> dist(lower_limit, upper_limit);
+void PopulateVector(std::vector<double2> &vector, std::mt19937 &mt, std::uniform_real_distribution<double> &dist) {
   for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
 }
 
 // Specialized versions of the above for half-precision
 template <>
-void PopulateVector(std::vector<half> &vector, const unsigned int seed) {
-  const auto lower_limit = static_cast<float>(kTestDataLowerLimit);
-  const auto upper_limit = static_cast<float>(kTestDataUpperLimit);
-  std::mt19937 mt(seed);
-  std::uniform_real_distribution<float> dist(lower_limit, upper_limit);
-  for (auto &element: vector) { element = FloatToHalf(dist(mt)); }
+void PopulateVector(std::vector<half> &vector, std::mt19937 &mt, std::uniform_real_distribution<double> &dist) {
+  for (auto &element: vector) { element = FloatToHalf(static_cast<float>(dist(mt))); }
 }
 
 // =================================================================================================
 
+template <typename T, typename U>
+void DeviceToHost(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host,
+                  Queue &queue, const std::vector<std::string> &names) {
+  for (auto &name: names) {
+    if (name == kBufVecX) {buffers_host.x_vec = std::vector<T>(args.x_size, static_cast<T>(0)); buffers.x_vec.Read(queue, args.x_size, buffers_host.x_vec); }
+    else if (name == kBufVecY) { buffers_host.y_vec = std::vector<T>(args.y_size, static_cast<T>(0)); buffers.y_vec.Read(queue, args.y_size, buffers_host.y_vec); }
+    else if (name == kBufMatA) { buffers_host.a_mat = std::vector<T>(args.a_size, static_cast<T>(0)); buffers.a_mat.Read(queue, args.a_size, buffers_host.a_mat); }
+    else if (name == kBufMatB) { buffers_host.b_mat = std::vector<T>(args.b_size, static_cast<T>(0)); buffers.b_mat.Read(queue, args.b_size, buffers_host.b_mat); }
+    else if (name == kBufMatC) { buffers_host.c_mat = std::vector<T>(args.c_size, static_cast<T>(0)); buffers.c_mat.Read(queue, args.c_size, buffers_host.c_mat); }
+    else if (name == kBufMatAP) { buffers_host.ap_mat = std::vector<T>(args.ap_size, static_cast<T>(0)); buffers.ap_mat.Read(queue, args.ap_size, buffers_host.ap_mat); }
+    else if (name == kBufScalar) { buffers_host.scalar = std::vector<T>(args.scalar_size, static_cast<T>(0)); buffers.scalar.Read(queue, args.scalar_size, buffers_host.scalar); }
+    else { throw std::runtime_error("Invalid buffer name"); }
+  }
+}
+
+template <typename T, typename U>
+void HostToDevice(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host,
+                  Queue &queue, const std::vector<std::string> &names) {
+  for (auto &name: names) {
+    if (name == kBufVecX) { buffers.x_vec.Write(queue, args.x_size, buffers_host.x_vec); }
+    else if (name == kBufVecY) { buffers.y_vec.Write(queue, args.y_size, buffers_host.y_vec); }
+    else if (name == kBufMatA) { buffers.a_mat.Write(queue, args.a_size, buffers_host.a_mat); }
+    else if (name == kBufMatB) { buffers.b_mat.Write(queue, args.b_size, buffers_host.b_mat); }
+    else if (name == kBufMatC) { buffers.c_mat.Write(queue, args.c_size, buffers_host.c_mat); }
+    else if (name == kBufMatAP) { buffers.ap_mat.Write(queue, args.ap_size, buffers_host.ap_mat); }
+    else if (name == kBufScalar) { buffers.scalar.Write(queue, args.scalar_size, buffers_host.scalar); }
+    else { throw std::runtime_error("Invalid buffer name"); }
+  }
+}
+
+// Compiles the above functions
+template void DeviceToHost(const Arguments<half>&, Buffers<half>&, BuffersHost<half>&, Queue&, const std::vector<std::string>&);
+template void DeviceToHost(const Arguments<float>&, Buffers<float>&, BuffersHost<float>&, Queue&, const std::vector<std::string>&);
+template void DeviceToHost(const Arguments<double>&, Buffers<double>&, BuffersHost<double>&, Queue&, const std::vector<std::string>&);
+template void DeviceToHost(const Arguments<float>&, Buffers<float2>&, BuffersHost<float2>&, Queue&, const std::vector<std::string>&);
+template void DeviceToHost(const Arguments<double>&, Buffers<double2>&, BuffersHost<double2>&, Queue&, const std::vector<std::string>&);
+template void DeviceToHost(const Arguments<float2>&, Buffers<float2>&, BuffersHost<float2>&, Queue&, const std::vector<std::string>&);
+template void DeviceToHost(const Arguments<double2>&, Buffers<double2>&, BuffersHost<double2>&, Queue&, const std::vector<std::string>&);
+template void HostToDevice(const Arguments<half>&, Buffers<half>&, BuffersHost<half>&, Queue&, const std::vector<std::string>&);
+template void HostToDevice(const Arguments<float>&, Buffers<float>&, BuffersHost<float>&, Queue&, const std::vector<std::string>&);
+template void HostToDevice(const Arguments<double>&, Buffers<double>&, BuffersHost<double>&, Queue&, const std::vector<std::string>&);
+template void HostToDevice(const Arguments<float>&, Buffers<float2>&, BuffersHost<float2>&, Queue&, const std::vector<std::string>&);
+template void HostToDevice(const Arguments<double>&, Buffers<double2>&, BuffersHost<double2>&, Queue&, const std::vector<std::string>&);
+template void HostToDevice(const Arguments<float2>&, Buffers<float2>&, BuffersHost<float2>&, Queue&, const std::vector<std::string>&);
+template void HostToDevice(const Arguments<double2>&, Buffers<double2>&, BuffersHost<double2>&, Queue&, const std::vector<std::string>&);
+
+// =================================================================================================
+
 // Conversion between half and single-precision
 std::vector<float> HalfToFloatBuffer(const std::vector<half>& source) {
   auto result = std::vector<float>(source.size());
@@ -405,6 +468,7 @@ size_t GetBytes(const Precision precision) {
     case Precision::kDouble: return 8;
     case Precision::kComplexSingle: return 8;
     case Precision::kComplexDouble: return 16;
+    case Precision::kAny: return -1;
   }
 }
 
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index 20587bd4..09394cf3 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -20,16 +20,20 @@
 #include <string>
 #include <functional>
 #include <complex>
+#include <random>
 
+#include "clpp11.hpp"
 #include "clblast.h"
 #include "clblast_half.h"
-#include "clpp11.hpp"
 #include "utilities/clblast_exceptions.hpp"
 #include "utilities/msvc.hpp"
 
 namespace clblast {
 // =================================================================================================
 
+// Shorthands for half-precision
+using half = cl_half; // based on the OpenCL type, which is actually an 'unsigned short'
+
 // Shorthands for complex data-types
 using float2 = std::complex<float>;
 using double2 = std::complex<double>;
@@ -72,6 +76,7 @@ constexpr auto kArgAsumOffset = "offasum";
 constexpr auto kArgImaxOffset = "offimax";
 constexpr auto kArgAlpha = "alpha";
 constexpr auto kArgBeta = "beta";
+constexpr auto kArgBatchCount = "batch_num";
 
 // The tuner-specific arguments in string form
 constexpr auto kArgFraction = "fraction";
@@ -79,6 +84,7 @@ constexpr auto kArgFraction = "fraction";
 // The client-specific arguments in string form
 constexpr auto kArgCompareclblas = "clblas";
 constexpr auto kArgComparecblas = "cblas";
+constexpr auto kArgComparecublas = "cublas";
 constexpr auto kArgStepSize = "step";
 constexpr auto kArgNumSteps = "num_steps";
 constexpr auto kArgNumRuns = "runs";
@@ -96,15 +102,39 @@ constexpr auto kArgHelp = "h";
 constexpr auto kArgQuiet = "q";
 constexpr auto kArgNoAbbreviations = "no_abbrv";
 
+// The buffer names
+constexpr auto kBufVecX = "X";
+constexpr auto kBufVecY = "Y";
+constexpr auto kBufMatA = "A";
+constexpr auto kBufMatB = "B";
+constexpr auto kBufMatC = "C";
+constexpr auto kBufMatAP = "AP";
+constexpr auto kBufScalar = "Scalar";
+
+// =================================================================================================
+
+// Converts a regular or complex type to it's base type (e.g. float2 to float)
+template <typename T> struct BaseType { using Type = T; };
+template <> struct BaseType<float2> { using Type = float; };
+template <> struct BaseType<double2> { using Type = double; };
+
 // =================================================================================================
 
 // Returns a scalar with a default value
-template <typename T>
-T GetScalar();
+template <typename T> T GetScalar();
 
-// Returns a scalar of value 1
-template <typename T>
-T ConstantOne();
+// Fixed value scalars
+template <typename T> T ConstantZero();
+template <typename T> T ConstantOne();
+template <typename T> T ConstantNegOne();
+template <typename T> T Constant(const double val);
+template <typename T> T SmallConstant();
+
+// Returns the absolute value of a scalar (modulus in case of complex numbers)
+template <typename T> typename BaseType<T>::Type AbsoluteValue(const T value);
+
+// Returns whether a scalar is close to zero
+template <typename T> bool IsCloseToZero(const T value);
 
 // =================================================================================================
 
@@ -140,6 +170,16 @@ struct Arguments {
   size_t imax_offset = 0;
   T alpha = ConstantOne<T>();
   T beta = ConstantOne<T>();
+  // Batch-specific arguments
+  size_t batch_count = 1;
+  std::vector<size_t> x_offsets = {0};
+  std::vector<size_t> y_offsets = {0};
+  std::vector<size_t> a_offsets = {0};
+  std::vector<size_t> b_offsets = {0};
+  std::vector<size_t> c_offsets = {0};
+  std::vector<T> alphas = {ConstantOne<T>()};
+  std::vector<T> betas = {ConstantOne<T>()};
+  // Sizes
   size_t x_size = 1;
   size_t y_size = 1;
   size_t a_size = 1;
@@ -152,9 +192,13 @@ struct Arguments {
   // Client-specific arguments
   int compare_clblas = 1;
   int compare_cblas = 1;
+  int compare_cublas = 1;
   size_t step = 1;
   size_t num_steps = 0;
   size_t num_runs = 10;
+  #ifdef CLBLAST_REF_CUBLAS
+    void* cublas_handle; // cublasHandle_t
+  #endif
   // Common arguments
   size_t platform_id = 0;
   size_t device_id = 0;
@@ -175,6 +219,16 @@ struct Buffers {
   Buffer<T> ap_mat;
   Buffer<T> scalar;
 };
+template <typename T>
+struct BuffersHost {
+  std::vector<T> x_vec;
+  std::vector<T> y_vec;
+  std::vector<T> a_mat;
+  std::vector<T> b_mat;
+  std::vector<T> c_mat;
+  std::vector<T> ap_mat;
+  std::vector<T> scalar;
+};
 
 // =================================================================================================
 
@@ -219,7 +273,19 @@ constexpr auto kTestDataUpperLimit = 2.0;
 
 // Populates a vector with random data
 template <typename T>
-void PopulateVector(std::vector<T> &vector, const unsigned int seed);
+void PopulateVector(std::vector<T> &vector, std::mt19937 &mt, std::uniform_real_distribution<double> &dist);
+
+// =================================================================================================
+
+// Copies buffers from the OpenCL device to the host
+template <typename T, typename U>
+void DeviceToHost(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host,
+                  Queue &queue, const std::vector<std::string> &names);
+
+// Copies buffers from the host to the OpenCL device
+template <typename T, typename U>
+void HostToDevice(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host,
+                  Queue &queue, const std::vector<std::string> &names);
 
 // =================================================================================================
 
diff --git a/test/correctness/misc/override_parameters.cpp b/test/correctness/misc/override_parameters.cpp
new file mode 100644
index 00000000..4283c039
--- /dev/null
+++ b/test/correctness/misc/override_parameters.cpp
@@ -0,0 +1,140 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the tests for the OverrideParameters function
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <random>
+
+#include "utilities/utilities.hpp"
+#include "test/routines/level3/xgemm.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+template <typename T>
+size_t RunOverrideTests(int argc, char *argv[], const bool silent, const std::string &routine_name) {
+  auto arguments = RetrieveCommandLineArguments(argc, argv);
+  auto errors = size_t{0};
+  auto passed = size_t{0};
+  auto example_routine = TestXgemm<T>();
+  constexpr auto kSeed = 42; // fixed seed for reproducibility
+
+  // Determines the test settings
+  const auto kernel_name = std::string{"Xgemm"};
+  const auto precision = PrecisionValue<T>();
+  const auto valid_settings = std::vector<std::unordered_map<std::string,size_t>>{
+    { {"KWG",16}, {"KWI",2}, {"MDIMA",4}, {"MDIMC",4}, {"MWG",16}, {"NDIMB",4}, {"NDIMC",4}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} },
+    { {"KWG",32}, {"KWI",2}, {"MDIMA",4}, {"MDIMC",4}, {"MWG",32}, {"NDIMB",4}, {"NDIMC",4}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} },
+  };
+  const auto invalid_settings = std::vector<std::unordered_map<std::string,size_t>>{
+    { {"KWI",2}, {"MDIMA",4}, {"MDIMC",4}, {"MWG",16}, {"NDIMB",4}, {"NDIMC",4}, {"NWG",16}, {"SA",0} },
+  };
+
+  // Retrieves the arguments
+  auto help = std::string{"Options given/available:\n"};
+  const auto platform_id = GetArgument(arguments, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0}));
+  const auto device_id = GetArgument(arguments, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0}));
+  auto args = Arguments<T>{};
+  args.m = GetArgument(arguments, help, kArgM, size_t{256});
+  args.n = GetArgument(arguments, help, kArgN, size_t{256});
+  args.k = GetArgument(arguments, help, kArgK, size_t{256});
+  args.a_ld = GetArgument(arguments, help, kArgALeadDim, args.k);
+  args.b_ld = GetArgument(arguments, help, kArgBLeadDim, args.n);
+  args.c_ld = GetArgument(arguments, help, kArgCLeadDim, args.n);
+  args.a_offset = GetArgument(arguments, help, kArgAOffset, size_t{0});
+  args.b_offset = GetArgument(arguments, help, kArgBOffset, size_t{0});
+  args.c_offset = GetArgument(arguments, help, kArgCOffset, size_t{0});
+  args.layout = GetArgument(arguments, help, kArgLayout, Layout::kRowMajor);
+  args.a_transpose = GetArgument(arguments, help, kArgATransp, Transpose::kNo);
+  args.b_transpose = GetArgument(arguments, help, kArgBTransp, Transpose::kNo);
+  args.alpha = GetArgument(arguments, help, kArgAlpha, GetScalar<T>());
+  args.beta  = GetArgument(arguments, help, kArgBeta, GetScalar<T>());
+
+  // Prints the help message (command-line arguments)
+  if (!silent) { fprintf(stdout, "\n* %s\n", help.c_str()); }
+
+  // Initializes OpenCL
+  const auto platform = Platform(platform_id);
+  const auto device = Device(platform, device_id);
+  const auto context = Context(device);
+  auto queue = Queue(context, device);
+
+  // Populate host matrices with some example data
+  auto host_a = std::vector<T>(args.m * args.k);
+  auto host_b = std::vector<T>(args.n * args.k);
+  auto host_c = std::vector<T>(args.m * args.n);
+  std::mt19937 mt(kSeed);
+  std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
+  PopulateVector(host_a, mt, dist);
+  PopulateVector(host_b, mt, dist);
+  PopulateVector(host_c, mt, dist);
+
+  // Copy the matrices to the device
+  auto device_a = Buffer<T>(context, host_a.size());
+  auto device_b = Buffer<T>(context, host_b.size());
+  auto device_c = Buffer<T>(context, host_c.size());
+  device_a.Write(queue, host_a.size(), host_a);
+  device_b.Write(queue, host_b.size(), host_b);
+  device_c.Write(queue, host_c.size(), host_c);
+  auto dummy = Buffer<T>(context, 1);
+  auto buffers = Buffers<T>{dummy, dummy, device_a, device_b, device_c, dummy, dummy};
+
+  // Loops over the valid combinations: run before and run afterwards
+  fprintf(stdout, "* Testing OverrideParameters for '%s'\n", routine_name.c_str());
+  for (const auto &override_setting : valid_settings) {
+    const auto status_before = example_routine.RunRoutine(args, buffers, queue);
+    if (status_before != StatusCode::kSuccess) { errors++; continue; }
+
+    // Overrides the parameters
+    const auto status = OverrideParameters(device(), kernel_name, precision, override_setting);
+    if (status != StatusCode::kSuccess) { errors++; continue; } // error shouldn't occur
+
+    const auto status_after = example_routine.RunRoutine(args, buffers, queue);
+    if (status_after != StatusCode::kSuccess) { errors++; continue; }
+    passed++;
+  }
+
+  // Loops over the invalid combinations: run before and run afterwards
+  for (const auto &override_setting : invalid_settings) {
+    const auto status_before = example_routine.RunRoutine(args, buffers, queue);
+    if (status_before != StatusCode::kSuccess) { errors++; continue; }
+
+    // Overrides the parameters
+    const auto status = OverrideParameters(device(), kernel_name, precision, override_setting);
+    if (status == StatusCode::kSuccess) { errors++; continue; } // error should occur
+
+    const auto status_after = example_routine.RunRoutine(args, buffers, queue);
+    if (status_after != StatusCode::kSuccess) { errors++; continue; }
+    passed++;
+  }
+
+  // Prints and returns the statistics
+  fprintf(stdout, "    %zu test(s) passed\n", passed);
+  fprintf(stdout, "    %zu test(s) failed\n", errors);
+  fprintf(stdout, "\n");
+  return errors;
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunOverrideTests<float>(argc, argv, false, "SGEMM");
+  errors += clblast::RunOverrideTests<clblast::float2>(argc, argv, true, "CGEMM");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level1/xamax.cpp b/test/correctness/routines/level1/xamax.cpp
index 607637e8..f92b6c4a 100644
--- a/test/correctness/routines/level1/xamax.cpp
+++ b/test/correctness/routines/level1/xamax.cpp
@@ -12,18 +12,14 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level1/xamax.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXamax<float>, float, float>(argc, argv, false, "iSAMAX");
   errors += clblast::RunTests<clblast::TestXamax<double>, double, double>(argc, argv, true, "iDAMAX");
-  errors += clblast::RunTests<clblast::TestXamax<float2>, float2, float2>(argc, argv, true, "iCAMAX");
-  errors += clblast::RunTests<clblast::TestXamax<double2>, double2, double2>(argc, argv, true, "iZAMAX");
-  errors += clblast::RunTests<clblast::TestXamax<half>, half, half>(argc, argv, true, "iHAMAX");
+  errors += clblast::RunTests<clblast::TestXamax<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "iCAMAX");
+  errors += clblast::RunTests<clblast::TestXamax<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "iZAMAX");
+  errors += clblast::RunTests<clblast::TestXamax<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "iHAMAX");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level1/xasum.cpp b/test/correctness/routines/level1/xasum.cpp
index e22e42a6..5f65cca9 100644
--- a/test/correctness/routines/level1/xasum.cpp
+++ b/test/correctness/routines/level1/xasum.cpp
@@ -12,18 +12,14 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level1/xasum.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXasum<float>, float, float>(argc, argv, false, "SASUM");
   errors += clblast::RunTests<clblast::TestXasum<double>, double, double>(argc, argv, true, "DASUM");
-  errors += clblast::RunTests<clblast::TestXasum<float2>, float2, float2>(argc, argv, true, "ScASUM");
-  errors += clblast::RunTests<clblast::TestXasum<double2>, double2, double2>(argc, argv, true, "DzASUM");
-  errors += clblast::RunTests<clblast::TestXasum<half>, half, half>(argc, argv, true, "HASUM");
+  errors += clblast::RunTests<clblast::TestXasum<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "ScASUM");
+  errors += clblast::RunTests<clblast::TestXasum<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "DzASUM");
+  errors += clblast::RunTests<clblast::TestXasum<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HASUM");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level1/xaxpy.cpp b/test/correctness/routines/level1/xaxpy.cpp
index 064172fa..f9f0d756 100644
--- a/test/correctness/routines/level1/xaxpy.cpp
+++ b/test/correctness/routines/level1/xaxpy.cpp
@@ -12,18 +12,14 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level1/xaxpy.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXaxpy<float>, float, float>(argc, argv, false, "SAXPY");
   errors += clblast::RunTests<clblast::TestXaxpy<double>, double, double>(argc, argv, true, "DAXPY");
-  errors += clblast::RunTests<clblast::TestXaxpy<float2>, float2, float2>(argc, argv, true, "CAXPY");
-  errors += clblast::RunTests<clblast::TestXaxpy<double2>, double2, double2>(argc, argv, true, "ZAXPY");
-  errors += clblast::RunTests<clblast::TestXaxpy<half>, half, half>(argc, argv, true, "HAXPY");
+  errors += clblast::RunTests<clblast::TestXaxpy<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CAXPY");
+  errors += clblast::RunTests<clblast::TestXaxpy<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZAXPY");
+  errors += clblast::RunTests<clblast::TestXaxpy<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HAXPY");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level1/xcopy.cpp b/test/correctness/routines/level1/xcopy.cpp
index e6f2581b..af750fdc 100644
--- a/test/correctness/routines/level1/xcopy.cpp
+++ b/test/correctness/routines/level1/xcopy.cpp
@@ -12,18 +12,14 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level1/xcopy.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXcopy<float>, float, float>(argc, argv, false, "SCOPY");
   errors += clblast::RunTests<clblast::TestXcopy<double>, double, double>(argc, argv, true, "DCOPY");
-  errors += clblast::RunTests<clblast::TestXcopy<float2>, float2, float2>(argc, argv, true, "CCOPY");
-  errors += clblast::RunTests<clblast::TestXcopy<double2>, double2, double2>(argc, argv, true, "ZCOPY");
-  errors += clblast::RunTests<clblast::TestXcopy<half>, half, half>(argc, argv, true, "HCOPY");
+  errors += clblast::RunTests<clblast::TestXcopy<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CCOPY");
+  errors += clblast::RunTests<clblast::TestXcopy<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZCOPY");
+  errors += clblast::RunTests<clblast::TestXcopy<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HCOPY");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level1/xdot.cpp b/test/correctness/routines/level1/xdot.cpp
index 080250cb..8f5a8cbd 100644
--- a/test/correctness/routines/level1/xdot.cpp
+++ b/test/correctness/routines/level1/xdot.cpp
@@ -12,16 +12,12 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level1/xdot.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXdot<float>, float, float>(argc, argv, false, "SDOT");
   errors += clblast::RunTests<clblast::TestXdot<double>, double, double>(argc, argv, true, "DDOT");
-  errors += clblast::RunTests<clblast::TestXdot<half>, half, half>(argc, argv, true, "HDOT");
+  errors += clblast::RunTests<clblast::TestXdot<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HDOT");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level1/xdotc.cpp b/test/correctness/routines/level1/xdotc.cpp
index 2a7bbeca..59eedddc 100644
--- a/test/correctness/routines/level1/xdotc.cpp
+++ b/test/correctness/routines/level1/xdotc.cpp
@@ -12,15 +12,11 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level1/xdotc.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
-  errors += clblast::RunTests<clblast::TestXdotc<float2>, float2, float2>(argc, argv, false, "CDOTC");
-  errors += clblast::RunTests<clblast::TestXdotc<double2>, double2, double2>(argc, argv, true, "ZDOTC");
+  errors += clblast::RunTests<clblast::TestXdotc<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, false, "CDOTC");
+  errors += clblast::RunTests<clblast::TestXdotc<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZDOTC");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level1/xdotu.cpp b/test/correctness/routines/level1/xdotu.cpp
index 1047d021..4392326d 100644
--- a/test/correctness/routines/level1/xdotu.cpp
+++ b/test/correctness/routines/level1/xdotu.cpp
@@ -12,15 +12,11 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level1/xdotu.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
-  errors += clblast::RunTests<clblast::TestXdotu<float2>, float2, float2>(argc, argv, false, "CDOTU");
-  errors += clblast::RunTests<clblast::TestXdotu<double2>, double2, double2>(argc, argv, true, "ZDOTU");
+  errors += clblast::RunTests<clblast::TestXdotu<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, false, "CDOTU");
+  errors += clblast::RunTests<clblast::TestXdotu<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZDOTU");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level1/xnrm2.cpp b/test/correctness/routines/level1/xnrm2.cpp
index 142fa7ba..6f07bad2 100644
--- a/test/correctness/routines/level1/xnrm2.cpp
+++ b/test/correctness/routines/level1/xnrm2.cpp
@@ -12,18 +12,14 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level1/xnrm2.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXnrm2<float>, float, float>(argc, argv, false, "SNRM2");
   errors += clblast::RunTests<clblast::TestXnrm2<double>, double, double>(argc, argv, true, "DNRM2");
-  errors += clblast::RunTests<clblast::TestXnrm2<float2>, float2, float2>(argc, argv, true, "ScNRM2");
-  errors += clblast::RunTests<clblast::TestXnrm2<double2>, double2, double2>(argc, argv, true, "DzNRM2");
-  errors += clblast::RunTests<clblast::TestXnrm2<half>, half, half>(argc, argv, true, "HNRM2");
+  errors += clblast::RunTests<clblast::TestXnrm2<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "ScNRM2");
+  errors += clblast::RunTests<clblast::TestXnrm2<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "DzNRM2");
+  errors += clblast::RunTests<clblast::TestXnrm2<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HNRM2");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level1/xrot.cpp b/test/correctness/routines/level1/xrot.cpp
index 5af358eb..d5eb6516 100644
--- a/test/correctness/routines/level1/xrot.cpp
+++ b/test/correctness/routines/level1/xrot.cpp
@@ -12,10 +12,6 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level1/xrot.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
diff --git a/test/correctness/routines/level1/xrotg.cpp b/test/correctness/routines/level1/xrotg.cpp
index ad23a554..ec544eab 100644
--- a/test/correctness/routines/level1/xrotg.cpp
+++ b/test/correctness/routines/level1/xrotg.cpp
@@ -12,10 +12,6 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level1/xrotg.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
diff --git a/test/correctness/routines/level1/xrotm.cpp b/test/correctness/routines/level1/xrotm.cpp
index 4f7e8f15..7f2d7ce6 100644
--- a/test/correctness/routines/level1/xrotm.cpp
+++ b/test/correctness/routines/level1/xrotm.cpp
@@ -12,10 +12,6 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level1/xrotm.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
diff --git a/test/correctness/routines/level1/xrotmg.cpp b/test/correctness/routines/level1/xrotmg.cpp
index ca89bc12..4ef6e67d 100644
--- a/test/correctness/routines/level1/xrotmg.cpp
+++ b/test/correctness/routines/level1/xrotmg.cpp
@@ -12,10 +12,6 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level1/xrotmg.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
diff --git a/test/correctness/routines/level1/xscal.cpp b/test/correctness/routines/level1/xscal.cpp
index 939524be..5a360678 100644
--- a/test/correctness/routines/level1/xscal.cpp
+++ b/test/correctness/routines/level1/xscal.cpp
@@ -12,18 +12,14 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level1/xscal.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXscal<float>, float, float>(argc, argv, false, "SSCAL");
   errors += clblast::RunTests<clblast::TestXscal<double>, double, double>(argc, argv, true, "DSCAL");
-  errors += clblast::RunTests<clblast::TestXscal<float2>, float2, float2>(argc, argv, true, "CSCAL");
-  errors += clblast::RunTests<clblast::TestXscal<double2>, double2, double2>(argc, argv, true, "ZSCAL");
-  errors += clblast::RunTests<clblast::TestXscal<half>, half, half>(argc, argv, true, "HSCAL");
+  errors += clblast::RunTests<clblast::TestXscal<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CSCAL");
+  errors += clblast::RunTests<clblast::TestXscal<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZSCAL");
+  errors += clblast::RunTests<clblast::TestXscal<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HSCAL");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level1/xswap.cpp b/test/correctness/routines/level1/xswap.cpp
index 446f3d65..08f08a78 100644
--- a/test/correctness/routines/level1/xswap.cpp
+++ b/test/correctness/routines/level1/xswap.cpp
@@ -12,18 +12,14 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level1/xswap.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXswap<float>, float, float>(argc, argv, false, "SSWAP");
   errors += clblast::RunTests<clblast::TestXswap<double>, double, double>(argc, argv, true, "DSWAP");
-  errors += clblast::RunTests<clblast::TestXswap<float2>, float2, float2>(argc, argv, true, "CSWAP");
-  errors += clblast::RunTests<clblast::TestXswap<double2>, double2, double2>(argc, argv, true, "ZSWAP");
-  errors += clblast::RunTests<clblast::TestXswap<half>, half, half>(argc, argv, true, "HSWAP");
+  errors += clblast::RunTests<clblast::TestXswap<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CSWAP");
+  errors += clblast::RunTests<clblast::TestXswap<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZSWAP");
+  errors += clblast::RunTests<clblast::TestXswap<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HSWAP");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xgbmv.cpp b/test/correctness/routines/level2/xgbmv.cpp
index 8c49bc65..b7936802 100644
--- a/test/correctness/routines/level2/xgbmv.cpp
+++ b/test/correctness/routines/level2/xgbmv.cpp
@@ -12,18 +12,14 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xgbmv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXgbmv<float>, float, float>(argc, argv, false, "SGBMV");
   errors += clblast::RunTests<clblast::TestXgbmv<double>, double, double>(argc, argv, true, "DGBMV");
-  errors += clblast::RunTests<clblast::TestXgbmv<float2>, float2, float2>(argc, argv, true, "CGBMV");
-  errors += clblast::RunTests<clblast::TestXgbmv<double2>, double2, double2>(argc, argv, true, "ZGBMV");
-  errors += clblast::RunTests<clblast::TestXgbmv<half>, half, half>(argc, argv, true, "HGBMV");
+  errors += clblast::RunTests<clblast::TestXgbmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CGBMV");
+  errors += clblast::RunTests<clblast::TestXgbmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZGBMV");
+  errors += clblast::RunTests<clblast::TestXgbmv<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HGBMV");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xgemv.cpp b/test/correctness/routines/level2/xgemv.cpp
index 902ae777..2bf0bf59 100644
--- a/test/correctness/routines/level2/xgemv.cpp
+++ b/test/correctness/routines/level2/xgemv.cpp
@@ -12,18 +12,14 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xgemv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXgemv<float>, float, float>(argc, argv, false, "SGEMV");
   errors += clblast::RunTests<clblast::TestXgemv<double>, double, double>(argc, argv, true, "DGEMV");
-  errors += clblast::RunTests<clblast::TestXgemv<float2>, float2, float2>(argc, argv, true, "CGEMV");
-  errors += clblast::RunTests<clblast::TestXgemv<double2>, double2, double2>(argc, argv, true, "ZGEMV");
-  errors += clblast::RunTests<clblast::TestXgemv<half>, half, half>(argc, argv, true, "HGEMV");
+  errors += clblast::RunTests<clblast::TestXgemv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CGEMV");
+  errors += clblast::RunTests<clblast::TestXgemv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZGEMV");
+  errors += clblast::RunTests<clblast::TestXgemv<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HGEMV");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xger.cpp b/test/correctness/routines/level2/xger.cpp
index ce61bbcb..999615b7 100644
--- a/test/correctness/routines/level2/xger.cpp
+++ b/test/correctness/routines/level2/xger.cpp
@@ -12,16 +12,12 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xger.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXger<float>, float, float>(argc, argv, false, "SGER");
   errors += clblast::RunTests<clblast::TestXger<double>, double, double>(argc, argv, true, "DGER");
-  errors += clblast::RunTests<clblast::TestXger<half>, half, half>(argc, argv, true, "HGER");
+  errors += clblast::RunTests<clblast::TestXger<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HGER");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xgerc.cpp b/test/correctness/routines/level2/xgerc.cpp
index b747f20d..42f6bb45 100644
--- a/test/correctness/routines/level2/xgerc.cpp
+++ b/test/correctness/routines/level2/xgerc.cpp
@@ -12,15 +12,11 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xgerc.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
-  errors += clblast::RunTests<clblast::TestXgerc<float2>, float2, float2>(argc, argv, false, "CGERC");
-  errors += clblast::RunTests<clblast::TestXgerc<double2>, double2, double2>(argc, argv, true, "ZGERC");
+  errors += clblast::RunTests<clblast::TestXgerc<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, false, "CGERC");
+  errors += clblast::RunTests<clblast::TestXgerc<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZGERC");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xgeru.cpp b/test/correctness/routines/level2/xgeru.cpp
index f80c1e2b..f167eff5 100644
--- a/test/correctness/routines/level2/xgeru.cpp
+++ b/test/correctness/routines/level2/xgeru.cpp
@@ -12,15 +12,11 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xgeru.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
-  errors += clblast::RunTests<clblast::TestXgeru<float2>, float2, float2>(argc, argv, false, "CGERU");
-  errors += clblast::RunTests<clblast::TestXgeru<double2>, double2, double2>(argc, argv, true, "ZGERU");
+  errors += clblast::RunTests<clblast::TestXgeru<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, false, "CGERU");
+  errors += clblast::RunTests<clblast::TestXgeru<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZGERU");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xhbmv.cpp b/test/correctness/routines/level2/xhbmv.cpp
index a4885c01..168d9474 100644
--- a/test/correctness/routines/level2/xhbmv.cpp
+++ b/test/correctness/routines/level2/xhbmv.cpp
@@ -12,15 +12,11 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xhbmv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
-  errors += clblast::RunTests<clblast::TestXhbmv<float2>, float2, float2>(argc, argv, false, "CHBMV");
-  errors += clblast::RunTests<clblast::TestXhbmv<double2>, double2, double2>(argc, argv, true, "ZHBMV");
+  errors += clblast::RunTests<clblast::TestXhbmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, false, "CHBMV");
+  errors += clblast::RunTests<clblast::TestXhbmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZHBMV");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xhemv.cpp b/test/correctness/routines/level2/xhemv.cpp
index 4318ffee..eabdf67d 100644
--- a/test/correctness/routines/level2/xhemv.cpp
+++ b/test/correctness/routines/level2/xhemv.cpp
@@ -12,15 +12,11 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xhemv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
-  errors += clblast::RunTests<clblast::TestXhemv<float2>, float2, float2>(argc, argv, false, "CHEMV");
-  errors += clblast::RunTests<clblast::TestXhemv<double2>, double2, double2>(argc, argv, true, "ZHEMV");
+  errors += clblast::RunTests<clblast::TestXhemv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, false, "CHEMV");
+  errors += clblast::RunTests<clblast::TestXhemv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZHEMV");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xher.cpp b/test/correctness/routines/level2/xher.cpp
index fe37bd76..a47a45ac 100644
--- a/test/correctness/routines/level2/xher.cpp
+++ b/test/correctness/routines/level2/xher.cpp
@@ -12,15 +12,11 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xher.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
-  errors += clblast::RunTests<clblast::TestXher<float2,float>, float2, float>(argc, argv, false, "CHER");
-  errors += clblast::RunTests<clblast::TestXher<double2,double>, double2, double>(argc, argv, true, "ZHER");
+  errors += clblast::RunTests<clblast::TestXher<clblast::float2,float>, clblast::float2, float>(argc, argv, false, "CHER");
+  errors += clblast::RunTests<clblast::TestXher<clblast::double2,double>, clblast::double2, double>(argc, argv, true, "ZHER");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xher2.cpp b/test/correctness/routines/level2/xher2.cpp
index 0b4af4d0..544ab16d 100644
--- a/test/correctness/routines/level2/xher2.cpp
+++ b/test/correctness/routines/level2/xher2.cpp
@@ -12,15 +12,11 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xher2.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
-  errors += clblast::RunTests<clblast::TestXher2<float2>, float2, float2>(argc, argv, false, "CHER2");
-  errors += clblast::RunTests<clblast::TestXher2<double2>, double2, double2>(argc, argv, true, "ZHER2");
+  errors += clblast::RunTests<clblast::TestXher2<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, false, "CHER2");
+  errors += clblast::RunTests<clblast::TestXher2<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZHER2");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xhpmv.cpp b/test/correctness/routines/level2/xhpmv.cpp
index dd77df71..30d23b8f 100644
--- a/test/correctness/routines/level2/xhpmv.cpp
+++ b/test/correctness/routines/level2/xhpmv.cpp
@@ -12,15 +12,11 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xhpmv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
-  errors += clblast::RunTests<clblast::TestXhpmv<float2>, float2, float2>(argc, argv, false, "CHPMV");
-  errors += clblast::RunTests<clblast::TestXhpmv<double2>, double2, double2>(argc, argv, true, "ZHPMV");
+  errors += clblast::RunTests<clblast::TestXhpmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, false, "CHPMV");
+  errors += clblast::RunTests<clblast::TestXhpmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZHPMV");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xhpr.cpp b/test/correctness/routines/level2/xhpr.cpp
index 5a3f615f..ed876857 100644
--- a/test/correctness/routines/level2/xhpr.cpp
+++ b/test/correctness/routines/level2/xhpr.cpp
@@ -12,15 +12,11 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xhpr.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
-  errors += clblast::RunTests<clblast::TestXhpr<float2,float>, float2, float>(argc, argv, false, "CHPR");
-  errors += clblast::RunTests<clblast::TestXhpr<double2,double>, double2, double>(argc, argv, true, "ZHPR");
+  errors += clblast::RunTests<clblast::TestXhpr<clblast::float2,float>, clblast::float2, float>(argc, argv, false, "CHPR");
+  errors += clblast::RunTests<clblast::TestXhpr<clblast::double2,double>, clblast::double2, double>(argc, argv, true, "ZHPR");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xhpr2.cpp b/test/correctness/routines/level2/xhpr2.cpp
index 8218b444..b3bd167a 100644
--- a/test/correctness/routines/level2/xhpr2.cpp
+++ b/test/correctness/routines/level2/xhpr2.cpp
@@ -12,15 +12,11 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xhpr2.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
-  errors += clblast::RunTests<clblast::TestXhpr2<float2>, float2, float2>(argc, argv, false, "CHPR2");
-  errors += clblast::RunTests<clblast::TestXhpr2<double2>, double2, double2>(argc, argv, true, "ZHPR2");
+  errors += clblast::RunTests<clblast::TestXhpr2<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, false, "CHPR2");
+  errors += clblast::RunTests<clblast::TestXhpr2<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZHPR2");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xsbmv.cpp b/test/correctness/routines/level2/xsbmv.cpp
index 7918cb21..e097e765 100644
--- a/test/correctness/routines/level2/xsbmv.cpp
+++ b/test/correctness/routines/level2/xsbmv.cpp
@@ -12,16 +12,12 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xsbmv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXsbmv<float>, float, float>(argc, argv, false, "SSBMV");
   errors += clblast::RunTests<clblast::TestXsbmv<double>, double, double>(argc, argv, true, "DSBMV");
-  errors += clblast::RunTests<clblast::TestXsbmv<half>, half, half>(argc, argv, true, "HSBMV");
+  errors += clblast::RunTests<clblast::TestXsbmv<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HSBMV");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xspmv.cpp b/test/correctness/routines/level2/xspmv.cpp
index 78210660..ff42e5b9 100644
--- a/test/correctness/routines/level2/xspmv.cpp
+++ b/test/correctness/routines/level2/xspmv.cpp
@@ -12,16 +12,12 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xspmv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXspmv<float>, float, float>(argc, argv, false, "SSPMV");
   errors += clblast::RunTests<clblast::TestXspmv<double>, double, double>(argc, argv, true, "DSPMV");
-  errors += clblast::RunTests<clblast::TestXspmv<half>, half, half>(argc, argv, true, "HSPMV");
+  errors += clblast::RunTests<clblast::TestXspmv<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HSPMV");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xspr.cpp b/test/correctness/routines/level2/xspr.cpp
index d05adf34..a478df55 100644
--- a/test/correctness/routines/level2/xspr.cpp
+++ b/test/correctness/routines/level2/xspr.cpp
@@ -12,16 +12,12 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xspr.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXspr<float>, float, float>(argc, argv, false, "SSPR");
   errors += clblast::RunTests<clblast::TestXspr<double>, double, double>(argc, argv, true, "DSPR");
-  errors += clblast::RunTests<clblast::TestXspr<half>, half, half>(argc, argv, true, "HSPR");
+  errors += clblast::RunTests<clblast::TestXspr<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HSPR");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xspr2.cpp b/test/correctness/routines/level2/xspr2.cpp
index caa46a09..8310f5d1 100644
--- a/test/correctness/routines/level2/xspr2.cpp
+++ b/test/correctness/routines/level2/xspr2.cpp
@@ -12,16 +12,12 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xspr2.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXspr2<float>, float, float>(argc, argv, false, "SSPR2");
   errors += clblast::RunTests<clblast::TestXspr2<double>, double, double>(argc, argv, true, "DSPR2");
-  errors += clblast::RunTests<clblast::TestXspr2<half>, half, half>(argc, argv, true, "HSPR2");
+  errors += clblast::RunTests<clblast::TestXspr2<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HSPR2");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xsymv.cpp b/test/correctness/routines/level2/xsymv.cpp
index 978a5f8a..41c6ce00 100644
--- a/test/correctness/routines/level2/xsymv.cpp
+++ b/test/correctness/routines/level2/xsymv.cpp
@@ -12,16 +12,12 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xsymv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXsymv<float>, float, float>(argc, argv, false, "SSYMV");
   errors += clblast::RunTests<clblast::TestXsymv<double>, double, double>(argc, argv, true, "DSYMV");
-  errors += clblast::RunTests<clblast::TestXsymv<half>, half, half>(argc, argv, true, "HSYMV");
+  errors += clblast::RunTests<clblast::TestXsymv<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HSYMV");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xsyr.cpp b/test/correctness/routines/level2/xsyr.cpp
index 244dbfb4..9c949e09 100644
--- a/test/correctness/routines/level2/xsyr.cpp
+++ b/test/correctness/routines/level2/xsyr.cpp
@@ -12,16 +12,12 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xsyr.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXsyr<float>, float, float>(argc, argv, false, "SSYR");
   errors += clblast::RunTests<clblast::TestXsyr<double>, double, double>(argc, argv, true, "DSYR");
-  errors += clblast::RunTests<clblast::TestXsyr<half>, half, half>(argc, argv, true, "HSYR");
+  errors += clblast::RunTests<clblast::TestXsyr<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HSYR");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xsyr2.cpp b/test/correctness/routines/level2/xsyr2.cpp
index 422e67ad..c3629c26 100644
--- a/test/correctness/routines/level2/xsyr2.cpp
+++ b/test/correctness/routines/level2/xsyr2.cpp
@@ -12,16 +12,12 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xsyr2.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXsyr2<float>, float, float>(argc, argv, false, "SSYR2");
   errors += clblast::RunTests<clblast::TestXsyr2<double>, double, double>(argc, argv, true, "DSYR2");
-  errors += clblast::RunTests<clblast::TestXsyr2<half>, half, half>(argc, argv, true, "HSYR2");
+  errors += clblast::RunTests<clblast::TestXsyr2<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HSYR2");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xtbmv.cpp b/test/correctness/routines/level2/xtbmv.cpp
index 491708ec..609a962a 100644
--- a/test/correctness/routines/level2/xtbmv.cpp
+++ b/test/correctness/routines/level2/xtbmv.cpp
@@ -12,18 +12,14 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xtbmv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXtbmv<float>, float, float>(argc, argv, false, "STBMV");
   errors += clblast::RunTests<clblast::TestXtbmv<double>, double, double>(argc, argv, true, "DTBMV");
-  errors += clblast::RunTests<clblast::TestXtbmv<float2>, float2, float2>(argc, argv, true, "CTBMV");
-  errors += clblast::RunTests<clblast::TestXtbmv<double2>, double2, double2>(argc, argv, true, "ZTBMV");
-  errors += clblast::RunTests<clblast::TestXtbmv<half>, half, half>(argc, argv, true, "HTBMV");
+  errors += clblast::RunTests<clblast::TestXtbmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CTBMV");
+  errors += clblast::RunTests<clblast::TestXtbmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZTBMV");
+  errors += clblast::RunTests<clblast::TestXtbmv<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HTBMV");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xtbsv.cpp b/test/correctness/routines/level2/xtbsv.cpp
index 12b5dca5..5cfc6942 100644
--- a/test/correctness/routines/level2/xtbsv.cpp
+++ b/test/correctness/routines/level2/xtbsv.cpp
@@ -12,17 +12,13 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xtbsv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXtbsv<float>, float, float>(argc, argv, false, "STBSV");
   errors += clblast::RunTests<clblast::TestXtbsv<double>, double, double>(argc, argv, true, "DTBSV");
-  errors += clblast::RunTests<clblast::TestXtbsv<float2>, float2, float2>(argc, argv, true, "CTBSV");
-  errors += clblast::RunTests<clblast::TestXtbsv<double2>, double2, double2>(argc, argv, true, "ZTBSV");
+  errors += clblast::RunTests<clblast::TestXtbsv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CTBSV");
+  errors += clblast::RunTests<clblast::TestXtbsv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZTBSV");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xtpmv.cpp b/test/correctness/routines/level2/xtpmv.cpp
index b89f0adc..3bf2d7aa 100644
--- a/test/correctness/routines/level2/xtpmv.cpp
+++ b/test/correctness/routines/level2/xtpmv.cpp
@@ -12,18 +12,14 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xtpmv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXtpmv<float>, float, float>(argc, argv, false, "STPMV");
   errors += clblast::RunTests<clblast::TestXtpmv<double>, double, double>(argc, argv, true, "DTPMV");
-  errors += clblast::RunTests<clblast::TestXtpmv<float2>, float2, float2>(argc, argv, true, "CTPMV");
-  errors += clblast::RunTests<clblast::TestXtpmv<double2>, double2, double2>(argc, argv, true, "ZTPMV");
-  errors += clblast::RunTests<clblast::TestXtpmv<half>, half, half>(argc, argv, true, "HTPMV");
+  errors += clblast::RunTests<clblast::TestXtpmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CTPMV");
+  errors += clblast::RunTests<clblast::TestXtpmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZTPMV");
+  errors += clblast::RunTests<clblast::TestXtpmv<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HTPMV");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xtpsv.cpp b/test/correctness/routines/level2/xtpsv.cpp
index 6e6e7c85..28c9fe39 100644
--- a/test/correctness/routines/level2/xtpsv.cpp
+++ b/test/correctness/routines/level2/xtpsv.cpp
@@ -12,17 +12,13 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xtpsv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXtpsv<float>, float, float>(argc, argv, false, "STPSV");
   errors += clblast::RunTests<clblast::TestXtpsv<double>, double, double>(argc, argv, true, "DTPSV");
-  errors += clblast::RunTests<clblast::TestXtpsv<float2>, float2, float2>(argc, argv, true, "CTPSV");
-  errors += clblast::RunTests<clblast::TestXtpsv<double2>, double2, double2>(argc, argv, true, "ZTPSV");
+  errors += clblast::RunTests<clblast::TestXtpsv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CTPSV");
+  errors += clblast::RunTests<clblast::TestXtpsv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZTPSV");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xtrmv.cpp b/test/correctness/routines/level2/xtrmv.cpp
index 819f5cad..11cffa5d 100644
--- a/test/correctness/routines/level2/xtrmv.cpp
+++ b/test/correctness/routines/level2/xtrmv.cpp
@@ -12,18 +12,14 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xtrmv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXtrmv<float>, float, float>(argc, argv, false, "STRMV");
   errors += clblast::RunTests<clblast::TestXtrmv<double>, double, double>(argc, argv, true, "DTRMV");
-  errors += clblast::RunTests<clblast::TestXtrmv<float2>, float2, float2>(argc, argv, true, "CTRMV");
-  errors += clblast::RunTests<clblast::TestXtrmv<double2>, double2, double2>(argc, argv, true, "ZTRMV");
-  errors += clblast::RunTests<clblast::TestXtrmv<half>, half, half>(argc, argv, true, "HTRMV");
+  errors += clblast::RunTests<clblast::TestXtrmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CTRMV");
+  errors += clblast::RunTests<clblast::TestXtrmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZTRMV");
+  errors += clblast::RunTests<clblast::TestXtrmv<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HTRMV");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level2/xtrsv.cpp b/test/correctness/routines/level2/xtrsv.cpp
index 78e33807..b35d7fc7 100644
--- a/test/correctness/routines/level2/xtrsv.cpp
+++ b/test/correctness/routines/level2/xtrsv.cpp
@@ -12,17 +12,13 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level2/xtrsv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXtrsv<float>, float, float>(argc, argv, false, "STRSV");
   errors += clblast::RunTests<clblast::TestXtrsv<double>, double, double>(argc, argv, true, "DTRSV");
-  errors += clblast::RunTests<clblast::TestXtrsv<float2>, float2, float2>(argc, argv, true, "CTRSV");
-  errors += clblast::RunTests<clblast::TestXtrsv<double2>, double2, double2>(argc, argv, true, "ZTRSV");
+  errors += clblast::RunTests<clblast::TestXtrsv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CTRSV");
+  errors += clblast::RunTests<clblast::TestXtrsv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZTRSV");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level3/xgemm.cpp b/test/correctness/routines/level3/xgemm.cpp
index 54d41719..5de73554 100644
--- a/test/correctness/routines/level3/xgemm.cpp
+++ b/test/correctness/routines/level3/xgemm.cpp
@@ -12,18 +12,14 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level3/xgemm.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXgemm<float>, float, float>(argc, argv, false, "SGEMM");
   errors += clblast::RunTests<clblast::TestXgemm<double>, double, double>(argc, argv, true, "DGEMM");
-  errors += clblast::RunTests<clblast::TestXgemm<float2>, float2, float2>(argc, argv, true, "CGEMM");
-  errors += clblast::RunTests<clblast::TestXgemm<double2>, double2, double2>(argc, argv, true, "ZGEMM");
-  errors += clblast::RunTests<clblast::TestXgemm<half>, half, half>(argc, argv, true, "HGEMM");
+  errors += clblast::RunTests<clblast::TestXgemm<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CGEMM");
+  errors += clblast::RunTests<clblast::TestXgemm<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZGEMM");
+  errors += clblast::RunTests<clblast::TestXgemm<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HGEMM");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level3/xhemm.cpp b/test/correctness/routines/level3/xhemm.cpp
index 76c970a7..cbd277e2 100644
--- a/test/correctness/routines/level3/xhemm.cpp
+++ b/test/correctness/routines/level3/xhemm.cpp
@@ -12,15 +12,11 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level3/xhemm.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
-  errors += clblast::RunTests<clblast::TestXhemm<float2>, float2, float2>(argc, argv, false, "CHEMM");
-  errors += clblast::RunTests<clblast::TestXhemm<double2>, double2, double2>(argc, argv, true, "ZHEMM");
+  errors += clblast::RunTests<clblast::TestXhemm<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, false, "CHEMM");
+  errors += clblast::RunTests<clblast::TestXhemm<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZHEMM");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level3/xher2k.cpp b/test/correctness/routines/level3/xher2k.cpp
index c653265e..e21a429c 100644
--- a/test/correctness/routines/level3/xher2k.cpp
+++ b/test/correctness/routines/level3/xher2k.cpp
@@ -12,15 +12,11 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level3/xher2k.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
-  errors += clblast::RunTests<clblast::TestXher2k<float2,float>, float2, float>(argc, argv, false, "CHER2K");
-  errors += clblast::RunTests<clblast::TestXher2k<double2,double>, double2, double>(argc, argv, true, "ZHER2K");
+  errors += clblast::RunTests<clblast::TestXher2k<clblast::float2,float>, clblast::float2, float>(argc, argv, false, "CHER2K");
+  errors += clblast::RunTests<clblast::TestXher2k<clblast::double2,double>, clblast::double2, double>(argc, argv, true, "ZHER2K");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level3/xherk.cpp b/test/correctness/routines/level3/xherk.cpp
index 09ea9e4d..5665147e 100644
--- a/test/correctness/routines/level3/xherk.cpp
+++ b/test/correctness/routines/level3/xherk.cpp
@@ -12,15 +12,11 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level3/xherk.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
-  errors += clblast::RunTests<clblast::TestXherk<float2,float>, float2, float>(argc, argv, false, "CHERK");
-  errors += clblast::RunTests<clblast::TestXherk<double2,double>, double2, double>(argc, argv, true, "ZHERK");
+  errors += clblast::RunTests<clblast::TestXherk<clblast::float2,float>, clblast::float2, float>(argc, argv, false, "CHERK");
+  errors += clblast::RunTests<clblast::TestXherk<clblast::double2,double>, clblast::double2, double>(argc, argv, true, "ZHERK");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level3/xsymm.cpp b/test/correctness/routines/level3/xsymm.cpp
index 3cb3515a..13d1b685 100644
--- a/test/correctness/routines/level3/xsymm.cpp
+++ b/test/correctness/routines/level3/xsymm.cpp
@@ -12,18 +12,14 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level3/xsymm.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXsymm<float>, float, float>(argc, argv, false, "SSYMM");
   errors += clblast::RunTests<clblast::TestXsymm<double>, double, double>(argc, argv, true, "DSYMM");
-  errors += clblast::RunTests<clblast::TestXsymm<float2>, float2, float2>(argc, argv, true, "CSYMM");
-  errors += clblast::RunTests<clblast::TestXsymm<double2>, double2, double2>(argc, argv, true, "ZSYMM");
-  errors += clblast::RunTests<clblast::TestXsymm<half>, half, half>(argc, argv, true, "HSYMM");
+  errors += clblast::RunTests<clblast::TestXsymm<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CSYMM");
+  errors += clblast::RunTests<clblast::TestXsymm<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZSYMM");
+  errors += clblast::RunTests<clblast::TestXsymm<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HSYMM");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level3/xsyr2k.cpp b/test/correctness/routines/level3/xsyr2k.cpp
index 617af04d..7f7c6471 100644
--- a/test/correctness/routines/level3/xsyr2k.cpp
+++ b/test/correctness/routines/level3/xsyr2k.cpp
@@ -12,18 +12,14 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level3/xsyr2k.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXsyr2k<float>, float, float>(argc, argv, false, "SSYR2K");
   errors += clblast::RunTests<clblast::TestXsyr2k<double>, double, double>(argc, argv, true, "DSYR2K");
-  errors += clblast::RunTests<clblast::TestXsyr2k<float2>, float2, float2>(argc, argv, true, "CSYR2K");
-  errors += clblast::RunTests<clblast::TestXsyr2k<double2>, double2, double2>(argc, argv, true, "ZSYR2K");
-  errors += clblast::RunTests<clblast::TestXsyr2k<half>, half, half>(argc, argv, true, "HSYR2K");
+  errors += clblast::RunTests<clblast::TestXsyr2k<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CSYR2K");
+  errors += clblast::RunTests<clblast::TestXsyr2k<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZSYR2K");
+  errors += clblast::RunTests<clblast::TestXsyr2k<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HSYR2K");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level3/xsyrk.cpp b/test/correctness/routines/level3/xsyrk.cpp
index 2014b8d0..6ae6733c 100644
--- a/test/correctness/routines/level3/xsyrk.cpp
+++ b/test/correctness/routines/level3/xsyrk.cpp
@@ -12,18 +12,14 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level3/xsyrk.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXsyrk<float>, float, float>(argc, argv, false, "SSYRK");
   errors += clblast::RunTests<clblast::TestXsyrk<double>, double, double>(argc, argv, true, "DSYRK");
-  errors += clblast::RunTests<clblast::TestXsyrk<float2>, float2, float2>(argc, argv, true, "CSYRK");
-  errors += clblast::RunTests<clblast::TestXsyrk<double2>, double2, double2>(argc, argv, true, "ZSYRK");
-  errors += clblast::RunTests<clblast::TestXsyrk<half>, half, half>(argc, argv, true, "HSYRK");
+  errors += clblast::RunTests<clblast::TestXsyrk<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CSYRK");
+  errors += clblast::RunTests<clblast::TestXsyrk<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZSYRK");
+  errors += clblast::RunTests<clblast::TestXsyrk<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HSYRK");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level3/xtrmm.cpp b/test/correctness/routines/level3/xtrmm.cpp
index 32640d52..2d42b541 100644
--- a/test/correctness/routines/level3/xtrmm.cpp
+++ b/test/correctness/routines/level3/xtrmm.cpp
@@ -12,18 +12,14 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level3/xtrmm.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXtrmm<float>, float, float>(argc, argv, false, "STRMM");
   errors += clblast::RunTests<clblast::TestXtrmm<double>, double, double>(argc, argv, true, "DTRMM");
-  errors += clblast::RunTests<clblast::TestXtrmm<float2>, float2, float2>(argc, argv, true, "CTRMM");
-  errors += clblast::RunTests<clblast::TestXtrmm<double2>, double2, double2>(argc, argv, true, "ZTRMM");
-  errors += clblast::RunTests<clblast::TestXtrmm<half>, half, half>(argc, argv, true, "HTRMM");
+  errors += clblast::RunTests<clblast::TestXtrmm<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CTRMM");
+  errors += clblast::RunTests<clblast::TestXtrmm<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZTRMM");
+  errors += clblast::RunTests<clblast::TestXtrmm<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HTRMM");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/level3/xtrsm.cpp b/test/correctness/routines/level3/xtrsm.cpp
index 6119bd17..dcc20060 100644
--- a/test/correctness/routines/level3/xtrsm.cpp
+++ b/test/correctness/routines/level3/xtrsm.cpp
@@ -12,18 +12,13 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/level3/xtrsm.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXtrsm<float>, float, float>(argc, argv, false, "STRSM");
   errors += clblast::RunTests<clblast::TestXtrsm<double>, double, double>(argc, argv, true, "DTRSM");
-  errors += clblast::RunTests<clblast::TestXtrsm<float2>, float2, float2>(argc, argv, true, "CTRSM");
-  errors += clblast::RunTests<clblast::TestXtrsm<double2>, double2, double2>(argc, argv, true, "ZTRSM");
-  errors += clblast::RunTests<clblast::TestXtrsm<half>, half, half>(argc, argv, true, "HTRSM");
+  errors += clblast::RunTests<clblast::TestXtrsm<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CTRSM");
+  errors += clblast::RunTests<clblast::TestXtrsm<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZTRSM");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/routines/levelx/xaxpybatched.cpp b/test/correctness/routines/levelx/xaxpybatched.cpp
new file mode 100644
index 00000000..0b56b1c2
--- /dev/null
+++ b/test/correctness/routines/levelx/xaxpybatched.cpp
@@ -0,0 +1,26 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/levelx/xaxpybatched.hpp"
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXaxpyBatched<float>, float, float>(argc, argv, false, "SAXPYBATCHED");
+  errors += clblast::RunTests<clblast::TestXaxpyBatched<double>, double, double>(argc, argv, true, "DAXPYBATCHED");
+  errors += clblast::RunTests<clblast::TestXaxpyBatched<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CAXPYBATCHED");
+  errors += clblast::RunTests<clblast::TestXaxpyBatched<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZAXPYBATCHED");
+  errors += clblast::RunTests<clblast::TestXaxpyBatched<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HAXPYBATCHED");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/levelx/xgemmbatched.cpp b/test/correctness/routines/levelx/xgemmbatched.cpp
new file mode 100644
index 00000000..cb6751c8
--- /dev/null
+++ b/test/correctness/routines/levelx/xgemmbatched.cpp
@@ -0,0 +1,26 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/levelx/xgemmbatched.hpp"
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXgemmBatched<float>, float, float>(argc, argv, false, "SGEMMBATCHED");
+  errors += clblast::RunTests<clblast::TestXgemmBatched<double>, double, double>(argc, argv, true, "DGEMMBATCHED");
+  errors += clblast::RunTests<clblast::TestXgemmBatched<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CGEMMBATCHED");
+  errors += clblast::RunTests<clblast::TestXgemmBatched<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZGEMMBATCHED");
+  errors += clblast::RunTests<clblast::TestXgemmBatched<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HGEMMBATCHED");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/levelx/xinvert.cpp b/test/correctness/routines/levelx/xinvert.cpp
new file mode 100644
index 00000000..0ccc0829
--- /dev/null
+++ b/test/correctness/routines/levelx/xinvert.cpp
@@ -0,0 +1,30 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/levelx/xinvert.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXinvert<float>, float, float>(argc, argv, false, "SINVERT");
+  errors += clblast::RunTests<clblast::TestXinvert<double>, double, double>(argc, argv, true, "DINVERT");
+  errors += clblast::RunTests<clblast::TestXinvert<float2>, float2, float2>(argc, argv, true, "CINVERT");
+  errors += clblast::RunTests<clblast::TestXinvert<double2>, double2, double2>(argc, argv, true, "ZINVERT");
+  errors += clblast::RunTests<clblast::TestXinvert<half>, half, half>(argc, argv, true, "HINVERT");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/levelx/xomatcopy.cpp b/test/correctness/routines/levelx/xomatcopy.cpp
index e034bc18..5eeabc0c 100644
--- a/test/correctness/routines/levelx/xomatcopy.cpp
+++ b/test/correctness/routines/levelx/xomatcopy.cpp
@@ -12,18 +12,14 @@
 #include "test/correctness/testblas.hpp"
 #include "test/routines/levelx/xomatcopy.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunTests<clblast::TestXomatcopy<float>, float, float>(argc, argv, false, "SOMATCOPY");
   errors += clblast::RunTests<clblast::TestXomatcopy<double>, double, double>(argc, argv, true, "DOMATCOPY");
-  errors += clblast::RunTests<clblast::TestXomatcopy<float2>, float2, float2>(argc, argv, true, "COMATCOPY");
-  errors += clblast::RunTests<clblast::TestXomatcopy<double2>, double2, double2>(argc, argv, true, "ZOMATCOPY");
-  errors += clblast::RunTests<clblast::TestXomatcopy<half>, half, half>(argc, argv, true, "HOMATCOPY");
+  errors += clblast::RunTests<clblast::TestXomatcopy<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "COMATCOPY");
+  errors += clblast::RunTests<clblast::TestXomatcopy<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZOMATCOPY");
+  errors += clblast::RunTests<clblast::TestXomatcopy<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HOMATCOPY");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp
index 5fddb37b..7bc9c869 100644
--- a/test/correctness/testblas.cpp
+++ b/test/correctness/testblas.cpp
@@ -13,18 +13,23 @@
 
 #include <algorithm>
 #include <iostream>
+#include <random>
 
+#include "utilities/utilities.hpp"
 #include "test/correctness/testblas.hpp"
 
 namespace clblast {
 // =================================================================================================
 
+template <typename T, typename U> const int TestBlas<T,U>::kSeed = 42; // fixed seed for reproducibility
+
 // Test settings for the regular test. Append to these lists in case more tests are required.
-template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kVectorDims = { 7, 93, 4096 };
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kVectorDims = { 7, 93, 144, 4096 };
 template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kIncrements = { 1, 2, 7 };
 template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatrixDims = { 7, 64 };
-template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatrixVectorDims = { 61, 512 };
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatrixVectorDims = { 61, 256 };
 template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kBandSizes = { 4, 19 };
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kBatchCounts = { 1, 3 };
 
 // Test settings for the invalid tests
 template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kInvalidIncrements = { 0, 1 };
@@ -51,6 +56,7 @@ template <> const std::vector<Transpose> TestBlas<double2,double>::kTransposes =
 template <typename T, typename U>
 TestBlas<T,U>::TestBlas(const std::vector<std::string> &arguments, const bool silent,
                         const std::string &name, const std::vector<std::string> &options,
+                        const DataPrepare prepare_data,
                         const Routine run_routine,
                         const Routine run_reference1, const Routine run_reference2,
                         const ResultGet get_result, const ResultIndex get_index,
@@ -59,16 +65,19 @@ TestBlas<T,U>::TestBlas(const std::vector<std::string> &arguments, const bool si
     kOffsets(GetOffsets()),
     kAlphaValues(GetExampleScalars<U>(full_test_)),
     kBetaValues(GetExampleScalars<U>(full_test_)),
+    prepare_data_(prepare_data),
     run_routine_(run_routine),
+    run_reference1_(run_reference1),
+    run_reference2_(run_reference2),
     get_result_(get_result),
     get_index_(get_index),
     get_id1_(get_id1),
     get_id2_(get_id2) {
 
-  // Sets the reference to test against
-  if (compare_clblas_) { run_reference_ = run_reference1; }
-  else if (compare_cblas_) { run_reference_ = run_reference2; }
-  else { throw std::runtime_error("Invalid configuration: no reference to test against"); }
+  // Sanity check
+  if (!compare_clblas_ && !compare_cblas_) {
+    throw std::runtime_error("Invalid configuration: no reference to test against");
+  }
 
   // Computes the maximum sizes. This allows for a single set of input/output buffers.
   const auto max_vec = *std::max_element(kVectorDims.begin(), kVectorDims.end());
@@ -77,22 +86,25 @@ TestBlas<T,U>::TestBlas(const std::vector<std::string> &arguments, const bool si
   const auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
   const auto max_matvec = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
   const auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
+  const auto max_batch_count = *std::max_element(kBatchCounts.begin(), kBatchCounts.end());
 
   // Creates test input data
-  x_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset);
-  y_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset);
-  a_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
-  b_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
-  c_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
-  ap_source_.resize(std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset);
-  scalar_source_.resize(std::max(max_mat, max_matvec) + max_offset);
-  PopulateVector(x_source_, kSeed);
-  PopulateVector(y_source_, kSeed);
-  PopulateVector(a_source_, kSeed);
-  PopulateVector(b_source_, kSeed);
-  PopulateVector(c_source_, kSeed);
-  PopulateVector(ap_source_, kSeed);
-  PopulateVector(scalar_source_, kSeed);
+  x_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset);
+  y_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset);
+  a_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
+  b_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
+  c_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
+  ap_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset);
+  scalar_source_.resize(max_batch_count * std::max(max_mat, max_matvec) + max_offset);
+  std::mt19937 mt(kSeed);
+  std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
+  PopulateVector(x_source_, mt, dist);
+  PopulateVector(y_source_, mt, dist);
+  PopulateVector(a_source_, mt, dist);
+  PopulateVector(b_source_, mt, dist);
+  PopulateVector(c_source_, mt, dist);
+  PopulateVector(ap_source_, mt, dist);
+  PopulateVector(scalar_source_, mt, dist);
 }
 
 // ===============================================================================================
@@ -112,6 +124,11 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
       std::cout << std::flush;
     }
 
+    // Optionally prepares the input data
+    prepare_data_(args, queue_, kSeed,
+                  x_source_, y_source_, a_source_, b_source_, c_source_,
+                  ap_source_, scalar_source_);
+
     // Set-up for the CLBlast run
     auto x_vec2 = Buffer<T>(context_, args.x_size);
     auto y_vec2 = Buffer<T>(context_, args.y_size);
@@ -138,7 +155,10 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
 
     // Don't continue with CBLAS if there are incorrect parameters
     if (compare_cblas_ && status2 != StatusCode::kSuccess) {
-      if (verbose_) { fprintf(stdout, " -> "); std::cout << std::flush; }
+      if (verbose_) {
+        fprintf(stdout, " -> %d -> ", static_cast<int>(status2));
+        std::cout << std::flush;
+      }
       TestErrorCodes(status2, status2, args);
       continue;
     }
@@ -166,7 +186,9 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
       else if (compare_cblas_) { fprintf(stdout, " [CPU BLAS]"); }
       std::cout << std::flush;
     }
-    const auto status1 = run_reference_(args, buffers1, queue_);
+    auto status1 = StatusCode::kSuccess;
+    if (compare_clblas_) { status1 = run_reference1_(args, buffers1, queue_); }
+    else if (compare_cblas_) { status1 = run_reference2_(args, buffers1, queue_); }
 
     // Tests for equality of the two status codes
     if (verbose_) { fprintf(stdout, " -> "); std::cout << std::flush; }
@@ -179,23 +201,41 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
     auto result1 = get_result_(args, buffers1, queue_);
     auto result2 = get_result_(args, buffers2, queue_);
 
+    // Computes the L2 error
+    auto l2error = 0.0;
+    const auto kErrorMarginL2 = getL2ErrorMargin<T>();
+    for (auto id1=size_t{0}; id1<get_id1_(args); ++id1) {
+      for (auto id2=size_t{0}; id2<get_id2_(args); ++id2) {
+        auto index = get_index_(args, id1, id2);
+        l2error += SquaredDifference(result1[index], result2[index]);
+      }
+    }
+    l2error /= static_cast<double>(get_id1_(args) * get_id2_(args));
+
     // Checks for differences in the output
     auto errors = size_t{0};
     for (auto id1=size_t{0}; id1<get_id1_(args); ++id1) {
       for (auto id2=size_t{0}; id2<get_id2_(args); ++id2) {
         auto index = get_index_(args, id1, id2);
         if (!TestSimilarity(result1[index], result2[index])) {
-          errors++;
+          if (l2error >= kErrorMarginL2) { errors++; }
           if (verbose_) {
             if (get_id2_(args) == 1) { fprintf(stdout, "\n   Error at index %zu: ", id1); }
             else { fprintf(stdout, "\n   Error at %zu,%zu: ", id1, id2); }
             fprintf(stdout, " %s (reference) versus ", ToString(result1[index]).c_str());
             fprintf(stdout, " %s (CLBlast)", ToString(result2[index]).c_str());
+            if (l2error < kErrorMarginL2) {
+              fprintf(stdout, " - error suppressed by a low total L2 error\n");
+            }
           }
         }
       }
     }
-    if (verbose_ && errors > 0) { fprintf(stdout, "\n   "); }
+
+    // Report the results
+    if (verbose_ && errors > 0) {
+      fprintf(stdout, "\n   Combined average L2 error: %.2e\n   ", l2error);
+    }
 
     // Tests the error count (should be zero)
     TestErrorCount(errors, get_id1_(args)*get_id2_(args), args);
@@ -269,7 +309,9 @@ void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const st
       else if (compare_cblas_) { fprintf(stdout, " [CPU BLAS]"); }
       std::cout << std::flush;
     }
-    const auto status1 = run_reference_(args, buffers1, queue_);
+    auto status1 = StatusCode::kSuccess;
+    if (compare_clblas_) { status1 = run_reference1_(args, buffers1, queue_); }
+    else if (compare_cblas_) { status1 = run_reference2_(args, buffers1, queue_); }
 
     // Tests for equality of the two status codes
     if (verbose_) { fprintf(stdout, " -> "); std::cout << std::flush; }
diff --git a/test/correctness/testblas.hpp b/test/correctness/testblas.hpp
index 27fd84c3..560ff4d3 100644
--- a/test/correctness/testblas.hpp
+++ b/test/correctness/testblas.hpp
@@ -30,7 +30,7 @@ namespace clblast {
 template <typename T, typename U>
 class TestBlas: public Tester<T,U> {
  public:
-  static constexpr auto kSeed = 42; // fixed seed for reproducibility
+  static const int kSeed;
 
   // Uses several variables from the Tester class
   using Tester<T,U>::context_;
@@ -56,6 +56,7 @@ class TestBlas: public Tester<T,U> {
   static const std::vector<size_t> kMatrixDims;
   static const std::vector<size_t> kMatrixVectorDims;
   static const std::vector<size_t> kBandSizes;
+  static const std::vector<size_t> kBatchCounts;
   const std::vector<size_t> kOffsets;
   const std::vector<U> kAlphaValues;
   const std::vector<U> kBetaValues;
@@ -74,6 +75,10 @@ class TestBlas: public Tester<T,U> {
   static const std::vector<Transpose> kTransposes; // Data-type dependent, see .cc-file
 
   // Shorthand for the routine-specific functions passed to the tester
+  using DataPrepare = std::function<void(const Arguments<U>&, Queue&, const int,
+                                         std::vector<T>&, std::vector<T>&,
+                                         std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                                         std::vector<T>&, std::vector<T>&)>;
   using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>;
   using ResultGet = std::function<std::vector<T>(const Arguments<U>&, Buffers<T>&, Queue&)>;
   using ResultIndex = std::function<size_t(const Arguments<U>&, const size_t, const size_t)>;
@@ -82,6 +87,7 @@ class TestBlas: public Tester<T,U> {
   // Constructor, initializes the base class tester and input data
   TestBlas(const std::vector<std::string> &arguments, const bool silent,
            const std::string &name, const std::vector<std::string> &options,
+           const DataPrepare prepare_data,
            const Routine run_routine,
            const Routine run_reference1, const Routine run_reference2,
            const ResultGet get_result, const ResultIndex get_index,
@@ -103,32 +109,48 @@ class TestBlas: public Tester<T,U> {
   std::vector<T> scalar_source_;
   
   // The routine-specific functions passed to the tester
-  Routine run_routine_;
-  Routine run_reference_;
-  ResultGet get_result_;
-  ResultIndex get_index_;
-  ResultIterator get_id1_;
-  ResultIterator get_id2_;
+  const DataPrepare prepare_data_;
+  const Routine run_routine_;
+  const Routine run_reference1_;
+  const Routine run_reference2_;
+  const ResultGet get_result_;
+  const ResultIndex get_index_;
+  const ResultIterator get_id1_;
+  const ResultIterator get_id2_;
 };
 
 // =================================================================================================
 
+// Bogus reference function, in case a comparison library is not available
+template <typename T, typename U, typename BufferType>
+static StatusCode ReferenceNotAvailable(const Arguments<U> &, BufferType &, Queue &) {
+  return StatusCode::kNotImplemented;
+}
+
 // The interface to the correctness tester. This is a separate function in the header such that it
 // is automatically compiled for each routine, templated by the parameter "C".
 template <typename C, typename T, typename U>
 size_t RunTests(int argc, char *argv[], const bool silent, const std::string &name) {
   auto command_line_args = RetrieveCommandLineArguments(argc, argv);
 
-  // Sets the reference to test against
-  #if defined(CLBLAST_REF_CLBLAS) && defined(CLBLAST_REF_CBLAS)
-    const auto reference_routine1 = C::RunReference1; // clBLAS
-    const auto reference_routine2 = C::RunReference2; // CBLAS
-  #elif CLBLAST_REF_CLBLAS
-    const auto reference_routine1 = C::RunReference1; // clBLAS
-    const auto reference_routine2 = C::RunReference1; // not used, dummy
-  #elif CLBLAST_REF_CBLAS
-    const auto reference_routine1 = C::RunReference2; // not used, dummy
-    const auto reference_routine2 = C::RunReference2; // CBLAS
+  // Sets the clBLAS reference to test against
+  #ifdef CLBLAST_REF_CLBLAS
+    auto reference_routine1 = C::RunReference1; // clBLAS when available
+  #else
+    auto reference_routine1 = ReferenceNotAvailable<T,U,Buffers<T>>;
+  #endif
+
+  // Sets the CBLAS reference to test against
+  #ifdef CLBLAST_REF_CBLAS
+    auto reference_routine2 = [](const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) -> StatusCode {
+      auto buffers_host = BuffersHost<T>();
+      DeviceToHost(args, buffers, buffers_host, queue, C::BuffersIn());
+      C::RunReference2(args, buffers_host, queue);
+      HostToDevice(args, buffers, buffers_host, queue, C::BuffersOut());
+      return StatusCode::kSuccess;
+    };
+  #else
+    auto reference_routine2 = ReferenceNotAvailable<T,U,Buffers<T>>;
   #endif
 
   // Non-BLAS routines cannot be fully tested
@@ -141,7 +163,7 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na
   // Creates a tester
   auto options = C::GetOptions();
   TestBlas<T,U> tester{command_line_args, silent, name, options,
-                       C::RunRoutine, reference_routine1, reference_routine2,
+                       C::PrepareData, C::RunRoutine, reference_routine1, reference_routine2,
                        C::DownloadResult, C::GetResultIndex, C::ResultID1, C::ResultID2};
 
   // This variable holds the arguments relevant for this routine
@@ -177,6 +199,7 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na
   auto imax_offsets = std::vector<size_t>{args.imax_offset};
   auto alphas = std::vector<U>{args.alpha};
   auto betas = std::vector<U>{args.beta};
+  auto batch_counts = std::vector<size_t>{args.batch_count};
   auto x_sizes = std::vector<size_t>{args.x_size};
   auto y_sizes = std::vector<size_t>{args.y_size};
   auto a_sizes = std::vector<size_t>{args.a_size};
@@ -220,6 +243,7 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na
     if (option == kArgImaxOffset) { imax_offsets = tester.kOffsets; }
     if (option == kArgAlpha) { alphas = tester.kAlphaValues; }
     if (option == kArgBeta) { betas = tester.kBetaValues; }
+    if (option == kArgBatchCount) { batch_counts = tester.kBatchCounts; }
 
     if (option == kArgXOffset) { x_sizes = tester.kVecSizes; }
     if (option == kArgYOffset) { y_sizes = tester.kVecSizes; }
@@ -262,8 +286,10 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na
                                                     for (auto &imax_offset: imax_offsets) { r_args.imax_offset = imax_offset;
                                                       for (auto &alpha: alphas) { r_args.alpha = alpha;
                                                         for (auto &beta: betas) { r_args.beta = beta;
-                                                          C::SetSizes(r_args);
-                                                          regular_test_vector.push_back(r_args);
+                                                          for (auto &batch_count: batch_counts) { r_args.batch_count = batch_count;
+                                                            C::SetSizes(r_args);
+                                                            regular_test_vector.push_back(r_args);
+                                                          }
                                                         }
                                                       }
                                                     }
diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp
index c449b09d..d1f3cbb2 100644
--- a/test/correctness/tester.cpp
+++ b/test/correctness/tester.cpp
@@ -22,22 +22,52 @@
 namespace clblast {
 // =================================================================================================
 
-// Eror margings (relative and absolute)
+// Relative error margins
 template <typename T>
 float getRelativeErrorMargin() {
   return 0.005f; // 0.5% is considered acceptable for float/double-precision
 }
+template float getRelativeErrorMargin<float>(); // as the above default
+template float getRelativeErrorMargin<double>(); // as the above default
+template float getRelativeErrorMargin<float2>(); // as the above default
+template float getRelativeErrorMargin<double2>(); // as the above default
 template <>
 float getRelativeErrorMargin<half>() {
   return 0.080f; // 8% (!) error is considered acceptable for half-precision
 }
+
+// Absolute error margins
 template <typename T>
 float getAbsoluteErrorMargin() {
   return 0.001f;
 }
+template float getAbsoluteErrorMargin<float>(); // as the above default
+template float getAbsoluteErrorMargin<double>(); // as the above default
+template float getAbsoluteErrorMargin<float2>(); // as the above default
+template float getAbsoluteErrorMargin<double2>(); // as the above default
 template <>
 float getAbsoluteErrorMargin<half>() {
-  return 0.10f; // especially small values are inaccurate for half-precision
+  return 0.15f; // especially small values are inaccurate for half-precision
+}
+
+// L2 error margins
+template <typename T>
+double getL2ErrorMargin() {
+  return 0.0f; // zero means don't look at the L2 error margin at all, use the other metrics
+}
+template double getL2ErrorMargin<float>(); // as the above default
+template double getL2ErrorMargin<double>(); // as the above default
+template double getL2ErrorMargin<float2>(); // as the above default
+template double getL2ErrorMargin<double2>(); // as the above default
+template <>
+double getL2ErrorMargin<half>() {
+  return 0.05; // half-precision results are considered OK as long as the L2 error is low enough
+}
+
+// Error margin: numbers beyond this value are considered equal to inf or NaN
+template <typename T>
+T getAlmostInfNumber() {
+  return static_cast<T>(1e35); // used for correctness testing of TRSV and TRSM routines
 }
 
 // Maximum number of test results printed on a single line
@@ -86,24 +116,44 @@ Tester<T,U>::Tester(const std::vector<std::string> &arguments, const bool silent
     tests_failed_{0} {
   options_ = options;
 
+  // Determines which reference is the default
+  #if defined(CLBLAST_REF_CBLAS)
+      auto default_cblas = 0;
+  #endif
+  #if defined(CLBLAST_REF_CLBLAS)
+      auto default_clblas = 0;
+  #endif
+  #if defined(CLBLAST_REF_CUBLAS)
+      auto default_cublas = 0;
+  #endif
+  #if defined(CLBLAST_REF_CBLAS)
+    default_cblas = 1;
+  #elif defined(CLBLAST_REF_CLBLAS)
+    default_clblas = 1;
+  #elif defined(CLBLAST_REF_CUBLAS)
+    default_cublas = 1;
+  #endif
+
   // Determines which reference to test against
-  #if defined(CLBLAST_REF_CLBLAS) && defined(CLBLAST_REF_CBLAS)
-    compare_clblas_ = GetArgument(arguments, help_, kArgCompareclblas, 0);
-    compare_cblas_  = GetArgument(arguments, help_, kArgComparecblas, 1);
-  #elif CLBLAST_REF_CLBLAS
-    compare_clblas_ = GetArgument(arguments, help_, kArgCompareclblas, 1);
-    compare_cblas_ = 0;
-  #elif CLBLAST_REF_CBLAS
-    compare_clblas_ = 0;
-    compare_cblas_  = GetArgument(arguments, help_, kArgComparecblas, 1);
-  #else
-    compare_clblas_ = 0;
-    compare_cblas_ = 0;
+  compare_clblas_ = 0;
+  compare_cblas_ = 0;
+  compare_cublas_ = 0;
+  #if defined(CLBLAST_REF_CBLAS)
+    compare_cblas_  = GetArgument(arguments, help_, kArgComparecblas, default_cblas);
+  #endif
+  #if defined(CLBLAST_REF_CLBLAS)
+    compare_clblas_ = GetArgument(arguments, help_, kArgCompareclblas, default_clblas);
+  #endif
+  #if defined(CLBLAST_REF_CUBLAS)
+    compare_cublas_  = GetArgument(arguments, help_, kArgComparecublas, default_cublas);
   #endif
 
   // Prints the help message (command-line arguments)
   if (!silent) { fprintf(stdout, "\n* %s\n", help_.c_str()); }
 
+  // Support for cuBLAS not available yet
+  if (compare_cublas_) { throw std::runtime_error("Cannot test against cuBLAS; not implemented yet"); }
+
   // Can only test against a single reference (not two, not zero)
   if (compare_clblas_ && compare_cblas_) {
     throw std::runtime_error("Cannot test against both clBLAS and CBLAS references; choose one using the -cblas and -clblas arguments");
@@ -138,6 +188,9 @@ Tester<T,U>::Tester(const std::vector<std::string> &arguments, const bool silent
           kUnsupportedReference.c_str());
   fprintf(stdout, "* Testing with error margins of %.1lf%% (relative) and %.3lf (absolute)\n",
           100.0f * getRelativeErrorMargin<T>(), getAbsoluteErrorMargin<T>());
+  if (getL2ErrorMargin<T>() != 0.0f) {
+    fprintf(stdout, "* and a combined maximum allowed L2 error of %.2e\n", getL2ErrorMargin<T>());
+  }
 
   // Initializes clBLAS
   #ifdef CLBLAST_REF_CLBLAS
@@ -248,8 +301,29 @@ template <typename T, typename U>
 void Tester<T,U>::TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status,
                                  const Arguments<U> &args) {
 
+  // Either an OpenCL or CLBlast internal error occurred, fail the test immediately
+  // NOTE: the OpenCL error codes grow downwards without any declared lower bound, hence the magic
+  // number. The last error code is atm around -70, but -500 is chosen to be on the safe side.
+  if (clblast_status != StatusCode::kSuccess &&
+      (clblast_status > static_cast<StatusCode>(-500) /* matches OpenCL errors (see above) */ ||
+       clblast_status < StatusCode::kNotImplemented) /* matches CLBlast internal errors */) {
+    PrintTestResult(kErrorStatus);
+    ReportError({StatusCode::kSuccess, clblast_status, kStatusError, args});
+    if (verbose_) {
+      fprintf(stdout, "\n");
+      PrintErrorLog({{StatusCode::kSuccess, clblast_status, kStatusError, args}});
+      fprintf(stdout, "   ");
+    }
+  }
+
+  // Routine is not implemented
+  else if (clblast_status == StatusCode::kNotImplemented) {
+    PrintTestResult(kSkippedCompilation);
+    ReportSkipped();
+  }
+
   // Cannot compare error codes against a library other than clBLAS
-  if (compare_cblas_) {
+  else if (compare_cblas_) {
     PrintTestResult(kUnsupportedReference);
     ReportSkipped();
   }
@@ -267,13 +341,6 @@ void Tester<T,U>::TestErrorCodes(const StatusCode clblas_status, const StatusCod
     ReportSkipped();
   }
 
-  // Could not compile the CLBlast kernel properly
-  else if (clblast_status == StatusCode::kOpenCLBuildProgramFailure ||
-           clblast_status == StatusCode::kNotImplemented) {
-    PrintTestResult(kSkippedCompilation);
-    ReportSkipped();
-  }
-
   // Error occurred
   else {
     PrintTestResult(kErrorStatus);
@@ -318,6 +385,9 @@ std::string Tester<T,U>::GetOptionsString(const Arguments<U> &args) {
     if (o == kArgCOffset)  { result += kArgCOffset + equals + ToString(args.c_offset) + " "; }
     if (o == kArgAPOffset) { result += kArgAPOffset + equals + ToString(args.ap_offset) + " "; }
     if (o == kArgDotOffset){ result += kArgDotOffset + equals + ToString(args.dot_offset) + " "; }
+    if (o == kArgAlpha)    { result += kArgAlpha + equals + ToString(args.alpha) + " "; }
+    if (o == kArgBeta)     { result += kArgBeta + equals + ToString(args.beta) + " "; }
+    if (o == kArgBatchCount){result += kArgBatchCount + equals + ToString(args.batch_count) + " "; }
   }
   return result;
 }
@@ -385,10 +455,12 @@ template <typename T, typename U>
 void Tester<T,U>::PrintErrorLog(const std::vector<ErrorLogEntry> &error_log) {
   for (auto &entry: error_log) {
     if (entry.error_percentage != kStatusError) {
-      fprintf(stdout, "   Error rate %.1lf%%: ", entry.error_percentage);
+      fprintf(stdout, "   Error rate %.2lf%%: ", entry.error_percentage);
     }
     else {
-      fprintf(stdout, "   Status code %d (expected %d): ", entry.status_found, entry.status_expect);
+      fprintf(stdout, "   Status code %d (expected %d): ",
+              static_cast<int>(entry.status_found),
+              static_cast<int>(entry.status_expect));
     }
     fprintf(stdout, "%s\n", GetOptionsString(entry.args).c_str());
   }
@@ -410,6 +482,21 @@ bool TestSimilarityNear(const T val1, const T val2,
   if (val1 == val2) {
     return true;
   }
+  // Handles cases with both results NaN or inf
+  else if ((std::isnan(val1) && std::isnan(val2)) || (std::isinf(val1) && std::isinf(val2))) {
+    return true;
+  }
+  // Also considers it OK if one of the results in NaN and the other is inf
+  // Note: for TRSV and TRSM routines
+  else if ((std::isnan(val1) && std::isinf(val2)) || (std::isinf(val1) && std::isnan(val2))) {
+    return true;
+  }
+  // Also considers it OK if one of the values is super large and the other is inf or NaN
+  // Note: for TRSV and TRSM routines
+  else if ((std::abs(val1) > getAlmostInfNumber<T>() && (std::isinf(val2) || std::isnan(val2))) ||
+           (std::abs(val2) > getAlmostInfNumber<T>() && (std::isinf(val1) || std::isnan(val1)))) {
+    return true;
+  }
   // The values are zero or very small: the relative error is less meaningful
   else if (val1 == 0 || val2 == 0 || difference < error_margin_absolute) {
     return (difference < error_margin_absolute);
@@ -436,15 +523,21 @@ template bool TestSimilarity<double>(const double, const double);
 // Specialisations for non-standard data-types
 template <>
 bool TestSimilarity(const float2 val1, const float2 val2) {
-  auto real = TestSimilarity(val1.real(), val2.real());
-  auto imag = TestSimilarity(val1.imag(), val2.imag());
-  return (real && imag);
+  const auto real = TestSimilarity(val1.real(), val2.real());
+  const auto imag = TestSimilarity(val1.imag(), val2.imag());
+  if (real && imag) { return true; }
+  // also OK if one is good and the combined is good (indicates a big diff between real & imag)
+  if (real || imag) { return TestSimilarity(val1.real() + val1.imag(), val2.real() + val2.imag()); }
+  return false; // neither real nor imag is good, return false
 }
 template <>
 bool TestSimilarity(const double2 val1, const double2 val2) {
-  auto real = TestSimilarity(val1.real(), val2.real());
-  auto imag = TestSimilarity(val1.imag(), val2.imag());
-  return (real && imag);
+  const auto real = TestSimilarity(val1.real(), val2.real());
+  const auto imag = TestSimilarity(val1.imag(), val2.imag());
+  if (real && imag) { return true; }
+  // also OK if one is good and the combined is good (indicates a big diff between real & imag)
+  if (real || imag) { return TestSimilarity(val1.real() + val1.imag(), val2.real() + val2.imag()); }
+  return false; // neither real nor imag is good, return false
 }
 template <>
 bool TestSimilarity(const half val1, const half val2) {
@@ -456,6 +549,37 @@ bool TestSimilarity(const half val1, const half val2) {
 
 // =================================================================================================
 
+// Retrieves the squared difference, used for example for computing the L2 error
+template <typename T>
+double SquaredDifference(const T val1, const T val2) {
+  const auto difference = (val1 - val2);
+  return static_cast<double>(difference * difference);
+}
+
+// Compiles the default case for standard data-types
+template double SquaredDifference<float>(const float, const float);
+template double SquaredDifference<double>(const double, const double);
+
+// Specialisations for non-standard data-types
+template <>
+double SquaredDifference(const float2 val1, const float2 val2) {
+  const auto real = SquaredDifference(val1.real(), val2.real());
+  const auto imag = SquaredDifference(val1.imag(), val2.imag());
+  return real + imag;
+}
+template <>
+double SquaredDifference(const double2 val1, const double2 val2) {
+  const auto real = SquaredDifference(val1.real(), val2.real());
+  const auto imag = SquaredDifference(val1.imag(), val2.imag());
+  return real + imag;
+}
+template <>
+double SquaredDifference(const half val1, const half val2) {
+  return SquaredDifference(HalfToFloat(val1), HalfToFloat(val2));
+}
+
+// =================================================================================================
+
 // Retrieves a list of example scalar values, used for the alpha and beta arguments for the various
 // routines. This function is specialised for the different data-types.
 template <> const std::vector<float> GetExampleScalars(const bool full_test) {
diff --git a/test/correctness/tester.hpp b/test/correctness/tester.hpp
index d8462cef..8cfa702f 100644
--- a/test/correctness/tester.hpp
+++ b/test/correctness/tester.hpp
@@ -22,14 +22,14 @@
 #include <vector>
 #include <memory>
 
+#include "utilities/utilities.hpp"
+
 // The libraries
 #ifdef CLBLAST_REF_CLBLAS
   #include <clBLAS.h>
 #endif
 #include "clblast.h"
 
-#include "utilities/utilities.hpp"
-
 namespace clblast {
 // =================================================================================================
 
@@ -113,6 +113,7 @@ class Tester {
   // Testing against reference implementations
   int compare_cblas_;
   int compare_clblas_;
+  int compare_cublas_;
 
  private:
 
@@ -150,11 +151,20 @@ class Tester {
 // template specialization)
 // =================================================================================================
 
+// Error margins
+template <typename T> float getRelativeErrorMargin();
+template <typename T> float getAbsoluteErrorMargin();
+template <typename T> double getL2ErrorMargin();
+
 // Compares two floating point values and returns whether they are within an acceptable error
 // margin. This replaces GTest's EXPECT_NEAR().
 template <typename T>
 bool TestSimilarity(const T val1, const T val2);
 
+// Retrieves the squared difference, used for example for computing the L2 error
+template <typename T>
+double SquaredDifference(const T val1, const T val2);
+
 // Retrieves a list of example scalar values, used for the alpha and beta arguments for the various
 // routines. This function is specialised for the different data-types.
 template <typename T>
diff --git a/test/performance/client.cpp b/test/performance/client.cpp
index 2c45b35e..dc98ffbd 100644
--- a/test/performance/client.cpp
+++ b/test/performance/client.cpp
@@ -11,27 +11,36 @@
 //
 // =================================================================================================
 
-#include "test/performance/client.hpp"
-
 #include <string>
 #include <vector>
 #include <utility>
 #include <algorithm>
 #include <chrono>
+#include <random>
+
+#include "utilities/utilities.hpp"
+#include "test/performance/client.hpp"
 
 namespace clblast {
 // =================================================================================================
 
+template <typename T, typename U> const int Client<T,U>::kSeed = 42; // fixed seed for reproducibility
+
 // Constructor
 template <typename T, typename U>
 Client<T,U>::Client(const Routine run_routine,
-                    const Routine run_reference1, const Routine run_reference2,
-                    const std::vector<std::string> &options,
+                    const Reference1 run_reference1, const Reference2 run_reference2,
+                    const Reference3 run_reference3, const std::vector<std::string> &options,
+                    const std::vector<std::string> &buffers_in,
+                    const std::vector<std::string> &buffers_out,
                     const GetMetric get_flops, const GetMetric get_bytes):
   run_routine_(run_routine),
   run_reference1_(run_reference1),
   run_reference2_(run_reference2),
+  run_reference3_(run_reference3),
   options_(options),
+  buffers_in_(buffers_in),
+  buffers_out_(buffers_out),
   get_flops_(get_flops),
   get_bytes_(get_bytes) {
 }
@@ -89,6 +98,9 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const size_t le
     if (o == kArgAsumOffset)  { args.asum_offset = GetArgument(command_line_args, help, kArgAsumOffset, size_t{0}); }
     if (o == kArgImaxOffset)  { args.imax_offset = GetArgument(command_line_args, help, kArgImaxOffset, size_t{0}); }
 
+    // Batch arguments
+    if (o == kArgBatchCount) { args.batch_count = GetArgument(command_line_args, help, kArgBatchCount, size_t{1}); }
+
     // Scalar values 
     if (o == kArgAlpha) { args.alpha = GetArgument(command_line_args, help, kArgAlpha, GetScalar<U>()); }
     if (o == kArgBeta)  { args.beta  = GetArgument(command_line_args, help, kArgBeta, GetScalar<U>()); }
@@ -108,6 +120,11 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const size_t le
   #else
     args.compare_cblas = 0;
   #endif
+  #ifdef CLBLAST_REF_CUBLAS
+    args.compare_cublas  = GetArgument(command_line_args, help, kArgComparecublas, 1);
+  #else
+    args.compare_cublas = 0;
+  #endif
   args.step           = GetArgument(command_line_args, help, kArgStepSize, size_t{1});
   args.num_steps      = GetArgument(command_line_args, help, kArgNumSteps, size_t{0});
   args.num_runs       = GetArgument(command_line_args, help, kArgNumRuns, size_t{10});
@@ -122,24 +139,26 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const size_t le
 
   // Comparison against a non-BLAS routine is not supported
   if (level == 4) { // level-4 == level-X
-    if (args.compare_clblas != 0 || args.compare_cblas != 0) {
+    if (args.compare_clblas != 0 || args.compare_cblas != 0 || args.compare_cublas != 0) {
       if (!args.silent) {
-        fprintf(stdout, "* Disabling clBLAS and CPU BLAS comparisons for this non-BLAS routine\n\n");
+        fprintf(stdout, "* Disabling clBLAS/CBLAS/cuBLAS comparisons for this non-BLAS routine\n\n");
       }
     }
     args.compare_clblas = 0;
     args.compare_cblas = 0;
+    args.compare_cublas = 0;
   }
 
-  // Comparison against clBLAS or a CPU BLAS library is not supported in case of half-precision
+  // Comparison against other BLAS libraries is not supported in case of half-precision
   if (args.precision == Precision::kHalf) {
-    if (args.compare_clblas != 0 || args.compare_cblas != 0) {
+    if (args.compare_clblas != 0 || args.compare_cblas != 0 || args.compare_cublas != 0) {
       if (!args.silent) {
-        fprintf(stdout, "* Disabling clBLAS and CPU BLAS comparisons for half-precision\n\n");
+        fprintf(stdout, "* Disabling clBLAS/CBLAS/cuBLAS comparisons for half-precision\n\n");
       }
     }
     args.compare_clblas = 0;
     args.compare_cblas = 0;
+    args.compare_cublas = 0;
   }
 
   // Returns the arguments
@@ -163,6 +182,9 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
   #ifdef CLBLAST_REF_CLBLAS
     if (args.compare_clblas) { clblasSetup(); }
   #endif
+  #ifdef CLBLAST_REF_CUBLAS
+    if (args.compare_cublas) { cublasSetup(args); }
+  #endif
 
   // Iterates over all "num_step" values jumping by "step" each time
   auto s = size_t{0};
@@ -179,13 +201,15 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
     std::vector<T> c_source(args.c_size);
     std::vector<T> ap_source(args.ap_size);
     std::vector<T> scalar_source(args.scalar_size);
-    PopulateVector(x_source, kSeed);
-    PopulateVector(y_source, kSeed);
-    PopulateVector(a_source, kSeed);
-    PopulateVector(b_source, kSeed);
-    PopulateVector(c_source, kSeed);
-    PopulateVector(ap_source, kSeed);
-    PopulateVector(scalar_source, kSeed);
+    std::mt19937 mt(kSeed);
+    std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
+    PopulateVector(x_source, mt, dist);
+    PopulateVector(y_source, mt, dist);
+    PopulateVector(a_source, mt, dist);
+    PopulateVector(b_source, mt, dist);
+    PopulateVector(c_source, mt, dist);
+    PopulateVector(ap_source, mt, dist);
+    PopulateVector(scalar_source, mt, dist);
 
     // Creates the matrices on the device
     auto x_vec = Buffer<T>(context, args.x_size);
@@ -213,9 +237,22 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
       timings.push_back(std::pair<std::string, double>("clBLAS", ms_clblas));
     }
     if (args.compare_cblas) {
-      auto ms_cblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference2_, "CPU BLAS");
+      auto buffers_host = BuffersHost<T>();
+      DeviceToHost(args, buffers, buffers_host, queue, buffers_in_);
+      auto ms_cblas = TimedExecution(args.num_runs, args, buffers_host, queue, run_reference2_, "CPU BLAS");
+      HostToDevice(args, buffers, buffers_host, queue, buffers_out_);
       timings.push_back(std::pair<std::string, double>("CPU BLAS", ms_cblas));
     }
+    if (args.compare_cublas) {
+      auto buffers_host = BuffersHost<T>();
+      auto buffers_cuda = BuffersCUDA<T>();
+      DeviceToHost(args, buffers, buffers_host, queue, buffers_in_);
+      HostToCUDA(args, buffers_cuda, buffers_host, buffers_in_);
+      auto ms_cublas = TimedExecution(args.num_runs, args, buffers_cuda, queue, run_reference3_, "cuBLAS");
+      CUDAToHost(args, buffers_cuda, buffers_host, buffers_out_);
+      HostToDevice(args, buffers, buffers_host, queue, buffers_out_);
+      timings.push_back(std::pair<std::string, double>("cuBLAS", ms_cublas));
+    }
 
     // Prints the performance of the tested libraries
     PrintTableRow(args, timings);
@@ -235,6 +272,9 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
   #ifdef CLBLAST_REF_CLBLAS
     if (args.compare_clblas) { clblasTeardown(); }
   #endif
+  #ifdef CLBLAST_REF_CUBLAS
+    if (args.compare_cublas) { cublasTeardown(args); }
+  #endif
 }
 
 // =================================================================================================
@@ -243,9 +283,10 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
 // timing is performed using the milliseconds chrono functions. The function returns the minimum
 // value found in the vector of timing results. The return value is in milliseconds.
 template <typename T, typename U>
+template <typename BufferType, typename RoutineType>
 double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args,
-                                   Buffers<T> &buffers, Queue &queue,
-                                   Routine run_blas, const std::string &library_name) {
+                                   BufferType &buffers, Queue &queue,
+                                   RoutineType run_blas, const std::string &library_name) {
   auto status = StatusCode::kSuccess;
 
   // Do an optional warm-up to omit compilation times and initialisations from the measurements
@@ -290,6 +331,7 @@ void Client<T,U>::PrintTableHeader(const Arguments<U>& args) {
     fprintf(stdout, " | <--       CLBlast       -->");
     if (args.compare_clblas) { fprintf(stdout, " | <--       clBLAS        -->"); }
     if (args.compare_cblas) { fprintf(stdout, " | <--      CPU BLAS       -->"); }
+    if (args.compare_cublas) { fprintf(stdout, " | <--       cuBLAS        -->"); }
     fprintf(stdout, " |\n");
   }
 
@@ -298,6 +340,7 @@ void Client<T,U>::PrintTableHeader(const Arguments<U>& args) {
   fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1");
   if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); }
   if (args.compare_cblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_3", "GFLOPS_3", "GBs_3"); }
+  if (args.compare_cublas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_4", "GFLOPS_4", "GBs_4"); }
   fprintf(stdout, "\n");
 }
 
@@ -335,6 +378,7 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args,
     else if (o == kArgNrm2Offset){integers.push_back(args.nrm2_offset); }
     else if (o == kArgAsumOffset){integers.push_back(args.asum_offset); }
     else if (o == kArgImaxOffset){integers.push_back(args.imax_offset); }
+    else if (o == kArgBatchCount){integers.push_back(args.batch_count); }
   }
   auto strings = std::vector<std::string>{};
   for (auto &o: options_) {
diff --git a/test/performance/client.hpp b/test/performance/client.hpp
index 4554c67f..47a13017 100644
--- a/test/performance/client.hpp
+++ b/test/performance/client.hpp
@@ -25,14 +25,15 @@
 #include <vector>
 #include <utility>
 
+#include "utilities/utilities.hpp"
+
 // The libraries to test
 #ifdef CLBLAST_REF_CLBLAS
   #include <clBLAS.h>
 #endif
+#include "test/wrapper_cuda.hpp"
 #include "clblast.h"
 
-#include "utilities/utilities.hpp"
-
 namespace clblast {
 // =================================================================================================
 
@@ -40,16 +41,20 @@ namespace clblast {
 template <typename T, typename U>
 class Client {
  public:
-  static constexpr auto kSeed = 42; // fixed seed for reproducibility
+  static const int kSeed;
 
   // Shorthand for the routine-specific functions passed to the tester
   using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>;
+  using Reference1 = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>;
+  using Reference2 = std::function<StatusCode(const Arguments<U>&, BuffersHost<T>&, Queue&)>;
+  using Reference3 = std::function<StatusCode(const Arguments<U>&, BuffersCUDA<T>&, Queue&)>;
   using SetMetric = std::function<void(Arguments<U>&)>;
   using GetMetric = std::function<size_t(const Arguments<U>&)>;
 
   // The constructor
-  Client(const Routine run_routine, const Routine run_reference1, const Routine run_reference2,
-         const std::vector<std::string> &options,
+  Client(const Routine run_routine, const Reference1 run_reference1, const Reference2 run_reference2,
+         const Reference3 run_reference3, const std::vector<std::string> &options,
+         const std::vector<std::string> &buffers_in, const std::vector<std::string> &buffers_out,
          const GetMetric get_flops, const GetMetric get_bytes);
 
   // Parses all command-line arguments, filling in the arguments structure. If no command-line
@@ -66,8 +71,9 @@ class Client {
  private:
 
   // Runs a function a given number of times and returns the execution time of the shortest instance
-  double TimedExecution(const size_t num_runs, const Arguments<U> &args, Buffers<T> &buffers,
-                        Queue &queue, Routine run_blas, const std::string &library_name);
+  template <typename BufferType, typename RoutineType>
+  double TimedExecution(const size_t num_runs, const Arguments<U> &args, BufferType &buffers,
+                        Queue &queue, RoutineType run_blas, const std::string &library_name);
 
   // Prints the header of a performance-data table
   void PrintTableHeader(const Arguments<U>& args);
@@ -78,9 +84,12 @@ class Client {
 
   // The routine-specific functions passed to the tester
   const Routine run_routine_;
-  const Routine run_reference1_;
-  const Routine run_reference2_;
+  const Reference1 run_reference1_;
+  const Reference2 run_reference2_;
+  const Reference3 run_reference3_;
   const std::vector<std::string> options_;
+  const std::vector<std::string> buffers_in_;
+  const std::vector<std::string> buffers_out_;
   const GetMetric get_flops_;
   const GetMetric get_bytes_;
 
@@ -91,8 +100,8 @@ class Client {
 // =================================================================================================
 
 // Bogus reference function, in case a comparison library is not available
-template <typename T, typename U>
-static StatusCode ReferenceNotAvailable(const Arguments<U> &, Buffers<T> &, Queue &) {
+template <typename T, typename U, typename BufferType>
+static StatusCode ReferenceNotAvailable(const Arguments<U> &, BufferType &, Queue &) {
   return StatusCode::kNotImplemented;
 }
 
@@ -105,17 +114,22 @@ void RunClient(int argc, char *argv[]) {
   #ifdef CLBLAST_REF_CLBLAS
     auto reference1 = C::RunReference1; // clBLAS when available
   #else
-    auto reference1 = ReferenceNotAvailable<T,U>;
+    auto reference1 = ReferenceNotAvailable<T,U,Buffers<T>>;
   #endif
   #ifdef CLBLAST_REF_CBLAS
     auto reference2 = C::RunReference2; // CBLAS when available
   #else
-    auto reference2 = ReferenceNotAvailable<T,U>;
+    auto reference2 = ReferenceNotAvailable<T,U,BuffersHost<T>>;
+  #endif
+  #ifdef CLBLAST_REF_CUBLAS
+    auto reference3 = C::RunReference3; // cuBLAS when available
+  #else
+    auto reference3 = ReferenceNotAvailable<T,U,BuffersCUDA<T>>;
   #endif
 
   // Creates a new client
-  auto client = Client<T,U>(C::RunRoutine, reference1, reference2, C::GetOptions(),
-                            C::GetFlops, C::GetBytes);
+  auto client = Client<T,U>(C::RunRoutine, reference1, reference2, reference3, C::GetOptions(),
+                            C::BuffersIn(), C::BuffersOut(), C::GetFlops, C::GetBytes);
 
   // Simple command line argument parser with defaults
   auto args = client.ParseArguments(argc, argv, C::BLASLevel(),
diff --git a/test/performance/routines/level1/xamax.cpp b/test/performance/routines/level1/xamax.cpp
index 5dc7b3d9..9a40b718 100644
--- a/test/performance/routines/level1/xamax.cpp
+++ b/test/performance/routines/level1/xamax.cpp
@@ -12,24 +12,20 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level1/xamax.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXamax<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXamax<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXamax<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
       clblast::RunClient<clblast::TestXamax<double>, double, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXamax<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXamax<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXamax<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXamax<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level1/xasum.cpp b/test/performance/routines/level1/xasum.cpp
index bf5b2fa9..b7a0e053 100644
--- a/test/performance/routines/level1/xasum.cpp
+++ b/test/performance/routines/level1/xasum.cpp
@@ -12,24 +12,20 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level1/xasum.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXasum<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXasum<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXasum<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
       clblast::RunClient<clblast::TestXasum<double>, double, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXasum<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXasum<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXasum<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXasum<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level1/xaxpy.cpp b/test/performance/routines/level1/xaxpy.cpp
index faccc089..86c209c2 100644
--- a/test/performance/routines/level1/xaxpy.cpp
+++ b/test/performance/routines/level1/xaxpy.cpp
@@ -12,24 +12,20 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level1/xaxpy.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXaxpy<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXaxpy<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXaxpy<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
       clblast::RunClient<clblast::TestXaxpy<double>, double, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXaxpy<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXaxpy<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXaxpy<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXaxpy<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level1/xcopy.cpp b/test/performance/routines/level1/xcopy.cpp
index 8aa536af..759bc34e 100644
--- a/test/performance/routines/level1/xcopy.cpp
+++ b/test/performance/routines/level1/xcopy.cpp
@@ -12,24 +12,20 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level1/xcopy.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXcopy<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXcopy<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXcopy<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
       clblast::RunClient<clblast::TestXcopy<double>, double, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXcopy<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXcopy<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXcopy<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXcopy<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level1/xdot.cpp b/test/performance/routines/level1/xdot.cpp
index 9a570e1e..8fc59c84 100644
--- a/test/performance/routines/level1/xdot.cpp
+++ b/test/performance/routines/level1/xdot.cpp
@@ -12,16 +12,12 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level1/xdot.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXdot<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXdot<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXdot<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level1/xdotc.cpp b/test/performance/routines/level1/xdotc.cpp
index 426b81ae..308bcdab 100644
--- a/test/performance/routines/level1/xdotc.cpp
+++ b/test/performance/routines/level1/xdotc.cpp
@@ -12,10 +12,6 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level1/xdotc.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
@@ -24,9 +20,9 @@ int main(int argc, char *argv[]) {
     case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXdotc<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXdotc<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXdotc<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXdotc<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level1/xdotu.cpp b/test/performance/routines/level1/xdotu.cpp
index 4fbe167d..fc54a8a5 100644
--- a/test/performance/routines/level1/xdotu.cpp
+++ b/test/performance/routines/level1/xdotu.cpp
@@ -12,10 +12,6 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level1/xdotu.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
@@ -24,9 +20,9 @@ int main(int argc, char *argv[]) {
     case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXdotu<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXdotu<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXdotu<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXdotu<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level1/xnrm2.cpp b/test/performance/routines/level1/xnrm2.cpp
index 6a1cdcc7..3bd74377 100644
--- a/test/performance/routines/level1/xnrm2.cpp
+++ b/test/performance/routines/level1/xnrm2.cpp
@@ -12,24 +12,20 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level1/xnrm2.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXnrm2<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXnrm2<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXnrm2<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
       clblast::RunClient<clblast::TestXnrm2<double>, double, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXnrm2<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXnrm2<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXnrm2<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXnrm2<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level1/xrot.cpp b/test/performance/routines/level1/xrot.cpp
index 2b94ca39..f010e04a 100644
--- a/test/performance/routines/level1/xrot.cpp
+++ b/test/performance/routines/level1/xrot.cpp
@@ -12,10 +12,6 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level1/xrot.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
diff --git a/test/performance/routines/level1/xrotg.cpp b/test/performance/routines/level1/xrotg.cpp
index ee6fc44b..4c8d33cf 100644
--- a/test/performance/routines/level1/xrotg.cpp
+++ b/test/performance/routines/level1/xrotg.cpp
@@ -12,10 +12,6 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level1/xrotg.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
diff --git a/test/performance/routines/level1/xrotm.cpp b/test/performance/routines/level1/xrotm.cpp
index e8d73311..bc2111b3 100644
--- a/test/performance/routines/level1/xrotm.cpp
+++ b/test/performance/routines/level1/xrotm.cpp
@@ -12,10 +12,6 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level1/xrotm.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
diff --git a/test/performance/routines/level1/xrotmg.cpp b/test/performance/routines/level1/xrotmg.cpp
index a5266b14..fb568243 100644
--- a/test/performance/routines/level1/xrotmg.cpp
+++ b/test/performance/routines/level1/xrotmg.cpp
@@ -12,10 +12,6 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level1/xrotmg.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
diff --git a/test/performance/routines/level1/xscal.cpp b/test/performance/routines/level1/xscal.cpp
index 6fefc5d0..0dd78879 100644
--- a/test/performance/routines/level1/xscal.cpp
+++ b/test/performance/routines/level1/xscal.cpp
@@ -12,24 +12,20 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level1/xscal.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXscal<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXscal<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXscal<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
       clblast::RunClient<clblast::TestXscal<double>, double, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXscal<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXscal<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXscal<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXscal<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level1/xswap.cpp b/test/performance/routines/level1/xswap.cpp
index b728b8f4..475c1431 100644
--- a/test/performance/routines/level1/xswap.cpp
+++ b/test/performance/routines/level1/xswap.cpp
@@ -12,24 +12,20 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level1/xswap.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXswap<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXswap<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXswap<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
       clblast::RunClient<clblast::TestXswap<double>, double, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXswap<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXswap<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXswap<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXswap<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level2/xgbmv.cpp b/test/performance/routines/level2/xgbmv.cpp
index 6a4b01f8..2246d260 100644
--- a/test/performance/routines/level2/xgbmv.cpp
+++ b/test/performance/routines/level2/xgbmv.cpp
@@ -12,24 +12,20 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xgbmv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXgbmv<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXgbmv<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXgbmv<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
       clblast::RunClient<clblast::TestXgbmv<double>, double, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXgbmv<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXgbmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXgbmv<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXgbmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level2/xgemv.cpp b/test/performance/routines/level2/xgemv.cpp
index 335d5ef1..c00c0219 100644
--- a/test/performance/routines/level2/xgemv.cpp
+++ b/test/performance/routines/level2/xgemv.cpp
@@ -12,24 +12,20 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xgemv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXgemv<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXgemv<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXgemv<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
       clblast::RunClient<clblast::TestXgemv<double>, double, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXgemv<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXgemv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXgemv<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXgemv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level2/xger.cpp b/test/performance/routines/level2/xger.cpp
index 50fdb9e6..89347020 100644
--- a/test/performance/routines/level2/xger.cpp
+++ b/test/performance/routines/level2/xger.cpp
@@ -12,16 +12,12 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xger.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXger<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXger<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXger<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level2/xgerc.cpp b/test/performance/routines/level2/xgerc.cpp
index 67c72285..0423cdd5 100644
--- a/test/performance/routines/level2/xgerc.cpp
+++ b/test/performance/routines/level2/xgerc.cpp
@@ -12,10 +12,6 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xgerc.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
@@ -24,9 +20,9 @@ int main(int argc, char *argv[]) {
     case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXgerc<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXgerc<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXgerc<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXgerc<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level2/xgeru.cpp b/test/performance/routines/level2/xgeru.cpp
index 6e845bb8..c0fbb2d5 100644
--- a/test/performance/routines/level2/xgeru.cpp
+++ b/test/performance/routines/level2/xgeru.cpp
@@ -12,10 +12,6 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xgeru.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
@@ -24,9 +20,9 @@ int main(int argc, char *argv[]) {
     case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXgeru<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXgeru<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXgeru<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXgeru<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level2/xhbmv.cpp b/test/performance/routines/level2/xhbmv.cpp
index 600317c1..d59cba26 100644
--- a/test/performance/routines/level2/xhbmv.cpp
+++ b/test/performance/routines/level2/xhbmv.cpp
@@ -12,10 +12,6 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xhbmv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
@@ -24,9 +20,9 @@ int main(int argc, char *argv[]) {
     case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXhbmv<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXhbmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXhbmv<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXhbmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level2/xhemv.cpp b/test/performance/routines/level2/xhemv.cpp
index 7700cf7b..1664b6cd 100644
--- a/test/performance/routines/level2/xhemv.cpp
+++ b/test/performance/routines/level2/xhemv.cpp
@@ -12,10 +12,6 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xhemv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
@@ -24,9 +20,9 @@ int main(int argc, char *argv[]) {
     case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXhemv<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXhemv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXhemv<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXhemv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level2/xher.cpp b/test/performance/routines/level2/xher.cpp
index e7276aee..434f486c 100644
--- a/test/performance/routines/level2/xher.cpp
+++ b/test/performance/routines/level2/xher.cpp
@@ -12,10 +12,6 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xher.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
@@ -24,9 +20,9 @@ int main(int argc, char *argv[]) {
     case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXher<float2,float>, float2, float>(argc, argv); break;
+      clblast::RunClient<clblast::TestXher<clblast::float2,float>, clblast::float2, float>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXher<double2,double>, double2, double>(argc, argv); break;
+      clblast::RunClient<clblast::TestXher<clblast::double2,double>, clblast::double2, double>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level2/xher2.cpp b/test/performance/routines/level2/xher2.cpp
index b4c53206..cce40a9e 100644
--- a/test/performance/routines/level2/xher2.cpp
+++ b/test/performance/routines/level2/xher2.cpp
@@ -12,10 +12,6 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xher2.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
@@ -24,9 +20,9 @@ int main(int argc, char *argv[]) {
     case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXher2<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXher2<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXher2<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXher2<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level2/xhpmv.cpp b/test/performance/routines/level2/xhpmv.cpp
index d9683d2e..d88791fe 100644
--- a/test/performance/routines/level2/xhpmv.cpp
+++ b/test/performance/routines/level2/xhpmv.cpp
@@ -12,10 +12,6 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xhpmv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
@@ -24,9 +20,9 @@ int main(int argc, char *argv[]) {
     case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXhpmv<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXhpmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXhpmv<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXhpmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level2/xhpr.cpp b/test/performance/routines/level2/xhpr.cpp
index c4ffaf81..a92a3134 100644
--- a/test/performance/routines/level2/xhpr.cpp
+++ b/test/performance/routines/level2/xhpr.cpp
@@ -12,10 +12,6 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xhpr.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
@@ -24,9 +20,9 @@ int main(int argc, char *argv[]) {
     case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXhpr<float2,float>, float2, float>(argc, argv); break;
+      clblast::RunClient<clblast::TestXhpr<clblast::float2,float>, clblast::float2, float>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXhpr<double2,double>, double2, double>(argc, argv); break;
+      clblast::RunClient<clblast::TestXhpr<clblast::double2,double>, clblast::double2, double>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level2/xhpr2.cpp b/test/performance/routines/level2/xhpr2.cpp
index 3e5d4004..f34de29b 100644
--- a/test/performance/routines/level2/xhpr2.cpp
+++ b/test/performance/routines/level2/xhpr2.cpp
@@ -12,10 +12,6 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xhpr2.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
@@ -24,9 +20,9 @@ int main(int argc, char *argv[]) {
     case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXhpr2<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXhpr2<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXhpr2<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXhpr2<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level2/xsbmv.cpp b/test/performance/routines/level2/xsbmv.cpp
index 9c0ab3b6..e20125c3 100644
--- a/test/performance/routines/level2/xsbmv.cpp
+++ b/test/performance/routines/level2/xsbmv.cpp
@@ -12,16 +12,12 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xsbmv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXsbmv<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXsbmv<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXsbmv<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level2/xspmv.cpp b/test/performance/routines/level2/xspmv.cpp
index 6cc4e3ba..540dc76e 100644
--- a/test/performance/routines/level2/xspmv.cpp
+++ b/test/performance/routines/level2/xspmv.cpp
@@ -12,16 +12,12 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xspmv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXspmv<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXspmv<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXspmv<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level2/xspr.cpp b/test/performance/routines/level2/xspr.cpp
index dc45ba6d..2fee42ee 100644
--- a/test/performance/routines/level2/xspr.cpp
+++ b/test/performance/routines/level2/xspr.cpp
@@ -12,16 +12,12 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xspr.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXspr<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXspr<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXspr<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level2/xspr2.cpp b/test/performance/routines/level2/xspr2.cpp
index 3c9a769f..38366ab7 100644
--- a/test/performance/routines/level2/xspr2.cpp
+++ b/test/performance/routines/level2/xspr2.cpp
@@ -12,16 +12,12 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xspr2.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXspr2<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXspr2<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXspr2<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level2/xsymv.cpp b/test/performance/routines/level2/xsymv.cpp
index aaa98c8b..bc60af36 100644
--- a/test/performance/routines/level2/xsymv.cpp
+++ b/test/performance/routines/level2/xsymv.cpp
@@ -12,16 +12,12 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xsymv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXsymv<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXsymv<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXsymv<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level2/xsyr.cpp b/test/performance/routines/level2/xsyr.cpp
index d710bf63..beb34776 100644
--- a/test/performance/routines/level2/xsyr.cpp
+++ b/test/performance/routines/level2/xsyr.cpp
@@ -12,16 +12,12 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xsyr.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXsyr<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXsyr<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXsyr<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level2/xsyr2.cpp b/test/performance/routines/level2/xsyr2.cpp
index 39b46b6a..847804d6 100644
--- a/test/performance/routines/level2/xsyr2.cpp
+++ b/test/performance/routines/level2/xsyr2.cpp
@@ -12,16 +12,12 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xsyr2.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXsyr2<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXsyr2<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXsyr2<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level2/xtbmv.cpp b/test/performance/routines/level2/xtbmv.cpp
index 5fb3ea14..64cdc10c 100644
--- a/test/performance/routines/level2/xtbmv.cpp
+++ b/test/performance/routines/level2/xtbmv.cpp
@@ -12,24 +12,20 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xtbmv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXtbmv<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXtbmv<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXtbmv<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
       clblast::RunClient<clblast::TestXtbmv<double>, double, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXtbmv<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXtbmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXtbmv<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXtbmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level2/xtbsv.cpp b/test/performance/routines/level2/xtbsv.cpp
index 7b88917c..4d37e76d 100644
--- a/test/performance/routines/level2/xtbsv.cpp
+++ b/test/performance/routines/level2/xtbsv.cpp
@@ -12,10 +12,6 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xtbsv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
@@ -26,9 +22,9 @@ int main(int argc, char *argv[]) {
     case clblast::Precision::kDouble:
       clblast::RunClient<clblast::TestXtbsv<double>, double, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXtbsv<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXtbsv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXtbsv<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXtbsv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level2/xtpmv.cpp b/test/performance/routines/level2/xtpmv.cpp
index 907749a7..005085a9 100644
--- a/test/performance/routines/level2/xtpmv.cpp
+++ b/test/performance/routines/level2/xtpmv.cpp
@@ -12,24 +12,20 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xtpmv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXtpmv<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXtpmv<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXtpmv<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
       clblast::RunClient<clblast::TestXtpmv<double>, double, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXtpmv<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXtpmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXtpmv<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXtpmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level2/xtpsv.cpp b/test/performance/routines/level2/xtpsv.cpp
index 0dab8ff6..b01a9f05 100644
--- a/test/performance/routines/level2/xtpsv.cpp
+++ b/test/performance/routines/level2/xtpsv.cpp
@@ -12,10 +12,6 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xtpsv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
@@ -26,9 +22,9 @@ int main(int argc, char *argv[]) {
     case clblast::Precision::kDouble:
       clblast::RunClient<clblast::TestXtpsv<double>, double, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXtpsv<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXtpsv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXtpsv<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXtpsv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level2/xtrmv.cpp b/test/performance/routines/level2/xtrmv.cpp
index c2c6f232..27dc7390 100644
--- a/test/performance/routines/level2/xtrmv.cpp
+++ b/test/performance/routines/level2/xtrmv.cpp
@@ -12,24 +12,20 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xtrmv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXtrmv<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXtrmv<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXtrmv<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
       clblast::RunClient<clblast::TestXtrmv<double>, double, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXtrmv<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXtrmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXtrmv<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXtrmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level2/xtrsv.cpp b/test/performance/routines/level2/xtrsv.cpp
index 49e477f7..02255e71 100644
--- a/test/performance/routines/level2/xtrsv.cpp
+++ b/test/performance/routines/level2/xtrsv.cpp
@@ -12,10 +12,6 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level2/xtrsv.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
@@ -26,9 +22,9 @@ int main(int argc, char *argv[]) {
     case clblast::Precision::kDouble:
       clblast::RunClient<clblast::TestXtrsv<double>, double, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXtrsv<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXtrsv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXtrsv<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXtrsv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level3/xgemm.cpp b/test/performance/routines/level3/xgemm.cpp
index deb2493f..5b3426f5 100644
--- a/test/performance/routines/level3/xgemm.cpp
+++ b/test/performance/routines/level3/xgemm.cpp
@@ -12,24 +12,20 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level3/xgemm.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXgemm<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXgemm<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXgemm<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
       clblast::RunClient<clblast::TestXgemm<double>, double, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXgemm<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXgemm<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXgemm<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXgemm<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level3/xhemm.cpp b/test/performance/routines/level3/xhemm.cpp
index 975c672f..6c3687a9 100644
--- a/test/performance/routines/level3/xhemm.cpp
+++ b/test/performance/routines/level3/xhemm.cpp
@@ -12,10 +12,6 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level3/xhemm.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
@@ -24,9 +20,9 @@ int main(int argc, char *argv[]) {
     case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXhemm<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXhemm<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXhemm<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXhemm<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level3/xher2k.cpp b/test/performance/routines/level3/xher2k.cpp
index d579d4f9..9d3385f7 100644
--- a/test/performance/routines/level3/xher2k.cpp
+++ b/test/performance/routines/level3/xher2k.cpp
@@ -12,10 +12,6 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level3/xher2k.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
@@ -24,9 +20,9 @@ int main(int argc, char *argv[]) {
     case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXher2k<float2,float>, float2, float>(argc, argv); break;
+      clblast::RunClient<clblast::TestXher2k<clblast::float2,float>, clblast::float2, float>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXher2k<double2,double>, double2, double>(argc, argv); break;
+      clblast::RunClient<clblast::TestXher2k<clblast::double2,double>, clblast::double2, double>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level3/xherk.cpp b/test/performance/routines/level3/xherk.cpp
index 94411e5a..ae6e774e 100644
--- a/test/performance/routines/level3/xherk.cpp
+++ b/test/performance/routines/level3/xherk.cpp
@@ -12,10 +12,6 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level3/xherk.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
@@ -24,9 +20,9 @@ int main(int argc, char *argv[]) {
     case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXherk<float2,float>, float2, float>(argc, argv); break;
+      clblast::RunClient<clblast::TestXherk<clblast::float2,float>, clblast::float2, float>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXherk<double2,double>, double2, double>(argc, argv); break;
+      clblast::RunClient<clblast::TestXherk<clblast::double2,double>, clblast::double2, double>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level3/xsymm.cpp b/test/performance/routines/level3/xsymm.cpp
index 04ae8eb0..9efc2cca 100644
--- a/test/performance/routines/level3/xsymm.cpp
+++ b/test/performance/routines/level3/xsymm.cpp
@@ -12,24 +12,20 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level3/xsymm.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXsymm<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXsymm<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXsymm<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
       clblast::RunClient<clblast::TestXsymm<double>, double, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXsymm<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXsymm<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXsymm<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXsymm<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level3/xsyr2k.cpp b/test/performance/routines/level3/xsyr2k.cpp
index 7b8b6f4f..28cf2bcc 100644
--- a/test/performance/routines/level3/xsyr2k.cpp
+++ b/test/performance/routines/level3/xsyr2k.cpp
@@ -12,24 +12,20 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level3/xsyr2k.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXsyr2k<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXsyr2k<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXsyr2k<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
       clblast::RunClient<clblast::TestXsyr2k<double>, double, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXsyr2k<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXsyr2k<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXsyr2k<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXsyr2k<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level3/xsyrk.cpp b/test/performance/routines/level3/xsyrk.cpp
index ea0fc33b..917cc9b8 100644
--- a/test/performance/routines/level3/xsyrk.cpp
+++ b/test/performance/routines/level3/xsyrk.cpp
@@ -12,24 +12,20 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level3/xsyrk.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXsyrk<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXsyrk<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXsyrk<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
       clblast::RunClient<clblast::TestXsyrk<double>, double, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXsyrk<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXsyrk<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXsyrk<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXsyrk<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level3/xtrmm.cpp b/test/performance/routines/level3/xtrmm.cpp
index 7a29e111..fc435a1a 100644
--- a/test/performance/routines/level3/xtrmm.cpp
+++ b/test/performance/routines/level3/xtrmm.cpp
@@ -12,24 +12,20 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level3/xtrmm.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXtrmm<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXtrmm<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXtrmm<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
       clblast::RunClient<clblast::TestXtrmm<double>, double, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXtrmm<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXtrmm<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXtrmm<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXtrmm<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/level3/xtrsm.cpp b/test/performance/routines/level3/xtrsm.cpp
index ef094891..f44265f2 100644
--- a/test/performance/routines/level3/xtrsm.cpp
+++ b/test/performance/routines/level3/xtrsm.cpp
@@ -12,24 +12,19 @@
 #include "test/performance/client.hpp"
 #include "test/routines/level3/xtrsm.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXtrsm<half>, half, half>(argc, argv); break;
+    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXtrsm<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
       clblast::RunClient<clblast::TestXtrsm<double>, double, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXtrsm<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXtrsm<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXtrsm<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXtrsm<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/performance/routines/levelx/xaxpybatched.cpp b/test/performance/routines/levelx/xaxpybatched.cpp
new file mode 100644
index 00000000..48b65353
--- /dev/null
+++ b/test/performance/routines/levelx/xaxpybatched.cpp
@@ -0,0 +1,33 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/performance/client.hpp"
+#include "test/routines/levelx/xaxpybatched.hpp"
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
+  switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXaxpyBatched<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXaxpyBatched<float>, float, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXaxpyBatched<double>, double, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXaxpyBatched<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXaxpyBatched<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/levelx/xgemmbatched.cpp b/test/performance/routines/levelx/xgemmbatched.cpp
new file mode 100644
index 00000000..d55a8749
--- /dev/null
+++ b/test/performance/routines/levelx/xgemmbatched.cpp
@@ -0,0 +1,33 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/performance/client.hpp"
+#include "test/routines/levelx/xgemmbatched.hpp"
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
+  switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXgemmBatched<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXgemmBatched<float>, float, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXgemmBatched<double>, double, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXgemmBatched<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXgemmBatched<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/levelx/xinvert.cpp b/test/performance/routines/levelx/xinvert.cpp
new file mode 100644
index 00000000..87f36b1e
--- /dev/null
+++ b/test/performance/routines/levelx/xinvert.cpp
@@ -0,0 +1,37 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/performance/client.hpp"
+#include "test/routines/levelx/xinvert.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
+  switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXinvert<half>, half, half>(argc, argv); break;
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXinvert<float>, float, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXinvert<double>, double, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXinvert<float2>, float2, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXinvert<double2>, double2, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/levelx/xomatcopy.cpp b/test/performance/routines/levelx/xomatcopy.cpp
index 5821c3b8..366ce496 100644
--- a/test/performance/routines/levelx/xomatcopy.cpp
+++ b/test/performance/routines/levelx/xomatcopy.cpp
@@ -12,24 +12,20 @@
 #include "test/performance/client.hpp"
 #include "test/routines/levelx/xomatcopy.hpp"
 
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
     case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXomatcopy<half>, half, half>(argc, argv); break;
+      clblast::RunClient<clblast::TestXomatcopy<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXomatcopy<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
       clblast::RunClient<clblast::TestXomatcopy<double>, double, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXomatcopy<float2>, float2, float2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXomatcopy<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
     case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXomatcopy<double2>, double2, double2>(argc, argv); break;
+      clblast::RunClient<clblast::TestXomatcopy<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
   }
   return 0;
 }
diff --git a/test/routines/common.hpp b/test/routines/common.hpp
new file mode 100644
index 00000000..9708288a
--- /dev/null
+++ b/test/routines/common.hpp
@@ -0,0 +1,36 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains all the common includes for the clients and tests
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_COMMON_H_
+#define CLBLAST_TEST_ROUTINES_COMMON_H_
+
+#include <vector>
+#include <string>
+
+#include "utilities/utilities.hpp"
+
+#ifdef CLBLAST_REF_CLBLAS
+  #include "test/wrapper_clblas.hpp"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+  #include "test/wrapper_cblas.hpp"
+#endif
+#include "test/wrapper_cuda.hpp"
+#ifdef CLBLAST_REF_CUBLAS
+  #include "test/wrapper_cublas.hpp"
+#endif
+
+// =================================================================================================
+
+// CLBLAST_TEST_ROUTINES_COMMON_H_
+#endif
diff --git a/test/routines/level1/xamax.hpp b/test/routines/level1/xamax.hpp
index f98bdb06..04bdaa3d 100644
--- a/test/routines/level1/xamax.hpp
+++ b/test/routines/level1/xamax.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XAMAX_H_
 #define CLBLAST_TEST_ROUTINES_XAMAX_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -43,6 +35,8 @@ class TestXamax {
             kArgXInc,
             kArgXOffset, kArgImaxOffset};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufScalar}; }
+  static std::vector<std::string> BuffersOut() { return {kBufScalar}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -68,6 +62,11 @@ class TestXamax {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -96,19 +95,24 @@ class TestXamax {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXamax(args.n,
-                 scalar_cpu, args.imax_offset,
-                 x_vec_cpu, args.x_offset, args.x_inc);
-      buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
+                 buffers_host.scalar, args.imax_offset,
+                 buffers_host.x_vec, args.x_offset, args.x_inc);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXamax(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n,
+                                buffers.scalar, args.imax_offset,
+                                buffers.x_vec, args.x_offset, args.x_inc);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.scalar_size, static_cast<T>(0));
diff --git a/test/routines/level1/xasum.hpp b/test/routines/level1/xasum.hpp
index 64aa37c2..6add9c64 100644
--- a/test/routines/level1/xasum.hpp
+++ b/test/routines/level1/xasum.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XASUM_H_
 #define CLBLAST_TEST_ROUTINES_XASUM_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -43,6 +35,8 @@ class TestXasum {
             kArgXInc,
             kArgXOffset, kArgAsumOffset};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufScalar}; }
+  static std::vector<std::string> BuffersOut() { return {kBufScalar}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -68,6 +62,11 @@ class TestXasum {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -96,19 +95,24 @@ class TestXasum {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXasum(args.n,
-                 scalar_cpu, args.asum_offset,
-                 x_vec_cpu, args.x_offset, args.x_inc);
-      buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
+                 buffers_host.scalar, args.asum_offset,
+                 buffers_host.x_vec, args.x_offset, args.x_inc);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXasum(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n,
+                                buffers.scalar, args.asum_offset,
+                                buffers.x_vec, args.x_offset, args.x_inc);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.scalar_size, static_cast<T>(0));
diff --git a/test/routines/level1/xaxpy.hpp b/test/routines/level1/xaxpy.hpp
index b24e6fe8..17cae6ad 100644
--- a/test/routines/level1/xaxpy.hpp
+++ b/test/routines/level1/xaxpy.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XAXPY_H_
 #define CLBLAST_TEST_ROUTINES_XAXPY_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -44,6 +36,8 @@ class TestXaxpy {
             kArgXOffset, kArgYOffset,
             kArgAlpha};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY}; }
+  static std::vector<std::string> BuffersOut() { return {kBufVecY}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -69,6 +63,11 @@ class TestXaxpy {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -97,19 +96,24 @@ class TestXaxpy {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXaxpy(args.n, args.alpha,
-                 x_vec_cpu, args.x_offset, args.x_inc,
-                 y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+                 buffers_host.x_vec, args.x_offset, args.x_inc,
+                 buffers_host.y_vec, args.y_offset, args.y_inc);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXaxpy(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n, args.alpha,
+                                buffers.x_vec, args.x_offset, args.x_inc,
+                                buffers.y_vec, args.y_offset, args.y_inc);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.y_size, static_cast<T>(0));
diff --git a/test/routines/level1/xcopy.hpp b/test/routines/level1/xcopy.hpp
index 87bc21d4..7a5c99b8 100644
--- a/test/routines/level1/xcopy.hpp
+++ b/test/routines/level1/xcopy.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XCOPY_H_
 #define CLBLAST_TEST_ROUTINES_XCOPY_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -43,6 +35,8 @@ class TestXcopy {
             kArgXInc, kArgYInc,
             kArgXOffset, kArgYOffset};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY}; }
+  static std::vector<std::string> BuffersOut() { return {kBufVecY}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -68,6 +62,11 @@ class TestXcopy {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -96,19 +95,24 @@ class TestXcopy {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXcopy(args.n,
-                 x_vec_cpu, args.x_offset, args.x_inc,
-                 y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+                 buffers_host.x_vec, args.x_offset, args.x_inc,
+                 buffers_host.y_vec, args.y_offset, args.y_inc);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXcopy(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n,
+                                buffers.x_vec, args.x_offset, args.x_inc,
+                                buffers.y_vec, args.y_offset, args.y_inc);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.y_size, static_cast<T>(0));
diff --git a/test/routines/level1/xdot.hpp b/test/routines/level1/xdot.hpp
index c4f6076a..1ea25994 100644
--- a/test/routines/level1/xdot.hpp
+++ b/test/routines/level1/xdot.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XDOT_H_
 #define CLBLAST_TEST_ROUTINES_XDOT_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -43,6 +35,8 @@ class TestXdot {
             kArgXInc, kArgYInc,
             kArgXOffset, kArgYOffset, kArgDotOffset};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY, kBufScalar}; }
+  static std::vector<std::string> BuffersOut() { return {kBufScalar}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -72,6 +66,11 @@ class TestXdot {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -102,22 +101,26 @@ class TestXdot {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXdot(args.n,
-                scalar_cpu, args.dot_offset,
-                x_vec_cpu, args.x_offset, args.x_inc,
-                y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
+                buffers_host.scalar, args.dot_offset,
+                buffers_host.x_vec, args.x_offset, args.x_inc,
+                buffers_host.y_vec, args.y_offset, args.y_inc);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXdot(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n,
+                               buffers.scalar, args.dot_offset,
+                               buffers.x_vec, args.x_offset, args.x_inc,
+                               buffers.y_vec, args.y_offset, args.y_inc);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.scalar_size, static_cast<T>(0));
diff --git a/test/routines/level1/xdotc.hpp b/test/routines/level1/xdotc.hpp
index aae892a8..c800c1f5 100644
--- a/test/routines/level1/xdotc.hpp
+++ b/test/routines/level1/xdotc.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XDOTC_H_
 #define CLBLAST_TEST_ROUTINES_XDOTC_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -43,6 +35,8 @@ class TestXdotc {
             kArgXInc, kArgYInc,
             kArgXOffset, kArgYOffset, kArgDotOffset};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY, kBufScalar}; }
+  static std::vector<std::string> BuffersOut() { return {kBufScalar}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -72,6 +66,11 @@ class TestXdotc {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -102,22 +101,26 @@ class TestXdotc {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXdotc(args.n,
-                 scalar_cpu, args.dot_offset,
-                 x_vec_cpu, args.x_offset, args.x_inc,
-                 y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
+                 buffers_host.scalar, args.dot_offset,
+                 buffers_host.x_vec, args.x_offset, args.x_inc,
+                 buffers_host.y_vec, args.y_offset, args.y_inc);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXdotc(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n,
+                                buffers.scalar, args.dot_offset,
+                                buffers.x_vec, args.x_offset, args.x_inc,
+                                buffers.y_vec, args.y_offset, args.y_inc);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.scalar_size, static_cast<T>(0));
diff --git a/test/routines/level1/xdotu.hpp b/test/routines/level1/xdotu.hpp
index f6be385b..3545a3a6 100644
--- a/test/routines/level1/xdotu.hpp
+++ b/test/routines/level1/xdotu.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XDOTU_H_
 #define CLBLAST_TEST_ROUTINES_XDOTU_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -43,6 +35,8 @@ class TestXdotu {
             kArgXInc, kArgYInc,
             kArgXOffset, kArgYOffset, kArgDotOffset};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY, kBufScalar}; }
+  static std::vector<std::string> BuffersOut() { return {kBufScalar}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -72,6 +66,11 @@ class TestXdotu {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -102,22 +101,26 @@ class TestXdotu {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXdotu(args.n,
-                 scalar_cpu, args.dot_offset,
-                 x_vec_cpu, args.x_offset, args.x_inc,
-                 y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
+                 buffers_host.scalar, args.dot_offset,
+                 buffers_host.x_vec, args.x_offset, args.x_inc,
+                 buffers_host.y_vec, args.y_offset, args.y_inc);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXdotu(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n,
+                                buffers.scalar, args.dot_offset,
+                                buffers.x_vec, args.x_offset, args.x_inc,
+                                buffers.y_vec, args.y_offset, args.y_inc);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.scalar_size, static_cast<T>(0));
diff --git a/test/routines/level1/xnrm2.hpp b/test/routines/level1/xnrm2.hpp
index e604077c..1db70537 100644
--- a/test/routines/level1/xnrm2.hpp
+++ b/test/routines/level1/xnrm2.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XNRM2_H_
 #define CLBLAST_TEST_ROUTINES_XNRM2_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -43,6 +35,8 @@ class TestXnrm2 {
             kArgXInc,
             kArgXOffset, kArgNrm2Offset};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufScalar}; }
+  static std::vector<std::string> BuffersOut() { return {kBufScalar}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -68,6 +62,11 @@ class TestXnrm2 {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -96,19 +95,24 @@ class TestXnrm2 {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXnrm2(args.n,
-                 scalar_cpu, args.nrm2_offset,
-                 x_vec_cpu, args.x_offset, args.x_inc);
-      buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
+                 buffers_host.scalar, args.nrm2_offset,
+                 buffers_host.x_vec, args.x_offset, args.x_inc);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXnrm2(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n,
+                                buffers.scalar, args.nrm2_offset,
+                                buffers.x_vec, args.x_offset, args.x_inc);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.scalar_size, static_cast<T>(0));
diff --git a/test/routines/level1/xscal.hpp b/test/routines/level1/xscal.hpp
index 3c438bd6..efa0988d 100644
--- a/test/routines/level1/xscal.hpp
+++ b/test/routines/level1/xscal.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XSCAL_H_
 #define CLBLAST_TEST_ROUTINES_XSCAL_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -44,6 +36,8 @@ class TestXscal {
             kArgXOffset,
             kArgAlpha};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufVecX}; }
+  static std::vector<std::string> BuffersOut() { return {kBufVecX}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -65,6 +59,11 @@ class TestXscal {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -91,16 +90,22 @@ class TestXscal {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXscal(args.n, args.alpha,
-                 x_vec_cpu, args.x_offset, args.x_inc);
-      buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
+                 buffers_host.x_vec, args.x_offset, args.x_inc);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXscal(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n, args.alpha,
+                                buffers.x_vec, args.x_offset, args.x_inc);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.x_size, static_cast<T>(0));
diff --git a/test/routines/level1/xswap.hpp b/test/routines/level1/xswap.hpp
index a0491f12..d778cc23 100644
--- a/test/routines/level1/xswap.hpp
+++ b/test/routines/level1/xswap.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XSWAP_H_
 #define CLBLAST_TEST_ROUTINES_XSWAP_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -43,6 +35,8 @@ class TestXswap {
             kArgXInc, kArgYInc,
             kArgXOffset, kArgYOffset};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY}; }
+  static std::vector<std::string> BuffersOut() { return {kBufVecX, kBufVecY}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -68,6 +62,11 @@ class TestXswap {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -96,20 +95,24 @@ class TestXswap {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXswap(args.n,
-                 x_vec_cpu, args.x_offset, args.x_inc,
-                 y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+                 buffers_host.x_vec, args.x_offset, args.x_inc,
+                 buffers_host.y_vec, args.y_offset, args.y_inc);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXswap(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n,
+                                buffers.x_vec, args.x_offset, args.x_inc,
+                                buffers.y_vec, args.y_offset, args.y_inc);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.x_size + args.y_size, static_cast<T>(0));
diff --git a/test/routines/level2/xgbmv.hpp b/test/routines/level2/xgbmv.hpp
index 5ed92aae..23138c77 100644
--- a/test/routines/level2/xgbmv.hpp
+++ b/test/routines/level2/xgbmv.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XGBMV_H_
 #define CLBLAST_TEST_ROUTINES_XGBMV_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXgbmv {
             kArgAOffset, kArgXOffset, kArgYOffset,
             kArgAlpha, kArgBeta};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; }
+  static std::vector<std::string> BuffersOut() { return {kBufVecY}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -80,6 +74,11 @@ class TestXgbmv {
   static Transposes GetATransposes(const Transposes &all) { return all; }
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -113,24 +112,30 @@ class TestXgbmv {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXgbmv(convertToCBLAS(args.layout),
                  convertToCBLAS(args.a_transpose),
                  args.m, args.n, args.kl, args.ku, args.alpha,
-                 a_mat_cpu, args.a_offset, args.a_ld,
-                 x_vec_cpu, args.x_offset, args.x_inc, args.beta,
-                 y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+                 buffers_host.a_mat, args.a_offset, args.a_ld,
+                 buffers_host.x_vec, args.x_offset, args.x_inc, args.beta,
+                 buffers_host.y_vec, args.y_offset, args.y_inc);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXgbmv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                convertToCUBLAS(args.a_transpose),
+                                args.m, args.n, args.kl, args.ku, args.alpha,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers.y_vec, args.y_offset, args.y_inc);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.y_size, static_cast<T>(0));
diff --git a/test/routines/level2/xgemv.hpp b/test/routines/level2/xgemv.hpp
index 9ee6d535..0ee53b80 100644
--- a/test/routines/level2/xgemv.hpp
+++ b/test/routines/level2/xgemv.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XGEMV_H_
 #define CLBLAST_TEST_ROUTINES_XGEMV_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXgemv {
             kArgAOffset, kArgXOffset, kArgYOffset,
             kArgAlpha, kArgBeta};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; }
+  static std::vector<std::string> BuffersOut() { return {kBufVecY}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -80,6 +74,11 @@ class TestXgemv {
   static Transposes GetATransposes(const Transposes &all) { return all; }
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -113,24 +112,30 @@ class TestXgemv {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXgemv(convertToCBLAS(args.layout),
                  convertToCBLAS(args.a_transpose),
                  args.m, args.n, args.alpha,
-                 a_mat_cpu, args.a_offset, args.a_ld,
-                 x_vec_cpu, args.x_offset, args.x_inc, args.beta,
-                 y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+                 buffers_host.a_mat, args.a_offset, args.a_ld,
+                 buffers_host.x_vec, args.x_offset, args.x_inc, args.beta,
+                 buffers_host.y_vec, args.y_offset, args.y_inc);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXgemv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                convertToCUBLAS(args.a_transpose),
+                                args.m, args.n, args.alpha,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers.y_vec, args.y_offset, args.y_inc);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.y_size, static_cast<T>(0));
diff --git a/test/routines/level2/xger.hpp b/test/routines/level2/xger.hpp
index 42283107..92a1a2ae 100644
--- a/test/routines/level2/xger.hpp
+++ b/test/routines/level2/xger.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XGER_H_
 #define CLBLAST_TEST_ROUTINES_XGER_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXger {
             kArgAOffset, kArgXOffset, kArgYOffset,
             kArgAlpha};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; }
+  static std::vector<std::string> BuffersOut() { return {kBufMatA}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -76,6 +70,11 @@ class TestXger {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -108,23 +107,28 @@ class TestXger {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXger(convertToCBLAS(args.layout),
                 args.m, args.n, args.alpha,
-                x_vec_cpu, args.x_offset, args.x_inc,
-                y_vec_cpu, args.y_offset, args.y_inc,
-                a_mat_cpu, args.a_offset, args.a_ld);
-      buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+                buffers_host.x_vec, args.x_offset, args.x_inc,
+                buffers_host.y_vec, args.y_offset, args.y_inc,
+                buffers_host.a_mat, args.a_offset, args.a_ld);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXger(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                               args.m, args.n, args.alpha,
+                               buffers.x_vec, args.x_offset, args.x_inc,
+                               buffers.y_vec, args.y_offset, args.y_inc,
+                               buffers.a_mat, args.a_offset, args.a_ld);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.a_size, static_cast<T>(0));
diff --git a/test/routines/level2/xgerc.hpp b/test/routines/level2/xgerc.hpp
index ef69c197..5d899398 100644
--- a/test/routines/level2/xgerc.hpp
+++ b/test/routines/level2/xgerc.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XGERC_H_
 #define CLBLAST_TEST_ROUTINES_XGERC_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXgerc {
             kArgAOffset, kArgXOffset, kArgYOffset,
             kArgAlpha};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; }
+  static std::vector<std::string> BuffersOut() { return {kBufMatA}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -76,6 +70,11 @@ class TestXgerc {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -108,23 +107,28 @@ class TestXgerc {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXgerc(convertToCBLAS(args.layout),
                  args.m, args.n, args.alpha,
-                 x_vec_cpu, args.x_offset, args.x_inc,
-                 y_vec_cpu, args.y_offset, args.y_inc,
-                 a_mat_cpu, args.a_offset, args.a_ld);
-      buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+                 buffers_host.x_vec, args.x_offset, args.x_inc,
+                 buffers_host.y_vec, args.y_offset, args.y_inc,
+                 buffers_host.a_mat, args.a_offset, args.a_ld);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXgerc(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                args.m, args.n, args.alpha,
+                                buffers.x_vec, args.x_offset, args.x_inc,
+                                buffers.y_vec, args.y_offset, args.y_inc,
+                                buffers.a_mat, args.a_offset, args.a_ld);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.a_size, static_cast<T>(0));
diff --git a/test/routines/level2/xgeru.hpp b/test/routines/level2/xgeru.hpp
index b2afc6d8..96dab22e 100644
--- a/test/routines/level2/xgeru.hpp
+++ b/test/routines/level2/xgeru.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XGERU_H_
 #define CLBLAST_TEST_ROUTINES_XGERU_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXgeru {
             kArgAOffset, kArgXOffset, kArgYOffset,
             kArgAlpha};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; }
+  static std::vector<std::string> BuffersOut() { return {kBufMatA}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -76,6 +70,11 @@ class TestXgeru {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -108,23 +107,28 @@ class TestXgeru {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXgeru(convertToCBLAS(args.layout),
                  args.m, args.n, args.alpha,
-                 x_vec_cpu, args.x_offset, args.x_inc,
-                 y_vec_cpu, args.y_offset, args.y_inc,
-                 a_mat_cpu, args.a_offset, args.a_ld);
-      buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+                 buffers_host.x_vec, args.x_offset, args.x_inc,
+                 buffers_host.y_vec, args.y_offset, args.y_inc,
+                 buffers_host.a_mat, args.a_offset, args.a_ld);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXgeru(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                args.m, args.n, args.alpha,
+                                buffers.x_vec, args.x_offset, args.x_inc,
+                                buffers.y_vec, args.y_offset, args.y_inc,
+                                buffers.a_mat, args.a_offset, args.a_ld);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.a_size, static_cast<T>(0));
diff --git a/test/routines/level2/xhbmv.hpp b/test/routines/level2/xhbmv.hpp
index 8bda4d0c..b6844744 100644
--- a/test/routines/level2/xhbmv.hpp
+++ b/test/routines/level2/xhbmv.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XHBMV_H_
 #define CLBLAST_TEST_ROUTINES_XHBMV_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXhbmv {
             kArgAOffset, kArgXOffset, kArgYOffset,
             kArgAlpha, kArgBeta};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; }
+  static std::vector<std::string> BuffersOut() { return {kBufVecY}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -74,6 +68,11 @@ class TestXhbmv {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -107,24 +106,30 @@ class TestXhbmv {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXhbmv(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  args.n, args.kl, args.alpha,
-                 a_mat_cpu, args.a_offset, args.a_ld,
-                 x_vec_cpu, args.x_offset, args.x_inc, args.beta,
-                 y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+                 buffers_host.a_mat, args.a_offset, args.a_ld,
+                 buffers_host.x_vec, args.x_offset, args.x_inc, args.beta,
+                 buffers_host.y_vec, args.y_offset, args.y_inc);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXhbmv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                convertToCUBLAS(args.triangle),
+                                args.n, args.kl, args.alpha,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers.y_vec, args.y_offset, args.y_inc);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.y_size, static_cast<T>(0));
diff --git a/test/routines/level2/xhemv.hpp b/test/routines/level2/xhemv.hpp
index 80565d04..e1f23592 100644
--- a/test/routines/level2/xhemv.hpp
+++ b/test/routines/level2/xhemv.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XHEMV_H_
 #define CLBLAST_TEST_ROUTINES_XHEMV_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXhemv {
             kArgAOffset, kArgXOffset, kArgYOffset,
             kArgAlpha, kArgBeta};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; }
+  static std::vector<std::string> BuffersOut() { return {kBufVecY}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -74,6 +68,11 @@ class TestXhemv {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -107,24 +106,30 @@ class TestXhemv {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXhemv(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  args.n, args.alpha,
-                 a_mat_cpu, args.a_offset, args.a_ld,
-                 x_vec_cpu, args.x_offset, args.x_inc, args.beta,
-                 y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+                 buffers_host.a_mat, args.a_offset, args.a_ld,
+                 buffers_host.x_vec, args.x_offset, args.x_inc, args.beta,
+                 buffers_host.y_vec, args.y_offset, args.y_inc);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXhemv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                convertToCUBLAS(args.triangle),
+                                args.n, args.alpha,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers.y_vec, args.y_offset, args.y_inc);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.y_size, static_cast<T>(0));
diff --git a/test/routines/level2/xher.hpp b/test/routines/level2/xher.hpp
index d71c8009..1ac1247b 100644
--- a/test/routines/level2/xher.hpp
+++ b/test/routines/level2/xher.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XHER_H_
 #define CLBLAST_TEST_ROUTINES_XHER_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXher {
             kArgAOffset, kArgXOffset,
             kArgAlpha};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX}; }
+  static std::vector<std::string> BuffersOut() { return {kBufMatA}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<U> &args) {
@@ -70,6 +64,11 @@ class TestXher {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<U>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -101,21 +100,28 @@ class TestXher {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+    static StatusCode RunReference2(const Arguments<U> &args, BuffersHost<T> &buffers_host, Queue&) {
       cblasXher(convertToCBLAS(args.layout),
                 convertToCBLAS(args.triangle),
                 args.n, args.alpha,
-                x_vec_cpu, args.x_offset, args.x_inc,
-                a_mat_cpu, args.a_offset, args.a_ld);
-      buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+                buffers_host.x_vec, args.x_offset, args.x_inc,
+                buffers_host.a_mat, args.a_offset, args.a_ld);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<U> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXher(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                               convertToCUBLAS(args.triangle),
+                               args.n, args.alpha,
+                               buffers.x_vec, args.x_offset, args.x_inc,
+                               buffers.a_mat, args.a_offset, args.a_ld);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.a_size, static_cast<T>(0));
diff --git a/test/routines/level2/xher2.hpp b/test/routines/level2/xher2.hpp
index 083dfa2f..18ccc1ac 100644
--- a/test/routines/level2/xher2.hpp
+++ b/test/routines/level2/xher2.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XHER2_H_
 #define CLBLAST_TEST_ROUTINES_XHER2_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXher2 {
             kArgAOffset, kArgXOffset, kArgYOffset,
             kArgAlpha};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; }
+  static std::vector<std::string> BuffersOut() { return {kBufMatA}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -74,6 +68,11 @@ class TestXher2 {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -107,24 +106,30 @@ class TestXher2 {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXher2(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  args.n, args.alpha,
-                 x_vec_cpu, args.x_offset, args.x_inc,
-                 y_vec_cpu, args.y_offset, args.y_inc,
-                 a_mat_cpu, args.a_offset, args.a_ld);
-      buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+                 buffers_host.x_vec, args.x_offset, args.x_inc,
+                 buffers_host.y_vec, args.y_offset, args.y_inc,
+                 buffers_host.a_mat, args.a_offset, args.a_ld);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXher2(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                convertToCUBLAS(args.triangle),
+                                args.n, args.alpha,
+                                buffers.x_vec, args.x_offset, args.x_inc,
+                                buffers.y_vec, args.y_offset, args.y_inc,
+                                buffers.a_mat, args.a_offset, args.a_ld);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.a_size, static_cast<T>(0));
diff --git a/test/routines/level2/xhpmv.hpp b/test/routines/level2/xhpmv.hpp
index 1dd63562..ad91fe15 100644
--- a/test/routines/level2/xhpmv.hpp
+++ b/test/routines/level2/xhpmv.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XHPMV_H_
 #define CLBLAST_TEST_ROUTINES_XHPMV_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXhpmv {
             kArgAPOffset, kArgXOffset, kArgYOffset,
             kArgAlpha, kArgBeta};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatAP, kBufVecX, kBufVecY}; }
+  static std::vector<std::string> BuffersOut() { return {kBufVecY}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -74,6 +68,11 @@ class TestXhpmv {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -107,24 +106,30 @@ class TestXhpmv {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXhpmv(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  args.n, args.alpha,
-                 ap_mat_cpu, args.ap_offset,
-                 x_vec_cpu, args.x_offset, args.x_inc, args.beta,
-                 y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+                 buffers_host.ap_mat, args.ap_offset,
+                 buffers_host.x_vec, args.x_offset, args.x_inc, args.beta,
+                 buffers_host.y_vec, args.y_offset, args.y_inc);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXhpmv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                convertToCUBLAS(args.triangle),
+                                args.n, args.alpha,
+                                buffers.ap_mat, args.ap_offset,
+                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers.y_vec, args.y_offset, args.y_inc);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.y_size, static_cast<T>(0));
diff --git a/test/routines/level2/xhpr.hpp b/test/routines/level2/xhpr.hpp
index a5c77811..f9d580cd 100644
--- a/test/routines/level2/xhpr.hpp
+++ b/test/routines/level2/xhpr.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XHPR_H_
 #define CLBLAST_TEST_ROUTINES_XHPR_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXhpr {
             kArgAPOffset, kArgXOffset,
             kArgAlpha};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatAP, kBufVecX}; }
+  static std::vector<std::string> BuffersOut() { return {kBufMatAP}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<U> &args) {
@@ -70,6 +64,11 @@ class TestXhpr {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<U>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -101,21 +100,28 @@ class TestXhpr {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+    static StatusCode RunReference2(const Arguments<U> &args, BuffersHost<T> &buffers_host, Queue&) {
       cblasXhpr(convertToCBLAS(args.layout),
                 convertToCBLAS(args.triangle),
                 args.n, args.alpha,
-                x_vec_cpu, args.x_offset, args.x_inc,
-                ap_mat_cpu, args.ap_offset);
-      buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
+                buffers_host.x_vec, args.x_offset, args.x_inc,
+                buffers_host.ap_mat, args.ap_offset);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<U> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXhpr(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                               convertToCUBLAS(args.triangle),
+                               args.n, args.alpha,
+                               buffers.x_vec, args.x_offset, args.x_inc,
+                               buffers.ap_mat, args.ap_offset);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.ap_size, static_cast<T>(0));
diff --git a/test/routines/level2/xhpr2.hpp b/test/routines/level2/xhpr2.hpp
index d09178f0..f946ba5c 100644
--- a/test/routines/level2/xhpr2.hpp
+++ b/test/routines/level2/xhpr2.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XHPR2_H_
 #define CLBLAST_TEST_ROUTINES_XHPR2_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXhpr2 {
             kArgAPOffset, kArgXOffset, kArgYOffset,
             kArgAlpha};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatAP, kBufVecX, kBufVecY}; }
+  static std::vector<std::string> BuffersOut() { return {kBufMatAP}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -74,6 +68,11 @@ class TestXhpr2 {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -107,24 +106,30 @@ class TestXhpr2 {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXhpr2(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  args.n, args.alpha,
-                 x_vec_cpu, args.x_offset, args.x_inc,
-                 y_vec_cpu, args.y_offset, args.y_inc,
-                 ap_mat_cpu, args.ap_offset);
-      buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
+                 buffers_host.x_vec, args.x_offset, args.x_inc,
+                 buffers_host.y_vec, args.y_offset, args.y_inc,
+                 buffers_host.ap_mat, args.ap_offset);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXhpr2(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                convertToCUBLAS(args.triangle),
+                                args.n, args.alpha,
+                                buffers.x_vec, args.x_offset, args.x_inc,
+                                buffers.y_vec, args.y_offset, args.y_inc,
+                                buffers.ap_mat, args.ap_offset);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.ap_size, static_cast<T>(0));
diff --git a/test/routines/level2/xsbmv.hpp b/test/routines/level2/xsbmv.hpp
index 8e0f8321..6481d19b 100644
--- a/test/routines/level2/xsbmv.hpp
+++ b/test/routines/level2/xsbmv.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XSBMV_H_
 #define CLBLAST_TEST_ROUTINES_XSBMV_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXsbmv {
             kArgAOffset, kArgXOffset, kArgYOffset,
             kArgAlpha, kArgBeta};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; }
+  static std::vector<std::string> BuffersOut() { return {kBufVecY}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -74,6 +68,11 @@ class TestXsbmv {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -107,24 +106,30 @@ class TestXsbmv {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXsbmv(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  args.n, args.kl, args.alpha,
-                 a_mat_cpu, args.a_offset, args.a_ld,
-                 x_vec_cpu, args.x_offset, args.x_inc, args.beta,
-                 y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+                 buffers_host.a_mat, args.a_offset, args.a_ld,
+                 buffers_host.x_vec, args.x_offset, args.x_inc, args.beta,
+                 buffers_host.y_vec, args.y_offset, args.y_inc);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXsbmv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                convertToCUBLAS(args.triangle),
+                                args.n, args.kl, args.alpha,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers.y_vec, args.y_offset, args.y_inc);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.y_size, static_cast<T>(0));
diff --git a/test/routines/level2/xspmv.hpp b/test/routines/level2/xspmv.hpp
index 977f733a..9815dbee 100644
--- a/test/routines/level2/xspmv.hpp
+++ b/test/routines/level2/xspmv.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XSPMV_H_
 #define CLBLAST_TEST_ROUTINES_XSPMV_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXspmv {
             kArgAPOffset, kArgXOffset, kArgYOffset,
             kArgAlpha, kArgBeta};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatAP, kBufVecX, kBufVecY}; }
+  static std::vector<std::string> BuffersOut() { return {kBufVecY}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -74,6 +68,11 @@ class TestXspmv {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -107,24 +106,30 @@ class TestXspmv {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXspmv(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  args.n, args.alpha,
-                 ap_mat_cpu, args.ap_offset,
-                 x_vec_cpu, args.x_offset, args.x_inc, args.beta,
-                 y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+                 buffers_host.ap_mat, args.ap_offset,
+                 buffers_host.x_vec, args.x_offset, args.x_inc, args.beta,
+                 buffers_host.y_vec, args.y_offset, args.y_inc);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXspmv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                convertToCUBLAS(args.triangle),
+                                args.n, args.alpha,
+                                buffers.ap_mat, args.ap_offset,
+                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers.y_vec, args.y_offset, args.y_inc);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.y_size, static_cast<T>(0));
diff --git a/test/routines/level2/xspr.hpp b/test/routines/level2/xspr.hpp
index 93da4b73..01a50c38 100644
--- a/test/routines/level2/xspr.hpp
+++ b/test/routines/level2/xspr.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XSPR_H_
 #define CLBLAST_TEST_ROUTINES_XSPR_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXspr {
             kArgAPOffset, kArgXOffset,
             kArgAlpha};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatAP, kBufVecX}; }
+  static std::vector<std::string> BuffersOut() { return {kBufMatAP}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -70,6 +64,11 @@ class TestXspr {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -101,21 +100,28 @@ class TestXspr {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXspr(convertToCBLAS(args.layout),
                 convertToCBLAS(args.triangle),
                 args.n, args.alpha,
-                x_vec_cpu, args.x_offset, args.x_inc,
-                ap_mat_cpu, args.ap_offset);
-      buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
+                buffers_host.x_vec, args.x_offset, args.x_inc,
+                buffers_host.ap_mat, args.ap_offset);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXspr(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                               convertToCUBLAS(args.triangle),
+                               args.n, args.alpha,
+                               buffers.x_vec, args.x_offset, args.x_inc,
+                               buffers.ap_mat, args.ap_offset);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.ap_size, static_cast<T>(0));
diff --git a/test/routines/level2/xspr2.hpp b/test/routines/level2/xspr2.hpp
index b835f2b0..55f8a141 100644
--- a/test/routines/level2/xspr2.hpp
+++ b/test/routines/level2/xspr2.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XSPR2_H_
 #define CLBLAST_TEST_ROUTINES_XSPR2_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXspr2 {
             kArgAPOffset, kArgXOffset, kArgYOffset,
             kArgAlpha};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatAP, kBufVecX, kBufVecY}; }
+  static std::vector<std::string> BuffersOut() { return {kBufMatAP}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -74,6 +68,11 @@ class TestXspr2 {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -107,24 +106,30 @@ class TestXspr2 {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXspr2(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  args.n, args.alpha,
-                 x_vec_cpu, args.x_offset, args.x_inc,
-                 y_vec_cpu, args.y_offset, args.y_inc,
-                 ap_mat_cpu, args.ap_offset);
-      buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
+                 buffers_host.x_vec, args.x_offset, args.x_inc,
+                 buffers_host.y_vec, args.y_offset, args.y_inc,
+                 buffers_host.ap_mat, args.ap_offset);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXspr2(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                convertToCUBLAS(args.triangle),
+                                args.n, args.alpha,
+                                buffers.x_vec, args.x_offset, args.x_inc,
+                                buffers.y_vec, args.y_offset, args.y_inc,
+                                buffers.ap_mat, args.ap_offset);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.ap_size, static_cast<T>(0));
diff --git a/test/routines/level2/xsymv.hpp b/test/routines/level2/xsymv.hpp
index 0ec96f1d..aec0dfb0 100644
--- a/test/routines/level2/xsymv.hpp
+++ b/test/routines/level2/xsymv.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XSYMV_H_
 #define CLBLAST_TEST_ROUTINES_XSYMV_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXsymv {
             kArgAOffset, kArgXOffset, kArgYOffset,
             kArgAlpha, kArgBeta};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; }
+  static std::vector<std::string> BuffersOut() { return {kBufVecY}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -74,6 +68,11 @@ class TestXsymv {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -107,24 +106,30 @@ class TestXsymv {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXsymv(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  args.n, args.alpha,
-                 a_mat_cpu, args.a_offset, args.a_ld,
-                 x_vec_cpu, args.x_offset, args.x_inc, args.beta,
-                 y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+                 buffers_host.a_mat, args.a_offset, args.a_ld,
+                 buffers_host.x_vec, args.x_offset, args.x_inc, args.beta,
+                 buffers_host.y_vec, args.y_offset, args.y_inc);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXsymv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                convertToCUBLAS(args.triangle),
+                                args.n, args.alpha,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers.y_vec, args.y_offset, args.y_inc);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.y_size, static_cast<T>(0));
diff --git a/test/routines/level2/xsyr.hpp b/test/routines/level2/xsyr.hpp
index b49132e3..78b686d8 100644
--- a/test/routines/level2/xsyr.hpp
+++ b/test/routines/level2/xsyr.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XSYR_H_
 #define CLBLAST_TEST_ROUTINES_XSYR_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXsyr {
             kArgAOffset, kArgXOffset,
             kArgAlpha};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX}; }
+  static std::vector<std::string> BuffersOut() { return {kBufMatA}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -70,6 +64,11 @@ class TestXsyr {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -101,21 +100,28 @@ class TestXsyr {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXsyr(convertToCBLAS(args.layout),
                 convertToCBLAS(args.triangle),
                 args.n, args.alpha,
-                x_vec_cpu, args.x_offset, args.x_inc,
-                a_mat_cpu, args.a_offset, args.a_ld);
-      buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+                buffers_host.x_vec, args.x_offset, args.x_inc,
+                buffers_host.a_mat, args.a_offset, args.a_ld);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXsyr(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                               convertToCUBLAS(args.triangle),
+                               args.n, args.alpha,
+                               buffers.x_vec, args.x_offset, args.x_inc,
+                               buffers.a_mat, args.a_offset, args.a_ld);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.a_size, static_cast<T>(0));
diff --git a/test/routines/level2/xsyr2.hpp b/test/routines/level2/xsyr2.hpp
index 7c65daa2..38aa4f43 100644
--- a/test/routines/level2/xsyr2.hpp
+++ b/test/routines/level2/xsyr2.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XSYR2_H_
 #define CLBLAST_TEST_ROUTINES_XSYR2_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXsyr2 {
             kArgAOffset, kArgXOffset, kArgYOffset,
             kArgAlpha};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; }
+  static std::vector<std::string> BuffersOut() { return {kBufMatA}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -74,6 +68,11 @@ class TestXsyr2 {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -107,24 +106,30 @@ class TestXsyr2 {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXsyr2(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  args.n, args.alpha,
-                 x_vec_cpu, args.x_offset, args.x_inc,
-                 y_vec_cpu, args.y_offset, args.y_inc,
-                 a_mat_cpu, args.a_offset, args.a_ld);
-      buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+                 buffers_host.x_vec, args.x_offset, args.x_inc,
+                 buffers_host.y_vec, args.y_offset, args.y_inc,
+                 buffers_host.a_mat, args.a_offset, args.a_ld);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXsyr2(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                convertToCUBLAS(args.triangle),
+                                args.n, args.alpha,
+                                buffers.x_vec, args.x_offset, args.x_inc,
+                                buffers.y_vec, args.y_offset, args.y_inc,
+                                buffers.a_mat, args.a_offset, args.a_ld);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.a_size, static_cast<T>(0));
diff --git a/test/routines/level2/xtbmv.hpp b/test/routines/level2/xtbmv.hpp
index cf30c2f7..8c7aa381 100644
--- a/test/routines/level2/xtbmv.hpp
+++ b/test/routines/level2/xtbmv.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XTBMV_H_
 #define CLBLAST_TEST_ROUTINES_XTBMV_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -44,6 +36,8 @@ class TestXtbmv {
             kArgALeadDim, kArgXInc,
             kArgAOffset, kArgXOffset};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX}; }
+  static std::vector<std::string> BuffersOut() { return {kBufVecX}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -69,6 +63,11 @@ class TestXtbmv {
   static Transposes GetATransposes(const Transposes &all) { return all; }
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -102,23 +101,32 @@ class TestXtbmv {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXtbmv(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  convertToCBLAS(args.a_transpose),
                  convertToCBLAS(args.diagonal),
                  args.n, args.kl,
-                 a_mat_cpu, args.a_offset, args.a_ld,
-                 x_vec_cpu, args.x_offset, args.x_inc);
-      buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
+                 buffers_host.a_mat, args.a_offset, args.a_ld,
+                 buffers_host.x_vec, args.x_offset, args.x_inc);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXtbmv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                convertToCUBLAS(args.triangle),
+                                convertToCUBLAS(args.a_transpose),
+                                convertToCUBLAS(args.diagonal),
+                                args.n, args.kl,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.x_vec, args.x_offset, args.x_inc);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.x_size, static_cast<T>(0));
diff --git a/test/routines/level2/xtpmv.hpp b/test/routines/level2/xtpmv.hpp
index d08e132f..3afab978 100644
--- a/test/routines/level2/xtpmv.hpp
+++ b/test/routines/level2/xtpmv.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XTPMV_H_
 #define CLBLAST_TEST_ROUTINES_XTPMV_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -44,6 +36,8 @@ class TestXtpmv {
             kArgXInc,
             kArgAPOffset, kArgXOffset};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatAP, kBufVecX}; }
+  static std::vector<std::string> BuffersOut() { return {kBufVecX}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -69,6 +63,11 @@ class TestXtpmv {
   static Transposes GetATransposes(const Transposes &all) { return all; }
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -102,23 +101,32 @@ class TestXtpmv {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXtpmv(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  convertToCBLAS(args.a_transpose),
                  convertToCBLAS(args.diagonal),
                  args.n,
-                 ap_mat_cpu, args.ap_offset,
-                 x_vec_cpu, args.x_offset, args.x_inc);
-      buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
+                 buffers_host.ap_mat, args.ap_offset,
+                 buffers_host.x_vec, args.x_offset, args.x_inc);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXtpmv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                convertToCUBLAS(args.triangle),
+                                convertToCUBLAS(args.a_transpose),
+                                convertToCUBLAS(args.diagonal),
+                                args.n,
+                                buffers.ap_mat, args.ap_offset,
+                                buffers.x_vec, args.x_offset, args.x_inc);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.x_size, static_cast<T>(0));
diff --git a/test/routines/level2/xtrmv.hpp b/test/routines/level2/xtrmv.hpp
index cf9a0063..2b71f151 100644
--- a/test/routines/level2/xtrmv.hpp
+++ b/test/routines/level2/xtrmv.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XTRMV_H_
 #define CLBLAST_TEST_ROUTINES_XTRMV_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -44,6 +36,8 @@ class TestXtrmv {
             kArgALeadDim, kArgXInc,
             kArgAOffset, kArgXOffset};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX}; }
+  static std::vector<std::string> BuffersOut() { return {kBufVecX}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
@@ -69,6 +63,11 @@ class TestXtrmv {
   static Transposes GetATransposes(const Transposes &all) { return all; }
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -102,23 +101,32 @@ class TestXtrmv {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXtrmv(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  convertToCBLAS(args.a_transpose),
                  convertToCBLAS(args.diagonal),
                  args.n,
-                 a_mat_cpu, args.a_offset, args.a_ld,
-                 x_vec_cpu, args.x_offset, args.x_inc);
-      buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
+                 buffers_host.a_mat, args.a_offset, args.a_ld,
+                 buffers_host.x_vec, args.x_offset, args.x_inc);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXtrmv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                convertToCUBLAS(args.triangle),
+                                convertToCUBLAS(args.a_transpose),
+                                convertToCUBLAS(args.diagonal),
+                                args.n,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.x_vec, args.x_offset, args.x_inc);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.x_size, static_cast<T>(0));
diff --git a/test/routines/level2/xtrsv.hpp b/test/routines/level2/xtrsv.hpp
new file mode 100644
index 00000000..85b50e85
--- /dev/null
+++ b/test/routines/level2/xtrsv.hpp
@@ -0,0 +1,174 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xtrsv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XTRSV_H_
+#define CLBLAST_TEST_ROUTINES_XTRSV_H_
+
+#include "test/routines/common.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXtrsv {
+ public:
+
+  // The BLAS level: 1, 2, or 3
+  static size_t BLASLevel() { return 2; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN,
+            kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal,
+            kArgALeadDim, kArgXInc,
+            kArgAOffset, kArgXOffset};
+  }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX}; }
+  static std::vector<std::string> BuffersOut() { return {kBufVecX}; }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &args) {
+    return args.n * args.x_inc + args.x_offset;
+  }
+  static size_t GetSizeA(const Arguments<T> &args) {
+    return args.n * args.a_ld + args.a_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.x_size = GetSizeX(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which transpose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &all) { return all; }
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T> &args, Queue&, const int, std::vector<T> &x_source,
+                          std::vector<T>&, std::vector<T> &a_source, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {
+    if (args.a_ld < args.n) { return; }
+    if (args.a_size <= 0 || args.x_size <= 0) { return; }
+
+    // Generates 'proper' input for the TRSV routine
+    // TODO: Improve this, currently loosely based on clBLAS's implementation
+    for (auto i = size_t{0}; i < args.n; ++i) {
+      auto diagonal = a_source[i*args.a_ld + i + args.a_offset];
+      diagonal = static_cast<T>(AbsoluteValue(diagonal)) + static_cast<T>(args.n / size_t{4});
+      for (auto j = size_t{0}; j < args.n; ++j) {
+        a_source[j*args.a_ld + i + args.a_offset] /= Constant<T>(2.0);
+      }
+      a_source[i*args.a_ld + i + args.a_offset] = diagonal;
+      x_source[i * args.x_inc + args.x_offset] /= Constant<T>(2.0);
+    }
+  }
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Trsv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
+                          args.n,
+                          buffers.a_mat(), args.a_offset, args.a_ld,
+                          buffers.x_vec(), args.x_offset, args.x_inc,
+                          &queue_plain, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CLBLAS
+    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+      auto queue_plain = queue();
+      auto event = cl_event{};
+      auto status = clblasXtrsv<T>(convertToCLBLAS(args.layout),
+                                   convertToCLBLAS(args.triangle),
+                                   convertToCLBLAS(args.a_transpose),
+                                   convertToCLBLAS(args.diagonal),
+                                   args.n,
+                                   buffers.a_mat, args.a_offset, args.a_ld,
+                                   buffers.x_vec, args.x_offset, args.x_inc,
+                                   1, &queue_plain, 0, nullptr, &event);
+      clWaitForEvents(1, &event);
+      return static_cast<StatusCode>(status);
+    }
+  #endif
+
+  // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CBLAS
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
+      cblasXtrsv(convertToCBLAS(args.layout),
+                 convertToCBLAS(args.triangle),
+                 convertToCBLAS(args.a_transpose),
+                 convertToCBLAS(args.diagonal),
+                 args.n,
+                 buffers_host.a_mat, args.a_offset, args.a_ld,
+                 buffers_host.x_vec, args.x_offset, args.x_inc);
+      return StatusCode::kSuccess;
+    }
+  #endif
+
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXtrsv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                convertToCUBLAS(args.triangle),
+                                convertToCUBLAS(args.a_transpose),
+                                convertToCUBLAS(args.diagonal),
+                                args.n,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.x_vec, args.x_offset, args.x_inc);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.x_size, static_cast<T>(0));
+    buffers.x_vec.Read(queue, args.x_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) {
+    return args.n;
+  }
+  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+    return id1*args.x_inc + args.x_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.n * args.n;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (args.n*args.n + 2*args.n + args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XTRSV_H_
+#endif
diff --git a/test/routines/level3/xgemm.hpp b/test/routines/level3/xgemm.hpp
index bca3c049..7e0ead6d 100644
--- a/test/routines/level3/xgemm.hpp
+++ b/test/routines/level3/xgemm.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XGEMM_H_
 #define CLBLAST_TEST_ROUTINES_XGEMM_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXgemm {
             kArgAOffset, kArgBOffset, kArgCOffset,
             kArgAlpha, kArgBeta};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; }
+  static std::vector<std::string> BuffersOut() { return {kBufMatC}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeA(const Arguments<T> &args) {
@@ -82,6 +76,11 @@ class TestXgemm {
   static Transposes GetATransposes(const Transposes &all) { return all; }
   static Transposes GetBTransposes(const Transposes &all) { return all; }
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -116,25 +115,32 @@ class TestXgemm {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-      std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
-      std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
-      buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXgemm(convertToCBLAS(args.layout),
                  convertToCBLAS(args.a_transpose),
                  convertToCBLAS(args.b_transpose),
                  args.m, args.n, args.k, args.alpha,
-                 a_mat_cpu, args.a_offset, args.a_ld,
-                 b_mat_cpu, args.b_offset, args.b_ld, args.beta,
-                 c_mat_cpu, args.c_offset, args.c_ld);
-      buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+                 buffers_host.a_mat, args.a_offset, args.a_ld,
+                 buffers_host.b_mat, args.b_offset, args.b_ld, args.beta,
+                 buffers_host.c_mat, args.c_offset, args.c_ld);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXgemm(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                convertToCUBLAS(args.a_transpose),
+                                convertToCUBLAS(args.b_transpose),
+                                args.m, args.n, args.k, args.alpha,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.b_mat, args.b_offset, args.b_ld, args.beta,
+                                buffers.c_mat, args.c_offset, args.c_ld);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.c_size, static_cast<T>(0));
diff --git a/test/routines/level3/xhemm.hpp b/test/routines/level3/xhemm.hpp
index 31c7695f..a89617b5 100644
--- a/test/routines/level3/xhemm.hpp
+++ b/test/routines/level3/xhemm.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XHEMM_H_
 #define CLBLAST_TEST_ROUTINES_XHEMM_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXhemm {
             kArgAOffset, kArgBOffset, kArgCOffset,
             kArgAlpha, kArgBeta};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; }
+  static std::vector<std::string> BuffersOut() { return {kBufMatC}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeA(const Arguments<T> &args) {
@@ -82,6 +76,11 @@ class TestXhemm {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -116,25 +115,32 @@ class TestXhemm {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-      std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
-      std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
-      buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXhemm(convertToCBLAS(args.layout),
                  convertToCBLAS(args.side),
                  convertToCBLAS(args.triangle),
                  args.m, args.n, args.alpha,
-                 a_mat_cpu, args.a_offset, args.a_ld,
-                 b_mat_cpu, args.b_offset, args.b_ld, args.beta,
-                 c_mat_cpu, args.c_offset, args.c_ld);
-      buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+                 buffers_host.a_mat, args.a_offset, args.a_ld,
+                 buffers_host.b_mat, args.b_offset, args.b_ld, args.beta,
+                 buffers_host.c_mat, args.c_offset, args.c_ld);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXhemm(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                convertToCUBLAS(args.side),
+                                convertToCUBLAS(args.triangle),
+                                args.m, args.n, args.alpha,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.b_mat, args.b_offset, args.b_ld, args.beta,
+                                buffers.c_mat, args.c_offset, args.c_ld);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.c_size, static_cast<T>(0));
diff --git a/test/routines/level3/xher2k.hpp b/test/routines/level3/xher2k.hpp
index ff2bb6cb..55e6d894 100644
--- a/test/routines/level3/xher2k.hpp
+++ b/test/routines/level3/xher2k.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XHER2K_H_
 #define CLBLAST_TEST_ROUTINES_XHER2K_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXher2k {
             kArgAOffset, kArgBOffset, kArgCOffset,
             kArgAlpha, kArgBeta};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; }
+  static std::vector<std::string> BuffersOut() { return {kBufMatC}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeA(const Arguments<U> &args) {
@@ -80,6 +74,11 @@ class TestXher2k {
   static Transposes GetATransposes(const Transposes &) { return {Transpose::kNo, Transpose::kConjugate}; }
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<U>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -116,26 +115,34 @@ class TestXher2k {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-      std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
-      std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
-      buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+    static StatusCode RunReference2(const Arguments<U> &args, BuffersHost<T> &buffers_host, Queue&) {
       auto alpha2 = T{args.alpha, args.alpha};
       cblasXher2k(convertToCBLAS(args.layout),
                   convertToCBLAS(args.triangle),
                   convertToCBLAS(args.a_transpose),
                   args.n, args.k, alpha2,
-                  a_mat_cpu, args.a_offset, args.a_ld,
-                  b_mat_cpu, args.b_offset, args.b_ld, args.beta,
-                  c_mat_cpu, args.c_offset, args.c_ld);
-      buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+                  buffers_host.a_mat, args.a_offset, args.a_ld,
+                  buffers_host.b_mat, args.b_offset, args.b_ld, args.beta,
+                  buffers_host.c_mat, args.c_offset, args.c_ld);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<U> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto alpha2 = T{args.alpha, args.alpha};
+      auto status = cublasXher2k(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                 convertToCUBLAS(args.triangle),
+                                 convertToCUBLAS(args.a_transpose),
+                                 args.n, args.k, alpha2,
+                                 buffers.a_mat, args.a_offset, args.a_ld,
+                                 buffers.b_mat, args.b_offset, args.b_ld, args.beta,
+                                 buffers.c_mat, args.c_offset, args.c_ld);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.c_size, static_cast<T>(0));
diff --git a/test/routines/level3/xherk.hpp b/test/routines/level3/xherk.hpp
index 26396fa9..3e1e7e02 100644
--- a/test/routines/level3/xherk.hpp
+++ b/test/routines/level3/xherk.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XHERK_H_
 #define CLBLAST_TEST_ROUTINES_XHERK_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXherk {
             kArgAOffset, kArgCOffset,
             kArgAlpha, kArgBeta};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatC}; }
+  static std::vector<std::string> BuffersOut() { return {kBufMatC}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeA(const Arguments<U> &args) {
@@ -73,6 +67,11 @@ class TestXherk {
   static Transposes GetATransposes(const Transposes &) { return {Transpose::kNo, Transpose::kConjugate}; }
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<U>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -105,22 +104,30 @@ class TestXherk {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-      std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+    static StatusCode RunReference2(const Arguments<U> &args, BuffersHost<T> &buffers_host, Queue&) {
       cblasXherk(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  convertToCBLAS(args.a_transpose),
                  args.n, args.k, args.alpha,
-                 a_mat_cpu, args.a_offset, args.a_ld, args.beta,
-                 c_mat_cpu, args.c_offset, args.c_ld);
-      buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+                 buffers_host.a_mat, args.a_offset, args.a_ld, args.beta,
+                 buffers_host.c_mat, args.c_offset, args.c_ld);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<U> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXherk(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                convertToCUBLAS(args.triangle),
+                                convertToCUBLAS(args.a_transpose),
+                                args.n, args.k, args.alpha,
+                                buffers.a_mat, args.a_offset, args.a_ld, args.beta,
+                                buffers.c_mat, args.c_offset, args.c_ld);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.c_size, static_cast<T>(0));
diff --git a/test/routines/level3/xsymm.hpp b/test/routines/level3/xsymm.hpp
index c84c22b4..5d840d40 100644
--- a/test/routines/level3/xsymm.hpp
+++ b/test/routines/level3/xsymm.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XSYMM_H_
 #define CLBLAST_TEST_ROUTINES_XSYMM_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXsymm {
             kArgAOffset, kArgBOffset, kArgCOffset,
             kArgAlpha, kArgBeta};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; }
+  static std::vector<std::string> BuffersOut() { return {kBufMatC}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeA(const Arguments<T> &args) {
@@ -82,6 +76,11 @@ class TestXsymm {
   static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -116,25 +115,32 @@ class TestXsymm {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-      std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
-      std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
-      buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXsymm(convertToCBLAS(args.layout),
                  convertToCBLAS(args.side),
                  convertToCBLAS(args.triangle),
                  args.m, args.n, args.alpha,
-                 a_mat_cpu, args.a_offset, args.a_ld,
-                 b_mat_cpu, args.b_offset, args.b_ld, args.beta,
-                 c_mat_cpu, args.c_offset, args.c_ld);
-      buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+                 buffers_host.a_mat, args.a_offset, args.a_ld,
+                 buffers_host.b_mat, args.b_offset, args.b_ld, args.beta,
+                 buffers_host.c_mat, args.c_offset, args.c_ld);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXsymm(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                convertToCUBLAS(args.side),
+                                convertToCUBLAS(args.triangle),
+                                args.m, args.n, args.alpha,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.b_mat, args.b_offset, args.b_ld, args.beta,
+                                buffers.c_mat, args.c_offset, args.c_ld);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.c_size, static_cast<T>(0));
diff --git a/test/routines/level3/xsyr2k.hpp b/test/routines/level3/xsyr2k.hpp
index 5c4976e2..4a4a2f10 100644
--- a/test/routines/level3/xsyr2k.hpp
+++ b/test/routines/level3/xsyr2k.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XSYR2K_H_
 #define CLBLAST_TEST_ROUTINES_XSYR2K_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXsyr2k {
             kArgAOffset, kArgBOffset, kArgCOffset,
             kArgAlpha, kArgBeta};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; }
+  static std::vector<std::string> BuffersOut() { return {kBufMatC}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeA(const Arguments<T> &args) {
@@ -80,6 +74,11 @@ class TestXsyr2k {
   static Transposes GetATransposes(const Transposes &) { return {Transpose::kNo, Transpose::kYes}; }
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -114,25 +113,32 @@ class TestXsyr2k {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-      std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
-      std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
-      buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXsyr2k(convertToCBLAS(args.layout),
                   convertToCBLAS(args.triangle),
                   convertToCBLAS(args.a_transpose),
                   args.n, args.k, args.alpha,
-                  a_mat_cpu, args.a_offset, args.a_ld,
-                  b_mat_cpu, args.b_offset, args.b_ld, args.beta,
-                  c_mat_cpu, args.c_offset, args.c_ld);
-      buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+                  buffers_host.a_mat, args.a_offset, args.a_ld,
+                  buffers_host.b_mat, args.b_offset, args.b_ld, args.beta,
+                  buffers_host.c_mat, args.c_offset, args.c_ld);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXsyr2k(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                 convertToCUBLAS(args.triangle),
+                                 convertToCUBLAS(args.a_transpose),
+                                 args.n, args.k, args.alpha,
+                                 buffers.a_mat, args.a_offset, args.a_ld,
+                                 buffers.b_mat, args.b_offset, args.b_ld, args.beta,
+                                 buffers.c_mat, args.c_offset, args.c_ld);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.c_size, static_cast<T>(0));
diff --git a/test/routines/level3/xsyrk.hpp b/test/routines/level3/xsyrk.hpp
index 98c4f6a4..90e46727 100644
--- a/test/routines/level3/xsyrk.hpp
+++ b/test/routines/level3/xsyrk.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XSYRK_H_
 #define CLBLAST_TEST_ROUTINES_XSYRK_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXsyrk {
             kArgAOffset, kArgCOffset,
             kArgAlpha, kArgBeta};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatC}; }
+  static std::vector<std::string> BuffersOut() { return {kBufMatC}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeA(const Arguments<T> &args) {
@@ -73,6 +67,11 @@ class TestXsyrk {
   static Transposes GetATransposes(const Transposes &) { return {Transpose::kNo, Transpose::kYes}; }
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -105,22 +104,30 @@ class TestXsyrk {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-      std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXsyrk(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  convertToCBLAS(args.a_transpose),
                  args.n, args.k, args.alpha,
-                 a_mat_cpu, args.a_offset, args.a_ld, args.beta,
-                 c_mat_cpu, args.c_offset, args.c_ld);
-      buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+                 buffers_host.a_mat, args.a_offset, args.a_ld, args.beta,
+                 buffers_host.c_mat, args.c_offset, args.c_ld);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXsyrk(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                convertToCUBLAS(args.triangle),
+                                convertToCUBLAS(args.a_transpose),
+                                args.n, args.k, args.alpha,
+                                buffers.a_mat, args.a_offset, args.a_ld, args.beta,
+                                buffers.c_mat, args.c_offset, args.c_ld);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.c_size, static_cast<T>(0));
diff --git a/test/routines/level3/xtrmm.hpp b/test/routines/level3/xtrmm.hpp
index 55b51e54..acc00e01 100644
--- a/test/routines/level3/xtrmm.hpp
+++ b/test/routines/level3/xtrmm.hpp
@@ -16,15 +16,7 @@
 #ifndef CLBLAST_TEST_ROUTINES_XTRMM_H_
 #define CLBLAST_TEST_ROUTINES_XTRMM_H_
 
-#include <vector>
-#include <string>
-
-#ifdef CLBLAST_REF_CLBLAS
-  #include "test/wrapper_clblas.hpp"
-#endif
-#ifdef CLBLAST_REF_CBLAS
-  #include "test/wrapper_cblas.hpp"
-#endif
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -45,6 +37,8 @@ class TestXtrmm {
             kArgAOffset, kArgBOffset,
             kArgAlpha};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB}; }
+  static std::vector<std::string> BuffersOut() { return {kBufMatB}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeA(const Arguments<T> &args) {
@@ -73,6 +67,11 @@ class TestXtrmm {
   static Transposes GetATransposes(const Transposes &all) { return all; }
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -107,24 +106,34 @@ class TestXtrmm {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-      std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXtrmm(convertToCBLAS(args.layout),
                  convertToCBLAS(args.side),
                  convertToCBLAS(args.triangle),
                  convertToCBLAS(args.a_transpose),
                  convertToCBLAS(args.diagonal),
                  args.m, args.n, args.alpha,
-                 a_mat_cpu, args.a_offset, args.a_ld,
-                 b_mat_cpu, args.b_offset, args.b_ld);
-      buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
+                 buffers_host.a_mat, args.a_offset, args.a_ld,
+                 buffers_host.b_mat, args.b_offset, args.b_ld);
       return StatusCode::kSuccess;
     }
   #endif
 
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXtrmm(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                convertToCUBLAS(args.side),
+                                convertToCUBLAS(args.triangle),
+                                convertToCUBLAS(args.a_transpose),
+                                convertToCUBLAS(args.diagonal),
+                                args.m, args.n, args.alpha,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.b_mat, args.b_offset, args.b_ld);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     std::vector<T> result(args.b_size, static_cast<T>(0));
diff --git a/test/routines/level3/xtrsm.hpp b/test/routines/level3/xtrsm.hpp
new file mode 100644
index 00000000..d63c9d79
--- /dev/null
+++ b/test/routines/level3/xtrsm.hpp
@@ -0,0 +1,179 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xtrsm routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XTRSM_H_
+#define CLBLAST_TEST_ROUTINES_XTRSM_H_
+
+#include "test/routines/common.hpp"
+#include "test/routines/level3/xtrsm_data.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXtrsm {
+ public:
+
+  // The BLAS level: 1, 2, or 3
+  static size_t BLASLevel() { return 3; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgM, kArgN,
+            kArgLayout, kArgSide, kArgTriangle, kArgATransp, kArgDiagonal,
+            kArgALeadDim, kArgBLeadDim,
+            kArgAOffset, kArgBOffset,
+            kArgAlpha};
+  }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB}; }
+  static std::vector<std::string> BuffersOut() { return {kBufMatB}; }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    const auto k = (args.side == Side::kLeft) ? args.m : args.n;
+    return k * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeB(const Arguments<T> &args) {
+    const auto b_rotated = (args.layout == Layout::kRowMajor);
+    const auto b_two = (b_rotated) ? args.m : args.n;
+    return b_two * args.b_ld + args.b_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.b_size = GetSizeB(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.m; }
+  static size_t DefaultLDB(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which transpose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &all) { return all; }
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T> &args, Queue&, const int seed,
+                          std::vector<T>&, std::vector<T>&,
+                          std::vector<T>& a_source_, std::vector<T>& b_source_, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {
+    const auto k = (args.side == Side::kLeft) ? args.m : args.n;
+    const auto b_one = (args.layout == Layout::kRowMajor) ? args.n : args.m;
+    if (args.a_ld < k) { return; }
+    if (args.b_ld < b_one) { return; }
+    if (args.a_size <= 0 || args.b_size <= 0) { return; }
+
+    // TODO: This is a copy of the clBLAS random matrix generation, make it work properly
+    GenerateProperTrsmMatrices(args, seed, &a_source_[args.a_offset], &b_source_[args.b_offset]);
+  }
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Trsm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
+                       args.m, args.n, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld,
+                       buffers.b_mat(), args.b_offset, args.b_ld,
+                       &queue_plain, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CLBLAS
+    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+      auto queue_plain = queue();
+      auto event = cl_event{};
+      auto status = clblasXtrsm(convertToCLBLAS(args.layout),
+                                convertToCLBLAS(args.side),
+                                convertToCLBLAS(args.triangle),
+                                convertToCLBLAS(args.a_transpose),
+                                convertToCLBLAS(args.diagonal),
+                                args.m, args.n, args.alpha,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.b_mat, args.b_offset, args.b_ld,
+                                1, &queue_plain, 0, nullptr, &event);
+      clWaitForEvents(1, &event);
+      return static_cast<StatusCode>(status);
+    }
+  #endif
+
+  // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CBLAS
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
+      cblasXtrsm(convertToCBLAS(args.layout),
+                 convertToCBLAS(args.side),
+                 convertToCBLAS(args.triangle),
+                 convertToCBLAS(args.a_transpose),
+                 convertToCBLAS(args.diagonal),
+                 args.m, args.n, args.alpha,
+                 buffers_host.a_mat, args.a_offset, args.a_ld,
+                 buffers_host.b_mat, args.b_offset, args.b_ld);
+      return StatusCode::kSuccess;
+    }
+  #endif
+
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      auto status = cublasXtrsm(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                convertToCUBLAS(args.side),
+                                convertToCUBLAS(args.triangle),
+                                convertToCUBLAS(args.a_transpose),
+                                convertToCUBLAS(args.diagonal),
+                                args.m, args.n, args.alpha,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.b_mat, args.b_offset, args.b_ld);
+      if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
+    }
+  #endif
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.b_size, static_cast<T>(0));
+    buffers.b_mat.Read(queue, args.b_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.m; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    return (args.layout == Layout::kRowMajor) ?
+           id1*args.b_ld + id2 + args.b_offset:
+           id2*args.b_ld + id1 + args.b_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    auto k = (args.side == Side::kLeft) ? args.m : args.n;
+    return args.m * args.n * k;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    auto k = (args.side == Side::kLeft) ? args.m : args.n;
+    return (k*k + 2*args.m*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XTRSM_H_
+#endif
diff --git a/test/routines/level3/xtrsm_data.hpp b/test/routines/level3/xtrsm_data.hpp
new file mode 100644
index 00000000..9392b6b9
--- /dev/null
+++ b/test/routines/level3/xtrsm_data.hpp
@@ -0,0 +1,188 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements data-prepration routines for proper input for the TRSM routine. Note: The
+// data-preparation routines are taken from clBLAS
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XTRSM_DATA_H_
+#define CLBLAST_TEST_ROUTINES_XTRSM_DATA_H_
+
+#include <vector>
+#include <string>
+#include <random>
+
+#include "utilities/utilities.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Limits to prepare proper input data
+template <typename T> double TrsmLimitMatA();
+template <> double TrsmLimitMatA<float>() { return pow(2.0, 7); }
+template <> double TrsmLimitMatA<double>() { return pow(2.0, 5); }
+template <> double TrsmLimitMatA<float2>() { return TrsmLimitMatA<float>(); }
+template <> double TrsmLimitMatA<double2>() { return TrsmLimitMatA<double>(); }
+template <typename T> double TrsmLimitMatB();
+template <> double TrsmLimitMatB<float>() { return pow(2.0, 16); }
+template <> double TrsmLimitMatB<double>() { return pow(2.0, 47); }
+template <> double TrsmLimitMatB<float2>() { return TrsmLimitMatB<float>(); }
+template <> double TrsmLimitMatB<double2>() { return TrsmLimitMatB<double>(); }
+
+// Matrix element setter
+template <typename T>
+void SetElement(const clblast::Layout layout,
+                const size_t row, const size_t column, T *mat, const size_t ld, const T value)
+{
+  if (layout == clblast::Layout::kRowMajor) { mat[column + ld * row] = value; }
+  else { mat[row + ld * column] = value; }
+}
+
+// Matrix element getter
+template <typename T>
+T GetElement(const clblast::Layout layout,
+             const size_t row, const size_t column, const T *mat, const size_t ld)
+{
+  if (layout == clblast::Layout::kRowMajor) { return mat[column + ld * row]; }
+  else { return mat[row + ld * column]; }
+}
+
+// Bounds a value between 'left' and 'right'. The random value is assumed to be between -1 and +1.
+template<typename T>
+T BoundRandom(const double rand_val, const double left, const double right)
+{
+  const auto value = Constant<T>(rand_val * (right - left));
+  if (AbsoluteValue<T>(value) < 0.0) {
+    return value - Constant<T>(left);
+  }
+  else {
+    return value + Constant<T>(left);
+  }
+}
+
+// The clBLAS function to generate proper input matrices for matrices A & B. Note that this routine
+// should remain deterministic. Random values are therefore taken from the existing input, which
+// is scaled between -1 and +1.
+template <typename T>
+void GenerateProperTrsmMatrices(const Arguments<T> &args, const int seed, T *mat_a, T *mat_b)
+{
+  // Random number generator
+  std::mt19937 mt(seed);
+  std::uniform_real_distribution<double> dist(-1.0, 1.0);
+
+  const auto k = (args.side == Side::kLeft) ? args.m : args.n;
+
+  // Determines: max(|a_{ii}|) and  min(|a_{ii}|)
+  // Generates: a_{ii} which are constrainted by min/max
+  auto min = ConstantZero<T>();
+  if (args.diagonal ==  clblast::Diagonal::kUnit) {
+    for (auto i = size_t{0}; i < k; ++i) {
+      SetElement<T>(args.layout, i, i, mat_a, args.a_ld, ConstantOne<T>()); // must not be accessed
+    }
+  }
+  else {
+    auto max = Constant<T>(dist(mt) * TrsmLimitMatA<T>());
+    if (AbsoluteValue(max) < 1.0) { max += Constant<T>(3.0); } // no zero's on the diagonal
+    min = max / Constant<T>(100.0);
+    SetElement<T>(args.layout, 0, 0, mat_a, args.a_ld, max);
+    for (auto i = size_t{1}; i < k; ++i) {
+      auto value = BoundRandom<T>(dist(mt), AbsoluteValue(min), AbsoluteValue(max));
+      if (AbsoluteValue(value) == 0) {
+        value = max;
+      }
+      SetElement<T>(args.layout, i, i, mat_a, args.a_ld, value);
+    }
+  }
+
+  // Generates a_{ij} for all j <> i.
+  for (auto i = size_t{0}; i < k; ++i) {
+    auto sum = (args.diagonal == clblast::Diagonal::kUnit) ?
+                                 AbsoluteValue(ConstantOne<T>()) :
+                                 AbsoluteValue(GetElement<T>(args.layout, i, i, mat_a, args.a_ld));
+    for (auto j = size_t{0}; j < k; ++j) {
+      if (j == i) { continue; }
+      auto value = ConstantZero<T>();
+      if (((args.triangle == clblast::Triangle::kUpper) && (j > i)) ||
+          ((args.triangle == clblast::Triangle::kLower) && (j < i))) {
+        if (sum >= 1.0) {
+          const auto limit = sum / std::sqrt(static_cast<double>(k) - static_cast<double>(j));
+          value = Constant<T>(dist(mt) * limit);
+          sum -= AbsoluteValue(value);
+        }
+      }
+      SetElement<T>(args.layout, i, j, mat_a, args.a_ld, value);
+    }
+  }
+
+  // Generate matrix B
+  if (args.side == clblast::Side::kLeft) {
+    for (auto j = size_t{0}; j < args.n; ++j) {
+      auto sum = TrsmLimitMatB<T>();
+      for (auto i = size_t{0}; i < args.m; ++i) {
+        const auto a_value = GetElement<T>(args.layout, i, i, mat_a, args.a_ld);
+        auto value = ConstantZero<T>();
+        if (sum >= 0.0) {
+          const auto limit = sum * AbsoluteValue(a_value) / std::sqrt(static_cast<double>(args.m) - static_cast<double>(i));
+          value = Constant<T>(dist(mt) * limit);
+          sum -= AbsoluteValue(value) / AbsoluteValue(a_value);
+        }
+        SetElement<T>(args.layout, i, j, mat_b, args.b_ld, value);
+        if ((i == 0 && j == 0) || (AbsoluteValue(value) < AbsoluteValue(min))) {
+          min = value;
+        }
+      }
+    }
+  }
+  else {
+    for (auto i = size_t{0}; i < args.m; ++i) {
+      auto sum = TrsmLimitMatB<T>();
+      for (auto j = size_t{0}; j < args.n; ++j) {
+        const auto a_value = GetElement<T>(args.layout, j, j, mat_a, args.a_ld);
+        auto value = ConstantZero<T>();
+        if (sum >= 0.0) {
+          const auto limit = sum * AbsoluteValue(a_value) / std::sqrt(static_cast<double>(args.n) - static_cast<double>(j));
+          value = Constant<T>(dist(mt) * limit);
+          sum -= AbsoluteValue(value) / AbsoluteValue(a_value);
+        }
+        SetElement<T>(args.layout, i, j, mat_b, args.b_ld, value);
+        if ((i == 0 && j == 0) || (AbsoluteValue(value) < AbsoluteValue(min))) {
+          min = value;
+        }
+      }
+    }
+  }
+  if (args.diagonal == clblast::Diagonal::kUnit) {
+    for (auto i = size_t{0}; i < k; ++i) {
+      SetElement<T>(args.layout, i, i, mat_a, args.a_ld, ConstantOne<T>()); // must not be accessed
+    }
+  }
+
+  // Calculate a proper alpha
+  if (AbsoluteValue(min) > AbsoluteValue(args.alpha)) {
+    // Not implemented
+  }
+
+  // Adjust matrix B according to the value of alpha
+  if (AbsoluteValue(args.alpha) != 1.0 && AbsoluteValue(args.alpha) != 0.0) {
+    for (auto i = size_t{0}; i < args.m; ++i) {
+      for (auto j = size_t{0}; j < args.n; ++j) {
+        auto value = GetElement<T>(args.layout, i, j, mat_b, args.b_ld);
+        value /= args.alpha;
+        SetElement<T>(args.layout, i, j, mat_b, args.b_ld, value);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XTRSM_DATA_H_
+#endif
diff --git a/test/routines/levelx/xaxpybatched.hpp b/test/routines/levelx/xaxpybatched.hpp
new file mode 100644
index 00000000..5385e86e
--- /dev/null
+++ b/test/routines/levelx/xaxpybatched.hpp
@@ -0,0 +1,168 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the XaxpyBatched routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XAXPYBATCHED_H_
+#define CLBLAST_TEST_ROUTINES_XAXPYBATCHED_H_
+
+#include "test/routines/common.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXaxpyBatched {
+ public:
+
+  // Although it is a non-BLAS routine, it can still be tested against level-1 routines in a loop
+  static size_t BLASLevel() { return 1; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN,
+            kArgXInc, kArgYInc,
+            kArgBatchCount, kArgAlpha};
+  }
+  static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY}; }
+  static std::vector<std::string> BuffersOut() { return {kBufVecY}; }
+
+  // Helper for the sizes per batch
+  static size_t PerBatchSizeX(const Arguments<T> &args) { return args.n * args.x_inc; }
+  static size_t PerBatchSizeY(const Arguments<T> &args) { return args.n * args.y_inc; }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &args) {
+    return PerBatchSizeX(args) * args.batch_count + args.x_offset;
+  }
+  static size_t GetSizeY(const Arguments<T> &args) {
+    return PerBatchSizeY(args) * args.batch_count + args.y_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.x_size = GetSizeX(args);
+    args.y_size = GetSizeY(args);
+
+    // Also sets the batch-related variables
+    args.x_offsets = std::vector<size_t>(args.batch_count);
+    args.y_offsets = std::vector<size_t>(args.batch_count);
+    args.alphas = std::vector<T>(args.batch_count);
+    for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
+      args.x_offsets[batch] = batch * PerBatchSizeX(args) + args.x_offset;
+      args.y_offsets[batch] = batch * PerBatchSizeY(args) + args.y_offset;
+      args.alphas[batch] = args.alpha + Constant<T>(batch);
+    }
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which transpose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = AxpyBatched(args.n, args.alphas.data(),
+                              buffers.x_vec(), args.x_offsets.data(), args.x_inc,
+                              buffers.y_vec(), args.y_offsets.data(), args.y_inc,
+                              args.batch_count,
+                              &queue_plain, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CLBLAS
+    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+      auto queue_plain = queue();
+      for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
+        auto event = cl_event{};
+        auto status = clblasXaxpy(args.n, args.alphas[batch],
+                                  buffers.x_vec, args.x_offsets[batch], args.x_inc,
+                                  buffers.y_vec, args.y_offsets[batch], args.y_inc,
+                                  1, &queue_plain, 0, nullptr, &event);
+        clWaitForEvents(1, &event);
+        if (static_cast<StatusCode>(status) != StatusCode::kSuccess) {
+          return static_cast<StatusCode>(status);
+        }
+      }
+      return StatusCode::kSuccess;
+    }
+  #endif
+
+  // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CBLAS
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
+      for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
+        cblasXaxpy(args.n, args.alphas[batch],
+                   buffers_host.x_vec, args.x_offsets[batch], args.x_inc,
+                   buffers_host.y_vec, args.y_offsets[batch], args.y_inc);
+      }
+      return StatusCode::kSuccess;
+    }
+  #endif
+
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
+        auto status = cublasXaxpy(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n, args.alphas[batch],
+                                  buffers.x_vec, args.x_offsets[batch], args.x_inc,
+                                  buffers.y_vec, args.y_offsets[batch], args.y_inc);
+        if (status != CUBLAS_STATUS_SUCCESS) { return StatusCode::kUnknownError; }
+      }
+      return StatusCode::kSuccess;
+    }
+  #endif
+
+  // Describes how to download the results of the computation
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.y_size, static_cast<T>(0));
+    buffers.y_vec.Read(queue, args.y_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.n; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.batch_count; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    return (id1 * args.y_inc) + args.y_offsets[id2];
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return args.batch_count * (2 * args.n);
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return args.batch_count * (3 * args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XAXPYBATCHED_H_
+#endif
diff --git a/test/routines/levelx/xgemmbatched.hpp b/test/routines/levelx/xgemmbatched.hpp
new file mode 100644
index 00000000..ebfd8b19
--- /dev/null
+++ b/test/routines/levelx/xgemmbatched.hpp
@@ -0,0 +1,211 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the XgemmBatched routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XGEMMBATCHED_H_
+#define CLBLAST_TEST_ROUTINES_XGEMMBATCHED_H_
+
+#include "test/routines/common.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXgemmBatched {
+ public:
+
+  // Although it is a non-BLAS routine, it can still be tested against level-3 routines in a loop
+  static size_t BLASLevel() { return 3; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgM, kArgN, kArgK,
+            kArgLayout, kArgATransp, kArgBTransp,
+            kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
+            kArgAOffset, kArgBOffset, kArgCOffset,
+            kArgBatchCount, kArgAlpha, kArgBeta};
+  }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; }
+  static std::vector<std::string> BuffersOut() { return {kBufMatC}; }
+
+  // Helper for the sizes per batch
+  static size_t PerBatchSizeA(const Arguments<T> &args) {
+    auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+    auto a_two = (a_rotated) ? args.m : args.k;
+    return a_two * args.a_ld;
+  }
+  static size_t PerBatchSizeB(const Arguments<T> &args) {
+    auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
+    auto b_two = (b_rotated) ? args.k : args.n;
+    return b_two * args.b_ld;
+  }
+  static size_t PerBatchSizeC(const Arguments<T> &args) {
+    auto c_rotated = (args.layout == Layout::kRowMajor);
+    auto c_two = (c_rotated) ? args.m : args.n;
+    return c_two * args.c_ld;
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    return PerBatchSizeA(args) * args.batch_count + args.a_offset;
+  }
+  static size_t GetSizeB(const Arguments<T> &args) {
+    return PerBatchSizeB(args) * args.batch_count + args.b_offset;
+  }
+  static size_t GetSizeC(const Arguments<T> &args) {
+    return PerBatchSizeC(args) * args.batch_count + args.c_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.b_size = GetSizeB(args);
+    args.c_size = GetSizeC(args);
+
+    // Also sets the batch-related variables
+    args.a_offsets = std::vector<size_t>(args.batch_count);
+    args.b_offsets = std::vector<size_t>(args.batch_count);
+    args.c_offsets = std::vector<size_t>(args.batch_count);
+    args.alphas = std::vector<T>(args.batch_count);
+    args.betas = std::vector<T>(args.batch_count);
+    for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
+      args.a_offsets[batch] = batch * PerBatchSizeA(args) + args.a_offset;
+      args.b_offsets[batch] = batch * PerBatchSizeB(args) + args.b_offset;
+      args.c_offsets[batch] = batch * PerBatchSizeC(args) + args.c_offset;
+      args.alphas[batch] = args.alpha + Constant<T>(batch);
+      args.betas[batch] = args.beta + Constant<T>(batch);
+    }
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.k; }
+  static size_t DefaultLDB(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDC(const Arguments<T> &args) { return args.n; }
+
+  // Describes which transpose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &all) { return all; }
+  static Transposes GetBTransposes(const Transposes &all) { return all; }
+
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = GemmBatched(args.layout, args.a_transpose, args.b_transpose,
+                              args.m, args.n, args.k, args.alphas.data(),
+                              buffers.a_mat(), args.a_offsets.data(), args.a_ld,
+                              buffers.b_mat(), args.b_offsets.data(), args.b_ld, args.betas.data(),
+                              buffers.c_mat(), args.c_offsets.data(), args.c_ld,
+                              args.batch_count,
+                              &queue_plain, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CLBLAS
+    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+      auto queue_plain = queue();
+      for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
+        auto event = cl_event{};
+        auto status = clblasXgemm(convertToCLBLAS(args.layout),
+                                  convertToCLBLAS(args.a_transpose),
+                                  convertToCLBLAS(args.b_transpose),
+                                  args.m, args.n, args.k, args.alphas[batch],
+                                  buffers.a_mat, args.a_offsets[batch], args.a_ld,
+                                  buffers.b_mat, args.b_offsets[batch], args.b_ld, args.betas[batch],
+                                  buffers.c_mat, args.c_offsets[batch], args.c_ld,
+                                  1, &queue_plain, 0, nullptr, &event);
+        clWaitForEvents(1, &event);
+        if (static_cast<StatusCode>(status) != StatusCode::kSuccess) {
+          return static_cast<StatusCode>(status);
+        }
+      }
+      return StatusCode::kSuccess;
+    }
+  #endif
+
+  // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CBLAS
+    static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
+      for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
+        cblasXgemm(convertToCBLAS(args.layout),
+                   convertToCBLAS(args.a_transpose),
+                   convertToCBLAS(args.b_transpose),
+                   args.m, args.n, args.k, args.alphas[batch],
+                   buffers_host.a_mat, args.a_offsets[batch], args.a_ld,
+                   buffers_host.b_mat, args.b_offsets[batch], args.b_ld, args.betas[batch],
+                   buffers_host.c_mat, args.c_offsets[batch], args.c_ld);
+      }
+      return StatusCode::kSuccess;
+    }
+  #endif
+
+  // Describes how to run the cuBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CUBLAS
+    static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+      for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
+        auto status = cublasXgemm(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout,
+                                  convertToCUBLAS(args.a_transpose),
+                                  convertToCUBLAS(args.b_transpose),
+                                  args.m, args.n, args.k, args.alphas[batch],
+                                  buffers.a_mat, args.a_offsets[batch], args.a_ld,
+                                  buffers.b_mat, args.b_offsets[batch], args.b_ld, args.betas[batch],
+                                  buffers.c_mat, args.c_offsets[batch], args.c_ld);
+      if (status != CUBLAS_STATUS_SUCCESS) { return StatusCode::kUnknownError; }
+      }
+      return StatusCode::kSuccess;
+    }
+  #endif
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.c_size, static_cast<T>(0));
+    buffers.c_mat.Read(queue, args.c_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.m; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.n * args.batch_count; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2_3) {
+    const size_t id2 = id2_3 % args.n;
+    const size_t id3 = id2_3 / args.n;
+    return (args.layout == Layout::kRowMajor) ?
+           id1*args.c_ld + id2 + args.c_offsets[id3]:
+           id2*args.c_ld + id1 + args.c_offsets[id3];
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return args.batch_count * (2 * args.m * args.n * args.k);
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return args.batch_count * (args.m*args.k + args.k*args.n + 2*args.m*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XGEMMBATCHED_H_
+#endif
diff --git a/test/routines/levelx/xinvert.hpp b/test/routines/levelx/xinvert.hpp
new file mode 100644
index 00000000..cc02a88b
--- /dev/null
+++ b/test/routines/levelx/xinvert.hpp
@@ -0,0 +1,225 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xinvert routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XINVERT_H_
+#define CLBLAST_TEST_ROUTINES_XINVERT_H_
+
+#include "test/routines/common.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+template <typename T>
+StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host) {
+  const bool is_upper = ((args.triangle == Triangle::kUpper && args.layout != Layout::kRowMajor) ||
+                         (args.triangle == Triangle::kLower && args.layout == Layout::kRowMajor));
+
+  // Helper variables
+  const auto block_size = args.m;
+  const auto num_blocks = CeilDiv(args.n, block_size);
+  const auto a_ld = args.a_ld;
+  const auto b_ld = block_size;
+
+  // Checks for valid arguments
+  if ((block_size == 0) || (args.n == 0)) {
+    return StatusCode::kInvalidDimension;
+  }
+  if ((block_size % 16 != 0) || (block_size > 128)) {
+    return StatusCode::kUnknownError;
+  }
+
+  // Loops over the amount of diagonal blocks of size args.m by args.m each
+  for (auto block_id = size_t{0}; block_id < num_blocks; ++block_id) {
+    const auto a_offset = block_id * (block_size + a_ld * block_size) + args.a_offset;
+    const auto b_offset = block_id * block_size * block_size;
+
+    // Inverts the diagonal elements of the matrix
+    for (auto i = size_t{0}; i < block_size; ++i) {
+      auto a_value = T{1.0};
+      if (args.diagonal == Diagonal::kNonUnit) {
+        if (i + block_id * block_size < args.n) {
+          if (buffers_host.a_mat[i * a_ld + i + a_offset] == T{0.0}) { return StatusCode::kUnknownError; }
+          a_value = T{1.0} / buffers_host.a_mat[i * a_ld + i + a_offset];
+        }
+      }
+      buffers_host.b_mat[i * b_ld + i + b_offset] = a_value;
+    }
+
+    // Inverts the upper triangle row by row
+    if (is_upper) {
+      for (int i = static_cast<int>(block_size) - 2; i >= 0; --i) {
+        for (auto j = static_cast<int>(block_size) - 1; j > i; --j) {
+          auto sum = T{0.0};
+          for (auto k = i + 1; k <= j; ++k) {
+            auto a_value = T{0.0};
+            if ((i + block_id * block_size < args.n) && (k + block_id * block_size < args.n)) {
+              a_value = buffers_host.a_mat[k * a_ld + i + a_offset];
+            }
+            sum += a_value * buffers_host.b_mat[j * b_ld + k + b_offset];
+          }
+          buffers_host.b_mat[j * b_ld + i + b_offset] = - sum * buffers_host.b_mat[i * b_ld + i + b_offset];
+        }
+      }
+    }
+
+    // Inverts the lower triangle row by row
+    else {
+      for (auto i = size_t{1}; i < block_size; ++i) {
+        for (auto j = size_t{0}; j < i; ++j) {
+          auto sum = T{0.0};
+          for (auto k = j; k < i; ++k) {
+            auto a_value = T{0.0};
+            if ((i + block_id * block_size < args.n) && (k + block_id * block_size < args.n)) {
+              a_value = buffers_host.a_mat[k * a_ld + i + a_offset];
+            }
+            sum += a_value * buffers_host.b_mat[j * b_ld + k + b_offset];
+          }
+          buffers_host.b_mat[j * b_ld + i + b_offset] = - sum * buffers_host.b_mat[i * b_ld + i + b_offset];
+        }
+      }
+    }
+  }
+  return StatusCode::kSuccess;
+}
+
+// Half-precision version calling the above reference implementation after conversions
+template <>
+StatusCode RunReference<half>(const Arguments<half> &args, BuffersHost<half> &buffers_host) {
+  auto a_buffer2 = HalfToFloatBuffer(buffers_host.a_mat);
+  auto b_buffer2 = HalfToFloatBuffer(buffers_host.b_mat);
+  auto dummy = std::vector<float>(0);
+  auto buffers2 = BuffersHost<float>{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy};
+  auto args2 = Arguments<float>();
+  args2.a_size = args.a_size; args2.b_size = args.b_size;
+  args2.a_ld = args.a_ld; args2.m = args.m; args2.n = args.n;
+  args2.a_offset = args.a_offset;
+  args2.layout = args.layout; args2.triangle = args.triangle; args2.diagonal = args.diagonal;
+  auto status = RunReference(args2, buffers2);
+  FloatToHalfBuffer(buffers_host.b_mat, b_buffer2);
+  return status;
+}
+
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXinvert {
+ public:
+
+  // The BLAS level: 4 for the extra routines
+  static size_t BLASLevel() { return 4; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN, kArgM,
+            kArgLayout, kArgTriangle, kArgDiagonal,
+            kArgALeadDim, kArgAOffset};
+  }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB}; }
+  static std::vector<std::string> BuffersOut() { return {kBufMatB}; }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    return args.n * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeB(const Arguments<T> &args) {
+    const auto block_size = args.m;
+    const auto num_blocks = CeilDiv(args.n, block_size);
+    return num_blocks * block_size * block_size;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.b_size = GetSizeB(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which omatcopyose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    try {
+      auto event = cl_event{};
+      auto inverter = Xinvert<T>(queue, &event);
+      inverter.InvertMatrixDiagonalBlocks(args.layout, args.triangle, args.diagonal,
+                                          args.n, args.m,
+                                          buffers.a_mat, args.a_offset, args.a_ld,
+                                          buffers.b_mat);
+      clWaitForEvents(1, &event);
+      clReleaseEvent(event);
+    } catch (...) { return DispatchException(); }
+    return StatusCode::kSuccess;
+  }
+
+  // Describes how to run a naive version of the routine (for correctness/performance comparison).
+  // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines.
+  static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    auto buffers_host = BuffersHost<T>();
+    DeviceToHost(args, buffers, buffers_host, queue, BuffersIn());
+    const auto status = RunReference(args, buffers_host);
+    HostToDevice(args, buffers, buffers_host, queue, BuffersOut());
+    return status;
+  }
+
+  static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue&) {
+    return RunReference(args, buffers_host);
+  }
+  static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
+    return StatusCode::kUnknownError;
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.b_size, static_cast<T>(0));
+    buffers.b_mat.Read(queue, args.b_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.m; }
+  static size_t ResultID2(const Arguments<T> &args) { return Ceil(args.n, args.m); }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    return id1 * Ceil(args.n, args.m) + id2;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    const auto block_size = args.m;
+    const auto num_blocks = CeilDiv(args.n, block_size);
+    return num_blocks * (block_size * (block_size / 2) * (block_size / 2));
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (args.a_size * args.b_size) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XINVERT_H_
+#endif
diff --git a/test/routines/levelx/xomatcopy.hpp b/test/routines/levelx/xomatcopy.hpp
index f0739c6a..4638b61c 100644
--- a/test/routines/levelx/xomatcopy.hpp
+++ b/test/routines/levelx/xomatcopy.hpp
@@ -16,20 +16,13 @@
 #ifndef CLBLAST_TEST_ROUTINES_XOMATCOPY_H_
 #define CLBLAST_TEST_ROUTINES_XOMATCOPY_H_
 
-#include <vector>
-#include <string>
+#include "test/routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
 
 template <typename T>
-StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-
-  // Data transfer from OpenCL to std::vector
-  std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
-  std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
-  buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-  buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
+StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host) {
 
   // Checking for invalid arguments
   const auto a_rotated = (args.layout == Layout::kRowMajor);
@@ -40,8 +33,8 @@ StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &qu
   if ((args.m == 0) || (args.n == 0)) { return StatusCode::kInvalidDimension; }
   if ((args.a_ld < args.m && !a_rotated) || (args.a_ld < args.n && a_rotated)) { return StatusCode::kInvalidLeadDimA; }
   if ((args.b_ld < args.m && !b_rotated) || (args.b_ld < args.n && b_rotated)) { return StatusCode::kInvalidLeadDimB; }
-  if (buffers.a_mat.GetSize() < (a_base + args.a_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryA; }
-  if (buffers.b_mat.GetSize() < (b_base + args.b_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryB; }
+  if (buffers_host.a_mat.size() * sizeof(T) < (a_base + args.a_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryA; }
+  if (buffers_host.b_mat.size() * sizeof(T) < (b_base + args.b_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryB; }
 
   // Matrix copy, scaling, and/or transpose
   for (auto id1 = size_t{0}; id1 < args.m; ++id1) {
@@ -52,30 +45,27 @@ StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &qu
       const auto b_two = (b_rotated) ? id1 : id2;
       const auto a_index = a_two * args.a_ld + a_one + args.a_offset;
       const auto b_index = b_two * args.b_ld + b_one + args.b_offset;
-      b_mat_cpu[b_index] = args.alpha * a_mat_cpu[a_index];
+      buffers_host.b_mat[b_index] = args.alpha * buffers_host.a_mat[a_index];
     }
   }
-
-  // Data transfer back to OpenCL
-  buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
   return StatusCode::kSuccess;
 }
 
 // Half-precision version calling the above reference implementation after conversions
 template <>
-StatusCode RunReference<half>(const Arguments<half> &args, Buffers<half> &buffers, Queue &queue) {
-  auto a_buffer2 = HalfToFloatBuffer(buffers.a_mat, queue());
-  auto b_buffer2 = HalfToFloatBuffer(buffers.b_mat, queue());
-  auto dummy = clblast::Buffer<float>(0);
-  auto buffers2 = Buffers<float>{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy};
+StatusCode RunReference<half>(const Arguments<half> &args, BuffersHost<half> &buffers_host) {
+  auto a_buffer2 = HalfToFloatBuffer(buffers_host.a_mat);
+  auto b_buffer2 = HalfToFloatBuffer(buffers_host.b_mat);
+  auto dummy = std::vector<float>(0);
+  auto buffers2 = BuffersHost<float>{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy};
   auto args2 = Arguments<float>();
   args2.a_size = args.a_size; args2.b_size = args.b_size;
   args2.a_ld = args.a_ld; args2.b_ld = args.b_ld; args2.m = args.m; args2.n = args.n;
   args2.a_offset = args.a_offset; args2.b_offset = args.b_offset;
   args2.layout = args.layout; args2.a_transpose = args.a_transpose;
   args2.alpha = HalfToFloat(args.alpha);
-  auto status = RunReference(args2, buffers2, queue);
-  FloatToHalfBuffer(buffers.b_mat, b_buffer2, queue());
+  auto status = RunReference(args2, buffers2);
+  FloatToHalfBuffer(buffers_host.b_mat, b_buffer2);
   return status;
 }
 
@@ -97,6 +87,8 @@ class TestXomatcopy {
             kArgAOffset, kArgBOffset,
             kArgAlpha};
   }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB}; }
+  static std::vector<std::string> BuffersOut() { return {kBufMatB}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeA(const Arguments<T> &args) {
@@ -127,6 +119,11 @@ class TestXomatcopy {
   static Transposes GetATransposes(const Transposes &all) { return all; }
   static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
 
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
@@ -143,11 +140,18 @@ class TestXomatcopy {
   // Describes how to run a naive version of the routine (for correctness/performance comparison).
   // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines.
   static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-    return RunReference(args, buffers, queue);
+    auto buffers_host = BuffersHost<T>();
+    DeviceToHost(args, buffers, buffers_host, queue, BuffersIn());
+    const auto status = RunReference(args, buffers_host);
+    HostToDevice(args, buffers, buffers_host, queue, BuffersOut());
+    return status;
   }
 
-  static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-    return RunReference(args, buffers, queue);
+  static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue&) {
+    return RunReference(args, buffers_host);
+  }
+  static StatusCode RunReference3(const Arguments<T> &, BuffersCUDA<T> &, Queue &) {
+    return StatusCode::kUnknownError;
   }
 
   // Describes how to download the results of the computation (more importantly: which buffer)
diff --git a/test/wrapper_cblas.hpp b/test/wrapper_cblas.hpp
index 5f1db54e..070d44b5 100644
--- a/test/wrapper_cblas.hpp
+++ b/test/wrapper_cblas.hpp
@@ -94,7 +94,7 @@ void cblasXrot(const size_t n,
                std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc,
                const float cos,
                const float sin) {
-  cblas_srot(n,
+  cblas_srot(static_cast<int>(n),
              &x_buffer[x_offset], static_cast<int>(x_inc),
              &y_buffer[y_offset], static_cast<int>(y_inc),
              cos,
@@ -105,7 +105,7 @@ void cblasXrot(const size_t n,
                std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc,
                const double cos,
                const double sin) {
-  cblas_drot(n,
+  cblas_drot(static_cast<int>(n),
              &x_buffer[x_offset], static_cast<int>(x_inc),
              &y_buffer[y_offset], static_cast<int>(y_inc),
              cos,
@@ -117,7 +117,7 @@ void cblasXrotm(const size_t n,
                 std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
                 std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc,
                 std::vector<float>& sparam_buffer, const size_t sparam_offset) {
-  cblas_srotm(n,
+  cblas_srotm(static_cast<int>(n),
               &x_buffer[x_offset], static_cast<int>(x_inc),
               &y_buffer[y_offset], static_cast<int>(y_inc),
               &sparam_buffer[sparam_offset]);
@@ -126,7 +126,7 @@ void cblasXrotm(const size_t n,
                 std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
                 std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc,
                 std::vector<double>& sparam_buffer, const size_t sparam_offset) {
-  cblas_drotm(n,
+  cblas_drotm(static_cast<int>(n),
               &x_buffer[x_offset], static_cast<int>(x_inc),
               &y_buffer[y_offset], static_cast<int>(y_inc),
               &sparam_buffer[sparam_offset]);
@@ -136,28 +136,28 @@ void cblasXrotm(const size_t n,
 void cblasXswap(const size_t n,
                 std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
                 std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
-  cblas_sswap(n,
+  cblas_sswap(static_cast<int>(n),
               &x_buffer[x_offset], static_cast<int>(x_inc),
               &y_buffer[y_offset], static_cast<int>(y_inc));
 }
 void cblasXswap(const size_t n,
                 std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
                 std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
-  cblas_dswap(n,
+  cblas_dswap(static_cast<int>(n),
               &x_buffer[x_offset], static_cast<int>(x_inc),
               &y_buffer[y_offset], static_cast<int>(y_inc));
 }
 void cblasXswap(const size_t n,
                 std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
                 std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
-  cblas_cswap(n,
+  cblas_cswap(static_cast<int>(n),
               reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
               reinterpret_cast<float*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
 }
 void cblasXswap(const size_t n,
                 std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
                 std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
-  cblas_zswap(n,
+  cblas_zswap(static_cast<int>(n),
               reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
               reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
 }
@@ -177,14 +177,14 @@ void cblasXswap(const size_t n,
 void cblasXscal(const size_t n,
                 const float alpha,
                 std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
-  cblas_sscal(n,
+  cblas_sscal(static_cast<int>(n),
               alpha,
               &x_buffer[x_offset], static_cast<int>(x_inc));
 }
 void cblasXscal(const size_t n,
                 const double alpha,
                 std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
-  cblas_dscal(n,
+  cblas_dscal(static_cast<int>(n),
               alpha,
               &x_buffer[x_offset], static_cast<int>(x_inc));
 }
@@ -192,7 +192,7 @@ void cblasXscal(const size_t n,
                 const float2 alpha,
                 std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
   const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
-  cblas_cscal(n,
+  cblas_cscal(static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
 }
@@ -200,7 +200,7 @@ void cblasXscal(const size_t n,
                 const double2 alpha,
                 std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
   const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
-  cblas_zscal(n,
+  cblas_zscal(static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
 }
@@ -218,28 +218,28 @@ void cblasXscal(const size_t n,
 void cblasXcopy(const size_t n,
                 const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
                 std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
-  cblas_scopy(n,
+  cblas_scopy(static_cast<int>(n),
               &x_buffer[x_offset], static_cast<int>(x_inc),
               &y_buffer[y_offset], static_cast<int>(y_inc));
 }
 void cblasXcopy(const size_t n,
                 const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
                 std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
-  cblas_dcopy(n,
+  cblas_dcopy(static_cast<int>(n),
               &x_buffer[x_offset], static_cast<int>(x_inc),
               &y_buffer[y_offset], static_cast<int>(y_inc));
 }
 void cblasXcopy(const size_t n,
                 const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
                 std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
-  cblas_ccopy(n,
+  cblas_ccopy(static_cast<int>(n),
               reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
               reinterpret_cast<float*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
 }
 void cblasXcopy(const size_t n,
                 const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
                 std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
-  cblas_zcopy(n,
+  cblas_zcopy(static_cast<int>(n),
               reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
               reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
 }
@@ -259,7 +259,7 @@ void cblasXaxpy(const size_t n,
                 const float alpha,
                 const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
                 std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
-  cblas_saxpy(n,
+  cblas_saxpy(static_cast<int>(n),
               alpha,
               &x_buffer[x_offset], static_cast<int>(x_inc),
               &y_buffer[y_offset], static_cast<int>(y_inc));
@@ -268,7 +268,7 @@ void cblasXaxpy(const size_t n,
                 const double alpha,
                 const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
                 std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
-  cblas_daxpy(n,
+  cblas_daxpy(static_cast<int>(n),
               alpha,
               &x_buffer[x_offset], static_cast<int>(x_inc),
               &y_buffer[y_offset], static_cast<int>(y_inc));
@@ -278,7 +278,7 @@ void cblasXaxpy(const size_t n,
                 const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
                 std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
   const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
-  cblas_caxpy(n,
+  cblas_caxpy(static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
               reinterpret_cast<float*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
@@ -288,7 +288,7 @@ void cblasXaxpy(const size_t n,
                 const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
                 std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
   const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
-  cblas_zaxpy(n,
+  cblas_zaxpy(static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
               reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
@@ -311,7 +311,7 @@ void cblasXdot(const size_t n,
                std::vector<float>& dot_buffer, const size_t dot_offset,
                const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
                const std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
-  dot_buffer[dot_offset] = cblas_sdot(n,
+  dot_buffer[dot_offset] = cblas_sdot(static_cast<int>(n),
                                       &x_buffer[x_offset], static_cast<int>(x_inc),
                                       &y_buffer[y_offset], static_cast<int>(y_inc));
 }
@@ -319,7 +319,7 @@ void cblasXdot(const size_t n,
                std::vector<double>& dot_buffer, const size_t dot_offset,
                const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
                const std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
-  dot_buffer[dot_offset] = cblas_ddot(n,
+  dot_buffer[dot_offset] = cblas_ddot(static_cast<int>(n),
                                       &x_buffer[x_offset], static_cast<int>(x_inc),
                                       &y_buffer[y_offset], static_cast<int>(y_inc));
 }
@@ -342,7 +342,7 @@ void cblasXdotu(const size_t n,
                 std::vector<float2>& dot_buffer, const size_t dot_offset,
                 const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
                 const std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
-  cblas_cdotu_sub(n,
+  cblas_cdotu_sub(static_cast<int>(n),
                   reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
                   reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
                   reinterpret_cast<return_pointer_float>(&dot_buffer[dot_offset]));
@@ -351,7 +351,7 @@ void cblasXdotu(const size_t n,
                 std::vector<double2>& dot_buffer, const size_t dot_offset,
                 const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
                 const std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
-  cblas_zdotu_sub(n,
+  cblas_zdotu_sub(static_cast<int>(n),
                   reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
                   reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
                   reinterpret_cast<return_pointer_double>(&dot_buffer[dot_offset]));
@@ -362,7 +362,7 @@ void cblasXdotc(const size_t n,
                 std::vector<float2>& dot_buffer, const size_t dot_offset,
                 const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
                 const std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
-  cblas_cdotc_sub(n,
+  cblas_cdotc_sub(static_cast<int>(n),
                   reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
                   reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
                   reinterpret_cast<return_pointer_float>(&dot_buffer[dot_offset]));
@@ -371,7 +371,7 @@ void cblasXdotc(const size_t n,
                 std::vector<double2>& dot_buffer, const size_t dot_offset,
                 const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
                 const std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
-  cblas_zdotc_sub(n,
+  cblas_zdotc_sub(static_cast<int>(n),
                   reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
                   reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
                   reinterpret_cast<return_pointer_double>(&dot_buffer[dot_offset]));
@@ -381,25 +381,25 @@ void cblasXdotc(const size_t n,
 void cblasXnrm2(const size_t n,
                 std::vector<float>& nrm2_buffer, const size_t nrm2_offset,
                 const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
-  nrm2_buffer[nrm2_offset] = cblas_snrm2(n,
+  nrm2_buffer[nrm2_offset] = cblas_snrm2(static_cast<int>(n),
                                          &x_buffer[x_offset], static_cast<int>(x_inc));
 }
 void cblasXnrm2(const size_t n,
                 std::vector<double>& nrm2_buffer, const size_t nrm2_offset,
                 const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
-  nrm2_buffer[nrm2_offset] = cblas_dnrm2(n,
+  nrm2_buffer[nrm2_offset] = cblas_dnrm2(static_cast<int>(n),
                                          &x_buffer[x_offset], static_cast<int>(x_inc));
 }
 void cblasXnrm2(const size_t n,
                 std::vector<float2>& nrm2_buffer, const size_t nrm2_offset,
                 const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
-  nrm2_buffer[nrm2_offset].real(cblas_scnrm2(n,
+  nrm2_buffer[nrm2_offset].real(cblas_scnrm2(static_cast<int>(n),
                                             reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc)));
 }
 void cblasXnrm2(const size_t n,
                 std::vector<double2>& nrm2_buffer, const size_t nrm2_offset,
                 const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
-  nrm2_buffer[nrm2_offset].real(cblas_dznrm2(n,
+  nrm2_buffer[nrm2_offset].real(cblas_dznrm2(static_cast<int>(n),
                                             reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc)));
 }
 void cblasXnrm2(const size_t n,
@@ -417,25 +417,25 @@ void cblasXnrm2(const size_t n,
 void cblasXasum(const size_t n,
                 std::vector<float>& asum_buffer, const size_t asum_offset,
                 const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
-  asum_buffer[asum_offset] = cblas_sasum(n,
+  asum_buffer[asum_offset] = cblas_sasum(static_cast<int>(n),
                                          &x_buffer[x_offset], static_cast<int>(x_inc));
 }
 void cblasXasum(const size_t n,
                 std::vector<double>& asum_buffer, const size_t asum_offset,
                 const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
-  asum_buffer[asum_offset] = cblas_dasum(n,
+  asum_buffer[asum_offset] = cblas_dasum(static_cast<int>(n),
                                          &x_buffer[x_offset], static_cast<int>(x_inc));
 }
 void cblasXasum(const size_t n,
                 std::vector<float2>& asum_buffer, const size_t asum_offset,
                 const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
-  asum_buffer[asum_offset].real(cblas_scasum(n,
+  asum_buffer[asum_offset].real(cblas_scasum(static_cast<int>(n),
                                             reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc)));
 }
 void cblasXasum(const size_t n,
                 std::vector<double2>& asum_buffer, const size_t asum_offset,
                 const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
-  asum_buffer[asum_offset].real(cblas_dzasum(n,
+  asum_buffer[asum_offset].real(cblas_dzasum(static_cast<int>(n),
                                             reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc)));
 }
 void cblasXasum(const size_t n,
@@ -453,25 +453,25 @@ void cblasXasum(const size_t n,
 void cblasXamax(const size_t n,
                 std::vector<float>& imax_buffer, const size_t imax_offset,
                 const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
-  ((int*)&imax_buffer[0])[imax_offset] = cblas_isamax(n,
+  ((int*)&imax_buffer[0])[imax_offset] = cblas_isamax(static_cast<int>(n),
                                                      &x_buffer[x_offset], static_cast<int>(x_inc));
 }
 void cblasXamax(const size_t n,
                 std::vector<double>& imax_buffer, const size_t imax_offset,
                 const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
-  ((int*)&imax_buffer[0])[imax_offset] = cblas_idamax(n,
+  ((int*)&imax_buffer[0])[imax_offset] = cblas_idamax(static_cast<int>(n),
                                                      &x_buffer[x_offset], static_cast<int>(x_inc));
 }
 void cblasXamax(const size_t n,
                 std::vector<float2>& imax_buffer, const size_t imax_offset,
                 const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
-  ((int*)&imax_buffer[0])[imax_offset] = cblas_icamax(n,
+  ((int*)&imax_buffer[0])[imax_offset] = cblas_icamax(static_cast<int>(n),
                                                      reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
 }
 void cblasXamax(const size_t n,
                 std::vector<double2>& imax_buffer, const size_t imax_offset,
                 const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
-  ((int*)&imax_buffer[0])[imax_offset] = cblas_izamax(n,
+  ((int*)&imax_buffer[0])[imax_offset] = cblas_izamax(static_cast<int>(n),
                                                      reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
 }
 void cblasXamax(const size_t n,
@@ -498,7 +498,7 @@ void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
                 const float beta,
                 std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
   cblas_sgemv(layout, a_transpose,
-              m, n,
+              static_cast<int>(m), static_cast<int>(n),
               alpha,
               &a_buffer[a_offset], a_ld,
               &x_buffer[x_offset], static_cast<int>(x_inc),
@@ -513,7 +513,7 @@ void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
                 const double beta,
                 std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
   cblas_dgemv(layout, a_transpose,
-              m, n,
+              static_cast<int>(m), static_cast<int>(n),
               alpha,
               &a_buffer[a_offset], a_ld,
               &x_buffer[x_offset], static_cast<int>(x_inc),
@@ -530,7 +530,7 @@ void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
   const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
   const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
   cblas_cgemv(layout, a_transpose,
-              m, n,
+              static_cast<int>(m), static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
@@ -547,7 +547,7 @@ void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
   const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
   const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
   cblas_zgemv(layout, a_transpose,
-              m, n,
+              static_cast<int>(m), static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
@@ -583,7 +583,7 @@ void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
                 const float beta,
                 std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
   cblas_sgbmv(layout, a_transpose,
-              m, n, kl, ku,
+              static_cast<int>(m), static_cast<int>(n), static_cast<int>(kl), static_cast<int>(ku),
               alpha,
               &a_buffer[a_offset], a_ld,
               &x_buffer[x_offset], static_cast<int>(x_inc),
@@ -598,7 +598,7 @@ void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
                 const double beta,
                 std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
   cblas_dgbmv(layout, a_transpose,
-              m, n, kl, ku,
+              static_cast<int>(m), static_cast<int>(n), static_cast<int>(kl), static_cast<int>(ku),
               alpha,
               &a_buffer[a_offset], a_ld,
               &x_buffer[x_offset], static_cast<int>(x_inc),
@@ -615,7 +615,7 @@ void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
   const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
   const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
   cblas_cgbmv(layout, a_transpose,
-              m, n, kl, ku,
+              static_cast<int>(m), static_cast<int>(n), static_cast<int>(kl), static_cast<int>(ku),
               alpha_array.data(),
               reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
@@ -632,7 +632,7 @@ void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
   const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
   const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
   cblas_zgbmv(layout, a_transpose,
-              m, n, kl, ku,
+              static_cast<int>(m), static_cast<int>(n), static_cast<int>(kl), static_cast<int>(ku),
               alpha_array.data(),
               reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
@@ -670,7 +670,7 @@ void cblasXhemv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
   const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
   const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
   cblas_chemv(layout, triangle,
-              n,
+              static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
@@ -687,7 +687,7 @@ void cblasXhemv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
   const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
   const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
   cblas_zhemv(layout, triangle,
-              n,
+              static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
@@ -706,7 +706,7 @@ void cblasXhbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
   const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
   const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
   cblas_chbmv(layout, triangle,
-              n, k,
+              static_cast<int>(n), static_cast<int>(k),
               alpha_array.data(),
               reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
@@ -723,7 +723,7 @@ void cblasXhbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
   const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
   const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
   cblas_zhbmv(layout, triangle,
-              n, k,
+              static_cast<int>(n), static_cast<int>(k),
               alpha_array.data(),
               reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
@@ -742,7 +742,7 @@ void cblasXhpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
   const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
   const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
   cblas_chpmv(layout, triangle,
-              n,
+              static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const float*>(&ap_buffer[ap_offset]),
               reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
@@ -759,7 +759,7 @@ void cblasXhpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
   const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
   const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
   cblas_zhpmv(layout, triangle,
-              n,
+              static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const double*>(&ap_buffer[ap_offset]),
               reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
@@ -776,7 +776,7 @@ void cblasXsymv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
                 const float beta,
                 std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
   cblas_ssymv(layout, triangle,
-              n,
+              static_cast<int>(n),
               alpha,
               &a_buffer[a_offset], a_ld,
               &x_buffer[x_offset], static_cast<int>(x_inc),
@@ -791,7 +791,7 @@ void cblasXsymv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
                 const double beta,
                 std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
   cblas_dsymv(layout, triangle,
-              n,
+              static_cast<int>(n),
               alpha,
               &a_buffer[a_offset], a_ld,
               &x_buffer[x_offset], static_cast<int>(x_inc),
@@ -827,7 +827,7 @@ void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
                 const float beta,
                 std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
   cblas_ssbmv(layout, triangle,
-              n, k,
+              static_cast<int>(n), static_cast<int>(k),
               alpha,
               &a_buffer[a_offset], a_ld,
               &x_buffer[x_offset], static_cast<int>(x_inc),
@@ -842,7 +842,7 @@ void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
                 const double beta,
                 std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
   cblas_dsbmv(layout, triangle,
-              n, k,
+              static_cast<int>(n), static_cast<int>(k),
               alpha,
               &a_buffer[a_offset], a_ld,
               &x_buffer[x_offset], static_cast<int>(x_inc),
@@ -878,7 +878,7 @@ void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
                 const float beta,
                 std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
   cblas_sspmv(layout, triangle,
-              n,
+              static_cast<int>(n),
               alpha,
               &ap_buffer[ap_offset],
               &x_buffer[x_offset], static_cast<int>(x_inc),
@@ -893,7 +893,7 @@ void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
                 const double beta,
                 std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
   cblas_dspmv(layout, triangle,
-              n,
+              static_cast<int>(n),
               alpha,
               &ap_buffer[ap_offset],
               &x_buffer[x_offset], static_cast<int>(x_inc),
@@ -926,7 +926,7 @@ void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
                 std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_strmv(layout, triangle, a_transpose, diagonal,
-              n,
+              static_cast<int>(n),
               &a_buffer[a_offset], a_ld,
               &x_buffer[x_offset], static_cast<int>(x_inc));
 }
@@ -935,7 +935,7 @@ void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
                 std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_dtrmv(layout, triangle, a_transpose, diagonal,
-              n,
+              static_cast<int>(n),
               &a_buffer[a_offset], a_ld,
               &x_buffer[x_offset], static_cast<int>(x_inc));
 }
@@ -944,7 +944,7 @@ void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
                 std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_ctrmv(layout, triangle, a_transpose, diagonal,
-              n,
+              static_cast<int>(n),
               reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
 }
@@ -953,7 +953,7 @@ void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
                 std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_ztrmv(layout, triangle, a_transpose, diagonal,
-              n,
+              static_cast<int>(n),
               reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
 }
@@ -976,7 +976,7 @@ void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
                 std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_stbmv(layout, triangle, a_transpose, diagonal,
-              n, k,
+              static_cast<int>(n), static_cast<int>(k),
               &a_buffer[a_offset], a_ld,
               &x_buffer[x_offset], static_cast<int>(x_inc));
 }
@@ -985,7 +985,7 @@ void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
                 std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_dtbmv(layout, triangle, a_transpose, diagonal,
-              n, k,
+              static_cast<int>(n), static_cast<int>(k),
               &a_buffer[a_offset], a_ld,
               &x_buffer[x_offset], static_cast<int>(x_inc));
 }
@@ -994,7 +994,7 @@ void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
                 std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_ctbmv(layout, triangle, a_transpose, diagonal,
-              n, k,
+              static_cast<int>(n), static_cast<int>(k),
               reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
 }
@@ -1003,7 +1003,7 @@ void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
                 std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_ztbmv(layout, triangle, a_transpose, diagonal,
-              n, k,
+              static_cast<int>(n), static_cast<int>(k),
               reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
 }
@@ -1026,7 +1026,7 @@ void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<float>& ap_buffer, const size_t ap_offset,
                 std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_stpmv(layout, triangle, a_transpose, diagonal,
-              n,
+              static_cast<int>(n),
               &ap_buffer[ap_offset],
               &x_buffer[x_offset], static_cast<int>(x_inc));
 }
@@ -1035,7 +1035,7 @@ void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<double>& ap_buffer, const size_t ap_offset,
                 std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_dtpmv(layout, triangle, a_transpose, diagonal,
-              n,
+              static_cast<int>(n),
               &ap_buffer[ap_offset],
               &x_buffer[x_offset], static_cast<int>(x_inc));
 }
@@ -1044,7 +1044,7 @@ void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<float2>& ap_buffer, const size_t ap_offset,
                 std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_ctpmv(layout, triangle, a_transpose, diagonal,
-              n,
+              static_cast<int>(n),
               reinterpret_cast<const float*>(&ap_buffer[ap_offset]),
               reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
 }
@@ -1053,7 +1053,7 @@ void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<double2>& ap_buffer, const size_t ap_offset,
                 std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_ztpmv(layout, triangle, a_transpose, diagonal,
-              n,
+              static_cast<int>(n),
               reinterpret_cast<const double*>(&ap_buffer[ap_offset]),
               reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
 }
@@ -1076,7 +1076,7 @@ void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
                 std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_strsv(layout, triangle, a_transpose, diagonal,
-              n,
+              static_cast<int>(n),
               &a_buffer[a_offset], a_ld,
               &x_buffer[x_offset], static_cast<int>(x_inc));
 }
@@ -1085,7 +1085,7 @@ void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
                 std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_dtrsv(layout, triangle, a_transpose, diagonal,
-              n,
+              static_cast<int>(n),
               &a_buffer[a_offset], a_ld,
               &x_buffer[x_offset], static_cast<int>(x_inc));
 }
@@ -1094,7 +1094,7 @@ void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
                 std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_ctrsv(layout, triangle, a_transpose, diagonal,
-              n,
+              static_cast<int>(n),
               reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
 }
@@ -1103,7 +1103,7 @@ void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
                 std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_ztrsv(layout, triangle, a_transpose, diagonal,
-              n,
+              static_cast<int>(n),
               reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
 }
@@ -1114,7 +1114,7 @@ void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
                 std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_stbsv(layout, triangle, a_transpose, diagonal,
-              n, k,
+              static_cast<int>(n), static_cast<int>(k),
               &a_buffer[a_offset], a_ld,
               &x_buffer[x_offset], static_cast<int>(x_inc));
 }
@@ -1123,7 +1123,7 @@ void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
                 std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_dtbsv(layout, triangle, a_transpose, diagonal,
-              n, k,
+              static_cast<int>(n), static_cast<int>(k),
               &a_buffer[a_offset], a_ld,
               &x_buffer[x_offset], static_cast<int>(x_inc));
 }
@@ -1132,7 +1132,7 @@ void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
                 std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_ctbsv(layout, triangle, a_transpose, diagonal,
-              n, k,
+              static_cast<int>(n), static_cast<int>(k),
               reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
 }
@@ -1141,7 +1141,7 @@ void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
                 std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_ztbsv(layout, triangle, a_transpose, diagonal,
-              n, k,
+              static_cast<int>(n), static_cast<int>(k),
               reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
 }
@@ -1152,7 +1152,7 @@ void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<float>& ap_buffer, const size_t ap_offset,
                 std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_stpsv(layout, triangle, a_transpose, diagonal,
-              n,
+              static_cast<int>(n),
               &ap_buffer[ap_offset],
               &x_buffer[x_offset], static_cast<int>(x_inc));
 }
@@ -1161,7 +1161,7 @@ void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<double>& ap_buffer, const size_t ap_offset,
                 std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_dtpsv(layout, triangle, a_transpose, diagonal,
-              n,
+              static_cast<int>(n),
               &ap_buffer[ap_offset],
               &x_buffer[x_offset], static_cast<int>(x_inc));
 }
@@ -1170,7 +1170,7 @@ void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<float2>& ap_buffer, const size_t ap_offset,
                 std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_ctpsv(layout, triangle, a_transpose, diagonal,
-              n,
+              static_cast<int>(n),
               reinterpret_cast<const float*>(&ap_buffer[ap_offset]),
               reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
 }
@@ -1179,7 +1179,7 @@ void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const std::vector<double2>& ap_buffer, const size_t ap_offset,
                 std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
   cblas_ztpsv(layout, triangle, a_transpose, diagonal,
-              n,
+              static_cast<int>(n),
               reinterpret_cast<const double*>(&ap_buffer[ap_offset]),
               reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
 }
@@ -1192,7 +1192,7 @@ void cblasXger(const CBLAS_ORDER layout,
                const std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc,
                std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld) {
   cblas_sger(layout,
-             m, n,
+             static_cast<int>(m), static_cast<int>(n),
              alpha,
              &x_buffer[x_offset], static_cast<int>(x_inc),
              &y_buffer[y_offset], static_cast<int>(y_inc),
@@ -1205,7 +1205,7 @@ void cblasXger(const CBLAS_ORDER layout,
                const std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc,
                std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld) {
   cblas_dger(layout,
-             m, n,
+             static_cast<int>(m), static_cast<int>(n),
              alpha,
              &x_buffer[x_offset], static_cast<int>(x_inc),
              &y_buffer[y_offset], static_cast<int>(y_inc),
@@ -1238,7 +1238,7 @@ void cblasXgeru(const CBLAS_ORDER layout,
                 std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld) {
   const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
   cblas_cgeru(layout,
-              m, n,
+              static_cast<int>(m), static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
               reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
@@ -1252,7 +1252,7 @@ void cblasXgeru(const CBLAS_ORDER layout,
                 std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld) {
   const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
   cblas_zgeru(layout,
-              m, n,
+              static_cast<int>(m), static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
               reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
@@ -1268,7 +1268,7 @@ void cblasXgerc(const CBLAS_ORDER layout,
                 std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld) {
   const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
   cblas_cgerc(layout,
-              m, n,
+              static_cast<int>(m), static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
               reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
@@ -1282,7 +1282,7 @@ void cblasXgerc(const CBLAS_ORDER layout,
                 std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld) {
   const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
   cblas_zgerc(layout,
-              m, n,
+              static_cast<int>(m), static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
               reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
@@ -1296,7 +1296,7 @@ void cblasXher(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
                const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
                std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld) {
   cblas_cher(layout, triangle,
-             n,
+             static_cast<int>(n),
              alpha,
              reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
              reinterpret_cast<float*>(&a_buffer[a_offset]), a_ld);
@@ -1307,7 +1307,7 @@ void cblasXher(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
                const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
                std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld) {
   cblas_zher(layout, triangle,
-             n,
+             static_cast<int>(n),
              alpha,
              reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
              reinterpret_cast<double*>(&a_buffer[a_offset]), a_ld);
@@ -1320,7 +1320,7 @@ void cblasXhpr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
                const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
                std::vector<float2>& ap_buffer, const size_t ap_offset) {
   cblas_chpr(layout, triangle,
-             n,
+             static_cast<int>(n),
              alpha,
              reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
              reinterpret_cast<float*>(&ap_buffer[ap_offset]));
@@ -1331,7 +1331,7 @@ void cblasXhpr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
                const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
                std::vector<double2>& ap_buffer, const size_t ap_offset) {
   cblas_zhpr(layout, triangle,
-             n,
+             static_cast<int>(n),
              alpha,
              reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
              reinterpret_cast<double*>(&ap_buffer[ap_offset]));
@@ -1346,7 +1346,7 @@ void cblasXher2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
                 std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld) {
   const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
   cblas_cher2(layout, triangle,
-              n,
+              static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
               reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
@@ -1360,7 +1360,7 @@ void cblasXher2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
                 std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld) {
   const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
   cblas_zher2(layout, triangle,
-              n,
+              static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
               reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
@@ -1376,7 +1376,7 @@ void cblasXhpr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
                 std::vector<float2>& ap_buffer, const size_t ap_offset) {
   const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
   cblas_chpr2(layout, triangle,
-              n,
+              static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
               reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
@@ -1390,7 +1390,7 @@ void cblasXhpr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
                 std::vector<double2>& ap_buffer, const size_t ap_offset) {
   const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
   cblas_zhpr2(layout, triangle,
-              n,
+              static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
               reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
@@ -1404,7 +1404,7 @@ void cblasXsyr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
                const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
                std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld) {
   cblas_ssyr(layout, triangle,
-             n,
+             static_cast<int>(n),
              alpha,
              &x_buffer[x_offset], static_cast<int>(x_inc),
              &a_buffer[a_offset], a_ld);
@@ -1415,7 +1415,7 @@ void cblasXsyr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
                const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
                std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld) {
   cblas_dsyr(layout, triangle,
-             n,
+             static_cast<int>(n),
              alpha,
              &x_buffer[x_offset], static_cast<int>(x_inc),
              &a_buffer[a_offset], a_ld);
@@ -1442,7 +1442,7 @@ void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
                const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
                std::vector<float>& ap_buffer, const size_t ap_offset) {
   cblas_sspr(layout, triangle,
-             n,
+             static_cast<int>(n),
              alpha,
              &x_buffer[x_offset], static_cast<int>(x_inc),
              &ap_buffer[ap_offset]);
@@ -1453,7 +1453,7 @@ void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
                const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
                std::vector<double>& ap_buffer, const size_t ap_offset) {
   cblas_dspr(layout, triangle,
-             n,
+             static_cast<int>(n),
              alpha,
              &x_buffer[x_offset], static_cast<int>(x_inc),
              &ap_buffer[ap_offset]);
@@ -1481,7 +1481,7 @@ void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
                 const std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc,
                 std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld) {
   cblas_ssyr2(layout, triangle,
-              n,
+              static_cast<int>(n),
               alpha,
               &x_buffer[x_offset], static_cast<int>(x_inc),
               &y_buffer[y_offset], static_cast<int>(y_inc),
@@ -1494,7 +1494,7 @@ void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
                 const std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc,
                 std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld) {
   cblas_dsyr2(layout, triangle,
-              n,
+              static_cast<int>(n),
               alpha,
               &x_buffer[x_offset], static_cast<int>(x_inc),
               &y_buffer[y_offset], static_cast<int>(y_inc),
@@ -1526,7 +1526,7 @@ void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
                 const std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc,
                 std::vector<float>& ap_buffer, const size_t ap_offset) {
   cblas_sspr2(layout, triangle,
-              n,
+              static_cast<int>(n),
               alpha,
               &x_buffer[x_offset], static_cast<int>(x_inc),
               &y_buffer[y_offset], static_cast<int>(y_inc),
@@ -1539,7 +1539,7 @@ void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
                 const std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc,
                 std::vector<double>& ap_buffer, const size_t ap_offset) {
   cblas_dspr2(layout, triangle,
-              n,
+              static_cast<int>(n),
               alpha,
               &x_buffer[x_offset], static_cast<int>(x_inc),
               &y_buffer[y_offset], static_cast<int>(y_inc),
@@ -1576,7 +1576,7 @@ void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, con
                 const float beta,
                 std::vector<float>& c_buffer, const size_t c_offset, const size_t c_ld) {
   cblas_sgemm(layout, a_transpose, b_transpose,
-              m, n, k,
+              static_cast<int>(m), static_cast<int>(n), static_cast<int>(k),
               alpha,
               &a_buffer[a_offset], a_ld,
               &b_buffer[b_offset], b_ld,
@@ -1591,7 +1591,7 @@ void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, con
                 const double beta,
                 std::vector<double>& c_buffer, const size_t c_offset, const size_t c_ld) {
   cblas_dgemm(layout, a_transpose, b_transpose,
-              m, n, k,
+              static_cast<int>(m), static_cast<int>(n), static_cast<int>(k),
               alpha,
               &a_buffer[a_offset], a_ld,
               &b_buffer[b_offset], b_ld,
@@ -1608,7 +1608,7 @@ void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, con
   const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
   const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
   cblas_cgemm(layout, a_transpose, b_transpose,
-              m, n, k,
+              static_cast<int>(m), static_cast<int>(n), static_cast<int>(k),
               alpha_array.data(),
               reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<const float*>(&b_buffer[b_offset]), b_ld,
@@ -1625,7 +1625,7 @@ void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, con
   const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
   const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
   cblas_zgemm(layout, a_transpose, b_transpose,
-              m, n, k,
+              static_cast<int>(m), static_cast<int>(n), static_cast<int>(k),
               alpha_array.data(),
               reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<const double*>(&b_buffer[b_offset]), b_ld,
@@ -1661,7 +1661,7 @@ void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL
                 const float beta,
                 std::vector<float>& c_buffer, const size_t c_offset, const size_t c_ld) {
   cblas_ssymm(layout, side, triangle,
-              m, n,
+              static_cast<int>(m), static_cast<int>(n),
               alpha,
               &a_buffer[a_offset], a_ld,
               &b_buffer[b_offset], b_ld,
@@ -1676,7 +1676,7 @@ void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL
                 const double beta,
                 std::vector<double>& c_buffer, const size_t c_offset, const size_t c_ld) {
   cblas_dsymm(layout, side, triangle,
-              m, n,
+              static_cast<int>(m), static_cast<int>(n),
               alpha,
               &a_buffer[a_offset], a_ld,
               &b_buffer[b_offset], b_ld,
@@ -1693,7 +1693,7 @@ void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL
   const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
   const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
   cblas_csymm(layout, side, triangle,
-              m, n,
+              static_cast<int>(m), static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<const float*>(&b_buffer[b_offset]), b_ld,
@@ -1710,7 +1710,7 @@ void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL
   const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
   const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
   cblas_zsymm(layout, side, triangle,
-              m, n,
+              static_cast<int>(m), static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<const double*>(&b_buffer[b_offset]), b_ld,
@@ -1748,7 +1748,7 @@ void cblasXhemm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL
   const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
   const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
   cblas_chemm(layout, side, triangle,
-              m, n,
+              static_cast<int>(m), static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<const float*>(&b_buffer[b_offset]), b_ld,
@@ -1765,7 +1765,7 @@ void cblasXhemm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL
   const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
   const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
   cblas_zhemm(layout, side, triangle,
-              m, n,
+              static_cast<int>(m), static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<const double*>(&b_buffer[b_offset]), b_ld,
@@ -1781,7 +1781,7 @@ void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const float beta,
                 std::vector<float>& c_buffer, const size_t c_offset, const size_t c_ld) {
   cblas_ssyrk(layout, triangle, a_transpose,
-              n, k,
+              static_cast<int>(n), static_cast<int>(k),
               alpha,
               &a_buffer[a_offset], a_ld,
               beta,
@@ -1794,7 +1794,7 @@ void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const double beta,
                 std::vector<double>& c_buffer, const size_t c_offset, const size_t c_ld) {
   cblas_dsyrk(layout, triangle, a_transpose,
-              n, k,
+              static_cast<int>(n), static_cast<int>(k),
               alpha,
               &a_buffer[a_offset], a_ld,
               beta,
@@ -1809,7 +1809,7 @@ void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
   const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
   const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
   cblas_csyrk(layout, triangle, a_transpose,
-              n, k,
+              static_cast<int>(n), static_cast<int>(k),
               alpha_array.data(),
               reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
               beta_array.data(),
@@ -1824,7 +1824,7 @@ void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
   const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
   const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
   cblas_zsyrk(layout, triangle, a_transpose,
-              n, k,
+              static_cast<int>(n), static_cast<int>(k),
               alpha_array.data(),
               reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
               beta_array.data(),
@@ -1855,7 +1855,7 @@ void cblasXherk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const float beta,
                 std::vector<float2>& c_buffer, const size_t c_offset, const size_t c_ld) {
   cblas_cherk(layout, triangle, a_transpose,
-              n, k,
+              static_cast<int>(n), static_cast<int>(k),
               alpha,
               reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
               beta,
@@ -1868,7 +1868,7 @@ void cblasXherk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
                 const double beta,
                 std::vector<double2>& c_buffer, const size_t c_offset, const size_t c_ld) {
   cblas_zherk(layout, triangle, a_transpose,
-              n, k,
+              static_cast<int>(n), static_cast<int>(k),
               alpha,
               reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
               beta,
@@ -1884,7 +1884,7 @@ void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLA
                  const float beta,
                  std::vector<float>& c_buffer, const size_t c_offset, const size_t c_ld) {
   cblas_ssyr2k(layout, triangle, ab_transpose,
-               n, k,
+               static_cast<int>(n), static_cast<int>(k),
                alpha,
                &a_buffer[a_offset], a_ld,
                &b_buffer[b_offset], b_ld,
@@ -1899,7 +1899,7 @@ void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLA
                  const double beta,
                  std::vector<double>& c_buffer, const size_t c_offset, const size_t c_ld) {
   cblas_dsyr2k(layout, triangle, ab_transpose,
-               n, k,
+               static_cast<int>(n), static_cast<int>(k),
                alpha,
                &a_buffer[a_offset], a_ld,
                &b_buffer[b_offset], b_ld,
@@ -1916,7 +1916,7 @@ void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLA
   const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
   const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
   cblas_csyr2k(layout, triangle, ab_transpose,
-               n, k,
+               static_cast<int>(n), static_cast<int>(k),
                alpha_array.data(),
                reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
                reinterpret_cast<const float*>(&b_buffer[b_offset]), b_ld,
@@ -1933,7 +1933,7 @@ void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLA
   const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
   const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
   cblas_zsyr2k(layout, triangle, ab_transpose,
-               n, k,
+               static_cast<int>(n), static_cast<int>(k),
                alpha_array.data(),
                reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
                reinterpret_cast<const double*>(&b_buffer[b_offset]), b_ld,
@@ -1970,7 +1970,7 @@ void cblasXher2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLA
                  std::vector<float2>& c_buffer, const size_t c_offset, const size_t c_ld) {
   const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
   cblas_cher2k(layout, triangle, ab_transpose,
-               n, k,
+               static_cast<int>(n), static_cast<int>(k),
                alpha_array.data(),
                reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
                reinterpret_cast<const float*>(&b_buffer[b_offset]), b_ld,
@@ -1986,7 +1986,7 @@ void cblasXher2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLA
                  std::vector<double2>& c_buffer, const size_t c_offset, const size_t c_ld) {
   const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
   cblas_zher2k(layout, triangle, ab_transpose,
-               n, k,
+               static_cast<int>(n), static_cast<int>(k),
                alpha_array.data(),
                reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
                reinterpret_cast<const double*>(&b_buffer[b_offset]), b_ld,
@@ -2001,7 +2001,7 @@ void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL
                 const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
                 std::vector<float>& b_buffer, const size_t b_offset, const size_t b_ld) {
   cblas_strmm(layout, side, triangle, a_transpose, diagonal,
-              m, n,
+              static_cast<int>(m), static_cast<int>(n),
               alpha,
               &a_buffer[a_offset], a_ld,
               &b_buffer[b_offset], b_ld);
@@ -2012,7 +2012,7 @@ void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL
                 const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
                 std::vector<double>& b_buffer, const size_t b_offset, const size_t b_ld) {
   cblas_dtrmm(layout, side, triangle, a_transpose, diagonal,
-              m, n,
+              static_cast<int>(m), static_cast<int>(n),
               alpha,
               &a_buffer[a_offset], a_ld,
               &b_buffer[b_offset], b_ld);
@@ -2024,7 +2024,7 @@ void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL
                 std::vector<float2>& b_buffer, const size_t b_offset, const size_t b_ld) {
   const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
   cblas_ctrmm(layout, side, triangle, a_transpose, diagonal,
-              m, n,
+              static_cast<int>(m), static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<float*>(&b_buffer[b_offset]), b_ld);
@@ -2036,7 +2036,7 @@ void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL
                 std::vector<double2>& b_buffer, const size_t b_offset, const size_t b_ld) {
   const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
   cblas_ztrmm(layout, side, triangle, a_transpose, diagonal,
-              m, n,
+              static_cast<int>(m), static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<double*>(&b_buffer[b_offset]), b_ld);
@@ -2063,7 +2063,7 @@ void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL
                 const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
                 std::vector<float>& b_buffer, const size_t b_offset, const size_t b_ld) {
   cblas_strsm(layout, side, triangle, a_transpose, diagonal,
-              m, n,
+              static_cast<int>(m), static_cast<int>(n),
               alpha,
               &a_buffer[a_offset], a_ld,
               &b_buffer[b_offset], b_ld);
@@ -2074,7 +2074,7 @@ void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL
                 const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
                 std::vector<double>& b_buffer, const size_t b_offset, const size_t b_ld) {
   cblas_dtrsm(layout, side, triangle, a_transpose, diagonal,
-              m, n,
+              static_cast<int>(m), static_cast<int>(n),
               alpha,
               &a_buffer[a_offset], a_ld,
               &b_buffer[b_offset], b_ld);
@@ -2086,7 +2086,7 @@ void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL
                 std::vector<float2>& b_buffer, const size_t b_offset, const size_t b_ld) {
   const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
   cblas_ctrsm(layout, side, triangle, a_transpose, diagonal,
-              m, n,
+              static_cast<int>(m), static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<float*>(&b_buffer[b_offset]), b_ld);
@@ -2098,25 +2098,11 @@ void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL
                 std::vector<double2>& b_buffer, const size_t b_offset, const size_t b_ld) {
   const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
   cblas_ztrsm(layout, side, triangle, a_transpose, diagonal,
-              m, n,
+              static_cast<int>(m), static_cast<int>(n),
               alpha_array.data(),
               reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<double*>(&b_buffer[b_offset]), b_ld);
 }
-void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
-                const size_t m, const size_t n,
-                const half alpha,
-                const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld,
-                std::vector<half>& b_buffer, const size_t b_offset, const size_t b_ld) {
-  auto a_buffer_bis = HalfToFloatBuffer(a_buffer);
-  auto b_buffer_bis = HalfToFloatBuffer(b_buffer);
-  cblasXtrsm(layout, side, triangle, a_transpose, diagonal,
-             m, n,
-             HalfToFloat(alpha),
-             a_buffer_bis, a_offset, a_ld,
-             b_buffer_bis, b_offset, b_ld);
-  FloatToHalfBuffer(b_buffer, b_buffer_bis);
-}
 
 // =================================================================================================
 } // namespace clblast
diff --git a/test/wrapper_clblas.hpp b/test/wrapper_clblas.hpp
index f1923784..f1b3a0c4 100644
--- a/test/wrapper_clblas.hpp
+++ b/test/wrapper_clblas.hpp
@@ -2865,24 +2865,6 @@ clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const
                      b_buffer(), b_offset, b_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
-clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
-                         const size_t m, const size_t n,
-                         const half alpha,
-                         const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld,
-                         Buffer<half>& b_buffer, const size_t b_offset, const size_t b_ld,
-                         cl_uint num_queues, cl_command_queue *queues,
-                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
-  auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]);
-  auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]);
-  auto status = clblasXtrsm(layout, side, triangle, a_transpose, diagonal,
-                            m, n,
-                            HalfToFloat(alpha),
-                            a_buffer_bis, a_offset, a_ld,
-                            b_buffer_bis, b_offset, b_ld,
-                            num_queues, queues, num_wait_events, wait_events, events);
-  FloatToHalfBuffer(b_buffer, b_buffer_bis, queues[0]);
-  return status;
-}
 
 // =================================================================================================
 } // namespace clblast
diff --git a/test/wrapper_cublas.hpp b/test/wrapper_cublas.hpp
new file mode 100644
index 00000000..35b1b9c6
--- /dev/null
+++ b/test/wrapper_cublas.hpp
@@ -0,0 +1,2548 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a wrapper around the cuBLAS library, such that its routines can be called
+// in a similar way as the CLBlast routines: using alpha and beta to determine the precision.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_WRAPPER_CUBLAS_H_
+#define CLBLAST_TEST_WRAPPER_CUBLAS_H_
+
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+
+#include "utilities/utilities.hpp"
+
+namespace clblast {
+
+// Conversions from CLBlast types
+cublasOperation_t convertToCUBLAS(const Transpose v) { return (v == Transpose::kNo) ? CUBLAS_OP_N : (v == Transpose::kYes) ? CUBLAS_OP_T : CUBLAS_OP_C; }
+cublasFillMode_t convertToCUBLAS(const Triangle v) { return (v == Triangle::kUpper) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER; }
+cublasDiagType_t convertToCUBLAS(const Diagonal v) { return (v == Diagonal::kUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT; }
+cublasSideMode_t convertToCUBLAS(const Side v) { return (v == Side::kLeft) ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT; }
+
+// =================================================================================================
+// BLAS level-1 (vector-vector) routines
+// =================================================================================================
+
+// Forwards the cuBLAS calls for SROTG/DROTG
+template <typename T>
+cublasStatus_t cublasXrotg(cublasHandle_t handle, T* sa_buffer, const size_t sa_offset,
+                           T* sb_buffer, const size_t sb_offset,
+                           T* sc_buffer, const size_t sc_offset,
+                           T* ss_buffer, const size_t ss_offset);
+template <>
+cublasStatus_t cublasXrotg<float>(cublasHandle_t handle, float* sa_buffer, const size_t sa_offset,
+                                  float* sb_buffer, const size_t sb_offset,
+                                  float* sc_buffer, const size_t sc_offset,
+                                  float* ss_buffer, const size_t ss_offset) {
+  auto status = cublasSrotg(handle, &sa_buffer[sa_offset],
+                            &sb_buffer[sb_offset],
+                            &sc_buffer[sc_offset],
+                            &ss_buffer[ss_offset]);
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXrotg<double>(cublasHandle_t handle, double* sa_buffer, const size_t sa_offset,
+                                   double* sb_buffer, const size_t sb_offset,
+                                   double* sc_buffer, const size_t sc_offset,
+                                   double* ss_buffer, const size_t ss_offset) {
+  auto status = cublasDrotg(handle, &sa_buffer[sa_offset],
+                            &sb_buffer[sb_offset],
+                            &sc_buffer[sc_offset],
+                            &ss_buffer[ss_offset]);
+  cudaDeviceSynchronize();
+  return status;
+}
+
+// Forwards the cuBLAS calls for SROTMG/DROTMG
+template <typename T>
+cublasStatus_t cublasXrotmg(cublasHandle_t handle, T* sd1_buffer, const size_t sd1_offset,
+                            T* sd2_buffer, const size_t sd2_offset,
+                            T* sx1_buffer, const size_t sx1_offset,
+                            const T* sy1_buffer, const size_t sy1_offset,
+                            T* sparam_buffer, const size_t sparam_offset);
+template <>
+cublasStatus_t cublasXrotmg<float>(cublasHandle_t handle, float* sd1_buffer, const size_t sd1_offset,
+                                   float* sd2_buffer, const size_t sd2_offset,
+                                   float* sx1_buffer, const size_t sx1_offset,
+                                   const float* sy1_buffer, const size_t sy1_offset,
+                                   float* sparam_buffer, const size_t sparam_offset) {
+  auto status = cublasSrotmg(handle, &sd1_buffer[sd1_offset],
+                             &sd2_buffer[sd2_offset],
+                             &sx1_buffer[sx1_offset],
+                             &sy1_buffer[sy1_offset],
+                             &sparam_buffer[sparam_offset]);
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXrotmg<double>(cublasHandle_t handle, double* sd1_buffer, const size_t sd1_offset,
+                                    double* sd2_buffer, const size_t sd2_offset,
+                                    double* sx1_buffer, const size_t sx1_offset,
+                                    const double* sy1_buffer, const size_t sy1_offset,
+                                    double* sparam_buffer, const size_t sparam_offset) {
+  auto status = cublasDrotmg(handle, &sd1_buffer[sd1_offset],
+                             &sd2_buffer[sd2_offset],
+                             &sx1_buffer[sx1_offset],
+                             &sy1_buffer[sy1_offset],
+                             &sparam_buffer[sparam_offset]);
+  cudaDeviceSynchronize();
+  return status;
+}
+
+// Forwards the cuBLAS calls for SROT/DROT
+cublasStatus_t cublasXrot(cublasHandle_t handle, const size_t n,
+                          float* x_buffer, const size_t x_offset, const size_t x_inc,
+                          float* y_buffer, const size_t y_offset, const size_t y_inc,
+                          const float cos,
+                          const float sin) {
+  auto status = cublasSrot(handle, static_cast<int>(n),
+                           &x_buffer[x_offset], static_cast<int>(x_inc),
+                           &y_buffer[y_offset], static_cast<int>(y_inc),
+                           &cos,
+                           &sin);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXrot(cublasHandle_t handle, const size_t n,
+                          double* x_buffer, const size_t x_offset, const size_t x_inc,
+                          double* y_buffer, const size_t y_offset, const size_t y_inc,
+                          const double cos,
+                          const double sin) {
+  auto status = cublasDrot(handle, static_cast<int>(n),
+                           &x_buffer[x_offset], static_cast<int>(x_inc),
+                           &y_buffer[y_offset], static_cast<int>(y_inc),
+                           &cos,
+                           &sin);
+  cudaDeviceSynchronize();
+  return status;
+}
+
+// Forwards the cuBLAS calls for SROTM/DROTM
+template <typename T>
+cublasStatus_t cublasXrotm(cublasHandle_t handle, const size_t n,
+                           T* x_buffer, const size_t x_offset, const size_t x_inc,
+                           T* y_buffer, const size_t y_offset, const size_t y_inc,
+                           T* sparam_buffer, const size_t sparam_offset);
+template <>
+cublasStatus_t cublasXrotm<float>(cublasHandle_t handle, const size_t n,
+                                  float* x_buffer, const size_t x_offset, const size_t x_inc,
+                                  float* y_buffer, const size_t y_offset, const size_t y_inc,
+                                  float* sparam_buffer, const size_t sparam_offset) {
+  auto status = cublasSrotm(handle, static_cast<int>(n),
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &y_buffer[y_offset], static_cast<int>(y_inc),
+                            &sparam_buffer[sparam_offset]);
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXrotm<double>(cublasHandle_t handle, const size_t n,
+                                   double* x_buffer, const size_t x_offset, const size_t x_inc,
+                                   double* y_buffer, const size_t y_offset, const size_t y_inc,
+                                   double* sparam_buffer, const size_t sparam_offset) {
+  auto status = cublasDrotm(handle, static_cast<int>(n),
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &y_buffer[y_offset], static_cast<int>(y_inc),
+                            &sparam_buffer[sparam_offset]);
+  cudaDeviceSynchronize();
+  return status;
+}
+
+// Forwards the cuBLAS calls for SSWAP/DSWAP/CSWAP/ZSWAP
+template <typename T>
+cublasStatus_t cublasXswap(cublasHandle_t handle, const size_t n,
+                           T* x_buffer, const size_t x_offset, const size_t x_inc,
+                           T* y_buffer, const size_t y_offset, const size_t y_inc);
+template <>
+cublasStatus_t cublasXswap<float>(cublasHandle_t handle, const size_t n,
+                                  float* x_buffer, const size_t x_offset, const size_t x_inc,
+                                  float* y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto status = cublasSswap(handle, static_cast<int>(n),
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &y_buffer[y_offset], static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXswap<double>(cublasHandle_t handle, const size_t n,
+                                   double* x_buffer, const size_t x_offset, const size_t x_inc,
+                                   double* y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto status = cublasDswap(handle, static_cast<int>(n),
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &y_buffer[y_offset], static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXswap<float2>(cublasHandle_t handle, const size_t n,
+                                   float2* x_buffer, const size_t x_offset, const size_t x_inc,
+                                   float2* y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto status = cublasCswap(handle, static_cast<int>(n),
+                            reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXswap<double2>(cublasHandle_t handle, const size_t n,
+                                    double2* x_buffer, const size_t x_offset, const size_t x_inc,
+                                    double2* y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto status = cublasZswap(handle, static_cast<int>(n),
+                            reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXswap<half>(cublasHandle_t handle, const size_t n,
+                                 half* x_buffer, const size_t x_offset, const size_t x_inc,
+                                 half* y_buffer, const size_t y_offset, const size_t y_inc) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for SSCAL/DSCAL/CSCAL/ZSCAL
+cublasStatus_t cublasXscal(cublasHandle_t handle, const size_t n,
+                           const float alpha,
+                           float* x_buffer, const size_t x_offset, const size_t x_inc) {
+  auto status = cublasSscal(handle, static_cast<int>(n),
+                            &alpha,
+                            &x_buffer[x_offset], static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXscal(cublasHandle_t handle, const size_t n,
+                           const double alpha,
+                           double* x_buffer, const size_t x_offset, const size_t x_inc) {
+  auto status = cublasDscal(handle, static_cast<int>(n),
+                            &alpha,
+                            &x_buffer[x_offset], static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXscal(cublasHandle_t handle, const size_t n,
+                           const float2 alpha,
+                           float2* x_buffer, const size_t x_offset, const size_t x_inc) {
+  cuComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  auto status = cublasCscal(handle, static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXscal(cublasHandle_t handle, const size_t n,
+                           const double2 alpha,
+                           double2* x_buffer, const size_t x_offset, const size_t x_inc) {
+  cuDoubleComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  auto status = cublasZscal(handle, static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXscal(cublasHandle_t handle, const size_t n,
+                           const half alpha,
+                           half* x_buffer, const size_t x_offset, const size_t x_inc) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for SCOPY/DCOPY/CCOPY/ZCOPY
+template <typename T>
+cublasStatus_t cublasXcopy(cublasHandle_t handle, const size_t n,
+                           const T* x_buffer, const size_t x_offset, const size_t x_inc,
+                           T* y_buffer, const size_t y_offset, const size_t y_inc);
+template <>
+cublasStatus_t cublasXcopy<float>(cublasHandle_t handle, const size_t n,
+                                  const float* x_buffer, const size_t x_offset, const size_t x_inc,
+                                  float* y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto status = cublasScopy(handle, static_cast<int>(n),
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &y_buffer[y_offset], static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXcopy<double>(cublasHandle_t handle, const size_t n,
+                                   const double* x_buffer, const size_t x_offset, const size_t x_inc,
+                                   double* y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto status = cublasDcopy(handle, static_cast<int>(n),
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &y_buffer[y_offset], static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXcopy<float2>(cublasHandle_t handle, const size_t n,
+                                   const float2* x_buffer, const size_t x_offset, const size_t x_inc,
+                                   float2* y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto status = cublasCcopy(handle, static_cast<int>(n),
+                            reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXcopy<double2>(cublasHandle_t handle, const size_t n,
+                                    const double2* x_buffer, const size_t x_offset, const size_t x_inc,
+                                    double2* y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto status = cublasZcopy(handle, static_cast<int>(n),
+                            reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXcopy<half>(cublasHandle_t handle, const size_t n,
+                                 const half* x_buffer, const size_t x_offset, const size_t x_inc,
+                                 half* y_buffer, const size_t y_offset, const size_t y_inc) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for SAXPY/DAXPY/CAXPY/ZAXPY
+cublasStatus_t cublasXaxpy(cublasHandle_t handle, const size_t n,
+                           const float alpha,
+                           const float* x_buffer, const size_t x_offset, const size_t x_inc,
+                           float* y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto status = cublasSaxpy(handle, static_cast<int>(n),
+                            &alpha,
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &y_buffer[y_offset], static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXaxpy(cublasHandle_t handle, const size_t n,
+                           const double alpha,
+                           const double* x_buffer, const size_t x_offset, const size_t x_inc,
+                           double* y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto status = cublasDaxpy(handle, static_cast<int>(n),
+                            &alpha,
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &y_buffer[y_offset], static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXaxpy(cublasHandle_t handle, const size_t n,
+                           const float2 alpha,
+                           const float2* x_buffer, const size_t x_offset, const size_t x_inc,
+                           float2* y_buffer, const size_t y_offset, const size_t y_inc) {
+  cuComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  auto status = cublasCaxpy(handle, static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXaxpy(cublasHandle_t handle, const size_t n,
+                           const double2 alpha,
+                           const double2* x_buffer, const size_t x_offset, const size_t x_inc,
+                           double2* y_buffer, const size_t y_offset, const size_t y_inc) {
+  cuDoubleComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  auto status = cublasZaxpy(handle, static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXaxpy(cublasHandle_t handle, const size_t n,
+                           const half alpha,
+                           const half* x_buffer, const size_t x_offset, const size_t x_inc,
+                           half* y_buffer, const size_t y_offset, const size_t y_inc) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for SDOT/DDOT
+template <typename T>
+cublasStatus_t cublasXdot(cublasHandle_t handle, const size_t n,
+                          T* dot_buffer, const size_t dot_offset,
+                          const T* x_buffer, const size_t x_offset, const size_t x_inc,
+                          const T* y_buffer, const size_t y_offset, const size_t y_inc);
+template <>
+cublasStatus_t cublasXdot<float>(cublasHandle_t handle, const size_t n,
+                                 float* dot_buffer, const size_t dot_offset,
+                                 const float* x_buffer, const size_t x_offset, const size_t x_inc,
+                                 const float* y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto status = cublasSdot(handle, static_cast<int>(n),
+                           &x_buffer[x_offset], static_cast<int>(x_inc),
+                           &y_buffer[y_offset], static_cast<int>(y_inc),
+                           &dot_buffer[dot_offset]);
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXdot<double>(cublasHandle_t handle, const size_t n,
+                                  double* dot_buffer, const size_t dot_offset,
+                                  const double* x_buffer, const size_t x_offset, const size_t x_inc,
+                                  const double* y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto status = cublasDdot(handle, static_cast<int>(n),
+                           &x_buffer[x_offset], static_cast<int>(x_inc),
+                           &y_buffer[y_offset], static_cast<int>(y_inc),
+                           &dot_buffer[dot_offset]);
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXdot<half>(cublasHandle_t handle, const size_t n,
+                                half* dot_buffer, const size_t dot_offset,
+                                const half* x_buffer, const size_t x_offset, const size_t x_inc,
+                                const half* y_buffer, const size_t y_offset, const size_t y_inc) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for CDOTU/ZDOTU
+template <typename T>
+cublasStatus_t cublasXdotu(cublasHandle_t handle, const size_t n,
+                           T* dot_buffer, const size_t dot_offset,
+                           const T* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const T* y_buffer, const size_t y_offset, const size_t y_inc);
+template <>
+cublasStatus_t cublasXdotu<float2>(cublasHandle_t handle, const size_t n,
+                                   float2* dot_buffer, const size_t dot_offset,
+                                   const float2* x_buffer, const size_t x_offset, const size_t x_inc,
+                                   const float2* y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto status = cublasCdotu(handle, static_cast<int>(n),
+                            reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<const cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+                            reinterpret_cast<cuComplex*>(&dot_buffer[dot_offset]));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXdotu<double2>(cublasHandle_t handle, const size_t n,
+                                    double2* dot_buffer, const size_t dot_offset,
+                                    const double2* x_buffer, const size_t x_offset, const size_t x_inc,
+                                    const double2* y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto status = cublasZdotu(handle, static_cast<int>(n),
+                            reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<const cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+                            reinterpret_cast<cuDoubleComplex*>(&dot_buffer[dot_offset]));
+  cudaDeviceSynchronize();
+  return status;
+}
+
+// Forwards the cuBLAS calls for CDOTC/ZDOTC
+template <typename T>
+cublasStatus_t cublasXdotc(cublasHandle_t handle, const size_t n,
+                           T* dot_buffer, const size_t dot_offset,
+                           const T* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const T* y_buffer, const size_t y_offset, const size_t y_inc);
+template <>
+cublasStatus_t cublasXdotc<float2>(cublasHandle_t handle, const size_t n,
+                                   float2* dot_buffer, const size_t dot_offset,
+                                   const float2* x_buffer, const size_t x_offset, const size_t x_inc,
+                                   const float2* y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto status = cublasCdotc(handle, static_cast<int>(n),
+                            reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<const cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+                            reinterpret_cast<cuComplex*>(&dot_buffer[dot_offset]));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXdotc<double2>(cublasHandle_t handle, const size_t n,
+                                    double2* dot_buffer, const size_t dot_offset,
+                                    const double2* x_buffer, const size_t x_offset, const size_t x_inc,
+                                    const double2* y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto status = cublasZdotc(handle, static_cast<int>(n),
+                            reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<const cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+                            reinterpret_cast<cuDoubleComplex*>(&dot_buffer[dot_offset]));
+  cudaDeviceSynchronize();
+  return status;
+}
+
+// Forwards the cuBLAS calls for SNRM2/DNRM2/ScNRM2/DzNRM2
+template <typename T>
+cublasStatus_t cublasXnrm2(cublasHandle_t handle, const size_t n,
+                           T* nrm2_buffer, const size_t nrm2_offset,
+                           const T* x_buffer, const size_t x_offset, const size_t x_inc);
+template <>
+cublasStatus_t cublasXnrm2<float>(cublasHandle_t handle, const size_t n,
+                                  float* nrm2_buffer, const size_t nrm2_offset,
+                                  const float* x_buffer, const size_t x_offset, const size_t x_inc) {
+  auto status = cublasSnrm2(handle, static_cast<int>(n),
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &nrm2_buffer[nrm2_offset]);
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXnrm2<double>(cublasHandle_t handle, const size_t n,
+                                   double* nrm2_buffer, const size_t nrm2_offset,
+                                   const double* x_buffer, const size_t x_offset, const size_t x_inc) {
+  auto status = cublasDnrm2(handle, static_cast<int>(n),
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &nrm2_buffer[nrm2_offset]);
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXnrm2<float2>(cublasHandle_t handle, const size_t n,
+                                   float2* nrm2_buffer, const size_t nrm2_offset,
+                                   const float2* x_buffer, const size_t x_offset, const size_t x_inc) {
+  auto status = cublasScnrm2(handle, static_cast<int>(n),
+                            reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<float*>(&nrm2_buffer[nrm2_offset]));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXnrm2<double2>(cublasHandle_t handle, const size_t n,
+                                    double2* nrm2_buffer, const size_t nrm2_offset,
+                                    const double2* x_buffer, const size_t x_offset, const size_t x_inc) {
+  auto status = cublasDznrm2(handle, static_cast<int>(n),
+                            reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<double*>(&nrm2_buffer[nrm2_offset]));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXnrm2<half>(cublasHandle_t handle, const size_t n,
+                                 half* nrm2_buffer, const size_t nrm2_offset,
+                                 const half* x_buffer, const size_t x_offset, const size_t x_inc) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for SASUM/DASUM/ScASUM/DzASUM
+template <typename T>
+cublasStatus_t cublasXasum(cublasHandle_t handle, const size_t n,
+                           T* asum_buffer, const size_t asum_offset,
+                           const T* x_buffer, const size_t x_offset, const size_t x_inc);
+template <>
+cublasStatus_t cublasXasum<float>(cublasHandle_t handle, const size_t n,
+                                  float* asum_buffer, const size_t asum_offset,
+                                  const float* x_buffer, const size_t x_offset, const size_t x_inc) {
+  auto status = cublasSasum(handle, static_cast<int>(n),
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &asum_buffer[asum_offset]);
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXasum<double>(cublasHandle_t handle, const size_t n,
+                                   double* asum_buffer, const size_t asum_offset,
+                                   const double* x_buffer, const size_t x_offset, const size_t x_inc) {
+  auto status = cublasDasum(handle, static_cast<int>(n),
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &asum_buffer[asum_offset]);
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXasum<float2>(cublasHandle_t handle, const size_t n,
+                                   float2* asum_buffer, const size_t asum_offset,
+                                   const float2* x_buffer, const size_t x_offset, const size_t x_inc) {
+  auto status = cublasScasum(handle, static_cast<int>(n),
+                            reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<float*>(&asum_buffer[asum_offset]));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXasum<double2>(cublasHandle_t handle, const size_t n,
+                                    double2* asum_buffer, const size_t asum_offset,
+                                    const double2* x_buffer, const size_t x_offset, const size_t x_inc) {
+  auto status = cublasDzasum(handle, static_cast<int>(n),
+                            reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<double*>(&asum_buffer[asum_offset]));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXasum<half>(cublasHandle_t handle, const size_t n,
+                                 half* asum_buffer, const size_t asum_offset,
+                                 const half* x_buffer, const size_t x_offset, const size_t x_inc) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
+template <typename T>
+cublasStatus_t cublasXamax(cublasHandle_t handle, const size_t n,
+                           T* imax_buffer, const size_t imax_offset,
+                           const T* x_buffer, const size_t x_offset, const size_t x_inc);
+template <>
+cublasStatus_t cublasXamax<float>(cublasHandle_t handle, const size_t n,
+                                  float* imax_buffer, const size_t imax_offset,
+                                  const float* x_buffer, const size_t x_offset, const size_t x_inc) {
+  auto status = cublasIsamax(handle, static_cast<int>(n),
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            reinterpret_cast<int*>(&imax_buffer[imax_offset]));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXamax<double>(cublasHandle_t handle, const size_t n,
+                                   double* imax_buffer, const size_t imax_offset,
+                                   const double* x_buffer, const size_t x_offset, const size_t x_inc) {
+  auto status = cublasIdamax(handle, static_cast<int>(n),
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            reinterpret_cast<int*>(&imax_buffer[imax_offset]));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXamax<float2>(cublasHandle_t handle, const size_t n,
+                                   float2* imax_buffer, const size_t imax_offset,
+                                   const float2* x_buffer, const size_t x_offset, const size_t x_inc) {
+  auto status = cublasIcamax(handle, static_cast<int>(n),
+                            reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<int*>(&imax_buffer[imax_offset]));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXamax<double2>(cublasHandle_t handle, const size_t n,
+                                    double2* imax_buffer, const size_t imax_offset,
+                                    const double2* x_buffer, const size_t x_offset, const size_t x_inc) {
+  auto status = cublasIzamax(handle, static_cast<int>(n),
+                            reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<int*>(&imax_buffer[imax_offset]));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXamax<half>(cublasHandle_t handle, const size_t n,
+                                 half* imax_buffer, const size_t imax_offset,
+                                 const half* x_buffer, const size_t x_offset, const size_t x_inc) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// =================================================================================================
+// BLAS level-2 (matrix-vector) routines
+// =================================================================================================
+
+// Forwards the cuBLAS calls for SGEMV/DGEMV/CGEMV/ZGEMV
+cublasStatus_t cublasXgemv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose,
+                           const size_t m, const size_t n,
+                           const float alpha,
+                           const float* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const float* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const float beta,
+                           float* y_buffer, const size_t y_offset, const size_t y_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasSgemv(handle, a_transpose,
+                            static_cast<int>(m), static_cast<int>(n),
+                            &alpha,
+                            &a_buffer[a_offset], a_ld,
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &beta,
+                            &y_buffer[y_offset], static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXgemv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose,
+                           const size_t m, const size_t n,
+                           const double alpha,
+                           const double* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const double* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const double beta,
+                           double* y_buffer, const size_t y_offset, const size_t y_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasDgemv(handle, a_transpose,
+                            static_cast<int>(m), static_cast<int>(n),
+                            &alpha,
+                            &a_buffer[a_offset], a_ld,
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &beta,
+                            &y_buffer[y_offset], static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXgemv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose,
+                           const size_t m, const size_t n,
+                           const float2 alpha,
+                           const float2* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const float2* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const float2 beta,
+                           float2* y_buffer, const size_t y_offset, const size_t y_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  cuComplex beta_cuda;
+  beta_cuda.x = beta.real();
+  beta_cuda.y = beta.imag();
+  auto status = cublasCgemv(handle, a_transpose,
+                            static_cast<int>(m), static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            &beta_cuda,
+                            reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXgemv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose,
+                           const size_t m, const size_t n,
+                           const double2 alpha,
+                           const double2* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const double2* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const double2 beta,
+                           double2* y_buffer, const size_t y_offset, const size_t y_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuDoubleComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  cuDoubleComplex beta_cuda;
+  beta_cuda.x = beta.real();
+  beta_cuda.y = beta.imag();
+  auto status = cublasZgemv(handle, a_transpose,
+                            static_cast<int>(m), static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            &beta_cuda,
+                            reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXgemv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose,
+                           const size_t m, const size_t n,
+                           const half alpha,
+                           const half* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const half* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const half beta,
+                           half* y_buffer, const size_t y_offset, const size_t y_inc) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for SGBMV/DGBMV/CGBMV/ZGBMV
+cublasStatus_t cublasXgbmv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose,
+                           const size_t m, const size_t n, const size_t kl, const size_t ku,
+                           const float alpha,
+                           const float* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const float* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const float beta,
+                           float* y_buffer, const size_t y_offset, const size_t y_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasSgbmv(handle, a_transpose,
+                            static_cast<int>(m), static_cast<int>(n), static_cast<int>(kl), static_cast<int>(ku),
+                            &alpha,
+                            &a_buffer[a_offset], a_ld,
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &beta,
+                            &y_buffer[y_offset], static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXgbmv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose,
+                           const size_t m, const size_t n, const size_t kl, const size_t ku,
+                           const double alpha,
+                           const double* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const double* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const double beta,
+                           double* y_buffer, const size_t y_offset, const size_t y_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasDgbmv(handle, a_transpose,
+                            static_cast<int>(m), static_cast<int>(n), static_cast<int>(kl), static_cast<int>(ku),
+                            &alpha,
+                            &a_buffer[a_offset], a_ld,
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &beta,
+                            &y_buffer[y_offset], static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXgbmv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose,
+                           const size_t m, const size_t n, const size_t kl, const size_t ku,
+                           const float2 alpha,
+                           const float2* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const float2* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const float2 beta,
+                           float2* y_buffer, const size_t y_offset, const size_t y_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  cuComplex beta_cuda;
+  beta_cuda.x = beta.real();
+  beta_cuda.y = beta.imag();
+  auto status = cublasCgbmv(handle, a_transpose,
+                            static_cast<int>(m), static_cast<int>(n), static_cast<int>(kl), static_cast<int>(ku),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            &beta_cuda,
+                            reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXgbmv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose,
+                           const size_t m, const size_t n, const size_t kl, const size_t ku,
+                           const double2 alpha,
+                           const double2* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const double2* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const double2 beta,
+                           double2* y_buffer, const size_t y_offset, const size_t y_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuDoubleComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  cuDoubleComplex beta_cuda;
+  beta_cuda.x = beta.real();
+  beta_cuda.y = beta.imag();
+  auto status = cublasZgbmv(handle, a_transpose,
+                            static_cast<int>(m), static_cast<int>(n), static_cast<int>(kl), static_cast<int>(ku),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            &beta_cuda,
+                            reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXgbmv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose,
+                           const size_t m, const size_t n, const size_t kl, const size_t ku,
+                           const half alpha,
+                           const half* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const half* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const half beta,
+                           half* y_buffer, const size_t y_offset, const size_t y_inc) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for CHEMV/ZHEMV
+cublasStatus_t cublasXhemv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n,
+                           const float2 alpha,
+                           const float2* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const float2* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const float2 beta,
+                           float2* y_buffer, const size_t y_offset, const size_t y_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  cuComplex beta_cuda;
+  beta_cuda.x = beta.real();
+  beta_cuda.y = beta.imag();
+  auto status = cublasChemv(handle, triangle,
+                            static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            &beta_cuda,
+                            reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXhemv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n,
+                           const double2 alpha,
+                           const double2* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const double2* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const double2 beta,
+                           double2* y_buffer, const size_t y_offset, const size_t y_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuDoubleComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  cuDoubleComplex beta_cuda;
+  beta_cuda.x = beta.real();
+  beta_cuda.y = beta.imag();
+  auto status = cublasZhemv(handle, triangle,
+                            static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            &beta_cuda,
+                            reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+
+// Forwards the cuBLAS calls for CHBMV/ZHBMV
+cublasStatus_t cublasXhbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n, const size_t k,
+                           const float2 alpha,
+                           const float2* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const float2* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const float2 beta,
+                           float2* y_buffer, const size_t y_offset, const size_t y_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  cuComplex beta_cuda;
+  beta_cuda.x = beta.real();
+  beta_cuda.y = beta.imag();
+  auto status = cublasChbmv(handle, triangle,
+                            static_cast<int>(n), static_cast<int>(k),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            &beta_cuda,
+                            reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXhbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n, const size_t k,
+                           const double2 alpha,
+                           const double2* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const double2* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const double2 beta,
+                           double2* y_buffer, const size_t y_offset, const size_t y_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuDoubleComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  cuDoubleComplex beta_cuda;
+  beta_cuda.x = beta.real();
+  beta_cuda.y = beta.imag();
+  auto status = cublasZhbmv(handle, triangle,
+                            static_cast<int>(n), static_cast<int>(k),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            &beta_cuda,
+                            reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+
+// Forwards the cuBLAS calls for CHPMV/ZHPMV
+cublasStatus_t cublasXhpmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n,
+                           const float2 alpha,
+                           const float2* ap_buffer, const size_t ap_offset,
+                           const float2* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const float2 beta,
+                           float2* y_buffer, const size_t y_offset, const size_t y_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  cuComplex beta_cuda;
+  beta_cuda.x = beta.real();
+  beta_cuda.y = beta.imag();
+  auto status = cublasChpmv(handle, triangle,
+                            static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuComplex*>(&ap_buffer[ap_offset]),
+                            reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            &beta_cuda,
+                            reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXhpmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n,
+                           const double2 alpha,
+                           const double2* ap_buffer, const size_t ap_offset,
+                           const double2* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const double2 beta,
+                           double2* y_buffer, const size_t y_offset, const size_t y_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuDoubleComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  cuDoubleComplex beta_cuda;
+  beta_cuda.x = beta.real();
+  beta_cuda.y = beta.imag();
+  auto status = cublasZhpmv(handle, triangle,
+                            static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuDoubleComplex*>(&ap_buffer[ap_offset]),
+                            reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            &beta_cuda,
+                            reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+
+// Forwards the cuBLAS calls for SSYMV/DSYMV
+cublasStatus_t cublasXsymv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n,
+                           const float alpha,
+                           const float* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const float* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const float beta,
+                           float* y_buffer, const size_t y_offset, const size_t y_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasSsymv(handle, triangle,
+                            static_cast<int>(n),
+                            &alpha,
+                            &a_buffer[a_offset], a_ld,
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &beta,
+                            &y_buffer[y_offset], static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXsymv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n,
+                           const double alpha,
+                           const double* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const double* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const double beta,
+                           double* y_buffer, const size_t y_offset, const size_t y_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasDsymv(handle, triangle,
+                            static_cast<int>(n),
+                            &alpha,
+                            &a_buffer[a_offset], a_ld,
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &beta,
+                            &y_buffer[y_offset], static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXsymv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n,
+                           const half alpha,
+                           const half* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const half* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const half beta,
+                           half* y_buffer, const size_t y_offset, const size_t y_inc) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for SSBMV/DSBMV
+cublasStatus_t cublasXsbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n, const size_t k,
+                           const float alpha,
+                           const float* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const float* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const float beta,
+                           float* y_buffer, const size_t y_offset, const size_t y_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasSsbmv(handle, triangle,
+                            static_cast<int>(n), static_cast<int>(k),
+                            &alpha,
+                            &a_buffer[a_offset], a_ld,
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &beta,
+                            &y_buffer[y_offset], static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXsbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n, const size_t k,
+                           const double alpha,
+                           const double* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const double* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const double beta,
+                           double* y_buffer, const size_t y_offset, const size_t y_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasDsbmv(handle, triangle,
+                            static_cast<int>(n), static_cast<int>(k),
+                            &alpha,
+                            &a_buffer[a_offset], a_ld,
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &beta,
+                            &y_buffer[y_offset], static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXsbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n, const size_t k,
+                           const half alpha,
+                           const half* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const half* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const half beta,
+                           half* y_buffer, const size_t y_offset, const size_t y_inc) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for SSPMV/DSPMV
+cublasStatus_t cublasXspmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n,
+                           const float alpha,
+                           const float* ap_buffer, const size_t ap_offset,
+                           const float* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const float beta,
+                           float* y_buffer, const size_t y_offset, const size_t y_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasSspmv(handle, triangle,
+                            static_cast<int>(n),
+                            &alpha,
+                            &ap_buffer[ap_offset],
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &beta,
+                            &y_buffer[y_offset], static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXspmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n,
+                           const double alpha,
+                           const double* ap_buffer, const size_t ap_offset,
+                           const double* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const double beta,
+                           double* y_buffer, const size_t y_offset, const size_t y_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasDspmv(handle, triangle,
+                            static_cast<int>(n),
+                            &alpha,
+                            &ap_buffer[ap_offset],
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &beta,
+                            &y_buffer[y_offset], static_cast<int>(y_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXspmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n,
+                           const half alpha,
+                           const half* ap_buffer, const size_t ap_offset,
+                           const half* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const half beta,
+                           half* y_buffer, const size_t y_offset, const size_t y_inc) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for STRMV/DTRMV/CTRMV/ZTRMV
+template <typename T>
+cublasStatus_t cublasXtrmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                           const size_t n,
+                           const T* a_buffer, const size_t a_offset, const size_t a_ld,
+                           T* x_buffer, const size_t x_offset, const size_t x_inc);
+template <>
+cublasStatus_t cublasXtrmv<float>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                  const size_t n,
+                                  const float* a_buffer, const size_t a_offset, const size_t a_ld,
+                                  float* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasStrmv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n),
+                            &a_buffer[a_offset], a_ld,
+                            &x_buffer[x_offset], static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXtrmv<double>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                   const size_t n,
+                                   const double* a_buffer, const size_t a_offset, const size_t a_ld,
+                                   double* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasDtrmv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n),
+                            &a_buffer[a_offset], a_ld,
+                            &x_buffer[x_offset], static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXtrmv<float2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                   const size_t n,
+                                   const float2* a_buffer, const size_t a_offset, const size_t a_ld,
+                                   float2* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasCtrmv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n),
+                            reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXtrmv<double2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                    const size_t n,
+                                    const double2* a_buffer, const size_t a_offset, const size_t a_ld,
+                                    double2* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasZtrmv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n),
+                            reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXtrmv<half>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                 const size_t n,
+                                 const half* a_buffer, const size_t a_offset, const size_t a_ld,
+                                 half* x_buffer, const size_t x_offset, const size_t x_inc) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for STBMV/DTBMV/CTBMV/ZTBMV
+template <typename T>
+cublasStatus_t cublasXtbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                           const size_t n, const size_t k,
+                           const T* a_buffer, const size_t a_offset, const size_t a_ld,
+                           T* x_buffer, const size_t x_offset, const size_t x_inc);
+template <>
+cublasStatus_t cublasXtbmv<float>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                  const size_t n, const size_t k,
+                                  const float* a_buffer, const size_t a_offset, const size_t a_ld,
+                                  float* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasStbmv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n), static_cast<int>(k),
+                            &a_buffer[a_offset], a_ld,
+                            &x_buffer[x_offset], static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXtbmv<double>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                   const size_t n, const size_t k,
+                                   const double* a_buffer, const size_t a_offset, const size_t a_ld,
+                                   double* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasDtbmv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n), static_cast<int>(k),
+                            &a_buffer[a_offset], a_ld,
+                            &x_buffer[x_offset], static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXtbmv<float2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                   const size_t n, const size_t k,
+                                   const float2* a_buffer, const size_t a_offset, const size_t a_ld,
+                                   float2* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasCtbmv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n), static_cast<int>(k),
+                            reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXtbmv<double2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                    const size_t n, const size_t k,
+                                    const double2* a_buffer, const size_t a_offset, const size_t a_ld,
+                                    double2* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasZtbmv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n), static_cast<int>(k),
+                            reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXtbmv<half>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                 const size_t n, const size_t k,
+                                 const half* a_buffer, const size_t a_offset, const size_t a_ld,
+                                 half* x_buffer, const size_t x_offset, const size_t x_inc) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for STPMV/DTPMV/CTPMV/ZTPMV
+template <typename T>
+cublasStatus_t cublasXtpmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                           const size_t n,
+                           const T* ap_buffer, const size_t ap_offset,
+                           T* x_buffer, const size_t x_offset, const size_t x_inc);
+template <>
+cublasStatus_t cublasXtpmv<float>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                  const size_t n,
+                                  const float* ap_buffer, const size_t ap_offset,
+                                  float* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasStpmv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n),
+                            &ap_buffer[ap_offset],
+                            &x_buffer[x_offset], static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXtpmv<double>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                   const size_t n,
+                                   const double* ap_buffer, const size_t ap_offset,
+                                   double* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasDtpmv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n),
+                            &ap_buffer[ap_offset],
+                            &x_buffer[x_offset], static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXtpmv<float2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                   const size_t n,
+                                   const float2* ap_buffer, const size_t ap_offset,
+                                   float2* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasCtpmv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n),
+                            reinterpret_cast<const cuComplex*>(&ap_buffer[ap_offset]),
+                            reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXtpmv<double2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                    const size_t n,
+                                    const double2* ap_buffer, const size_t ap_offset,
+                                    double2* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasZtpmv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n),
+                            reinterpret_cast<const cuDoubleComplex*>(&ap_buffer[ap_offset]),
+                            reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXtpmv<half>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                 const size_t n,
+                                 const half* ap_buffer, const size_t ap_offset,
+                                 half* x_buffer, const size_t x_offset, const size_t x_inc) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for STRSV/DTRSV/CTRSV/ZTRSV
+template <typename T>
+cublasStatus_t cublasXtrsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                           const size_t n,
+                           const T* a_buffer, const size_t a_offset, const size_t a_ld,
+                           T* x_buffer, const size_t x_offset, const size_t x_inc);
+template <>
+cublasStatus_t cublasXtrsv<float>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                  const size_t n,
+                                  const float* a_buffer, const size_t a_offset, const size_t a_ld,
+                                  float* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasStrsv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n),
+                            &a_buffer[a_offset], a_ld,
+                            &x_buffer[x_offset], static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXtrsv<double>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                   const size_t n,
+                                   const double* a_buffer, const size_t a_offset, const size_t a_ld,
+                                   double* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasDtrsv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n),
+                            &a_buffer[a_offset], a_ld,
+                            &x_buffer[x_offset], static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXtrsv<float2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                   const size_t n,
+                                   const float2* a_buffer, const size_t a_offset, const size_t a_ld,
+                                   float2* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasCtrsv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n),
+                            reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXtrsv<double2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                    const size_t n,
+                                    const double2* a_buffer, const size_t a_offset, const size_t a_ld,
+                                    double2* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasZtrsv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n),
+                            reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+
+// Forwards the cuBLAS calls for STBSV/DTBSV/CTBSV/ZTBSV
+template <typename T>
+cublasStatus_t cublasXtbsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                           const size_t n, const size_t k,
+                           const T* a_buffer, const size_t a_offset, const size_t a_ld,
+                           T* x_buffer, const size_t x_offset, const size_t x_inc);
+template <>
+cublasStatus_t cublasXtbsv<float>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                  const size_t n, const size_t k,
+                                  const float* a_buffer, const size_t a_offset, const size_t a_ld,
+                                  float* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasStbsv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n), static_cast<int>(k),
+                            &a_buffer[a_offset], a_ld,
+                            &x_buffer[x_offset], static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXtbsv<double>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                   const size_t n, const size_t k,
+                                   const double* a_buffer, const size_t a_offset, const size_t a_ld,
+                                   double* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasDtbsv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n), static_cast<int>(k),
+                            &a_buffer[a_offset], a_ld,
+                            &x_buffer[x_offset], static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXtbsv<float2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                   const size_t n, const size_t k,
+                                   const float2* a_buffer, const size_t a_offset, const size_t a_ld,
+                                   float2* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasCtbsv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n), static_cast<int>(k),
+                            reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXtbsv<double2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                    const size_t n, const size_t k,
+                                    const double2* a_buffer, const size_t a_offset, const size_t a_ld,
+                                    double2* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasZtbsv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n), static_cast<int>(k),
+                            reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+
+// Forwards the cuBLAS calls for STPSV/DTPSV/CTPSV/ZTPSV
+template <typename T>
+cublasStatus_t cublasXtpsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                           const size_t n,
+                           const T* ap_buffer, const size_t ap_offset,
+                           T* x_buffer, const size_t x_offset, const size_t x_inc);
+template <>
+cublasStatus_t cublasXtpsv<float>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                  const size_t n,
+                                  const float* ap_buffer, const size_t ap_offset,
+                                  float* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasStpsv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n),
+                            &ap_buffer[ap_offset],
+                            &x_buffer[x_offset], static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXtpsv<double>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                   const size_t n,
+                                   const double* ap_buffer, const size_t ap_offset,
+                                   double* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasDtpsv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n),
+                            &ap_buffer[ap_offset],
+                            &x_buffer[x_offset], static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXtpsv<float2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                   const size_t n,
+                                   const float2* ap_buffer, const size_t ap_offset,
+                                   float2* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasCtpsv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n),
+                            reinterpret_cast<const cuComplex*>(&ap_buffer[ap_offset]),
+                            reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+template <>
+cublasStatus_t cublasXtpsv<double2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                                    const size_t n,
+                                    const double2* ap_buffer, const size_t ap_offset,
+                                    double2* x_buffer, const size_t x_offset, const size_t x_inc) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasZtpsv(handle, triangle, a_transpose, diagonal,
+                            static_cast<int>(n),
+                            reinterpret_cast<const cuDoubleComplex*>(&ap_buffer[ap_offset]),
+                            reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+  cudaDeviceSynchronize();
+  return status;
+}
+
+// Forwards the cuBLAS calls for SGER/DGER
+cublasStatus_t cublasXger(cublasHandle_t handle, const Layout layout,
+                          const size_t m, const size_t n,
+                          const float alpha,
+                          const float* x_buffer, const size_t x_offset, const size_t x_inc,
+                          const float* y_buffer, const size_t y_offset, const size_t y_inc,
+                          float* a_buffer, const size_t a_offset, const size_t a_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasSger(handle, static_cast<int>(m), static_cast<int>(n),
+                           &alpha,
+                           &x_buffer[x_offset], static_cast<int>(x_inc),
+                           &y_buffer[y_offset], static_cast<int>(y_inc),
+                           &a_buffer[a_offset], a_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXger(cublasHandle_t handle, const Layout layout,
+                          const size_t m, const size_t n,
+                          const double alpha,
+                          const double* x_buffer, const size_t x_offset, const size_t x_inc,
+                          const double* y_buffer, const size_t y_offset, const size_t y_inc,
+                          double* a_buffer, const size_t a_offset, const size_t a_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasDger(handle, static_cast<int>(m), static_cast<int>(n),
+                           &alpha,
+                           &x_buffer[x_offset], static_cast<int>(x_inc),
+                           &y_buffer[y_offset], static_cast<int>(y_inc),
+                           &a_buffer[a_offset], a_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXger(cublasHandle_t handle, const Layout layout,
+                          const size_t m, const size_t n,
+                          const half alpha,
+                          const half* x_buffer, const size_t x_offset, const size_t x_inc,
+                          const half* y_buffer, const size_t y_offset, const size_t y_inc,
+                          half* a_buffer, const size_t a_offset, const size_t a_ld) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for CGERU/ZGERU
+cublasStatus_t cublasXgeru(cublasHandle_t handle, const Layout layout,
+                           const size_t m, const size_t n,
+                           const float2 alpha,
+                           const float2* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const float2* y_buffer, const size_t y_offset, const size_t y_inc,
+                           float2* a_buffer, const size_t a_offset, const size_t a_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  auto status = cublasCgeru(handle, static_cast<int>(m), static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<const cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+                            reinterpret_cast<cuComplex*>(&a_buffer[a_offset]), a_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXgeru(cublasHandle_t handle, const Layout layout,
+                           const size_t m, const size_t n,
+                           const double2 alpha,
+                           const double2* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const double2* y_buffer, const size_t y_offset, const size_t y_inc,
+                           double2* a_buffer, const size_t a_offset, const size_t a_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuDoubleComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  auto status = cublasZgeru(handle, static_cast<int>(m), static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<const cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+                            reinterpret_cast<cuDoubleComplex*>(&a_buffer[a_offset]), a_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+
+// Forwards the cuBLAS calls for CGERC/ZGERC
+cublasStatus_t cublasXgerc(cublasHandle_t handle, const Layout layout,
+                           const size_t m, const size_t n,
+                           const float2 alpha,
+                           const float2* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const float2* y_buffer, const size_t y_offset, const size_t y_inc,
+                           float2* a_buffer, const size_t a_offset, const size_t a_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  auto status = cublasCgerc(handle, static_cast<int>(m), static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<const cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+                            reinterpret_cast<cuComplex*>(&a_buffer[a_offset]), a_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXgerc(cublasHandle_t handle, const Layout layout,
+                           const size_t m, const size_t n,
+                           const double2 alpha,
+                           const double2* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const double2* y_buffer, const size_t y_offset, const size_t y_inc,
+                           double2* a_buffer, const size_t a_offset, const size_t a_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuDoubleComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  auto status = cublasZgerc(handle, static_cast<int>(m), static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<const cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+                            reinterpret_cast<cuDoubleComplex*>(&a_buffer[a_offset]), a_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+
+// Forwards the cuBLAS calls for CHER/ZHER
+cublasStatus_t cublasXher(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                          const size_t n,
+                          const float alpha,
+                          const float2* x_buffer, const size_t x_offset, const size_t x_inc,
+                          float2* a_buffer, const size_t a_offset, const size_t a_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasCher(handle, triangle,
+                           static_cast<int>(n),
+                           &alpha,
+                           reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                           reinterpret_cast<cuComplex*>(&a_buffer[a_offset]), a_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXher(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                          const size_t n,
+                          const double alpha,
+                          const double2* x_buffer, const size_t x_offset, const size_t x_inc,
+                          double2* a_buffer, const size_t a_offset, const size_t a_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasZher(handle, triangle,
+                           static_cast<int>(n),
+                           &alpha,
+                           reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                           reinterpret_cast<cuDoubleComplex*>(&a_buffer[a_offset]), a_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+
+// Forwards the cuBLAS calls for CHPR/ZHPR
+cublasStatus_t cublasXhpr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                          const size_t n,
+                          const float alpha,
+                          const float2* x_buffer, const size_t x_offset, const size_t x_inc,
+                          float2* ap_buffer, const size_t ap_offset) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasChpr(handle, triangle,
+                           static_cast<int>(n),
+                           &alpha,
+                           reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                           reinterpret_cast<cuComplex*>(&ap_buffer[ap_offset]));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXhpr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                          const size_t n,
+                          const double alpha,
+                          const double2* x_buffer, const size_t x_offset, const size_t x_inc,
+                          double2* ap_buffer, const size_t ap_offset) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasZhpr(handle, triangle,
+                           static_cast<int>(n),
+                           &alpha,
+                           reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                           reinterpret_cast<cuDoubleComplex*>(&ap_buffer[ap_offset]));
+  cudaDeviceSynchronize();
+  return status;
+}
+
+// Forwards the cuBLAS calls for CHER2/ZHER2
+cublasStatus_t cublasXher2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n,
+                           const float2 alpha,
+                           const float2* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const float2* y_buffer, const size_t y_offset, const size_t y_inc,
+                           float2* a_buffer, const size_t a_offset, const size_t a_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  auto status = cublasCher2(handle, triangle,
+                            static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<const cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+                            reinterpret_cast<cuComplex*>(&a_buffer[a_offset]), a_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXher2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n,
+                           const double2 alpha,
+                           const double2* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const double2* y_buffer, const size_t y_offset, const size_t y_inc,
+                           double2* a_buffer, const size_t a_offset, const size_t a_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuDoubleComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  auto status = cublasZher2(handle, triangle,
+                            static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<const cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+                            reinterpret_cast<cuDoubleComplex*>(&a_buffer[a_offset]), a_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+
+// Forwards the cuBLAS calls for CHPR2/ZHPR2
+cublasStatus_t cublasXhpr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n,
+                           const float2 alpha,
+                           const float2* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const float2* y_buffer, const size_t y_offset, const size_t y_inc,
+                           float2* ap_buffer, const size_t ap_offset) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  auto status = cublasChpr2(handle, triangle,
+                            static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<const cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+                            reinterpret_cast<cuComplex*>(&ap_buffer[ap_offset]));
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXhpr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n,
+                           const double2 alpha,
+                           const double2* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const double2* y_buffer, const size_t y_offset, const size_t y_inc,
+                           double2* ap_buffer, const size_t ap_offset) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuDoubleComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  auto status = cublasZhpr2(handle, triangle,
+                            static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                            reinterpret_cast<const cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+                            reinterpret_cast<cuDoubleComplex*>(&ap_buffer[ap_offset]));
+  cudaDeviceSynchronize();
+  return status;
+}
+
+// Forwards the cuBLAS calls for SSYR/DSYR
+cublasStatus_t cublasXsyr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                          const size_t n,
+                          const float alpha,
+                          const float* x_buffer, const size_t x_offset, const size_t x_inc,
+                          float* a_buffer, const size_t a_offset, const size_t a_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasSsyr(handle, triangle,
+                           static_cast<int>(n),
+                           &alpha,
+                           &x_buffer[x_offset], static_cast<int>(x_inc),
+                           &a_buffer[a_offset], a_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXsyr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                          const size_t n,
+                          const double alpha,
+                          const double* x_buffer, const size_t x_offset, const size_t x_inc,
+                          double* a_buffer, const size_t a_offset, const size_t a_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasDsyr(handle, triangle,
+                           static_cast<int>(n),
+                           &alpha,
+                           &x_buffer[x_offset], static_cast<int>(x_inc),
+                           &a_buffer[a_offset], a_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXsyr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                          const size_t n,
+                          const half alpha,
+                          const half* x_buffer, const size_t x_offset, const size_t x_inc,
+                          half* a_buffer, const size_t a_offset, const size_t a_ld) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for SSPR/DSPR
+cublasStatus_t cublasXspr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                          const size_t n,
+                          const float alpha,
+                          const float* x_buffer, const size_t x_offset, const size_t x_inc,
+                          float* ap_buffer, const size_t ap_offset) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasSspr(handle, triangle,
+                           static_cast<int>(n),
+                           &alpha,
+                           &x_buffer[x_offset], static_cast<int>(x_inc),
+                           &ap_buffer[ap_offset]);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXspr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                          const size_t n,
+                          const double alpha,
+                          const double* x_buffer, const size_t x_offset, const size_t x_inc,
+                          double* ap_buffer, const size_t ap_offset) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasDspr(handle, triangle,
+                           static_cast<int>(n),
+                           &alpha,
+                           &x_buffer[x_offset], static_cast<int>(x_inc),
+                           &ap_buffer[ap_offset]);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXspr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                          const size_t n,
+                          const half alpha,
+                          const half* x_buffer, const size_t x_offset, const size_t x_inc,
+                          half* ap_buffer, const size_t ap_offset) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for SSYR2/DSYR2
+cublasStatus_t cublasXsyr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n,
+                           const float alpha,
+                           const float* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const float* y_buffer, const size_t y_offset, const size_t y_inc,
+                           float* a_buffer, const size_t a_offset, const size_t a_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasSsyr2(handle, triangle,
+                            static_cast<int>(n),
+                            &alpha,
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &y_buffer[y_offset], static_cast<int>(y_inc),
+                            &a_buffer[a_offset], a_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXsyr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n,
+                           const double alpha,
+                           const double* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const double* y_buffer, const size_t y_offset, const size_t y_inc,
+                           double* a_buffer, const size_t a_offset, const size_t a_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasDsyr2(handle, triangle,
+                            static_cast<int>(n),
+                            &alpha,
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &y_buffer[y_offset], static_cast<int>(y_inc),
+                            &a_buffer[a_offset], a_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXsyr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n,
+                           const half alpha,
+                           const half* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const half* y_buffer, const size_t y_offset, const size_t y_inc,
+                           half* a_buffer, const size_t a_offset, const size_t a_ld) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for SSPR2/DSPR2
+cublasStatus_t cublasXspr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n,
+                           const float alpha,
+                           const float* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const float* y_buffer, const size_t y_offset, const size_t y_inc,
+                           float* ap_buffer, const size_t ap_offset) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasSspr2(handle, triangle,
+                            static_cast<int>(n),
+                            &alpha,
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &y_buffer[y_offset], static_cast<int>(y_inc),
+                            &ap_buffer[ap_offset]);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXspr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n,
+                           const double alpha,
+                           const double* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const double* y_buffer, const size_t y_offset, const size_t y_inc,
+                           double* ap_buffer, const size_t ap_offset) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasDspr2(handle, triangle,
+                            static_cast<int>(n),
+                            &alpha,
+                            &x_buffer[x_offset], static_cast<int>(x_inc),
+                            &y_buffer[y_offset], static_cast<int>(y_inc),
+                            &ap_buffer[ap_offset]);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXspr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle,
+                           const size_t n,
+                           const half alpha,
+                           const half* x_buffer, const size_t x_offset, const size_t x_inc,
+                           const half* y_buffer, const size_t y_offset, const size_t y_inc,
+                           half* ap_buffer, const size_t ap_offset) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// =================================================================================================
+// BLAS level-3 (matrix-matrix) routines
+// =================================================================================================
+
+// Forwards the cuBLAS calls for SGEMM/DGEMM/CGEMM/ZGEMM
+cublasStatus_t cublasXgemm(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose,
+                           const size_t m, const size_t n, const size_t k,
+                           const float alpha,
+                           const float* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const float* b_buffer, const size_t b_offset, const size_t b_ld,
+                           const float beta,
+                           float* c_buffer, const size_t c_offset, const size_t c_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasSgemm(handle, a_transpose, b_transpose,
+                            static_cast<int>(m), static_cast<int>(n), static_cast<int>(k),
+                            &alpha,
+                            &a_buffer[a_offset], a_ld,
+                            &b_buffer[b_offset], b_ld,
+                            &beta,
+                            &c_buffer[c_offset], c_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXgemm(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose,
+                           const size_t m, const size_t n, const size_t k,
+                           const double alpha,
+                           const double* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const double* b_buffer, const size_t b_offset, const size_t b_ld,
+                           const double beta,
+                           double* c_buffer, const size_t c_offset, const size_t c_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasDgemm(handle, a_transpose, b_transpose,
+                            static_cast<int>(m), static_cast<int>(n), static_cast<int>(k),
+                            &alpha,
+                            &a_buffer[a_offset], a_ld,
+                            &b_buffer[b_offset], b_ld,
+                            &beta,
+                            &c_buffer[c_offset], c_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXgemm(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose,
+                           const size_t m, const size_t n, const size_t k,
+                           const float2 alpha,
+                           const float2* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const float2* b_buffer, const size_t b_offset, const size_t b_ld,
+                           const float2 beta,
+                           float2* c_buffer, const size_t c_offset, const size_t c_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  cuComplex beta_cuda;
+  beta_cuda.x = beta.real();
+  beta_cuda.y = beta.imag();
+  auto status = cublasCgemm(handle, a_transpose, b_transpose,
+                            static_cast<int>(m), static_cast<int>(n), static_cast<int>(k),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<const cuComplex*>(&b_buffer[b_offset]), b_ld,
+                            &beta_cuda,
+                            reinterpret_cast<cuComplex*>(&c_buffer[c_offset]), c_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXgemm(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose,
+                           const size_t m, const size_t n, const size_t k,
+                           const double2 alpha,
+                           const double2* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const double2* b_buffer, const size_t b_offset, const size_t b_ld,
+                           const double2 beta,
+                           double2* c_buffer, const size_t c_offset, const size_t c_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuDoubleComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  cuDoubleComplex beta_cuda;
+  beta_cuda.x = beta.real();
+  beta_cuda.y = beta.imag();
+  auto status = cublasZgemm(handle, a_transpose, b_transpose,
+                            static_cast<int>(m), static_cast<int>(n), static_cast<int>(k),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<const cuDoubleComplex*>(&b_buffer[b_offset]), b_ld,
+                            &beta_cuda,
+                            reinterpret_cast<cuDoubleComplex*>(&c_buffer[c_offset]), c_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXgemm(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose,
+                           const size_t m, const size_t n, const size_t k,
+                           const half alpha,
+                           const half* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const half* b_buffer, const size_t b_offset, const size_t b_ld,
+                           const half beta,
+                           half* c_buffer, const size_t c_offset, const size_t c_ld) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for SSYMM/DSYMM/CSYMM/ZSYMM
+cublasStatus_t cublasXsymm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle,
+                           const size_t m, const size_t n,
+                           const float alpha,
+                           const float* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const float* b_buffer, const size_t b_offset, const size_t b_ld,
+                           const float beta,
+                           float* c_buffer, const size_t c_offset, const size_t c_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasSsymm(handle, side, triangle,
+                            static_cast<int>(m), static_cast<int>(n),
+                            &alpha,
+                            &a_buffer[a_offset], a_ld,
+                            &b_buffer[b_offset], b_ld,
+                            &beta,
+                            &c_buffer[c_offset], c_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXsymm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle,
+                           const size_t m, const size_t n,
+                           const double alpha,
+                           const double* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const double* b_buffer, const size_t b_offset, const size_t b_ld,
+                           const double beta,
+                           double* c_buffer, const size_t c_offset, const size_t c_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasDsymm(handle, side, triangle,
+                            static_cast<int>(m), static_cast<int>(n),
+                            &alpha,
+                            &a_buffer[a_offset], a_ld,
+                            &b_buffer[b_offset], b_ld,
+                            &beta,
+                            &c_buffer[c_offset], c_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXsymm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle,
+                           const size_t m, const size_t n,
+                           const float2 alpha,
+                           const float2* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const float2* b_buffer, const size_t b_offset, const size_t b_ld,
+                           const float2 beta,
+                           float2* c_buffer, const size_t c_offset, const size_t c_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  cuComplex beta_cuda;
+  beta_cuda.x = beta.real();
+  beta_cuda.y = beta.imag();
+  auto status = cublasCsymm(handle, side, triangle,
+                            static_cast<int>(m), static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<const cuComplex*>(&b_buffer[b_offset]), b_ld,
+                            &beta_cuda,
+                            reinterpret_cast<cuComplex*>(&c_buffer[c_offset]), c_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXsymm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle,
+                           const size_t m, const size_t n,
+                           const double2 alpha,
+                           const double2* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const double2* b_buffer, const size_t b_offset, const size_t b_ld,
+                           const double2 beta,
+                           double2* c_buffer, const size_t c_offset, const size_t c_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuDoubleComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  cuDoubleComplex beta_cuda;
+  beta_cuda.x = beta.real();
+  beta_cuda.y = beta.imag();
+  auto status = cublasZsymm(handle, side, triangle,
+                            static_cast<int>(m), static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<const cuDoubleComplex*>(&b_buffer[b_offset]), b_ld,
+                            &beta_cuda,
+                            reinterpret_cast<cuDoubleComplex*>(&c_buffer[c_offset]), c_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXsymm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle,
+                           const size_t m, const size_t n,
+                           const half alpha,
+                           const half* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const half* b_buffer, const size_t b_offset, const size_t b_ld,
+                           const half beta,
+                           half* c_buffer, const size_t c_offset, const size_t c_ld) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for CHEMM/ZHEMM
+cublasStatus_t cublasXhemm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle,
+                           const size_t m, const size_t n,
+                           const float2 alpha,
+                           const float2* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const float2* b_buffer, const size_t b_offset, const size_t b_ld,
+                           const float2 beta,
+                           float2* c_buffer, const size_t c_offset, const size_t c_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  cuComplex beta_cuda;
+  beta_cuda.x = beta.real();
+  beta_cuda.y = beta.imag();
+  auto status = cublasChemm(handle, side, triangle,
+                            static_cast<int>(m), static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<const cuComplex*>(&b_buffer[b_offset]), b_ld,
+                            &beta_cuda,
+                            reinterpret_cast<cuComplex*>(&c_buffer[c_offset]), c_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXhemm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle,
+                           const size_t m, const size_t n,
+                           const double2 alpha,
+                           const double2* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const double2* b_buffer, const size_t b_offset, const size_t b_ld,
+                           const double2 beta,
+                           double2* c_buffer, const size_t c_offset, const size_t c_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuDoubleComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  cuDoubleComplex beta_cuda;
+  beta_cuda.x = beta.real();
+  beta_cuda.y = beta.imag();
+  auto status = cublasZhemm(handle, side, triangle,
+                            static_cast<int>(m), static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<const cuDoubleComplex*>(&b_buffer[b_offset]), b_ld,
+                            &beta_cuda,
+                            reinterpret_cast<cuDoubleComplex*>(&c_buffer[c_offset]), c_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+
+// Forwards the cuBLAS calls for SSYRK/DSYRK/CSYRK/ZSYRK
+cublasStatus_t cublasXsyrk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose,
+                           const size_t n, const size_t k,
+                           const float alpha,
+                           const float* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const float beta,
+                           float* c_buffer, const size_t c_offset, const size_t c_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasSsyrk(handle, triangle, a_transpose,
+                            static_cast<int>(n), static_cast<int>(k),
+                            &alpha,
+                            &a_buffer[a_offset], a_ld,
+                            &beta,
+                            &c_buffer[c_offset], c_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXsyrk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose,
+                           const size_t n, const size_t k,
+                           const double alpha,
+                           const double* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const double beta,
+                           double* c_buffer, const size_t c_offset, const size_t c_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasDsyrk(handle, triangle, a_transpose,
+                            static_cast<int>(n), static_cast<int>(k),
+                            &alpha,
+                            &a_buffer[a_offset], a_ld,
+                            &beta,
+                            &c_buffer[c_offset], c_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXsyrk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose,
+                           const size_t n, const size_t k,
+                           const float2 alpha,
+                           const float2* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const float2 beta,
+                           float2* c_buffer, const size_t c_offset, const size_t c_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  cuComplex beta_cuda;
+  beta_cuda.x = beta.real();
+  beta_cuda.y = beta.imag();
+  auto status = cublasCsyrk(handle, triangle, a_transpose,
+                            static_cast<int>(n), static_cast<int>(k),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
+                            &beta_cuda,
+                            reinterpret_cast<cuComplex*>(&c_buffer[c_offset]), c_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXsyrk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose,
+                           const size_t n, const size_t k,
+                           const double2 alpha,
+                           const double2* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const double2 beta,
+                           double2* c_buffer, const size_t c_offset, const size_t c_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuDoubleComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  cuDoubleComplex beta_cuda;
+  beta_cuda.x = beta.real();
+  beta_cuda.y = beta.imag();
+  auto status = cublasZsyrk(handle, triangle, a_transpose,
+                            static_cast<int>(n), static_cast<int>(k),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
+                            &beta_cuda,
+                            reinterpret_cast<cuDoubleComplex*>(&c_buffer[c_offset]), c_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXsyrk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose,
+                           const size_t n, const size_t k,
+                           const half alpha,
+                           const half* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const half beta,
+                           half* c_buffer, const size_t c_offset, const size_t c_ld) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for CHERK/ZHERK
+cublasStatus_t cublasXherk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose,
+                           const size_t n, const size_t k,
+                           const float alpha,
+                           const float2* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const float beta,
+                           float2* c_buffer, const size_t c_offset, const size_t c_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasCherk(handle, triangle, a_transpose,
+                            static_cast<int>(n), static_cast<int>(k),
+                            &alpha,
+                            reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
+                            &beta,
+                            reinterpret_cast<cuComplex*>(&c_buffer[c_offset]), c_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXherk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose,
+                           const size_t n, const size_t k,
+                           const double alpha,
+                           const double2* a_buffer, const size_t a_offset, const size_t a_ld,
+                           const double beta,
+                           double2* c_buffer, const size_t c_offset, const size_t c_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasZherk(handle, triangle, a_transpose,
+                            static_cast<int>(n), static_cast<int>(k),
+                            &alpha,
+                            reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
+                            &beta,
+                            reinterpret_cast<cuDoubleComplex*>(&c_buffer[c_offset]), c_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+
+// Forwards the cuBLAS calls for SSYR2K/DSYR2K/CSYR2K/ZSYR2K
+cublasStatus_t cublasXsyr2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose,
+                            const size_t n, const size_t k,
+                            const float alpha,
+                            const float* a_buffer, const size_t a_offset, const size_t a_ld,
+                            const float* b_buffer, const size_t b_offset, const size_t b_ld,
+                            const float beta,
+                            float* c_buffer, const size_t c_offset, const size_t c_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasSsyr2k(handle, triangle, ab_transpose,
+                             static_cast<int>(n), static_cast<int>(k),
+                             &alpha,
+                             &a_buffer[a_offset], a_ld,
+                             &b_buffer[b_offset], b_ld,
+                             &beta,
+                             &c_buffer[c_offset], c_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXsyr2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose,
+                            const size_t n, const size_t k,
+                            const double alpha,
+                            const double* a_buffer, const size_t a_offset, const size_t a_ld,
+                            const double* b_buffer, const size_t b_offset, const size_t b_ld,
+                            const double beta,
+                            double* c_buffer, const size_t c_offset, const size_t c_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasDsyr2k(handle, triangle, ab_transpose,
+                             static_cast<int>(n), static_cast<int>(k),
+                             &alpha,
+                             &a_buffer[a_offset], a_ld,
+                             &b_buffer[b_offset], b_ld,
+                             &beta,
+                             &c_buffer[c_offset], c_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXsyr2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose,
+                            const size_t n, const size_t k,
+                            const float2 alpha,
+                            const float2* a_buffer, const size_t a_offset, const size_t a_ld,
+                            const float2* b_buffer, const size_t b_offset, const size_t b_ld,
+                            const float2 beta,
+                            float2* c_buffer, const size_t c_offset, const size_t c_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  cuComplex beta_cuda;
+  beta_cuda.x = beta.real();
+  beta_cuda.y = beta.imag();
+  auto status = cublasCsyr2k(handle, triangle, ab_transpose,
+                             static_cast<int>(n), static_cast<int>(k),
+                             &alpha_cuda,
+                             reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
+                             reinterpret_cast<const cuComplex*>(&b_buffer[b_offset]), b_ld,
+                             &beta_cuda,
+                             reinterpret_cast<cuComplex*>(&c_buffer[c_offset]), c_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXsyr2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose,
+                            const size_t n, const size_t k,
+                            const double2 alpha,
+                            const double2* a_buffer, const size_t a_offset, const size_t a_ld,
+                            const double2* b_buffer, const size_t b_offset, const size_t b_ld,
+                            const double2 beta,
+                            double2* c_buffer, const size_t c_offset, const size_t c_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuDoubleComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  cuDoubleComplex beta_cuda;
+  beta_cuda.x = beta.real();
+  beta_cuda.y = beta.imag();
+  auto status = cublasZsyr2k(handle, triangle, ab_transpose,
+                             static_cast<int>(n), static_cast<int>(k),
+                             &alpha_cuda,
+                             reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
+                             reinterpret_cast<const cuDoubleComplex*>(&b_buffer[b_offset]), b_ld,
+                             &beta_cuda,
+                             reinterpret_cast<cuDoubleComplex*>(&c_buffer[c_offset]), c_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXsyr2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose,
+                            const size_t n, const size_t k,
+                            const half alpha,
+                            const half* a_buffer, const size_t a_offset, const size_t a_ld,
+                            const half* b_buffer, const size_t b_offset, const size_t b_ld,
+                            const half beta,
+                            half* c_buffer, const size_t c_offset, const size_t c_ld) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for CHER2K/ZHER2K
+cublasStatus_t cublasXher2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose,
+                            const size_t n, const size_t k,
+                            const float2 alpha,
+                            const float2* a_buffer, const size_t a_offset, const size_t a_ld,
+                            const float2* b_buffer, const size_t b_offset, const size_t b_ld,
+                            const float beta,
+                            float2* c_buffer, const size_t c_offset, const size_t c_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  auto status = cublasCher2k(handle, triangle, ab_transpose,
+                             static_cast<int>(n), static_cast<int>(k),
+                             &alpha_cuda,
+                             reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
+                             reinterpret_cast<const cuComplex*>(&b_buffer[b_offset]), b_ld,
+                             &beta,
+                             reinterpret_cast<cuComplex*>(&c_buffer[c_offset]), c_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXher2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose,
+                            const size_t n, const size_t k,
+                            const double2 alpha,
+                            const double2* a_buffer, const size_t a_offset, const size_t a_ld,
+                            const double2* b_buffer, const size_t b_offset, const size_t b_ld,
+                            const double beta,
+                            double2* c_buffer, const size_t c_offset, const size_t c_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuDoubleComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  auto status = cublasZher2k(handle, triangle, ab_transpose,
+                             static_cast<int>(n), static_cast<int>(k),
+                             &alpha_cuda,
+                             reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
+                             reinterpret_cast<const cuDoubleComplex*>(&b_buffer[b_offset]), b_ld,
+                             &beta,
+                             reinterpret_cast<cuDoubleComplex*>(&c_buffer[c_offset]), c_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+
+// Forwards the cuBLAS calls for STRMM/DTRMM/CTRMM/ZTRMM
+cublasStatus_t cublasXtrmm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                           const size_t m, const size_t n,
+                           const float alpha,
+                           const float* a_buffer, const size_t a_offset, const size_t a_ld,
+                           float* b_buffer, const size_t b_offset, const size_t b_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasStrmm(handle, side, triangle, a_transpose, diagonal,
+                            static_cast<int>(m), static_cast<int>(n),
+                            &alpha,
+                            &a_buffer[a_offset], a_ld,
+                            &a_buffer[a_offset], a_ld,
+                            &b_buffer[b_offset], b_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXtrmm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                           const size_t m, const size_t n,
+                           const double alpha,
+                           const double* a_buffer, const size_t a_offset, const size_t a_ld,
+                           double* b_buffer, const size_t b_offset, const size_t b_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasDtrmm(handle, side, triangle, a_transpose, diagonal,
+                            static_cast<int>(m), static_cast<int>(n),
+                            &alpha,
+                            &a_buffer[a_offset], a_ld,
+                            &a_buffer[a_offset], a_ld,
+                            &b_buffer[b_offset], b_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXtrmm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                           const size_t m, const size_t n,
+                           const float2 alpha,
+                           const float2* a_buffer, const size_t a_offset, const size_t a_ld,
+                           float2* b_buffer, const size_t b_offset, const size_t b_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  auto status = cublasCtrmm(handle, side, triangle, a_transpose, diagonal,
+                            static_cast<int>(m), static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<cuComplex*>(&b_buffer[b_offset]), b_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXtrmm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                           const size_t m, const size_t n,
+                           const double2 alpha,
+                           const double2* a_buffer, const size_t a_offset, const size_t a_ld,
+                           double2* b_buffer, const size_t b_offset, const size_t b_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuDoubleComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  auto status = cublasZtrmm(handle, side, triangle, a_transpose, diagonal,
+                            static_cast<int>(m), static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<cuDoubleComplex*>(&b_buffer[b_offset]), b_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXtrmm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                           const size_t m, const size_t n,
+                           const half alpha,
+                           const half* a_buffer, const size_t a_offset, const size_t a_ld,
+                           half* b_buffer, const size_t b_offset, const size_t b_ld) {
+  return CUBLAS_STATUS_NOT_SUPPORTED;
+}
+
+// Forwards the cuBLAS calls for STRSM/DTRSM/CTRSM/ZTRSM
+cublasStatus_t cublasXtrsm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                           const size_t m, const size_t n,
+                           const float alpha,
+                           const float* a_buffer, const size_t a_offset, const size_t a_ld,
+                           float* b_buffer, const size_t b_offset, const size_t b_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasStrsm(handle, side, triangle, a_transpose, diagonal,
+                            static_cast<int>(m), static_cast<int>(n),
+                            &alpha,
+                            &a_buffer[a_offset], a_ld,
+                            &b_buffer[b_offset], b_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXtrsm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                           const size_t m, const size_t n,
+                           const double alpha,
+                           const double* a_buffer, const size_t a_offset, const size_t a_ld,
+                           double* b_buffer, const size_t b_offset, const size_t b_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  auto status = cublasDtrsm(handle, side, triangle, a_transpose, diagonal,
+                            static_cast<int>(m), static_cast<int>(n),
+                            &alpha,
+                            &a_buffer[a_offset], a_ld,
+                            &b_buffer[b_offset], b_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXtrsm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                           const size_t m, const size_t n,
+                           const float2 alpha,
+                           const float2* a_buffer, const size_t a_offset, const size_t a_ld,
+                           float2* b_buffer, const size_t b_offset, const size_t b_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  auto status = cublasCtrsm(handle, side, triangle, a_transpose, diagonal,
+                            static_cast<int>(m), static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<cuComplex*>(&b_buffer[b_offset]), b_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+cublasStatus_t cublasXtrsm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal,
+                           const size_t m, const size_t n,
+                           const double2 alpha,
+                           const double2* a_buffer, const size_t a_offset, const size_t a_ld,
+                           double2* b_buffer, const size_t b_offset, const size_t b_ld) {
+  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }
+  cuDoubleComplex alpha_cuda;
+  alpha_cuda.x = alpha.real();
+  alpha_cuda.y = alpha.imag();
+  auto status = cublasZtrsm(handle, side, triangle, a_transpose, diagonal,
+                            static_cast<int>(m), static_cast<int>(n),
+                            &alpha_cuda,
+                            reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld,
+                            reinterpret_cast<cuDoubleComplex*>(&b_buffer[b_offset]), b_ld);
+  cudaDeviceSynchronize();
+  return status;
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_WRAPPER_CUBLAS_H_
+#endif
diff --git a/test/wrapper_cuda.hpp b/test/wrapper_cuda.hpp
new file mode 100644
index 00000000..c97ae3ef
--- /dev/null
+++ b/test/wrapper_cuda.hpp
@@ -0,0 +1,149 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains all the CUDA related code; used only in case of testing against cuBLAS
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_WRAPPER_CUDA_H_
+#define CLBLAST_TEST_WRAPPER_CUDA_H_
+
+#include <string>
+#include <vector>
+#include <memory>
+#include <stdexcept>
+
+#include "utilities/utilities.hpp"
+
+#ifdef CLBLAST_REF_CUBLAS
+  #include <cuda_runtime.h>
+  #include <cublas_v2.h>
+#endif
+
+namespace clblast {
+// =================================================================================================
+
+#ifdef CLBLAST_REF_CUBLAS
+  template <typename T>
+  void cublasSetup(Arguments<T> &args) {
+    cudaSetDevice(static_cast<int>(args.device_id));
+    auto status = cublasCreate(reinterpret_cast<cublasHandle_t*>(&args.cublas_handle));
+    if (status != CUBLAS_STATUS_SUCCESS) {
+      throw std::runtime_error("CUDA cublasCreate error");
+    }
+  }
+#endif
+
+#ifdef CLBLAST_REF_CUBLAS
+  template <typename T>
+  void cublasTeardown(Arguments<T> &args) {
+    auto status = cublasDestroy(reinterpret_cast<cublasHandle_t>(args.cublas_handle));
+    if (status != CUBLAS_STATUS_SUCCESS) {
+      throw std::runtime_error("CUDA cublasDestroy error");
+    }
+  }
+#endif
+
+// =================================================================================================
+
+// Copies data from the CUDA device to the host and frees-up the CUDA memory afterwards
+#ifdef CLBLAST_REF_CUBLAS
+  template <typename T>
+  void CUDAToHost(T** buffer_cuda, std::vector<T> &buffer_host, const size_t size) {
+    auto status1 = cudaMemcpy(
+      reinterpret_cast<void*>(buffer_host.data()),
+      reinterpret_cast<void*>(*buffer_cuda),
+      size*sizeof(T),
+      cudaMemcpyDeviceToHost
+    );
+    if (status1 != cudaSuccess) {
+      throw std::runtime_error("CUDA cudaMemcpy error with status: "+ToString(static_cast<int>(status1)));
+    }
+    auto status2 = cudaFree(*buffer_cuda);
+    if (status2 != cudaSuccess) {
+      throw std::runtime_error("CUDA cudaFree error with status: "+ToString(static_cast<int>(status2)));
+    }
+    *buffer_cuda = nullptr;
+}
+#else
+  template <typename T> void CUDAToHost(T**, const std::vector<T>&, const size_t) { }
+#endif
+
+// Allocates space on the CUDA device and copies in data from the host
+#ifdef CLBLAST_REF_CUBLAS
+  template <typename T>
+  void HostToCUDA(T** buffer_cuda, std::vector<T> &buffer_host, const size_t size) {
+    if (*buffer_cuda == nullptr) {
+      auto status1 = cudaMalloc(reinterpret_cast<void**>(buffer_cuda), size*sizeof(T));
+      if (status1 != cudaSuccess) {
+        throw std::runtime_error("CUDA cudaMalloc error with status: "+ToString(static_cast<int>(status1)));
+      }
+    }
+    auto status2 = cudaMemcpy(
+      reinterpret_cast<void*>(*buffer_cuda),
+      reinterpret_cast<void*>(buffer_host.data()),
+      size*sizeof(T),
+      cudaMemcpyHostToDevice
+    );
+    if (status2 != cudaSuccess) {
+      throw std::runtime_error("CUDA cudaMemcpy error with status: "+ToString(static_cast<int>(status2)));
+    }
+  }
+#else
+  template <typename T> void HostToCUDA(T**, const std::vector<T>&, const size_t) { }
+#endif
+
+// =================================================================================================
+
+template <typename T>
+struct BuffersCUDA {
+  T* x_vec = nullptr;
+  T* y_vec = nullptr;
+  T* a_mat = nullptr;
+  T* b_mat = nullptr;
+  T* c_mat = nullptr;
+  T* ap_mat = nullptr;
+  T* scalar = nullptr;
+};
+
+template <typename T, typename U>
+void CUDAToHost(const Arguments<U> &args, BuffersCUDA<T> &buffers, BuffersHost<T> &buffers_host,
+                const std::vector<std::string> &names) {
+  for (auto &name: names) {
+    if (name == kBufVecX) { buffers_host.x_vec = std::vector<T>(args.x_size, static_cast<T>(0)); CUDAToHost(&buffers.x_vec, buffers_host.x_vec, args.x_size); }
+    else if (name == kBufVecY) { buffers_host.y_vec = std::vector<T>(args.y_size, static_cast<T>(0)); CUDAToHost(&buffers.y_vec, buffers_host.y_vec, args.y_size); }
+    else if (name == kBufMatA) { buffers_host.a_mat = std::vector<T>(args.a_size, static_cast<T>(0)); CUDAToHost(&buffers.a_mat, buffers_host.a_mat, args.a_size); }
+    else if (name == kBufMatB) { buffers_host.b_mat = std::vector<T>(args.b_size, static_cast<T>(0)); CUDAToHost(&buffers.b_mat, buffers_host.b_mat, args.b_size); }
+    else if (name == kBufMatC) { buffers_host.c_mat = std::vector<T>(args.c_size, static_cast<T>(0)); CUDAToHost(&buffers.c_mat, buffers_host.c_mat, args.c_size); }
+    else if (name == kBufMatAP) { buffers_host.ap_mat = std::vector<T>(args.ap_size, static_cast<T>(0)); CUDAToHost(&buffers.ap_mat, buffers_host.ap_mat, args.ap_size); }
+    else if (name == kBufScalar) { buffers_host.scalar = std::vector<T>(args.scalar_size, static_cast<T>(0)); CUDAToHost(&buffers.scalar, buffers_host.scalar, args.scalar_size); }
+    else { throw std::runtime_error("Invalid buffer name"); }
+  }
+}
+
+template <typename T, typename U>
+void HostToCUDA(const Arguments<U> &args, BuffersCUDA<T> &buffers, BuffersHost<T> &buffers_host,
+                const std::vector<std::string> &names) {
+  for (auto &name: names) {
+    if (name == kBufVecX) { HostToCUDA(&buffers.x_vec, buffers_host.x_vec, args.x_size); }
+    else if (name == kBufVecY) { HostToCUDA(&buffers.y_vec, buffers_host.y_vec, args.y_size); }
+    else if (name == kBufMatA) { HostToCUDA(&buffers.a_mat, buffers_host.a_mat, args.a_size); }
+    else if (name == kBufMatB) { HostToCUDA(&buffers.b_mat, buffers_host.b_mat, args.b_size); }
+    else if (name == kBufMatC) { HostToCUDA(&buffers.c_mat, buffers_host.c_mat, args.c_size); }
+    else if (name == kBufMatAP) { HostToCUDA(&buffers.ap_mat, buffers_host.ap_mat, args.ap_size); }
+    else if (name == kBufScalar) { HostToCUDA(&buffers.scalar, buffers_host.scalar, args.scalar_size); }
+    else { throw std::runtime_error("Invalid buffer name"); }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_WRAPPER_CUDA_H_
+#endif