Merge pull request #150 from CNugteren/development

Update to version 0.11.0
2017-05-02 22:39:50 +02:00 · 2017-05-02 22:39:50 +02:00 · 606f2871dd
parent e52f9a9ff2 e9d2a2f54c
commit 606f2871dd
291 changed files with 12269 additions and 3527 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -2,14 +2,6 @@ language: cpp
 sudo: required
 dist: trusty

-os:
-  - linux
-  - osx
-
-compiler:
-  - gcc
-  - clang
-
 addons:
  apt:
    sources:
@ -19,6 +11,14 @@ addons:
      - cmake
      - ocl-icd-opencl-dev

+matrix:
+  include:
+    - os: linux
+      compiler: gcc
+    - os: linux
+      compiler: clang
+    - os: osx
+
 env:
  global:
    - CLBLAST_ROOT=${TRAVIS_BUILD_DIR}/bin/clblast
--- a/21
+++ b/21
@ -1,4 +1,25 @@

+Version 0.11.0
+- Improved the internal program source and binary caches for scalability and speed (thanks to 'intelfx')
+- Fixed a bug having to re-create the binary even if it was in the cache
+- Fixed a bug when using offsets in the direct version of the GEMM kernels
+- Fixed a missing cl_khr_fp64 when running double-precision on Intel CPUs
+- Fixed tests on Apple's CPU OpenCL implementation; still not fast but correct at least
+- Fixed bugs in the half-precision routines HTBMV/HTPMV/HTRMV/HSYR2K/HTRMM
+- Tests now also exit with an error code when OpenCL errors or compilation errors occur
+- Tests now also check for the L2 error in case of half-precision
+- Clients can now test against cuBLAS on NVIDIA systems for performance comparisons (-DCUBLAS=ON)
+- Replaced the R graph scripts with Python/Matplotlib scripts
+- Various minor fixes and enhancements
+- Added tuned parameters for various devices (see README)
+- Added the OverrideParameters function to the API to be able to supply custom tuning parmeters
+- Added triangular solver (level-2 & level-3) routines:
+  * STRSV/DTRSV/CTRSV/ZTRSV (experimental, un-optimized)
+  * STRSM/DTRSM/CTRSM/ZTRSM (experimental, un-optimized)
+- Added batched (not part of the BLAS standard) routines:
+  * SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED (batched version of AXPY)
+  * SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED (batched version of GEMM)
+
 Version 0.10.0
 - Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header
 - Changed the enums in the C API to avoid potential name clashes with external code
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -18,7 +18,7 @@ set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_fla
 # CMake project details
 project("clblast" C CXX)
 set(clblast_VERSION_MAJOR 0)
-set(clblast_VERSION_MINOR 10)
+set(clblast_VERSION_MINOR 11)
 set(clblast_VERSION_PATCH 0)

 # Options and their default values
@ -28,6 +28,7 @@ option(TUNERS "Enable compilation of the tuners" OFF)
 option(CLIENTS "Enable compilation of the clients to test and compare performance" OFF)
 option(TESTS "Enable compilation of the correctness tests" OFF)
 option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF)
+option(CUBLAS "Enables performance comparison against cuBLAS on NVIDIA GPUs" OFF)

 # Compile in verbose mode with additional diagnostic messages
 option(VERBOSE "Compile in verbose mode for additional diagnostic messages" OFF)
@ -129,11 +130,14 @@ if(TUNERS)
  endif()
 endif()

-# Locates the reference BLAS libraries in case the tests need to be compiled. The "FindclBLAS.cmake"
-# and "FindCBLAS.cmake" are included.
+# Locates the reference BLAS libraries in case the tests need to be compiled. The "FindclBLAS.cmake",
+# "FindCBLAS.cmake" and "FindcuBLAS.cmake" are included.
 if(CLIENTS OR TESTS)
  find_package(clBLAS)
  find_package(CBLAS)
+  if(CUBLAS)
+    find_package(cuBLAS)
+  endif()
  if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND)
    if(TESTS)
      message(STATUS "Could NOT find clBLAS nor a CPU BLAS, disabling the compilation of the tests")
@ -156,10 +160,10 @@ if(NETLIB)
  set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib)
 endif()
 set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
-set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
+set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv
                    xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
-set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
-set(LEVELX_ROUTINES xomatcopy)
+set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm)
+set(LEVELX_ROUTINES xomatcopy xaxpybatched xgemmbatched)
 set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES})
 set(PRECISIONS 32 64 3232 6464 16)

@ -175,6 +179,7 @@ set(SOURCES
  src/clblast.cpp
  src/clblast_c.cpp
  src/routine.cpp
+  src/routines/levelx/xinvert.cpp  # only source, don't include it as a test
 )
 if(NETLIB)
  set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp)
@ -241,7 +246,7 @@ endif()
 if(SAMPLES)

  # Downloads the cl.hpp file from Khronos
-  file(DOWNLOAD https://www.khronos.org/registry/cl/api/1.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp)
+  file(DOWNLOAD https://www.khronos.org/registry/OpenCL/api/2.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp)

  # Adds sample programs (C++)
  foreach(SAMPLE ${SAMPLE_PROGRAMS_CPP})
@ -319,13 +324,22 @@ if(CLIENTS OR TESTS)
      add_definitions(" -DCLBLAST_REF_CBLAS")
    endif()
  endif()
+  if(CUBLAS_FOUND)
+    set(REF_INCLUDES ${REF_INCLUDES} ${CUDA_INCLUDE_DIRS})
+    set(REF_LIBRARIES ${REF_LIBRARIES} ${CUDA_LIBRARIES} ${CUBLAS_LIBRARIES})
+    if(MSVC)
+      add_definitions(" /DCLBLAST_REF_CUBLAS")
+    else()
+      add_definitions(" -DCLBLAST_REF_CUBLAS")
+    endif()
+  endif()

 endif()

 # ==================================================================================================

 # Section for the performance tests (i.e. the client). These compare against optionally a reference
-# library, either clBLAS or a CPU BLAS.
+# library, either clBLAS, a CPU BLAS, or CUDA's cuBLAS.
 if(CLIENTS)

  # Visual Studio requires the sources of non-exported objects/libraries
@ -371,7 +385,7 @@ endif()
 # ==================================================================================================

 # Section for the correctness tests. Note that these tests require the presence of clBLAS and/or a
-# CPU BLAS library to act as a reference.
+# CPU BLAS library, and/or cuBLAS to act as a reference.
 if(TESTS)
  enable_testing()

@ -414,6 +428,18 @@ if(TESTS)
    add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE})
  endforeach()

+  # Miscellaneous tests
+  set(MISC_TESTS override_parameters)
+  foreach(MISC_TEST ${MISC_TESTS})
+    add_executable(clblast_test_${MISC_TEST} ${TESTS_COMMON}
+                   test/correctness/misc/${MISC_TEST}.cpp)
+    target_link_libraries(clblast_test_${MISC_TEST} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
+    target_include_directories(clblast_test_${MISC_TEST} PUBLIC
+                               $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
+                               ${clblast_SOURCE_DIR} ${REF_INCLUDES})
+    add_test(clblast_test_${MISC_TEST} clblast_test_${MISC_TEST})
+  endforeach()
+
  # Adds 'alltests' target: runs all tests
  set(ALLTESTS )
  set(ALLTESTSDEPENDS )
--- a/README.md
+++ b/README.md
@ -21,6 +21,7 @@ Use CLBlast instead of clBLAS:
 * When you want to be able to inspect the BLAS kernels or easily customize them to your needs.
 * When you run on exotic OpenCL devices for which you need to tune yourself.
 * When you are still running on OpenCL 1.1 hardware.
+* When you prefer a C++ API over a C API (C API also available in CLBlast).
 * When you value an organized and modern C++ codebase.
 * When you target Intel CPUs and GPUs or embedded devices
 * When you can benefit from the increased performance of half-precision fp16 data-types.
@ -90,21 +91,23 @@ Or alternatively the plain C version:

    #include <clblast_c.h>

-Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the above mentioned include files and the included [API documentation](doc/clblast.md). Additionally, a couple of stand-alone example programs are included in the `samples` subfolder. They can optionally be compiled using the CMake infrastructure of CLBlast by providing the `-DSAMPLES=ON` flag, for example as follows:
+Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the above mentioned include files and the included [API documentation](doc/clblast.md). The API is kept as close as possible to the Netlib BLAS and the cuBLAS/clBLAS APIs.
+
+To get started quickly, a couple of stand-alone example programs are included in the `samples` subfolder. They can optionally be compiled using the CMake infrastructure of CLBlast by providing the `-DSAMPLES=ON` flag, for example as follows:

    cmake -DSAMPLES=ON ..

-Furthermore, it is possible to optionally set an OS environmental variable `CLBLAST_BUILD_OPTIONS` to pass specific build options to the OpenCL compiler.
-
 There is also a Netlib CBLAS C API available. This is however not recommended for full control over performance, since at every call it will copy all buffers to and from the OpenCL device. Especially for level 1 and level 2 BLAS functions performance will be impacted severly. However, it can be useful if you don't want to touch OpenCL at all. You can set the default device and platform by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. This API can be used as follows after providing the `-DNETLIB=ON` flag to CMake:

    #include <clblast_netlib_c.h>

+For all of CLBlast's APIs, it is possible to optionally set an OS environmental variable `CLBLAST_BUILD_OPTIONS` to pass specific build options to the OpenCL compiler.
+

 Using the tuners (optional)
 -------------

-The CLBlast library will be tuned in the future for the most commonly used OpenCL devices. This pre-release of CLBlast is only tuned for a limited number of devices, in particular those with the following `CL_DEVICE_NAME` values:
+The CLBlast library is already tuned for the most commonly used OpenCL devices and it's gradually being extended to other devices as well. For unseen devices CLBlast will make use of common-best tuning values for similar devices (e.g. AMD GPUs), so performance might still be decent. The current release of CLBlast is tuned for devices with the following `CL_DEVICE_NAME` values:

 * NVIDIA GPUs:
  - GRID K520
@ -115,18 +118,23 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
  - GeForce GTX 750 Ti
  - GeForce GTX 980
  - GeForce GTX 1070
+  - GeForce GTX 1080
  - GeForce GTX TITAN
  - GeForce GTX TITAN Black
  - GeForce GTX TITAN X
+  - TITAN X (Pascal)
  - Tesla K20m
  - Tesla K40m
 * AMD GPUs:
  - AMD Radeon R9 M370X Compute Engine
+  - ATI Radeon HD 6750M
+  - Ellesmere
  - Hawaii
  - Oland
  - Pitcairn
  - Tahiti
  - Tonga
+  - Turks
 * Intel GPUs:
  - HD Graphics 530
  - HD Graphics 5500 BroadWell U-Processor GT2
@ -137,7 +145,9 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
  - Iris Pro
 * Intel CPUs:
  - Core i5-6200U
+  - Core i7-2670QM
  - Core i7-3770K
+  - Core i7-4790K
  - Core i7-5930K
 * Other devices:
  - ARM Mali-T628 GPU
@ -151,7 +161,7 @@ Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https:/

 Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables.

-The tuners output a JSON-file with the results. The best results need to be added to `src/database/kernels/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
+The tuners output a JSON-file with the results. The best results need to be added to `src/database/kernels/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python (2.7 or 3.x) script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).

 In summary, tuning the entire library for your device can be done as follows (starting from the root of the CLBlast folder):

@ -163,6 +173,8 @@ In summary, tuning the entire library for your device can be done as follows (st
    python ../scripts/database/database.py . ..
    make

+Alternatively, you can also supply your tuning parameters programmatically through the CLBlast API. This is especially useful if you tune for specific non-standard arguments (e.g. a rectangular or a very small matrix). To do so, you can call the `OverrideParameters` function which will set new parameters for a specific kernel. At the first next call of the target routine, CLBlast will compile a new binary and use it together with the new parameters from then on. Until `OverrideParameters` is called again of course. See the [API documentation](doc/clblast.md#overrideparameters-override-tuning-parameters-auxiliary-function) for more details.
+

 Compiling the correctness tests (optional)
 -------------
@ -187,15 +199,15 @@ All tests can be run directly together in one go through the `make alltests` tar
 Compiling the performance tests/clients (optional)
 -------------

-To test the performance of CLBlast and compare optionally against [clBLAS](http://github.com/clMathLibraries/clBLAS) or a CPU BLAS library (see above for requirements), compile with the clients enabled by specifying `-DCLIENTS=ON`, for example as follows:
+To test the performance of CLBlast and compare optionally against [clBLAS](http://github.com/clMathLibraries/clBLAS), cuBLAS (if testing on an NVIDIA GPU and `-DCUBLAS=ON` set), or a CPU BLAS library (see above for requirements), compile with the clients enabled by specifying `-DCLIENTS=ON`, for example as follows:

    cmake -DCLIENTS=ON ..

 The performance tests come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run CLBlast in a head-to-head performance test against optionally clBLAS and/or a CPU BLAS library. You can use the command-line options `-clblas 1` or `-cblas 1` to select a library to test against.

-The folder `doc/performance` contains some PDF files with performance results on tested devices. Performance is compared in this case against a tuned version of the clBLAS library. These graphs can be generated automatically on your own device. First, compile CLBlast with the clients enabled. Then, make sure your installation of the reference clBLAS is performance-tuned by running the `tune` executable. Finally, run one of the graph-scripts found in `scripts/graphs` using R. For example, to generate the Xgemm PDF on device 1 of platform 0 from the `build` subdirectory:
+The folder `doc/performance` contains some PDF files with performance results on tested devices. Performance is compared in this case against a tuned version of the clBLAS library. These graphs can be generated automatically on your own device. First, compile CLBlast with the clients enabled. Then, make sure your installation of the reference clBLAS is performance-tuned by running the `tune` executable (shipped with clBLAS). Finally, run the Python/Matplotlib graph-script found in `scripts/benchmark/benchmark.py`. For example, to generate the SGEMM PDF on device 1 of platform 0 from the `build` subdirectory:

-    Rscript ../scripts/graphs/xgemm.r 0 1
+    python ../scripts/benchmark/benchmark.py --platform 0 --device 1 --benchmark gemm

 Note that the CLBlast library provides pre-tuned parameter-values for some devices only: if your device is not among these, then out-of-the-box performance might be poor. See above under `Using the tuners` to find out how to tune for your device.

@ -250,6 +262,7 @@ CLBlast supports almost all the Netlib BLAS routines plus a couple of extra non-
 | xSPR     | ✔ | ✔ | - | - | ✔ |
 | xSYR2    | ✔ | ✔ | - | - | ✔ |
 | xSPR2    | ✔ | ✔ | - | - | ✔ |
+| xTRSV    | ✔ | ✔ | ✔ | ✔ |   | (experimental, un-optimized)

 | Level-3  | S | D | C | Z | H |
 | ---------|---|---|---|---|---|
@ -261,6 +274,14 @@ CLBlast supports almost all the Netlib BLAS routines plus a couple of extra non-
 | xSYR2K   | ✔ | ✔ | ✔ | ✔ | ✔ |
 | xHER2K   | - | - | ✔ | ✔ | - |
 | xTRMM    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xTRSM    | ✔ | ✔ | ✔ | ✔ |   | (experimental, un-optimized)
+
+Futhermore, there are also batched versions of BLAS routines available, processing multiple smaller computations in one go for better performance:
+
+| Batched      | S | D | C | Z | H |
+| -------------|---|---|---|---|---|
+| xAXPYBATCHED | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xGEMMBATCHED | ✔ | ✔ | ✔ | ✔ | ✔ |

 In addition, some extra non-BLAS routines are also supported by CLBlast, classified as level-X. They are experimental and should be used with care:

@ -271,7 +292,7 @@ In addition, some extra non-BLAS routines are also supported by CLBlast, classif
 | IxMIN      | ✔ | ✔ | ✔ | ✔ | ✔ |
 | xOMATCOPY  | ✔ | ✔ | ✔ | ✔ | ✔ |

-Some less commonly used BLAS routines are not yet supported yet by CLBlast. They are xROTG, xROTMG, xROT, xROTM, xTRSV, xTBSV, xTPSV, and xTRSM.
+Some less commonly used BLAS routines are not yet supported yet by CLBlast. They are xROTG, xROTMG, xROT, xROTM, xTBSV, and xTPSV.


 Half precision (fp16)
--- a/cmake/Modules/FindcuBLAS.cmake
+++ b/cmake/Modules/FindcuBLAS.cmake
@ -0,0 +1,82 @@
+
+# ==================================================================================================
+# This file is part of the cuBLASt project. The project is licensed under Apache Version 2.0. This
+# project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+# width of 100 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+#
+# ==================================================================================================
+#
+# Defines the following variables:
+#   CUBLAS_FOUND          Boolean holding whether or not the cuBLAS library was found
+#   CUBLAS_INCLUDE_DIRS   The CUDA and cuBLAS include directory
+#   CUDA_LIBRARIES        The CUDA library
+#   CUBLAS_LIBRARIES      The cuBLAS library
+#
+# In case CUDA is not installed in the default directory, set the CUDA_ROOT variable to point to
+# the root of cuBLAS, such that 'cublas_v2.h' can be found in $CUDA_ROOT/include. This can either be
+# done using an environmental variable (e.g. export CUDA_ROOT=/path/to/cuBLAS) or using a CMake
+# variable (e.g. cmake -DCUDA_ROOT=/path/to/cuBLAS ..).
+#
+# ==================================================================================================
+
+# Sets the possible install locations
+set(CUBLAS_HINTS
+  ${CUDA_ROOT}
+  $ENV{CUDA_ROOT}
+  $ENV{CUDA_TOOLKIT_ROOT_DIR}
+)
+set(CUBLAS_PATHS
+  /usr
+  /usr/local
+  /usr/local/cuda
+)
+
+# Finds the include directories
+find_path(CUBLAS_INCLUDE_DIRS
+  NAMES cublas_v2.h cuda.h
+  HINTS ${CUBLAS_HINTS}
+  PATH_SUFFIXES include inc include/x86_64 include/x64
+  PATHS ${CUBLAS_PATHS}
+  DOC "cuBLAS include header cublas_v2.h"
+)
+mark_as_advanced(CUBLAS_INCLUDE_DIRS)
+
+# Finds the libraries
+find_library(CUDA_LIBRARIES
+  NAMES cudart
+  HINTS ${CUBLAS_HINTS}
+  PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import
+  PATHS ${CUBLAS_PATHS}
+  DOC "CUDA library"
+)
+mark_as_advanced(CUDA_LIBRARIES)
+find_library(CUBLAS_LIBRARIES
+  NAMES cublas
+  HINTS ${CUBLAS_HINTS}
+  PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import
+  PATHS ${CUBLAS_PATHS}
+  DOC "cuBLAS library"
+)
+mark_as_advanced(CUBLAS_LIBRARIES)
+
+# ==================================================================================================
+
+# Notification messages
+if(NOT CUBLAS_INCLUDE_DIRS)
+    message(STATUS "Could NOT find 'cuBLAS.h', install CUDA/cuBLAS or set CUDA_ROOT")
+endif()
+if(NOT CUDA_LIBRARIES)
+    message(STATUS "Could NOT find CUDA library, install it or set CUDA_ROOT")
+endif()
+if(NOT CUBLAS_LIBRARIES)
+    message(STATUS "Could NOT find cuBLAS library, install it or set CUDA_ROOT")
+endif()
+
+# Determines whether or not cuBLAS was found
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(cuBLAS DEFAULT_MSG CUBLAS_INCLUDE_DIRS CUDA_LIBRARIES CUBLAS_LIBRARIES)
+
+# ==================================================================================================
--- a/doc/clblast.md
+++ b/doc/clblast.md
@ -1445,6 +1445,63 @@ Arguments to TPMV:



+xTRSV: Solves a triangular system of equations
+-------------
+
+
+
+C++ API:
+```
+template <typename T>
+StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastStrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to TRSV:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `cl_mem x_buffer`: OpenCL buffer to store the output x vector.
+* `const size_t x_offset`: The offset in elements from the start of the output x vector.
+* `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
 xGER: General rank-1 matrix update
 -------------

@ -2708,6 +2765,71 @@ Requirements for TRMM:



+xTRSM: Solves a triangular system of equations
+-------------
+
+Solves the equation _A * X = alpha * B_ for the unknown _m_ by _n_ matrix X, in which _A_ is an _n_ by _n_ unit or non-unit triangular matrix and B is an _m_ by _n_ matrix. The matrix _B_ is overwritten by the solution _X_.
+
+C++ API:
+```
+template <typename T>
+StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t m, const size_t n,
+                               const float alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t m, const size_t n,
+                               const double alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t m, const size_t n,
+                               const cl_float2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t m, const size_t n,
+                               const cl_double2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to TRSM:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Side side`: The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142).
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `cl_mem b_buffer`: OpenCL buffer to store the output B matrix.
+* `const size_t b_offset`: The offset in elements from the start of the output B matrix.
+* `const size_t b_ld`: Leading dimension of the output B matrix. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
 xOMATCOPY: Scaling and out-place transpose/copy (non-BLAS function)
 -------------

@ -2781,3 +2903,231 @@ Requirements for OMATCOPY:



+xAXPYBATCHED: Batched version of AXPY
+-------------
+
+As AXPY, but multiple operations are batched together for better performance.
+
+C++ API:
+```
+template <typename T>
+StatusCode AxpyBatched(const size_t n,
+                       const T *alphas,
+                       const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                       cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                       const size_t batch_count,
+                       cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
+                                      const float *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDaxpyBatched(const size_t n,
+                                      const double *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCaxpyBatched(const size_t n,
+                                      const cl_float2 *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZaxpyBatched(const size_t n,
+                                      const cl_double2 *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHaxpyBatched(const size_t n,
+                                      const cl_half *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to AXPYBATCHED:
+
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T *alphas`: Input scalar constants.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t *x_offsets`: The offsets in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
+* `const size_t *y_offsets`: The offsets in elements from the start of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
+* `const size_t batch_count`: Number of batches. This value must be positive.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xGEMMBATCHED: Batched version of GEMM
+-------------
+
+As GEMM, but multiple operations are batched together for better performance.
+
+C++ API:
+```
+template <typename T>
+StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                       const size_t m, const size_t n, const size_t k,
+                       const T *alphas,
+                       const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                       const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                       const T *betas,
+                       cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                       const size_t batch_count,
+                       cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const float *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const float *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const double *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const double *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const cl_float2 *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const cl_float2 *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const cl_double2 *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const cl_double2 *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const cl_half *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const cl_half *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to GEMMBATCHED:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const Transpose b_transpose`: Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const size_t k`: Integer size argument. This value must be positive.
+* `const T *alphas`: Input scalar constants.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t *a_offsets`: The offsets in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix.
+* `const size_t *b_offsets`: The offsets in elements from the start of the input B matrix.
+* `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0.
+* `const T *betas`: Input scalar constants.
+* `cl_mem c_buffer`: OpenCL buffer to store the output C matrix.
+* `const size_t *c_offsets`: The offsets in elements from the start of the output C matrix.
+* `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0.
+* `const size_t batch_count`: Number of batches. This value must be positive.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for GEMMBATCHED:
+
+* When `transpose_a == Transpose::kNo`, then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `k`.
+* When `transpose_b == Transpose::kNo`, then `b_ld` must be at least `k`, otherwise `b_ld` must be at least `n`.
+* The value of `c_ld` must be at least `m`.
+
+
+
+ClearCache: Resets the cache of compiled binaries (auxiliary function)
+-------------
+
+CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on for the same device. This cache can be cleared to free up system memory or it can be useful in case of debugging.
+
+C++ API:
+```
+StatusCode ClearCache()
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastClearCache()
+```
+
+
+
+FillCache: Populates the cache of compiled binaries for a specific device (auxiliary function)
+-------------
+
+CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on for the same device. This cache is automatically populated whenever a new binary is created. Thus, the first run of a specific kernel could take extra time. For debugging or performance evaluation purposes, it might be useful to populate the cache upfront. This function populates the cache for all kernels in CLBlast for all precisions, but for a specific device only.
+
+C++ API:
+```
+StatusCode FillCache(const cl_device_id device)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastFillCache(const cl_device_id device)
+```
+
+Arguments to FillCache:
+
+* `const cl_device_id device`: The OpenCL device to fill the cache for.
+
+
+
+OverrideParameters: Override tuning parameters (auxiliary function)
+-------------
+
+This function overrides tuning parameters for a specific device-precision-kernel combination. The next time the target routine is called it will be re-compiled and use the new parameters. All further times (until `OverrideParameters` is called again) it will load the kernel from the cache and thus continue to use the new parameters. Note that the first time after calling `OverrideParameters` a performance drop can be observable due to the re-compilation of the kernel.
+
+C++ API:
+```
+StatusCode OverrideParameters(const cl_device_id device, const std::string &kernel_name,
+                              const Precision precision,
+                              const std::unordered_map<std::string,size_t> &parameters)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastOverrideParameters(const cl_device_id device, const char* kernel_name,
+                                            const CLBlastPrecision precision, const size_t num_parameters,
+                                            const char** parameters_names, const size_t* parameters_values)
+```
+
+Arguments to OverrideParameters (C++ version):
+
+* `const cl_device_id device`: The OpenCL device to set the new parameters for.
+* `const std::string &kernel_name`: The target kernel name. This has to be one of the existing CLBlast kernels (Xaxpy, Xdot, Xgemv, XgemvFast, XgemvFastRot, Xgemv, Xger, Copy, Pad, Transpose, Padtranspose, Xgemm, or XgemmDirect). If this argument is incorrect, this function will return with the `clblast::kInvalidOverrideKernel` status-code.
+* `const Precision precision`: The CLBlast precision enum to set the new parameters for.
+* `const std::unordered_map<std::string,size_t> &parameters`: An unordered map of strings to integers. This has to contain all the tuning parameters for a specific kernel as reported by the included tuners (e.g. `{ {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} }` for the `Copy` kernel). If this argument is incorrect, this function will return with the `clblast::kMissingOverrideParameter` status-code.
--- a/include/clblast.h
+++ b/include/clblast.h
@ -17,6 +17,8 @@
 #define CLBLAST_CLBLAST_H_

 #include <cstdlib> // For size_t
+#include <string> // For OverrideParameters function
+#include <unordered_map> // For OverrideParameters function

 // Includes the normal OpenCL C header
 #if defined(__APPLE__) || defined(__MACOSX)
@ -95,6 +97,9 @@ enum class StatusCode {
  kInsufficientMemoryY       = -1007, // Vector Y's OpenCL buffer is too small

  // Custom additional status codes for CLBlast
+  kInvalidBatchCount         = -2049, // The batch count needs to be positive
+  kInvalidOverrideKernel     = -2048, // Trying to override parameters for an invalid kernel
+  kMissingOverrideParameter  = -2047, // Missing override parameter(s) for the target kernel
  kInvalidLocalMemUsage      = -2046, // Not enough local memory available on this device
  kNoHalfPrecision           = -2045, // Half precision (16-bits) not supported by the device
  kNoDoublePrecision         = -2044, // Double precision (64-bits) not supported by the device
@ -114,7 +119,7 @@ enum class Side { kLeft = 141, kRight = 142 };

 // Precision scoped enum (values in bits)
 enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
-                       kComplexSingle = 3232, kComplexDouble = 6464 };
+                       kComplexSingle = 3232, kComplexDouble = 6464, kAny = -1 };

 // =================================================================================================
 // BLAS level-1 (vector-vector) routines
@ -583,7 +588,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c
                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                cl_command_queue* queue, cl_event* event = nullptr);

-// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
+// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
 template <typename T>
 StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                const size_t m, const size_t n,
@ -605,6 +610,27 @@ StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
                    cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                    cl_command_queue* queue, cl_event* event = nullptr);

+// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
+template <typename T>
+StatusCode AxpyBatched(const size_t n,
+                       const T *alphas,
+                       const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                       cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                       const size_t batch_count,
+                       cl_command_queue* queue, cl_event* event = nullptr);
+
+// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED
+template <typename T>
+StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                       const size_t m, const size_t n, const size_t k,
+                       const T *alphas,
+                       const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                       const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                       const T *betas,
+                       cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                       const size_t batch_count,
+                       cl_command_queue* queue, cl_event* event = nullptr);
+
 // =================================================================================================

 // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
@ -617,6 +643,14 @@ StatusCode PUBLIC_API FillCache(const cl_device_id device);

 // =================================================================================================

+// Overrides tuning parameters for a specific device-precision-kernel combination. The next time
+// the target routine is called it will re-compile and use the new parameters from then on.
+StatusCode PUBLIC_API OverrideParameters(const cl_device_id device, const std::string &kernel_name,
+                                         const Precision precision,
+                                         const std::unordered_map<std::string,size_t> &parameters);
+
+// =================================================================================================
+
 } // namespace clblast

 // CLBLAST_CLBLAST_H_
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@ -96,6 +96,9 @@ typedef enum CLBlastStatusCode_ {
  CLBlastInsufficientMemoryY       = -1007, // Vector Y's OpenCL buffer is too small

  // Custom additional status codes for CLBlast
+  CLBlastInvalidBatchCount         = -2049, // The batch count needs to be positive
+  CLBlastInvalidOverrideKernel     = -2048, // Trying to override parameters for an invalid kernel
+  CLBlastMissingOverrideParameter  = -2047, // Missing override parameter(s) for the target kernel
  CLBlastInvalidLocalMemUsage      = -2046, // Not enough local memory available on this device
  CLBlastNoHalfPrecision           = -2045, // Half precision (16-bits) not supported by the device
  CLBlastNoDoublePrecision         = -2044, // Double precision (64-bits) not supported by the device
@ -117,6 +120,11 @@ typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131,
                                CLBlastDiagonalUnit = 132 } CLBlastDiagonal;
 typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide;

+// Precision enum (values in bits)
+typedef enum CLBlastPrecision_ { CLBlastPrecisionHalf = 16, CLBlastPrecisionSingle = 32,
+                                 CLBlastPrecisionDouble = 64, CLBlastPrecisionComplexSingle = 3232,
+                                 CLBlastPrecisionComplexDouble = 6464 } CLBlastPrecision;
+
 // =================================================================================================
 // BLAS level-1 (vector-vector) routines
 // =================================================================================================
@ -1258,7 +1266,7 @@ CLBlastStatusCode PUBLIC_API CLBlastHtrmm(const CLBlastLayout layout, const CLBl
                                          cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                          cl_command_queue* queue, cl_event* event);

-// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
+// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
 CLBlastStatusCode PUBLIC_API CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                          const size_t m, const size_t n,
                                          const float alpha,
@ -1283,12 +1291,6 @@ CLBlastStatusCode PUBLIC_API CLBlastZtrsm(const CLBlastLayout layout, const CLBl
                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                          cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                          cl_command_queue* queue, cl_event* event);
-CLBlastStatusCode PUBLIC_API CLBlastHtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
-                                          const size_t m, const size_t n,
-                                          const cl_half alpha,
-                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                                          cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
-                                          cl_command_queue* queue, cl_event* event);

 // =================================================================================================
 // Extra non-BLAS routines (level-X)
@ -1326,6 +1328,85 @@ CLBlastStatusCode PUBLIC_API CLBlastHomatcopy(const CLBlastLayout layout, const
                                              cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                              cl_command_queue* queue, cl_event* event);

+// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
+CLBlastStatusCode PUBLIC_API CLBlastSaxpyBatched(const size_t n,
+                                                 const float *alphas,
+                                                 const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                                 cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                                 const size_t batch_count,
+                                                 cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDaxpyBatched(const size_t n,
+                                                 const double *alphas,
+                                                 const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                                 cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                                 const size_t batch_count,
+                                                 cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCaxpyBatched(const size_t n,
+                                                 const cl_float2 *alphas,
+                                                 const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                                 cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                                 const size_t batch_count,
+                                                 cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZaxpyBatched(const size_t n,
+                                                 const cl_double2 *alphas,
+                                                 const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                                 cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                                 const size_t batch_count,
+                                                 cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHaxpyBatched(const size_t n,
+                                                 const cl_half *alphas,
+                                                 const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                                 cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                                 const size_t batch_count,
+                                                 cl_command_queue* queue, cl_event* event);
+
+// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED
+CLBlastStatusCode PUBLIC_API CLBlastSgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                                 const size_t m, const size_t n, const size_t k,
+                                                 const float *alphas,
+                                                 const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                                 const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                                 const float *betas,
+                                                 cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                                 const size_t batch_count,
+                                                 cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                                 const size_t m, const size_t n, const size_t k,
+                                                 const double *alphas,
+                                                 const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                                 const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                                 const double *betas,
+                                                 cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                                 const size_t batch_count,
+                                                 cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                                 const size_t m, const size_t n, const size_t k,
+                                                 const cl_float2 *alphas,
+                                                 const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                                 const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                                 const cl_float2 *betas,
+                                                 cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                                 const size_t batch_count,
+                                                 cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                                 const size_t m, const size_t n, const size_t k,
+                                                 const cl_double2 *alphas,
+                                                 const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                                 const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                                 const cl_double2 *betas,
+                                                 cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                                 const size_t batch_count,
+                                                 cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                                 const size_t m, const size_t n, const size_t k,
+                                                 const cl_half *alphas,
+                                                 const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                                 const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                                 const cl_half *betas,
+                                                 cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                                 const size_t batch_count,
+                                                 cl_command_queue* queue, cl_event* event);
+
 // =================================================================================================

 // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
@ -1338,6 +1419,14 @@ CLBlastStatusCode PUBLIC_API CLBlastFillCache(const cl_device_id device);

 // =================================================================================================

+// Overrides tuning parameters for a specific device-precision-kernel combination. The next time
+// the target routine is called it will re-compile and use the new parameters from then on.
+CLBlastStatusCode PUBLIC_API CLBlastOverrideParameters(const cl_device_id device, const char* kernel_name,
+                                                       const CLBlastPrecision precision, const size_t num_parameters,
+                                                       const char** parameters_names, const size_t* parameters_values);
+
+// =================================================================================================
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
--- a/include/clblast_half.h
+++ b/include/clblast_half.h
@ -32,9 +32,8 @@

 // =================================================================================================

-// Host data-type for half-precision floating-point (16-bit). This is based on the OpenCL type,
-// which is a typedef for unsigned short.
-typedef cl_half half;
+// The host data-type for half-precision floating-point (16-bit) is based on the `cl_half` OpenCL
+// type, which is a typedef for unsigned short.

 // 32-bit union for conversions
 typedef union ConversionBits_ {
@ -47,7 +46,7 @@ typedef union ConversionBits_ {
 // Converts a IEEE-compliant single-precision value to half-precision floating-point. This function
 // applies simple truncation (round toward zero, but with overflows set to infinity) as rounding
 // mode.
-inline half FloatToHalf(const float value) {
+inline cl_half FloatToHalf(const float value) {
  static const unsigned short base_table[512] = { 
    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
@ -108,7 +107,7 @@ inline half FloatToHalf(const float value) {
 }

 // Converts a half-precision value to IEEE-compliant single-precision floating-point
-inline float HalfToFloat(const half value) {
+inline float HalfToFloat(const cl_half value) {
  static const unsigned int mantissa_table[2048] = { 
    0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000,
    0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000,
--- a/include/clblast_netlib_c.h
+++ b/include/clblast_netlib_c.h
@ -862,7 +862,7 @@ void PUBLIC_API cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side,
                            const void* a, const int a_ld,
                            void* b, const int b_ld);

-// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
+// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
 void PUBLIC_API cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                            const int m, const int n,
                            const float alpha,
--- a/samples/cache.c
+++ b/samples/cache.c
@ -20,6 +20,8 @@
 #include <string.h>
 #include <time.h>

+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
 // Includes the CLBlast library (C interface)
 #include <clblast_c.h>

--- a/samples/dgemv.c
+++ b/samples/dgemv.c
@ -19,6 +19,8 @@
 #include <stdio.h>
 #include <string.h>

+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
 // Includes the CLBlast library (C interface)
 #include <clblast_c.h>

--- a/samples/haxpy.c
+++ b/samples/haxpy.c
@ -18,6 +18,8 @@
 #include <stdio.h>
 #include <string.h>

+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
 // Includes the CLBlast library (C interface)
 #include <clblast_c.h>

--- a/samples/sasum.c
+++ b/samples/sasum.c
@ -19,6 +19,8 @@
 #include <stdio.h>
 #include <string.h>

+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
 // Includes the CLBlast library (C interface)
 #include <clblast_c.h>

--- a/samples/sgemm.c
+++ b/samples/sgemm.c
@ -19,6 +19,8 @@
 #include <stdio.h>
 #include <string.h>

+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
 // Includes the CLBlast library (C interface)
 #include <clblast_c.h>

--- a/samples/sgemm.cpp
+++ b/samples/sgemm.cpp
@ -20,6 +20,9 @@
 #include <chrono>
 #include <vector>

+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
 // Includes the C++ OpenCL API. If not yet available, it can be found here:
 // https://www.khronos.org/registry/cl/api/1.1/cl.hpp
 #include "cl.hpp"
@ -103,7 +106,7 @@ int main() {
  auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();

  // Example completed. See "clblast.h" for status codes (0 -> success).
-  printf("Completed SGEMM in %.3lf ms with status %d\n", time_ms, status);
+  printf("Completed SGEMM in %.3lf ms with status %d\n", time_ms, static_cast<int>(status));
  return 0;
 }

--- a/scripts/benchmark/benchmark.py
+++ b/scripts/benchmark/benchmark.py
@ -0,0 +1,151 @@
+#!/usr/bin/env python
+
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+import argparse
+import json
+import os
+import sys
+
+import settings
+import plot
+import utils
+
+EXPERIMENTS = {
+    "axpy": settings.AXPY,
+    "axpybatched": settings.AXPYBATCHED,
+    "gemv": settings.GEMV,
+    "gemm": settings.GEMM,
+    "gemm_small": settings.GEMM_SMALL,
+    "gemmbatched": settings.GEMMBATCHED,
+    "symm": settings.SYMM,
+    "syrk": settings.SYRK,
+    "summary": settings.SUMMARY,
+}
+
+
+def run_benchmark(name, arguments_list, precision, num_runs, platform, device):
+    binary = "./clblast_client_x" + name
+
+    # Loops over sub-benchmarks per benchmark
+    results = []
+    for arguments in arguments_list:
+
+        # Sets the arguments
+        constant_arguments = ["-warm_up", "-q", "-no_abbrv", "-cblas 0", "-cublas 0"]
+        common_arguments = ["-precision %d" % precision, "-runs %d" % num_runs]
+        opencl_arguments = ["-platform %d" % platform, "-device %d" % device]
+        all_arguments = opencl_arguments + common_arguments + constant_arguments
+        for name, value in arguments.items():
+            all_arguments.append("-" + name + " " + str(value))
+
+        # Calls the binary and parses the results
+        benchmark_output = utils.run_binary(binary, all_arguments)
+        result = utils.parse_results(benchmark_output)
+
+        # For half-precision: also runs single-precision for comparison
+        if precision == 16:
+            all_arguments = [arg if arg != "-precision 16" else "-precision 32" for arg in all_arguments]
+            benchmark_output = utils.run_binary(binary, all_arguments)
+            result_extra = utils.parse_results(benchmark_output)
+            for index in range(len(min(result, result_extra))):
+                result[index]["GBs_1_FP32"] = result_extra[index]["GBs_1"]
+                result[index]["GBs_2"] = result_extra[index]["GBs_2"]
+                result[index]["GFLOPS_1_FP32"] = result_extra[index]["GFLOPS_1"]
+                result[index]["GFLOPS_2"] = result_extra[index]["GFLOPS_2"]
+
+        results.extend(result)
+    return results
+
+
+def parse_arguments(argv):
+    parser = argparse.ArgumentParser(description="Runs a full benchmark for a specific routine on a specific device")
+    parser.add_argument("-b", "--benchmark", required=True, help="The benchmark to perform (choose from %s)" % sorted(EXPERIMENTS.keys()))
+    parser.add_argument("-p", "--platform", required=True, type=int, help="The ID of the OpenCL platform to test on")
+    parser.add_argument("-d", "--device", required=True, type=int, help="The ID of the OpenCL device to test on")
+    parser.add_argument("-n", "--num_runs", type=int, default=None, help="Overrides the default number of benchmark repeats for averaging")
+    parser.add_argument("-x", "--precision", type=int, default=32, help="The precision to test for (choose from 16, 32, 64, 3232, 6464")
+    parser.add_argument("-l", "--load_from_disk", action="store_true", help="Increase verbosity of the script")
+    parser.add_argument("-t", "--plot_title", default="", help="The title for the plots, defaults to benchmark name")
+    parser.add_argument("-z", "--tight_plot", action="store_true", help="Enables tight plot layout for in paper or presentation")
+    parser.add_argument("-o", "--output_folder", default=os.getcwd(), help="Sets the folder for output plots (defaults to current folder)")
+    parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity of the script")
+    cl_args = parser.parse_args(argv)
+    return vars(cl_args)
+
+
+def benchmark_single(benchmark, platform, device, num_runs, precision, load_from_disk,
+                     plot_title, tight_plot, output_folder, verbose):
+
+    # Sanity check
+    if not os.path.isdir(output_folder):
+        print("[benchmark] Error: folder '%s' doesn't exist" % output_folder)
+        return
+
+    # The benchmark name and plot title
+    benchmark_name = utils.precision_to_letter(precision) + benchmark.upper()
+    if benchmark.upper() != "SUMMARY":
+        plot_title = benchmark_name if plot_title is "" else benchmark_name + ": " + plot_title
+
+    # Retrieves the benchmark settings
+    if benchmark not in EXPERIMENTS.keys():
+        print("[benchmark] Invalid benchmark '%s', choose from %s" % (benchmark, EXPERIMENTS.keys()))
+        return
+    experiment = EXPERIMENTS[benchmark]
+    benchmarks = experiment["benchmarks"]
+
+    # Either run the benchmarks for this experiment or load old results from disk
+    json_file_name = os.path.join(output_folder, benchmark_name.lower() + "_benchmarks.json")
+    if load_from_disk and os.path.isfile(json_file_name):
+        print("[benchmark] Loading previous benchmark results from '" + json_file_name + "'")
+        with open(json_file_name) as f:
+            results = json.load(f)
+    else:
+
+        # Runs all the individual benchmarks
+        print("[benchmark] Running on platform %d, device %d" % (platform, device))
+        print("[benchmark] Running %d benchmarks for settings '%s'" % (len(benchmarks), benchmark))
+        results = {"label_names": experiment["label_names"], "num_rows": experiment["num_rows"],
+                   "num_cols": experiment["num_cols"], "benchmarks": []}
+        for bench in benchmarks:
+            num_runs_benchmark = bench["num_runs"] if num_runs is None else num_runs
+            print("[benchmark] Running benchmark '%s:%s'" % (bench["name"], bench["title"]))
+            result = run_benchmark(bench["name"], bench["arguments"], precision, num_runs_benchmark,
+                                   platform, device)
+            results["benchmarks"].append(result)
+
+        # Stores the results to disk
+        print("[benchmark] Saving benchmark results to '" + json_file_name + "'")
+        with open(json_file_name, "wb") as f:
+            json.dump(results, f, sort_keys=True, indent=4)
+
+    # Retrieves the data from the benchmark settings
+    file_name_suffix = "_tight" if tight_plot else ""
+    pdf_file_name = os.path.join(output_folder, benchmark_name.lower() + "_plot" + file_name_suffix + ".pdf")
+    titles = [utils.precision_to_letter(precision) + b["name"].upper() + " " + b["title"] for b in benchmarks]
+    x_keys = [b["x_keys"] for b in benchmarks]
+    y_keys = [b["y_keys"] for b in benchmarks]
+    x_labels = [b["x_label"] for b in benchmarks]
+    y_labels = [b["y_label"] for b in benchmarks]
+    label_names = results["label_names"]
+
+    # For half-precision: also adds single-precision results for comparison
+    if precision == 16:
+        label_names = ["CLBlast FP16", "clBLAS FP32", "CLBlast FP32"]
+        y_keys = [y_key + [y_key[0] + "_FP32"] for y_key in y_keys]
+
+    # Plots the graphs
+    plot.plot_graphs(results["benchmarks"], pdf_file_name, results["num_rows"], results["num_cols"],
+                     x_keys, y_keys, titles, x_labels, y_labels,
+                     label_names, plot_title, tight_plot, verbose)
+
+    print("[benchmark] All done")
+
+
+if __name__ == '__main__':
+    parsed_arguments = parse_arguments(sys.argv[1:])
+    benchmark_single(**parsed_arguments)
--- a/scripts/benchmark/benchmark_all.py
+++ b/scripts/benchmark/benchmark_all.py
@ -0,0 +1,44 @@
+#!/usr/bin/env python
+
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+import argparse
+import os
+import sys
+
+from benchmark import benchmark_single
+
+
+BENCHMARKS = ["axpy", "gemv", "gemm", "summary", "axpybatched", "gemmbatched"]
+
+
+def parse_arguments(argv):
+    parser = argparse.ArgumentParser(description="Runs all (main) benchmarks in one go for a given device")
+    parser.add_argument("-p", "--platform", required=True, type=int, help="The ID of the OpenCL platform to test on")
+    parser.add_argument("-d", "--device", required=True, type=int, help="The ID of the OpenCL device to test on")
+    parser.add_argument("-x", "--precision", type=int, default=32, help="The precision to test for (choose from 16, 32, 64, 3232, 6464")
+    parser.add_argument("-l", "--load_from_disk", action="store_true", help="Increase verbosity of the script")
+    parser.add_argument("-t", "--plot_title", default=None, help="The title for the plots, defaults to benchmark name")
+    parser.add_argument("-o", "--output_folder", default=os.getcwd(), help="Sets the folder for output plots (defaults to current folder)")
+    parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity of the script")
+    cl_args = parser.parse_args(argv)
+    return vars(cl_args)
+
+
+def benchmark_all(platform, device, precision, load_from_disk,
+                  plot_title, output_folder, verbose):
+    for bench in BENCHMARKS:
+        from_disk = load_from_disk
+        for tight_plot in [True, False]:  # two plots for a single benchmark
+            benchmark_single(bench, platform, device, None, precision, from_disk,
+                             plot_title, tight_plot, output_folder, verbose)
+            from_disk = True  # for the next plot of the same data
+
+
+if __name__ == '__main__':
+    parsed_arguments = parse_arguments(sys.argv[1:])
+    benchmark_all(**parsed_arguments)
--- a/scripts/benchmark/plot.py
+++ b/scripts/benchmark/plot.py
@ -0,0 +1,118 @@
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+import utils
+
+import matplotlib
+matplotlib.use('Agg')
+from matplotlib import rcParams
+import matplotlib.pyplot as plt
+
+# Colors
+BLUEISH = [c / 255.0 for c in [71, 101, 177]]  # #4765b1
+REDISH = [c / 255.0 for c in [214, 117, 104]]  # #d67568
+PURPLISH = [c / 255.0 for c in [85, 0, 119]]  # #550077
+COLORS = [BLUEISH, REDISH, PURPLISH]
+MARKERS = ["o-", "x-", ".-"]
+
+
+def plot_graphs(results, file_name, num_rows, num_cols,
+                x_keys, y_keys, titles, x_labels, y_labels,
+                label_names, title, tight_plot, verbose):
+    assert len(results) == num_rows * num_cols
+    assert len(results) != 1
+    assert len(x_keys) == len(results)
+    assert len(y_keys) == len(results)
+    assert len(titles) == len(results)
+    assert len(x_labels) == len(results)
+    assert len(y_labels) == len(results)
+
+    # Tight plot (for in a paper or presentation) or regular (for display on a screen)
+    if tight_plot:
+        plot_size = 5
+        w_space = 0.20
+        h_space = 0.39
+        title_from_top = 0.11
+        legend_from_top = 0.17
+        legend_from_top_per_item = 0.04
+        x_label_from_bottom = 0.09
+        legend_spacing = 0.0
+        font_size = 15
+        font_size_legend = 13
+        font_size_title = font_size
+        bounding_box = "tight"
+    else:
+        plot_size = 8
+        w_space = 0.15
+        h_space = 0.22
+        title_from_top = 0.09
+        legend_from_top = 0.10
+        legend_from_top_per_item = 0.07
+        x_label_from_bottom = 0.06
+        legend_spacing = 0.8
+        font_size = 15
+        font_size_legend = font_size
+        font_size_title = 18
+        bounding_box = None  # means not 'tight'
+
+    # Initializes the plot
+    size_x = plot_size * num_cols
+    size_y = plot_size * num_rows
+    fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(size_x, size_y), facecolor='w', edgecolor='k')
+    fig.text(.5, 0.92, title, horizontalalignment="center", fontsize=font_size_title)
+    plt.subplots_adjust(wspace=w_space, hspace=h_space)
+    rcParams.update({'font.size': font_size})
+
+    # Loops over each subplot
+    for row in range(num_rows):
+        for col in range(num_cols):
+            index = row * num_cols + col
+            result = results[index]
+            ax = axes.flat[index]
+            plt.sca(ax)
+            print("[plot] Plotting subplot %d" % index)
+
+            # Sets the x-axis labels
+            x_list = [[r[x_key] for r in result] for x_key in x_keys[index]]
+            x_ticks = [",".join([utils.float_to_kilo_mega(v) for v in values]) for values in zip(*x_list)]
+            x_location = range(len(x_ticks))
+
+            # Optional sparsifying of the labels on the x-axis
+            if tight_plot and len(x_location) > 10:
+                x_ticks = [v if not (i % 2) else "" for i, v in enumerate(x_ticks)]
+
+            # Sets the y-data
+            y_list = [[r[y_key] for r in result] for y_key in y_keys[index]]
+            y_max = max([max(y) for y in y_list])
+
+            # Sets the axes
+            y_rounding = 10 if y_max < 80 else 50 if y_max < 400 else 200
+            y_axis_limit = (y_max * 1.2) - ((y_max * 1.2) % y_rounding) + y_rounding
+            plt.ylim(ymin=0, ymax=y_axis_limit)
+            plt.xticks(x_location, x_ticks, rotation='vertical')
+
+            # Sets the labels
+            ax.set_title(titles[index], y=1.0 - title_from_top, fontsize=font_size)
+            if col == 0 or y_labels[index] != y_labels[index - 1]:
+                ax.set_ylabel(y_labels[index])
+            ax.set_xlabel(x_labels[index])
+            ax.xaxis.set_label_coords(0.5, x_label_from_bottom)
+
+            # Plots the graph
+            assert len(COLORS) >= len(y_keys[index])
+            assert len(MARKERS) >= len(y_keys[index])
+            assert len(label_names) == len(y_keys[index])
+            for i in range(len(y_keys[index])):
+                ax.plot(x_location, y_list[i], MARKERS[i], label=label_names[i], color=COLORS[i])
+
+            # Sets the legend
+            leg = ax.legend(loc=(0.02, 1.0 - legend_from_top - legend_from_top_per_item * len(y_keys[index])),
+                            handletextpad=0.1, labelspacing=legend_spacing, fontsize=font_size_legend)
+            leg.draw_frame(False)
+
+    # Saves the plot to disk
+    print("[benchmark] Saving plot to '" + file_name + "'")
+    fig.savefig(file_name, bbox_inches=bounding_box)
--- a/scripts/benchmark/settings.py
+++ b/scripts/benchmark/settings.py
@ -0,0 +1,381 @@
+#!/usr/bin/env python
+
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+import utils
+
+
+AXPY = {
+    "label_names": ["CLBlast", "clBLAS"],
+    "num_rows": 2, "num_cols": 3,
+    "benchmarks": [
+        {
+            "name": "axpy", "num_runs": 40,
+            "title": "multiples of 256K",
+            "x_label": "sizes (n)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": utils.k(256), "incx": 1, "incy": 1, "step": utils.k(256), "num_steps": 16}],
+        },
+        {
+            "name": "axpy", "num_runs": 40,
+            "title": "multiples of 256K+1",
+            "x_label": "sizes (n)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": utils.k(256) + 1, "incx": 1, "incy": 1, "step": utils.k(256) + 1, "num_steps": 16}],
+        },
+        {
+            "name": "axpy", "num_runs": 40,
+            "title": "around 1M",
+            "x_label": "sizes (n)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": utils.m(1), "incx": 1, "incy": 1, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "axpy", "num_runs": 20,
+            "title": "around 16M",
+            "x_label": "sizes (n)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": utils.m(16), "incx": 1, "incy": 1, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "axpy", "num_runs": 20,
+            "title": "strides n=8M",
+            "x_label": "increments for x,y", "x_keys": ["incx", "incy"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": utils.m(8), "incx": inc_x, "incy": inc_y, "step": 0, "num_steps": 1}
+                          for inc_x in [1, 2, 4] for inc_y in [1, 2, 4]],
+        },
+        {
+            "name": "axpy", "num_runs": 40,
+            "title": "powers of 2",
+            "x_label": "sizes (n)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": n, "incx": 1, "incy": 1, "step": 0, "num_steps": 1}
+                          for n in utils.powers_of_2(utils.k(32), utils.m(64))],
+        }
+    ]
+}
+
+AXPYBATCHED = {
+    "label_names": ["CLBlast", "clBLAS (non batched)"],
+    "num_rows": 1, "num_cols": 3,
+    "benchmarks": [
+        {
+            "name": "axpybatched", "num_runs": 30,
+            "title": "8 batches",
+            "x_label": "sizes (n)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"batch_num": 8, "n": n, "incx": 1, "incy": 1, "step": 0, "num_steps": 1}
+                          for n in utils.powers_of_2(utils.k(8), utils.m(4))],
+        },
+        {
+            "name": "axpybatched", "num_runs": 20,
+            "title": "64 batches",
+            "x_label": "sizes (n)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"batch_num": 64, "n": n, "incx": 1, "incy": 1, "step": 0, "num_steps": 1}
+                          for n in utils.powers_of_2(utils.k(8), utils.m(4))],
+        },
+        {
+            "name": "axpybatched", "num_runs": 40,
+            "title": "n=512K",
+            "x_label": "number of batches", "x_keys": ["batch_num"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"batch_num": b, "n": utils.k(512), "incx": 1, "incy": 1, "step": 1, "num_steps": 1}
+                          for b in utils.powers_of_2(1, 512)],
+        }
+    ]
+}
+
+GEMV = {
+    "label_names": ["CLBlast", "clBLAS"],
+    "num_rows": 2, "num_cols": 3,
+    "benchmarks": [
+        {
+            "name": "gemv", "num_runs": 40,
+            "title": "multiples of 256",
+            "x_label": "sizes (n=m)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": 256, "m": 256, "incx": 1, "incy": 1, "layout": 102, "step": 256, "num_steps": 20}],
+        },
+        {
+            "name": "gemv", "num_runs": 40,
+            "title": "multiples of 257",
+            "x_label": "sizes (n=m)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": 257, "m": 257, "incx": 1, "incy": 1, "layout": 102, "step": 257, "num_steps": 20}],
+        },
+        {
+            "name": "gemv", "num_runs": 20,
+            "title": "around 4K",
+            "x_label": "sizes (n=m)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": 4096, "m": 4096, "incx": 1, "incy": 1, "layout": 102, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "gemv", "num_runs": 40,
+            "title": "multiples of 256 rotated",
+            "x_label": "sizes (n=m)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": 256, "m": 256, "incx": 1, "incy": 1, "layout": 101, "step": 256, "num_steps": 20}],
+        },
+        {
+            "name": "gemv", "num_runs": 40,
+            "title": "multiples of 257 rotated",
+            "x_label": "sizes (n=m)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": 257, "m": 257, "incx": 1, "incy": 1, "layout": 101, "step": 257, "num_steps": 20}],
+        },
+        {
+            "name": "gemv", "num_runs": 20,
+            "title": "strides n=m=4K",
+            "x_label": "increments/strides for x,y", "x_keys": ["incx", "incy"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": 4096, "m": 4096, "incx": inc_x, "incy": inc_y, "layout": 102, "step": 0, "num_steps": 1}
+                          for inc_x in [1, 2, 4] for inc_y in [1, 2, 4]],
+        }
+    ]
+}
+
+GEMM = {
+    "label_names": ["CLBlast", "clBLAS"],
+    "num_rows": 2, "num_cols": 3,
+    "benchmarks": [
+        {
+            "name": "gemm", "num_runs": 20,
+            "title": "multiples of 128",
+            "x_label": "sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 128, "n": 128, "k": 128, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 128, "num_steps": 20}],
+        },
+        {
+            "name": "gemm", "num_runs": 20,
+            "title": "multiples of 129",
+            "x_label": "sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 129, "n": 129, "k": 129, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 129, "num_steps": 20}],
+        },
+        {
+            "name": "gemm", "num_runs": 20,
+            "title": "around 512",
+            "x_label": "sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 512, "n": 512, "k": 512, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "gemm", "num_runs": 10,
+            "title": "around 2048",
+            "x_label": "sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 2048, "n": 2048, "k": 2048, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "gemm", "num_runs": 10,
+            "title": "layouts/transpose",
+            "x_label": "layout, transA, transB", "x_keys": ["layout", "transA", "transB"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 1024, "n": 1024, "k": 1024, "layout": layout,
+                           "transA": transA, "transB": transB, "step": 0, "num_steps": 1}
+                          for layout in [101, 102] for transA in [111, 112] for transB in [111, 112]],
+        },
+        {
+            "name": "gemm", "num_runs": 10,
+            "title": "powers of 2",
+            "x_label": "sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": n, "n": n, "k": n, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 0, "num_steps": 1}
+                          for n in utils.powers_of_2(8, utils.k(4))],
+        }
+    ]
+}
+
+GEMM_SMALL = {
+    "label_names": ["CLBlast", "clBLAS"],
+    "num_rows": 2, "num_cols": 1,
+    "benchmarks": [
+        {
+            "name": "gemm", "num_runs": 10,
+            "title": "small matrices in steps of 16",
+            "x_label": "sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 128, "n": 128, "k": 128, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 16, "num_steps": 57}],
+        },
+        {
+            "name": "gemm", "num_runs": 10,
+            "title": "small matrices in steps of 1",
+            "x_label": "sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 128, "n": 128, "k": 128, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 1, "num_steps": 385}],
+        },
+
+    ]
+}
+
+GEMMBATCHED = {
+    "label_names": ["CLBlast", "clBLAS (non batched)"],
+    "num_rows": 1, "num_cols": 3,
+    "benchmarks": [
+        {
+            "name": "gemmbatched", "num_runs": 40,
+            "title": "8 batches",
+            "x_label": "sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"batch_num": 8, "m": 32, "n": 32, "k": 32, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 32, "num_steps": 20}],
+        },
+        {
+            "name": "gemmbatched", "num_runs": 20,
+            "title": "64 batches",
+            "x_label": "sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"batch_num": 64, "m": 32, "n": 32, "k": 32, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 32, "num_steps": 20}],
+        },
+        {
+            "name": "gemmbatched", "num_runs": 30,
+            "title": "m=n=k=128",
+            "x_label": "number of batches", "x_keys": ["batch_num"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"batch_num": b, "m": 128, "n": 128, "k": 128, "layout": 102,
+                           "transA": 111, "transB": 111} for b in utils.powers_of_2(1, utils.k(16))],
+        }
+    ]
+}
+
+SYMM = {
+    "label_names": ["CLBlast", "clBLAS"],
+    "num_rows": 2, "num_cols": 3,
+    "benchmarks": [
+        {
+            "name": "symm", "num_runs": 10,
+            "title": "multiples of 128",
+            "x_label": "sizes (m=n)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 128, "n": 128, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 128, "num_steps": 20}],
+        },
+        {
+            "name": "symm", "num_runs": 10,
+            "title": "multiples of 129",
+            "x_label": "sizes (m=n)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 129, "n": 129, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 129, "num_steps": 20}],
+        },
+        {
+            "name": "symm", "num_runs": 10,
+            "title": "around 512",
+            "x_label": "sizes (m=n)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 512, "n": 512, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "symm", "num_runs": 10,
+            "title": "around 2048",
+            "x_label": "sizes (m=n)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 2048, "n": 2048, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "symm", "num_runs": 10,
+            "title": "layouts/sides/triangles",
+            "x_label": "layout, side, triangle", "x_keys": ["layout", "side", "triangle"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 1024, "n": 1024, "layout": layout,
+                           "side": side, "triangle": triangle, "step": 0, "num_steps": 1}
+                          for layout in [101, 102] for side in [141, 142] for triangle in [121, 122]],
+        },
+        {
+            "name": "symm", "num_runs": 10,
+            "title": "powers of 2",
+            "x_label": "sizes (m=n)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": n, "n": n, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 0, "num_steps": 1}
+                          for n in utils.powers_of_2(8, utils.k(4))],
+        }
+    ]
+}
+
+SYRK = {
+    "label_names": ["CLBlast", "clBLAS"],
+    "num_rows": 2, "num_cols": 3,
+    "benchmarks": [
+        {
+            "name": "syrk", "num_runs": 10,
+            "title": "multiples of 128",
+            "x_label": "sizes (n=k)", "x_keys": ["n"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"n": 128, "k": 128, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 128, "num_steps": 20}],
+        },
+        {
+            "name": "syrk", "num_runs": 10,
+            "title": "multiples of 129",
+            "x_label": "sizes (n=k)", "x_keys": ["n"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"n": 129, "k": 129, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 129, "num_steps": 20}],
+        },
+        {
+            "name": "syrk", "num_runs": 10,
+            "title": "around 512",
+            "x_label": "sizes (n=k)", "x_keys": ["n"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"n": 512, "k": 512, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "syrk", "num_runs": 10,
+            "title": "around 2048",
+            "x_label": "sizes (n=k)", "x_keys": ["n"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"n": 2048, "k": 2048, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "syrk", "num_runs": 10,
+            "title": "layouts/sides/triangles",
+            "x_label": "layout, triangle, transA", "x_keys": ["layout", "triangle", "transA"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"n": 1024, "k": 1024, "layout": layout,
+                           "triangle": triangle, "transA": transA, "step": 0, "num_steps": 1}
+                          for layout in [101, 102] for triangle in [121, 122] for transA in [111, 112]],
+        },
+        {
+            "name": "syrk", "num_runs": 10,
+            "title": "powers of 2",
+            "x_label": "sizes (n=k)", "x_keys": ["n"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"n": n, "k": n, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 0, "num_steps": 1}
+                          for n in utils.powers_of_2(8, utils.k(4))],
+        }
+    ]
+}
+
+SUMMARY = {
+    "label_names": ["CLBlast", "clBLAS"],
+    "num_rows": 3, "num_cols": 2,
+    "benchmarks": [
+        AXPY["benchmarks"][0],
+        AXPY["benchmarks"][1],
+        GEMV["benchmarks"][0],
+        GEMV["benchmarks"][1],
+        GEMM["benchmarks"][0],
+        GEMM["benchmarks"][1],
+    ]
+}
--- a/scripts/benchmark/utils.py
+++ b/scripts/benchmark/utils.py
@ -0,0 +1,66 @@
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+import csv
+import subprocess
+
+
+def k(value):
+    return value * 1024
+
+
+def m(value):
+    return value * 1024 * 1024
+
+
+def float_to_kilo_mega(value):
+    if value % 1024 or value <= 1024:
+        return "%.0f" % value
+    elif value % (1024 * 1024) or value <= (1024 * 1024):
+        return "%.0fK" % (value / 1024.0)
+    else:
+        return "%.0fM" % (value / (1024.0 * 1024.0))
+
+
+def powers_of_2(start, stop):
+    while start <= stop:
+        yield start
+        start *= 2
+
+
+def precision_to_letter(precision):
+    if precision == 16:
+        return "H"
+    elif precision == 32:
+        return "S"
+    elif precision == 64:
+        return "D"
+    elif precision == 3232:
+        return "C"
+    elif precision == 6464:
+        return "Z"
+    else:
+        return "X"
+
+
+def run_binary(command, arguments):
+    full_command = command + " " + " ".join(arguments)
+    print("[benchmark] Calling binary: %s" % str(full_command))
+    try:
+        return subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE).stdout.read()
+    except OSError as e:
+        print("[benchmark] Error while running the binary, got exception: %s" + str(e))
+        return False
+
+
+def parse_results(csv_data):
+    csv_data = csv_data.split("\n")
+    results = csv.DictReader(csv_data, delimiter=";", skipinitialspace=True)
+    results = [r for r in results]
+    for result in results:
+        for key in result:
+            result[key] = float(result[key]) if "." in result[key] else int(result[key])
+    return results
--- a/scripts/database/database.py
+++ b/scripts/database/database.py
@ -29,12 +29,62 @@ VENDOR_TRANSLATION_TABLE = {
 }


+def remove_mismatched_arguments(database):
+    """Checks for tuning results with mis-matched entries and removes them according to user preferences"""
+    kernel_attributes = clblast.DEVICE_TYPE_ATTRIBUTES + clblast.KERNEL_ATTRIBUTES + ["kernel"]
+
+    # For Python 2 and 3 compatibility
+    try:
+        user_input = raw_input
+    except NameError:
+        user_input = input
+        pass
+
+    # Check for mis-matched entries
+    for kernel_group_name, kernel_group in db.group_by(database["sections"], kernel_attributes):
+        group_by_arguments = db.group_by(kernel_group, clblast.ARGUMENT_ATTRIBUTES)
+        if len(group_by_arguments) != 1:
+            print("[database] WARNING: entries for a single kernel with multiple argument values " + str(kernel_group_name))
+            print("[database] Either quit now, or remove all but one of the argument combinations below:")
+            for index, (attribute_group_name, mismatching_entries) in enumerate(group_by_arguments):
+                print("[database]     %d: %s" % (index, attribute_group_name))
+            for attribute_group_name, mismatching_entries in group_by_arguments:
+                response = user_input("[database] Remove entries corresponding to %s, [y/n]? " % str(attribute_group_name))
+                if response == "y":
+                    for entry in mismatching_entries:
+                        database["sections"].remove(entry)
+                    print("[database] Removed %d entry/entries" % len(mismatching_entries))
+
+    # Sanity-check: all mis-matched entries should be removed
+    for kernel_group_name, kernel_group in db.group_by(database["sections"], kernel_attributes):
+        group_by_arguments = db.group_by(kernel_group, clblast.ARGUMENT_ATTRIBUTES)
+        if len(group_by_arguments) != 1:
+            print("[database] ERROR: entries for a single kernel with multiple argument values " + str(kernel_group_name))
+        assert len(group_by_arguments) == 1
+
+
+def remove_database_entries(database, remove_if_matches_fields):
+    assert len(remove_if_matches_fields.keys()) > 0
+
+    def remove_this_entry(section):
+        for key in remove_if_matches_fields.keys():
+            if section[key] != remove_if_matches_fields[key]:
+                return False
+        return True
+
+    old_length = len(database["sections"])
+    database["sections"] = [x for x in database["sections"] if not remove_this_entry(x)]
+    new_length = len(database["sections"])
+    print("[database] Removed %d entries from the database" % (old_length - new_length))
+
+
 def main(argv):

    # Parses the command-line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("source_folder", help="The folder with JSON files to parse to add to the database")
    parser.add_argument("clblast_root", help="Root of the CLBlast sources")
+    parser.add_argument("-r", "--remove_device", type=str, default=None, help="Removes all entries for a specific device")
    parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity of the script")
    cl_args = parser.parse_args(argv)

@ -76,10 +126,19 @@ def main(argv):
        new_size = db.length(database)
        print("with " + str(new_size - old_size) + " new items")  # Newline printed here

+    # Checks for tuning results with mis-matched entries
+    remove_mismatched_arguments(database)
+
    # Stores the modified database back to disk
    if len(glob.glob(json_files)) >= 1:
        io.save_database(database, database_filename)

+    # Removes database entries before continuing
+    if cl_args.remove_device is not None:
+        print("[database] Removing all results for device '%s'" % cl_args.remove_device)
+        remove_database_entries(database, {"device": cl_args.remove_device})
+        io.save_database(database, database_filename)
+
    # Retrieves the best performing results
    print("[database] Calculating the best results per device/kernel...")
    database_best_results = bests.get_best_results(database)
--- a/scripts/database/database/clblast.py
+++ b/scripts/database/database/clblast.py
@ -123,7 +123,7 @@ def print_cpp_database(database, output_dir):
                        devices = sorted(set([s["device"] for s in type_database]))
                        for device_name in devices:
                            device_database = [s for s in type_database if s["device"] == device_name]
-                            device_name_quoted = "\"%s\"," % device_name
+                            device_name_quoted = "\"%s\"," % device_name.strip()
                            device_name_cpp = "        { %-50s { " % device_name_quoted
                            f.write(device_name_cpp)

--- a/scripts/database/database/db.py
+++ b/scripts/database/database/db.py
@ -5,6 +5,9 @@
 # Author(s):
 #   Cedric Nugteren <www.cedricnugteren.nl>

+import itertools
+from operator import itemgetter
+
 import clblast


@ -62,3 +65,14 @@ def combine_result(old_results, new_result):
    # No match found: append a new result
    old_results.append(new_result)
    return old_results
+
+
+def group_by(database, attributes):
+    """Returns an list with the name of the group and the corresponding entries in the database"""
+    assert len(database) > 0
+    attributes = [a for a in attributes if a in database[0]]
+    database.sort(key=itemgetter(*attributes))
+    result = []
+    for key, data in itertools.groupby(database, key=itemgetter(*attributes)):
+        result.append((key, list(data)))
+    return result
--- a/scripts/database/database/io.py
+++ b/scripts/database/database/io.py
@ -56,5 +56,11 @@ def load_tuning_results(filename):
        assert json_data["precision"] == str(result["parameters"]["PRECISION"])
        result["parameters"].pop("PRECISION", None)

+    # Fixes the scalar argument values
+    for value, replacement in zip(["2.00", "2.00+0.50i"], ["2.000000", "2+0.5i"]):
+        for field in ["arg_alpha", "arg_beta"]:
+            if field in json_data.keys() and json_data[field] == value:
+                json_data[field] = replacement
+
    # All done
    return json_data
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@ -38,11 +38,14 @@ FILES = [
    "/src/clblast_c.cpp",
    "/test/wrapper_clblas.hpp",
    "/test/wrapper_cblas.hpp",
+    "/test/wrapper_cublas.hpp",
    "/include/clblast_netlib_c.h",
    "/src/clblast_netlib_c.cpp",
 ]
-HEADER_LINES = [117, 73, 118, 22, 29, 41, 65, 32]
-FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 2]
+HEADER_LINES = [122, 77, 126, 24, 29, 41, 29, 65, 32]
+FOOTER_LINES = [25, 139, 27, 38, 6, 6, 6, 9, 2]
+HEADER_LINES_DOC = 0
+FOOTER_LINES_DOC = 63

 # Different possibilities for requirements
 ald_m = "The value of `a_ld` must be at least `m`."
@ -99,65 +102,69 @@ bmnn = size_helper("layout == CLBlastLayoutRowMajor", "((side == CLBlastSideLeft
 # Populates a list of routines
 ROUTINES = [
 [  # Level 1: vector-vector
-  Routine(False, True,  "1", "rotg",  T, [S,D],            [],                  [],                                                     [],         ["sa","sb","sc","ss"],        ["1","1","1","1"], [],       "",    "Generate givens plane rotation", "", []),
-  Routine(False, True,  "1", "rotmg", T, [S,D],            [],                  [],                                                     ["sy1"],    ["sd1","sd2","sx1","sparam"], ["1","1","1","1","1"], [],   "",    "Generate modified givens plane rotation", "", []),
-  Routine(False, True,  "1", "rot",   T, [S,D],            ["n"],               [],                                                     [],         ["x","y"],                    [xn,yn],       ["cos","sin"],"",    "Apply givens plane rotation", "", []),
-  Routine(False, True,  "1", "rotm",  T, [S,D],            ["n"],               [],                                                     [],         ["x","y","sparam"],           [xn,yn,"1"],   [],           "",    "Apply modified givens plane rotation", "", []),
-  Routine(True,  True,  "1", "swap",  T, [S,D,C,Z,H],      ["n"],               [],                                                     [],         ["x","y"],                    [xn,yn],       [],           "",    "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []),
-  Routine(True,  True,  "1", "scal",  T, [S,D,C,Z,H],      ["n"],               [],                                                     [],         ["x"],                        [xn],          ["alpha"],    "",    "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []),
-  Routine(True,  True,  "1", "copy",  T, [S,D,C,Z,H],      ["n"],               [],                                                     ["x"],      ["y"],                        [xn,yn],       [],           "",    "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []),
-  Routine(True,  True,  "1", "axpy",  T, [S,D,C,Z,H],      ["n"],               [],                                                     ["x"],      ["y"],                        [xn,yn],       ["alpha"],    "",    "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []),
-  Routine(True,  True,  "1", "dot",   T, [S,D,H],          ["n"],               [],                                                     ["x","y"],  ["dot"],                      [xn,yn,"1"],   [],           "n",   "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []),
-  Routine(True,  True,  "1", "dotu",  T, [C,Z],            ["n"],               [],                                                     ["x","y"],  ["dot"],                      [xn,yn,"1"],   [],           "n",   "Dot product of two complex vectors", "See the regular xDOT routine.", []),
-  Routine(True,  True,  "1", "dotc",  T, [C,Z],            ["n"],               [],                                                     ["x","y"],  ["dot"],                      [xn,yn,"1"],   [],           "n",   "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
-  Routine(True,  True,  "1", "nrm2",  T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["nrm2"],                     [xn,"1"],      [],           "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
-  Routine(True,  True,  "1", "asum",  T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["asum"],                     [xn,"1"],      [],           "n",   "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
-  Routine(True,  False, "1", "sum",   T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["sum"],                      [xn,"1"],      [],           "n",   "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
-  Routine(True,  True,  "1", "amax",  T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [xn,"1"],      [],           "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
-  Routine(True,  False, "1", "max",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [xn,"1"],      [],           "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
-  Routine(True,  False, "1", "min",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imin"],                     [xn,"1"],      [],           "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
+  Routine(False, True,  False, "1", "rotg",  T, [S,D],            [],                  [],                                                     [],         ["sa","sb","sc","ss"],        ["1","1","1","1"], [],       "",    "Generate givens plane rotation", "", []),
+  Routine(False, True,  False, "1", "rotmg", T, [S,D],            [],                  [],                                                     ["sy1"],    ["sd1","sd2","sx1","sparam"], ["1","1","1","1","1"], [],   "",    "Generate modified givens plane rotation", "", []),
+  Routine(False, True,  False, "1", "rot",   T, [S,D],            ["n"],               [],                                                     [],         ["x","y"],                    [xn,yn],       ["cos","sin"],"",    "Apply givens plane rotation", "", []),
+  Routine(False, True,  False, "1", "rotm",  T, [S,D],            ["n"],               [],                                                     [],         ["x","y","sparam"],           [xn,yn,"1"],   [],           "",    "Apply modified givens plane rotation", "", []),
+  Routine(True,  True,  False, "1", "swap",  T, [S,D,C,Z,H],      ["n"],               [],                                                     [],         ["x","y"],                    [xn,yn],       [],           "",    "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []),
+  Routine(True,  True,  False, "1", "scal",  T, [S,D,C,Z,H],      ["n"],               [],                                                     [],         ["x"],                        [xn],          ["alpha"],    "",    "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []),
+  Routine(True,  True,  False, "1", "copy",  T, [S,D,C,Z,H],      ["n"],               [],                                                     ["x"],      ["y"],                        [xn,yn],       [],           "",    "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []),
+  Routine(True,  True,  False, "1", "axpy",  T, [S,D,C,Z,H],      ["n"],               [],                                                     ["x"],      ["y"],                        [xn,yn],       ["alpha"],    "",    "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []),
+  Routine(True,  True,  False, "1", "dot",   T, [S,D,H],          ["n"],               [],                                                     ["x","y"],  ["dot"],                      [xn,yn,"1"],   [],           "n",   "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []),
+  Routine(True,  True,  False, "1", "dotu",  T, [C,Z],            ["n"],               [],                                                     ["x","y"],  ["dot"],                      [xn,yn,"1"],   [],           "n",   "Dot product of two complex vectors", "See the regular xDOT routine.", []),
+  Routine(True,  True,  False, "1", "dotc",  T, [C,Z],            ["n"],               [],                                                     ["x","y"],  ["dot"],                      [xn,yn,"1"],   [],           "n",   "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
+  Routine(True,  True,  False, "1", "nrm2",  T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["nrm2"],                     [xn,"1"],      [],           "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
+  Routine(True,  True,  False, "1", "asum",  T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["asum"],                     [xn,"1"],      [],           "n",   "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
+  Routine(True,  False, False, "1", "sum",   T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["sum"],                      [xn,"1"],      [],           "n",   "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
+  Routine(True,  True,  False, "1", "amax",  T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [xn,"1"],      [],           "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
+  Routine(True,  False, False, "1", "max",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [xn,"1"],      [],           "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
+  Routine(True,  False, False, "1", "min",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imin"],                     [xn,"1"],      [],           "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
 ],
 [  # Level 2: matrix-vector
-  Routine(True,  True,  "2a", "gemv",  T,  [S,D,C,Z,H],    ["m","n"],           ["layout","a_transpose"],                               ["a","x"],  ["y"],                        [amn,xmn,ynm], ["alpha","beta"], "",    "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
-  Routine(True,  True,  "2a", "gbmv",  T,  [S,D,C,Z,H],    ["m","n","kl","ku"], ["layout","a_transpose"],                               ["a","x"],  ["y"],                        [amn,xmn,ynm], ["alpha","beta"], "",    "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]),
-  Routine(True,  True,  "2a", "hemv",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]),
-  Routine(True,  True,  "2a", "hbmv",  T,  [C,Z],          ["n","k"],           ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]),
-  Routine(True,  True,  "2a", "hpmv",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["ap","x"], ["y"],                        [apn,xn,yn],   ["alpha","beta"], "",    "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2a", "symv",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]),
-  Routine(True,  True,  "2a", "sbmv",  T,  [S,D,H],        ["n","k"],           ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]),
-  Routine(True,  True,  "2a", "spmv",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["ap","x"], ["y"],                        [apn,xn,yn],   ["alpha","beta"], "",    "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2a", "trmv",  T,  [S,D,C,Z,H],    ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "n",   "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
-  Routine(True,  True,  "2a", "tbmv",  T,  [S,D,C,Z,H],    ["n","k"],           ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "n",   "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
-  Routine(True,  True,  "2a", "tpmv",  T,  [S,D,C,Z,H],    ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["ap"],     ["x"],                        [apn,xn],      [],               "n",   "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
-  Routine(False, True,  "2a", "trsv",  T,  [S,D,C,Z],      ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "",    "Solves a triangular system of equations", "", []),
-  Routine(False, True,  "2a", "tbsv",  T,  [S,D,C,Z],      ["n","k"],           ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "",    "Solves a banded triangular system of equations", "", [ald_k_one]),
-  Routine(False, True,  "2a", "tpsv",  T,  [S,D,C,Z],      ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["ap"],     ["x"],                        [apn,xn],      [],               "",    "Solves a packed triangular system of equations", "", []),
+  Routine(True,  True,  False, "2a", "gemv",  T,  [S,D,C,Z,H],    ["m","n"],           ["layout","a_transpose"],                               ["a","x"],  ["y"],                        [amn,xmn,ynm], ["alpha","beta"], "",    "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
+  Routine(True,  True,  False, "2a", "gbmv",  T,  [S,D,C,Z,H],    ["m","n","kl","ku"], ["layout","a_transpose"],                               ["a","x"],  ["y"],                        [amn,xmn,ynm], ["alpha","beta"], "",    "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]),
+  Routine(True,  True,  False, "2a", "hemv",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]),
+  Routine(True,  True,  False, "2a", "hbmv",  T,  [C,Z],          ["n","k"],           ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]),
+  Routine(True,  True,  False, "2a", "hpmv",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["ap","x"], ["y"],                        [apn,xn,yn],   ["alpha","beta"], "",    "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  False, "2a", "symv",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]),
+  Routine(True,  True,  False, "2a", "sbmv",  T,  [S,D,H],        ["n","k"],           ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]),
+  Routine(True,  True,  False, "2a", "spmv",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["ap","x"], ["y"],                        [apn,xn,yn],   ["alpha","beta"], "",    "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  False, "2a", "trmv",  T,  [S,D,C,Z,H],    ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "n",   "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
+  Routine(True,  True,  False, "2a", "tbmv",  T,  [S,D,C,Z,H],    ["n","k"],           ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "n",   "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
+  Routine(True,  True,  False, "2a", "tpmv",  T,  [S,D,C,Z,H],    ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["ap"],     ["x"],                        [apn,xn],      [],               "n",   "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
+  Routine(True,  True,  False, "2a", "trsv",  T,  [S,D,C,Z],      ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "",    "Solves a triangular system of equations", "", []),
+  Routine(False, True,  False, "2a", "tbsv",  T,  [S,D,C,Z],      ["n","k"],           ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "",    "Solves a banded triangular system of equations", "", [ald_k_one]),
+  Routine(False, True,  False, "2a", "tpsv",  T,  [S,D,C,Z],      ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["ap"],     ["x"],                        [apn,xn],      [],               "",    "Solves a packed triangular system of equations", "", []),
  # Level 2: matrix update
-  Routine(True,  True,  "2b", "ger",   T,  [S,D,H],        ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        [xm,yn,amn],   ["alpha"],        "",    "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]),
-  Routine(True,  True,  "2b", "geru",  T,  [C,Z],          ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        [xm,yn,amn],   ["alpha"],        "",    "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]),
-  Routine(True,  True,  "2b", "gerc",  T,  [C,Z],          ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        [xm,yn,amn],   ["alpha"],        "",    "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]),
-  Routine(True,  True,  "2b", "her",   Tc, [Css,Zdd],      ["n"],               ["layout","triangle"],                                  ["x"],      ["a"],                        [xn,an],       ["alpha"],        "",    "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]),
-  Routine(True,  True,  "2b", "hpr",   Tc, [Css,Zdd],      ["n"],               ["layout","triangle"],                                  ["x"],      ["ap"],                       [xn,apn],      ["alpha"],        "",    "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2b", "her2",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["x","y"],  ["a"],                        [xn,yn,an],    ["alpha"],        "",    "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]),
-  Routine(True,  True,  "2b", "hpr2",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["x","y"],  ["ap"],                       [xn,yn,apn],   ["alpha"],        "",    "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2b", "syr",   T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x"],      ["a"],                        [xn,an],       ["alpha"],        "",    "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]),
-  Routine(True,  True,  "2b", "spr",   T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x"],      ["ap"],                       [xn,apn],      ["alpha"],        "",    "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2b", "syr2",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x","y"],  ["a"],                        [xn,yn,an],    ["alpha"],        "",    "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]),
-  Routine(True,  True,  "2b", "spr2",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x","y"],  ["ap"],                       [xn,yn,apn],   ["alpha"],        "",    "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  False, "2b", "ger",   T,  [S,D,H],        ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        [xm,yn,amn],   ["alpha"],        "",    "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]),
+  Routine(True,  True,  False, "2b", "geru",  T,  [C,Z],          ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        [xm,yn,amn],   ["alpha"],        "",    "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]),
+  Routine(True,  True,  False, "2b", "gerc",  T,  [C,Z],          ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        [xm,yn,amn],   ["alpha"],        "",    "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]),
+  Routine(True,  True,  False, "2b", "her",   Tc, [Css,Zdd],      ["n"],               ["layout","triangle"],                                  ["x"],      ["a"],                        [xn,an],       ["alpha"],        "",    "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]),
+  Routine(True,  True,  False, "2b", "hpr",   Tc, [Css,Zdd],      ["n"],               ["layout","triangle"],                                  ["x"],      ["ap"],                       [xn,apn],      ["alpha"],        "",    "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  False, "2b", "her2",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["x","y"],  ["a"],                        [xn,yn,an],    ["alpha"],        "",    "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]),
+  Routine(True,  True,  False, "2b", "hpr2",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["x","y"],  ["ap"],                       [xn,yn,apn],   ["alpha"],        "",    "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  False, "2b", "syr",   T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x"],      ["a"],                        [xn,an],       ["alpha"],        "",    "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]),
+  Routine(True,  True,  False, "2b", "spr",   T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x"],      ["ap"],                       [xn,apn],      ["alpha"],        "",    "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  False, "2b", "syr2",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x","y"],  ["a"],                        [xn,yn,an],    ["alpha"],        "",    "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]),
+  Routine(True,  True,  False, "2b", "spr2",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x","y"],  ["ap"],                       [xn,yn,apn],   ["alpha"],        "",    "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
 ],
 [  # Level 3: matrix-matrix
-  Routine(True,  True,  "3", "gemm",  T,  [S,D,C,Z,H],     ["m","n","k"],        ["layout","a_transpose","b_transpose"],                ["a","b"],  ["c"],                        [amk,bkn,cmn],   ["alpha","beta"], "",    "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
-  Routine(True,  True,  "3", "symm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle"],                          ["a","b"],  ["c"],                        [ammn,bmnn,cmn], ["alpha","beta"], "",    "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]),
-  Routine(True,  True,  "3", "hemm",  T,  [C,Z],           ["m","n"],            ["layout","side","triangle"],                          ["a","b"],  ["c"],                        [ammn,bmnn,cmn], ["alpha","beta"], "",    "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]),
-  Routine(True,  True,  "3", "syrk",  T,  [S,D,C,Z,H],     ["n","k"],            ["layout","triangle","a_transpose"],                   ["a"],      ["c"],                        [ank,cn],        ["alpha","beta"], "",    "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]),
-  Routine(True,  True,  "3", "herk",  Tc, [Css,Zdd],       ["n","k"],            ["layout","triangle","a_transpose"],                   ["a"],      ["c"],                        [ank,cn],        ["alpha","beta"], "",    "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]),
-  Routine(True,  True,  "3", "syr2k", T,  [S,D,C,Z,H],     ["n","k"],            ["layout","triangle","ab_transpose"],                  ["a","b"],  ["c"],                        [ankab,bnkab,cn],["alpha","beta"], "",    "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
-  Routine(True,  True,  "3", "her2k", TU, [Ccs,Zzd],       ["n","k"],            ["layout","triangle","ab_transpose"],                  ["a","b"],  ["c"],                        [ankab,bnkab,cn],["alpha","beta"], "",    "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
-  Routine(True,  True,  "3", "trmm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        [amns,bmn],      ["alpha"],        "",    "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
-  Routine(False, True,  "3", "trsm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        [amns,bmn],      ["alpha"],        "",    "Solves a triangular system of equations", "", []),
+  Routine(True,  True,  False, "3", "gemm",  T,  [S,D,C,Z,H],     ["m","n","k"],        ["layout","a_transpose","b_transpose"],                ["a","b"],  ["c"],                        [amk,bkn,cmn],   ["alpha","beta"], "",    "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
+  Routine(True,  True,  False, "3", "symm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle"],                          ["a","b"],  ["c"],                        [ammn,bmnn,cmn], ["alpha","beta"], "",    "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]),
+  Routine(True,  True,  False, "3", "hemm",  T,  [C,Z],           ["m","n"],            ["layout","side","triangle"],                          ["a","b"],  ["c"],                        [ammn,bmnn,cmn], ["alpha","beta"], "",    "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]),
+  Routine(True,  True,  False, "3", "syrk",  T,  [S,D,C,Z,H],     ["n","k"],            ["layout","triangle","a_transpose"],                   ["a"],      ["c"],                        [ank,cn],        ["alpha","beta"], "",    "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]),
+  Routine(True,  True,  False, "3", "herk",  Tc, [Css,Zdd],       ["n","k"],            ["layout","triangle","a_transpose"],                   ["a"],      ["c"],                        [ank,cn],        ["alpha","beta"], "",    "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]),
+  Routine(True,  True,  False, "3", "syr2k", T,  [S,D,C,Z,H],     ["n","k"],            ["layout","triangle","ab_transpose"],                  ["a","b"],  ["c"],                        [ankab,bnkab,cn],["alpha","beta"], "",    "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
+  Routine(True,  True,  False, "3", "her2k", TU, [Ccs,Zzd],       ["n","k"],            ["layout","triangle","ab_transpose"],                  ["a","b"],  ["c"],                        [ankab,bnkab,cn],["alpha","beta"], "",    "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
+  Routine(True,  True,  False, "3", "trmm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        [amns,bmn],      ["alpha"],        "",    "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
+  Routine(True,  True,  False, "3", "trsm",  T,  [S,D,C,Z],       ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        [amns,bmn],      ["alpha"],        "",    "Solves a triangular system of equations", "Solves the equation _A * X = alpha * B_ for the unknown _m_ by _n_ matrix X, in which _A_ is an _n_ by _n_ unit or non-unit triangular matrix and B is an _m_ by _n_ matrix. The matrix _B_ is overwritten by the solution _X_.", []),
 ],
 [  # Level X: extra routines (not part of BLAS)
-  Routine(True,  True,  "x", "omatcopy", T, [S,D,C,Z,H],   ["m","n"],            ["layout","a_transpose"],                              ["a"],      ["b"],                        [amn,bnma],      ["alpha"],        "",    "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
+  # Special routines:
+  Routine(True,  True,  False, "x", "omatcopy", T, [S,D,C,Z,H],   ["m","n"],            ["layout","a_transpose"],                              ["a"],      ["b"],                        [amn,bnma],      ["alpha"],        "",    "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
+  # Batched routines:
+  Routine(True,  True,  True,  "x", "axpy",     T, [S,D,C,Z,H],   ["n"],                [],                                                    ["x"],      ["y"],                        [xn,yn],         ["alpha"],        "",    "Batched version of AXPY", "As AXPY, but multiple operations are batched together for better performance.", []),
+  Routine(True,  True,  True,  "x", "gemm",     T, [S,D,C,Z,H],   ["m","n","k"],        ["layout","a_transpose","b_transpose"],                ["a","b"],  ["c"],                        [amk,bkn,cmn],   ["alpha","beta"], "",    "Batched version of GEMM", "As GEMM, but multiple operations are batched together for better performance.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
 ]]


@ -188,7 +195,7 @@ def main(argv):
        # Re-writes the body of the file
        with open(library_root + FILES[i], "w") as f:
            body = ""
-            levels = [1, 2, 3] if (i == 4 or i == 5) else [1, 2, 3, 4]
+            levels = [1, 2, 3] if (i == 4 or i == 5 or i == 6) else [1, 2, 3, 4]
            for level in levels:
                body += cpp.LEVEL_SEPARATORS[level - 1] + "\n"
                for routine in ROUTINES[level - 1]:
@ -205,9 +212,13 @@ def main(argv):
                    if i == 5:
                        body += cpp.wrapper_cblas(routine)
                    if i == 6:
-                        body += cpp.clblast_netlib_c_h(routine)
+                        body += cpp.wrapper_cublas(routine)
                    if i == 7:
-                        body += cpp.clblast_netlib_c_cc(routine)
+                        if not routine.batched:
+                            body += cpp.clblast_netlib_c_h(routine)
+                    if i == 8:
+                        if not routine.batched:
+                            body += cpp.clblast_netlib_c_cc(routine)
            f.write("".join(file_header))
            f.write(body)
            f.write("".join(file_footer))
@ -217,7 +228,7 @@ def main(argv):
        for routine in ROUTINES[level - 1]:
            if routine.has_tests:
                level_string = cpp.LEVEL_NAMES[level - 1]
-                routine_suffix = "level" + level_string + "/x" + routine.name + ".cpp"
+                routine_suffix = "level" + level_string + "/x" + routine.lowercase_name() + ".cpp"

                # Correctness tests
                filename = library_root + "/test/correctness/routines/" + routine_suffix
@ -233,11 +244,20 @@ def main(argv):
                    f.write(cpp.performance_test(routine, level_string))
                    f.write(cpp.FOOTER)

-    # Outputs the API documentation
+    # API documentation
    filename = cl_args.clblast_root + "/doc/clblast.md"
+
+    # Stores the header and the footer of the original documentation file
+    with open(filename) as f:
+        original = f.readlines()
+    file_header = original[:HEADER_LINES_DOC]
+    file_footer = original[-FOOTER_LINES_DOC:]
+
+    # Outputs the API documentation
    with open(filename, "w") as f:

        # Outputs the header
+        f.write("".join(file_header))
        doc_header = doc.header()
        f.write(doc_header)

@ -248,5 +268,8 @@ def main(argv):
                    doc_routine = doc.generate(routine)
                    f.write(doc_routine)

+        # Outputs the footer
+        f.write("".join(file_footer))
+
 if __name__ == '__main__':
    main(sys.argv[1:])
--- a/scripts/generator/generator/convert.py
+++ b/scripts/generator/generator/convert.py
@ -56,6 +56,19 @@ def option_to_cblas(x):
    }[x]


+def option_to_cublas(x):
+    """As above, but for clBLAS data-types"""
+    return {
+        'layout': "Layout",
+        'a_transpose': "cublasOperation_t",
+        'b_transpose': "cublasOperation_t",
+        'ab_transpose': "cublasOperation_t",
+        'side': "cublasSideMode_t",
+        'triangle': "cublasFillMode_t",
+        'diagonal': "cublasDiagType_t",
+    }[x]
+
+
 def option_to_documentation(x):
    """Translates an option name to a documentation string"""
    return {
--- a/scripts/generator/generator/cpp.py
+++ b/scripts/generator/generator/cpp.py
@ -51,8 +51,10 @@ def clblast_cc(routine):
        result += routine.routine_header_cpp(12, "") + " {" + NL
        result += "  try {" + NL
        result += "    auto queue_cpp = Queue(*queue);" + NL
-        result += "    auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL
-        result += "    routine.Do" + routine.name.capitalize() + "("
+        result += "    auto routine = X" + routine.plain_name() + "<" + routine.template.template + ">(queue_cpp, event);" + NL
+        if routine.batched:
+            result += "    " + (NL + "    ").join(routine.batched_transform_to_cpp()) + NL
+        result += "    routine.Do" + routine.capitalized_name() + "("
        result += ("," + NL + indent1).join([a for a in routine.arguments_clcudaapi()])
        result += ");" + NL
        result += "    return StatusCode::kSuccess;" + NL
@ -63,7 +65,7 @@ def clblast_cc(routine):
    result += "}" + NL
    for flavour in routine.flavours:
        indent2 = " " * (34 + routine.length() + len(flavour.template))
-        result += "template StatusCode PUBLIC_API " + routine.name.capitalize() + "<" + flavour.template + ">("
+        result += "template StatusCode PUBLIC_API " + routine.capitalized_name() + "<" + flavour.template + ">("
        result += ("," + NL + indent2).join([a for a in routine.arguments_type(flavour)])
        result += "," + NL + indent2 + "cl_command_queue*, cl_event*);" + NL
    return result
@ -84,9 +86,11 @@ def clblast_c_cc(routine):
        template = "<" + flavour.template + ">" if routine.no_scalars() else ""
        indent = " " * (16 + routine.length() + len(template))
        result += routine.routine_header_c(flavour, 27, "") + " {" + NL
+        if routine.batched:
+            result += "  " + (NL + "  ").join(routine.batched_transform_to_complex(flavour)) + NL
        result += "  try {" + NL
        result += "    return static_cast<CLBlastStatusCode>(" + NL
-        result += "      clblast::" + routine.name.capitalize() + template + "("
+        result += "      clblast::" + routine.capitalized_name() + template + "("
        result += ("," + NL + indent).join([a for a in routine.arguments_cast(flavour, indent)])
        result += "," + NL + indent + "queue, event)" + NL
        result += "    );" + NL
@ -286,14 +290,69 @@ def wrapper_cblas(routine):
    return result


+def wrapper_cublas(routine):
+    """The wrapper to the reference cuBLAS routines (for performance/correctness testing)"""
+    result = ""
+    if routine.has_tests:
+        result += NL + "// Forwards the cuBLAS calls for %s" % routine.short_names_tested() + NL
+        if routine.no_scalars():
+            result += routine.routine_header_wrapper_cublas(routine.template, True, 23) + ";" + NL
+        for flavour in routine.flavours:
+            result += routine.routine_header_wrapper_cublas(flavour, False, 23) + " {" + NL
+
+            # There is a version available in cuBLAS
+            if flavour.precision_name in ["S", "D", "C", "Z"]:
+                indent = " " * (24 + routine.length())
+                arguments = routine.arguments_wrapper_cublas(flavour)
+
+                # Handles row-major
+                if routine.has_layout():
+                    result += "  if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }" + NL
+
+                # Complex scalars
+                for scalar in routine.scalars:
+                    if flavour.is_complex(scalar):
+                        cuda_complex = "cuDoubleComplex" if flavour.precision_name == "Z" else "cuComplex"
+                        result += "  " + cuda_complex + " " + scalar + "_cuda;" + NL
+                        result += "  " + scalar + "_cuda.x = " + scalar + ".real();" + NL
+                        result += "  " + scalar + "_cuda.y = " + scalar + ".imag();" + NL
+
+                # Calls the cuBLAS routine
+                result += "  auto status = cublas" + flavour.name_cublas() + routine.name + "(handle, "
+                result += ("," + NL + indent).join([a for a in arguments]) + ");" + NL
+                result += "  cudaDeviceSynchronize();" + NL
+                result += "  return status;"
+
+            # There is no cuBLAS available, forward the call to one of the available functions
+            else:  # Half-precision
+                result += "  return CUBLAS_STATUS_NOT_SUPPORTED;"
+            #     indent = " " * (24 + routine.length())
+
+            #     # Convert to float (note: also integer buffers are stored as half/float)
+            #     for buf in routine.inputs + routine.outputs:
+            #         result += "  auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer, queues[0]);" + NL
+
+            #     # Call the float routine
+            #     result += "  return cublasX" + routine.name + "(handle,"
+            #     result += ("," + NL + indent).join([a for a in routine.arguments_half()]) + ");" + NL
+            #     result += "  cudaDeviceSynchronize();" + NL
+            #     result += "  return status;"
+
+            #     # Convert back to half
+            #     for buf in routine.outputs:
+            #         result += "  FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis, queues[0]);" + NL
+            #     result += "  return status;"
+
+            # Complete
+            result += NL + "}" + NL
+    return result
+
+
 def performance_test(routine, level_string):
    """Generates the body of a performance test for a specific routine"""
    result = ""
    result += "#include \"test/performance/client.hpp\"" + NL
-    result += "#include \"test/routines/level" + level_string + "/x" + routine.name + ".hpp\"" + NL + NL
-    result += "// Shortcuts to the clblast namespace" + NL
-    result += "using float2 = clblast::float2;" + NL
-    result += "using double2 = clblast::double2;" + NL + NL
+    result += "#include \"test/routines/level" + level_string + "/x" + routine.lowercase_name() + ".hpp\"" + NL + NL
    result += "// Main function (not within the clblast namespace)" + NL
    result += "int main(int argc, char *argv[]) {" + NL
    result += "  const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);" + NL
@ -304,7 +363,7 @@ def performance_test(routine, level_string):
        found = False
        for flavour in routine.flavours:
            if flavour.precision_name == precision:
-                result += NL + "      clblast::RunClient<clblast::TestX" + routine.name + flavour.test_template()
+                result += NL + "      clblast::RunClient<clblast::TestX" + routine.plain_name() + flavour.test_template()
                result += ">(argc, argv); break;" + NL
                found = True
        if not found:
@ -319,17 +378,14 @@ def correctness_test(routine, level_string):
    """Generates the body of a correctness test for a specific routine"""
    result = ""
    result += "#include \"test/correctness/testblas.hpp\"" + NL
-    result += "#include \"test/routines/level" + level_string + "/x" + routine.name + ".hpp\"" + NL + NL
-    result += "// Shortcuts to the clblast namespace" + NL
-    result += "using float2 = clblast::float2;" + NL
-    result += "using double2 = clblast::double2;" + NL + NL
+    result += "#include \"test/routines/level" + level_string + "/x" + routine.lowercase_name() + ".hpp\"" + NL + NL
    result += "// Main function (not within the clblast namespace)" + NL
    result += "int main(int argc, char *argv[]) {" + NL
    result += "  auto errors = size_t{0};" + NL
    not_first = "false"
    for flavour in routine.flavours:
-        result += "  errors += clblast::RunTests<clblast::TestX" + routine.name + flavour.test_template()
-        result += ">(argc, argv, " + not_first + ", \"" + flavour.name + routine.name.upper() + "\");" + NL
+        result += "  errors += clblast::RunTests<clblast::TestX" + routine.plain_name() + flavour.test_template()
+        result += ">(argc, argv, " + not_first + ", \"" + flavour.name + routine.upper_name() + "\");" + NL
        not_first = "true"
    result += "  if (errors > 0) { return 1; } else { return 0; }" + NL
    result += "}" + NL
--- a/scripts/generator/generator/datatype.py
+++ b/scripts/generator/generator/datatype.py
@ -30,17 +30,17 @@ class DataType:
        self.beta_cl = scalars[3]
        self.buffer_type = buffer_type

-    def use_alpha(self):
+    def use_alpha(self, postfix=""):
        """Outputs the name of the data-type (alpha/beta), possibly transforming into the right type"""
        if self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]:
-            return self.alpha_cpp + "{alpha.s[0], alpha.s[1]}"
-        return "alpha"
+            return self.alpha_cpp + "{alpha" + postfix + ".s[0], alpha" + postfix + ".s[1]}"
+        return "alpha" + postfix

-    def use_beta(self):
+    def use_beta(self, postfix=""):
        """As above, but for beta instead of alpha"""
        if self.beta_cpp in [D_FLOAT2, D_DOUBLE2]:
-            return self.beta_cpp + "{beta.s[0], beta.s[1]}"
-        return "beta"
+            return self.beta_cpp + "{beta" + postfix + ".s[0], beta" + postfix + ".s[1]}"
+        return "beta" + postfix

    def use_alpha_opencl(self):
        """As above, but the transformation is in the opposite direction"""
@ -72,9 +72,11 @@ class DataType:

    def test_template(self):
        """Returns the template as used in the correctness/performance tests"""
+        buffer_type = "clblast::" + self.buffer_type if self.is_non_standard() else self.buffer_type
+        beta_cpp = "clblast::" + self.beta_cpp if self.beta_cpp in [D_HALF, D_FLOAT2, D_DOUBLE2] else self.beta_cpp
        if self.buffer_type != self.beta_cpp:
-            return "<" + self.buffer_type + "," + self.beta_cpp + ">, " + self.buffer_type + ", " + self.beta_cpp
-        return "<" + self.buffer_type + ">, " + self.buffer_type + ", " + self.beta_cpp
+            return "<" + buffer_type + "," + self.beta_cpp + ">, " + buffer_type + ", " + beta_cpp
+        return "<" + buffer_type + ">, " + buffer_type + ", " + beta_cpp

    def is_complex(self, scalar):
        """Current scalar is complex"""
@ -85,6 +87,11 @@ class DataType:
        """Current type is of a non-standard type"""
        return self.buffer_type in [D_HALF, D_FLOAT2, D_DOUBLE2]

+    def name_cublas(self):
+        if "i" in self.name:
+            return "I" + self.name[1].lower()
+        return self.name
+

 # Regular data-types
 H = DataType("H", "H", D_HALF, [D_HALF] * 2 + [D_HALF_OPENCL] * 2, D_HALF)  # half (16)
--- a/scripts/generator/generator/doc.py
+++ b/scripts/generator/generator/doc.py
@ -20,7 +20,7 @@ def generate(routine):
    result = ""

    # Routine header
-    result += "x" + routine.name.upper() + ": " + routine.description + NL
+    result += "x" + routine.upper_name() + ": " + routine.description + NL
    result += "-------------" + NL + NL
    result += routine.details + NL + NL

@ -36,7 +36,7 @@ def generate(routine):
    result += "```" + NL + NL

    # Routine arguments
-    result += "Arguments to " + routine.name.upper() + ":" + NL + NL
+    result += "Arguments to " + routine.upper_name() + ":" + NL + NL
    for argument in routine.arguments_doc():
        result += "* " + argument + NL
    result += "* `cl_command_queue* queue`: "
@ -47,7 +47,7 @@ def generate(routine):

    # Routine requirements
    if len(routine.requirements_doc()) > 0:
-        result += "Requirements for " + routine.name.upper() + ":" + NL + NL
+        result += "Requirements for " + routine.upper_name() + ":" + NL + NL
        for requirement in routine.requirements_doc():
            result += "* " + requirement + NL
        result += NL
--- a/scripts/generator/generator/routine.py
+++ b/scripts/generator/generator/routine.py
@ -12,11 +12,12 @@ import generator.convert as convert

 class Routine:
    """Class holding routine-specific information (e.g. name, which arguments, which precisions)"""
-    def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options,
+    def __init__(self, implemented, has_tests, batched, level, name, template, flavours, sizes, options,
                 inputs, outputs, buffer_sizes, scalars, scratch,
                 description, details, requirements):
        self.implemented = implemented
        self.has_tests = has_tests
+        self.batched = batched
        self.level = level
        self.name = name
        self.template = template
@ -32,6 +33,69 @@ class Routine:
        self.details = details
        self.requirements = requirements

+    def lowercase_name(self):
+        postfix = "batched" if self.batched else ""
+        return self.name + postfix
+
+    def plain_name(self):
+        postfix = "Batched" if self.batched else ""
+        return self.name + postfix
+
+    def capitalized_name(self):
+        postfix = "Batched" if self.batched else ""
+        return self.name.capitalize() + postfix
+
+    def upper_name(self):
+        postfix = "BATCHED" if self.batched else ""
+        return self.name.upper() + postfix
+
+    def b_star(self):
+        return "*" if self.batched else ""
+
+    def b_s(self):
+        return "s" if self.batched else ""
+
+    def batch_count_def(self):
+        return ["const size_t batch_count"] if self.batched else []
+
+    def batch_count_list(self):
+        return ["batch_count"] if self.batched else []
+
+    def batch_count_type(self):
+        return ["const size_t"] if self.batched else []
+
+    def batch_count_doc(self):
+        return ["`const size_t batch_count`: Number of batches. This value must be positive."] if self.batched else []
+
+    def batched_transform_to_cpp(self):
+        result = []
+        for scalar in self.scalars:
+            result.append("auto " + scalar + "s_cpp = std::vector<T>();")
+        for buffer_name in self.inputs + self.outputs:
+            result.append("auto " + buffer_name + "_offsets_cpp = std::vector<size_t>();")
+        result.append("for (auto batch = size_t{0}; batch < batch_count; ++batch) {")
+        for scalar in self.scalars:
+            result.append("  " + scalar + "s_cpp.push_back(" + scalar + "s[batch]);")
+        for buffer_name in self.inputs + self.outputs:
+            result.append("  " + buffer_name + "_offsets_cpp.push_back(" + buffer_name + "_offsets[batch]);")
+        result.append("}")
+        return result
+
+    def batched_transform_to_complex(self, flavour):
+        result = []
+        for scalar in self.scalars:
+            result.append("auto " + scalar + "s_cpp = std::vector<" + flavour.buffer_type + ">();")
+        result.append("for (auto batch = size_t{0}; batch < batch_count; ++batch) {")
+        for scalar in self.scalars:
+            content = scalar
+            if scalar == "alpha":
+                content = flavour.use_alpha(postfix="s[batch]")
+            elif scalar == "beta":
+                content = flavour.use_beta(postfix="s[batch]")
+            result.append("  " + scalar + "s_cpp.push_back(" + content + ");")
+        result.append("}")
+        return result
+
    @staticmethod
    def scalar_buffers_first():
        """List of scalar buffers"""
@ -127,21 +191,25 @@ class Routine:

    def length(self):
        """Retrieves the number of characters in the routine's name"""
-        return len(self.name)
+        return len(self.capitalized_name())

    def no_scalars(self):
        """Determines whether or not this routine has scalar arguments (alpha/beta)"""
        return self.scalars == []

+    def has_layout(self):
+        """Determines whether the layout is an argument"""
+        return "layout" in self.options
+
    def short_names(self):
        """Returns the upper-case names of these routines (all flavours)"""
-        return "/".join([f.name + self.name.upper() for f in self.flavours])
+        return "/".join([f.name + self.upper_name() for f in self.flavours])

    def short_names_tested(self):
        """As above, but excludes some"""
-        names = [f.name + self.name.upper() for f in self.flavours]
-        if "H" + self.name.upper() in names:
-            names.remove("H" + self.name.upper())
+        names = [f.name + self.upper_name() for f in self.flavours]
+        if "H" + self.upper_name() in names:
+            names.remove("H" + self.upper_name())
        return "/".join(names)

    def buffers_first(self):
@ -159,7 +227,7 @@ class Routine:
        """Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')"""
        if name in self.inputs or name in self.outputs:
            a = [name + "_buffer"]
-            b = [name + "_offset"]
+            b = [name + "_offset" + self.b_s()]
            c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else []
            return [", ".join(a + b + c)]
        return []
@ -187,13 +255,13 @@ class Routine:
        prefix = "const " if name in self.inputs else ""
        if name in self.inputs or name in self.outputs:
            a = [prefix + "cl_mem " + name + "_buffer"]
-            b = ["const size_t " + name + "_offset"]
+            b = ["const size_t " + self.b_star() + name + "_offset" + self.b_s()]
            c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
            return [", ".join(a + b + c)]
        return []

    def buffer_def_wrapper_cl(self, name, flavour):
-        """As above but with data-types"""
+        """As above but for OpenCL"""
        prefix = "const " if name in self.inputs else ""
        if name in self.inputs or name in self.outputs:
            a = [prefix + "Buffer<" + flavour.buffer_type + ">& " + name + "_buffer"]
@ -202,6 +270,16 @@ class Routine:
            return [", ".join(a + b + c)]
        return []

+    def buffer_def_wrapper_cuda(self, name, flavour):
+        """As above but for CUDA"""
+        prefix = "const " if name in self.inputs else ""
+        if name in self.inputs or name in self.outputs:
+            a = [prefix + flavour.buffer_type + "* " + name + "_buffer"]
+            b = ["const size_t " + name + "_offset"]
+            c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
+            return [", ".join(a + b + c)]
+        return []
+
    def buffer_def_vector(self, name, flavour):
        """As above but as vectors"""
        prefix = "const " if name in self.inputs else ""
@ -228,7 +306,7 @@ class Routine:
        if name in self.inputs or name in self.outputs:
            buffer_type = "unsigned int" if (name in self.index_buffers()) else self.template.buffer_type
            a = ["Buffer<" + buffer_type + ">(" + name + "_buffer)"]
-            b = [name + "_offset"]
+            b = [name + "_offsets_cpp"] if self.batched else [name + "_offset"]
            c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else []
            return [", ".join(a + b + c)]
        return []
@ -265,12 +343,38 @@ class Routine:
            return [", ".join(a + c)]
        return []

+    def buffer_wrapper_cublas(self, name, flavour):
+        """As above but for cuBLAS the wrapper"""
+        prefix = "const " if name in self.inputs else ""
+        if name in self.inputs or name in self.outputs:
+            if name in self.index_buffers():
+                a = ["reinterpret_cast<int*>(&" + name + "_buffer[" + name + "_offset])"]
+            elif name in self.outputs and flavour.name in ["Sc", "Dz"]:
+                dtype = "float" if flavour.name == "Sc" else "double"
+                a = ["reinterpret_cast<" + dtype + "*>(&" + name + "_buffer[" + name + "_offset])"]
+            elif flavour.precision_name in ["C", "Z"]:
+                cuda_complex = "cuDoubleComplex" if flavour.precision_name == "Z" else "cuComplex"
+                a = ["reinterpret_cast<" + prefix + cuda_complex + "*>" +
+                     "(&" + name + "_buffer[" + name + "_offset])"]
+            else:
+                a = ["&" + name + "_buffer[" + name + "_offset]"]
+            c = []
+            if name in ["x", "y"]:
+                c = ["static_cast<int>(" + name + "_" + self.postfix(name) + ")"]
+            elif name in ["a", "b", "c"]:
+                c = [name + "_" + self.postfix(name)]
+            result = [", ".join(a + c)]
+            if self.name == "trmm" and name == "a":
+                result *= 2
+            return result
+        return []
+
    def buffer_type(self, name):
        """As above, but only data-types"""
        prefix = "const " if (name in self.inputs) else ""
        if (name in self.inputs) or (name in self.outputs):
            a = [prefix + "cl_mem"]
-            b = ["const size_t"]
+            b = ["const size_t" + self.b_star()]
            c = ["const size_t"] if (name not in self.buffers_without_ld_inc()) else []
            return [", ".join(a + b + c)]
        return []
@ -283,18 +387,19 @@ class Routine:
            math_name = name.upper() + " matrix" if (name in self.buffers_matrix()) else name + " vector"
            inc_ld_description = "Leading dimension " if (name in self.buffers_matrix()) else "Stride/increment "
            a = ["`" + prefix + "cl_mem " + name + "_buffer`: OpenCL buffer to store the " + inout + " " + math_name + "."]
-            b = ["`const size_t " + name + "_offset`: The offset in elements from the start of the " + inout + " " + math_name + "."]
+            b = ["`const size_t " + self.b_star() + name + "_offset" + self.b_s() + "`: The offset" + self.b_s() + " in elements from the start of the " + inout + " " + math_name + "."]
+            c = []
            if name not in self.buffers_without_ld_inc():
                c = ["`const size_t " + name + "_" + self.postfix(name) + "`: " +
                     inc_ld_description + "of the " + inout + " " + math_name + ". This value must be greater than 0."]
-            else:
-                c = []
            return a + b + c
        return []

    def scalar(self, name):
        """Retrieves the name of a scalar (alpha/beta)"""
        if name in self.scalars:
+            if self.batched:
+                return [name + "s_cpp"]
            return [name]
        return []

@ -314,8 +419,12 @@ class Routine:
        """Retrieves the use of a scalar (alpha/beta)"""
        if name in self.scalars:
            if name == "alpha":
+                if self.batched:
+                    return ["alphas_cpp.data()"]
                return [flavour.use_alpha()]
            elif name == "beta":
+                if self.batched:
+                    return ["betas_cpp.data()"]
                return [flavour.use_beta()]
            return [name]
        return []
@ -338,20 +447,28 @@ class Routine:
            return [name]
        return []

+    def scalar_use_wrapper_cublas(self, name, flavour):
+        """As above, but for the cuBLAS wrapper"""
+        if name in self.scalars:
+            if flavour.is_complex(name):
+                return ["&" + name + "_cuda"]
+            return ["&" + name]
+        return []
+
    def scalar_def(self, name, flavour):
        """Retrieves the definition of a scalar (alpha/beta)"""
        if name in self.scalars:
            if name == "alpha":
-                return ["const " + flavour.alpha_cl + " " + name]
-            return ["const " + flavour.beta_cl + " " + name]
+                return ["const " + flavour.alpha_cl + " " + self.b_star() + name + self.b_s()]
+            return ["const " + flavour.beta_cl + " " + self.b_star() + name + self.b_s()]
        return []

    def scalar_def_plain(self, name, flavour):
        """As above, but without 'cl_' prefix"""
        if name in self.scalars:
            if name == "alpha":
-                return ["const " + flavour.alpha_cpp + " " + name]
-            return ["const " + flavour.beta_cpp + " " + name]
+                return ["const " + flavour.alpha_cpp + " " + self.b_star() + name + self.b_s()]
+            return ["const " + flavour.beta_cpp + " " + self.b_star() + name + self.b_s()]
        return []

    def scalar_def_void(self, name, flavour):
@ -368,16 +485,16 @@ class Routine:
        """Retrieves the type of a scalar (alpha/beta)"""
        if name in self.scalars:
            if name == "alpha":
-                return ["const " + flavour.alpha_cpp]
-            return ["const " + flavour.beta_cpp]
+                return ["const " + flavour.alpha_cpp + self.b_star()]
+            return ["const " + flavour.beta_cpp + self.b_star()]
        return []

    def scalar_doc(self, name):
        """Retrieves the documentation of a scalar"""
        if name in self.scalars:
            if name == "alpha":
-                return ["`const " + self.template.alpha_cpp + " " + name + "`: Input scalar constant."]
-            return ["`const " + self.template.beta_cpp + " " + name + "`: Input scalar constant."]
+                return ["`const " + self.template.alpha_cpp + " " + self.b_star() + name + self.b_s() + "`: Input scalar constant" + self.b_s() + "."]
+            return ["`const " + self.template.beta_cpp + " " + self.b_star() + name + self.b_s() + "`: Input scalar constant" + self.b_s() + "."]
        return []

    def scalar_create_cpp(self, flavour):
@ -396,6 +513,12 @@ class Routine:
            return [", ".join([s for s in self.sizes])]
        return []

+    def sizes_list_as_int(self):
+        """Retrieves a list of comma-separated sizes (m, n, k) cast to integers"""
+        if self.sizes:
+            return [", ".join(["static_cast<int>(" + s + ")" for s in self.sizes])]
+        return []
+
    def sizes_def(self):
        """Retrieves the definition of the sizes (m,n,k)"""
        if self.sizes:
@ -427,6 +550,15 @@ class Routine:
            return [", ".join(self.options)]
        return []

+    def options_list_no_layout(self):
+        """Retrieves a list of options"""
+        options = self.options[:]
+        if "layout" in options:
+            options.remove("layout")
+        if options:
+            return [", ".join(options)]
+        return []
+
    def options_cast(self, indent):
        """As above, but now casted to CLBlast data-types"""
        if self.options:
@ -462,6 +594,13 @@ class Routine:
            return [", ".join(definitions)]
        return []

+    def options_def_wrapper_cublas(self):
+        """As above, but now using cuBLAS data-types"""
+        if self.options:
+            definitions = ["const " + convert.option_to_cublas(o) + " " + o for o in self.options]
+            return [", ".join(definitions)]
+        return []
+
    def options_type(self):
        """Retrieves the types of the options (layout, transpose, side, etc.)"""
        if self.options:
@ -507,7 +646,8 @@ class Routine:
                self.scalar("beta") +
                list(chain(*[self.buffer_clcudaapi(b) for b in self.buffers_second()])) +
                list(chain(*[self.buffer_clcudaapi(b) for b in self.scalar_buffers_second()])) +
-                list(chain(*[self.scalar(s) for s in self.other_scalars()])))
+                list(chain(*[self.scalar(s) for s in self.other_scalars()])) +
+                self.batch_count_list())

    def arguments_cast(self, flavour, indent):
        """As above, but with CLBlast casts"""
@ -518,7 +658,8 @@ class Routine:
                self.scalar_use("beta", flavour) +
                list(chain(*[self.buffer(b) for b in self.buffers_second()])) +
                list(chain(*[self.buffer(b) for b in self.scalar_buffers_second()])) +
-                list(chain(*[self.scalar_use(s, flavour) for s in self.other_scalars()])))
+                list(chain(*[self.scalar_use(s, flavour) for s in self.other_scalars()])) +
+                self.batch_count_list())

    def arguments_netlib(self, flavour, indent):
        """As above, but for the Netlib CBLAS API"""
@ -544,7 +685,7 @@ class Routine:

    def arguments_wrapper_cblas(self, flavour):
        """As above, but for the CBLAS wrapper"""
-        return (self.options_list() + self.sizes_list() +
+        return (self.options_list() + self.sizes_list_as_int() +
                self.scalar_use_wrapper_cblas("alpha", flavour) +
                list(chain(*[self.buffer_wrapper_cblas(b, flavour) for b in self.buffers_first()])) +
                self.scalar_use_wrapper_cblas("beta", flavour) +
@ -552,6 +693,17 @@ class Routine:
                list(chain(*[self.buffer_wrapper_cblas(b, flavour) for b in self.scalar_buffers_second()])) +
                list(chain(*[self.scalar_use_wrapper_cblas(s, flavour) for s in self.other_scalars()])))

+    def arguments_wrapper_cublas(self, flavour):
+        """As above, but for the cuBLAS wrapper"""
+        return (self.options_list_no_layout() + self.sizes_list_as_int() +
+                self.scalar_use_wrapper_cublas("alpha", flavour) +
+                list(chain(*[self.buffer_wrapper_cublas(b, flavour) for b in self.buffers_first()])) +
+                self.scalar_use_wrapper_cublas("beta", flavour) +
+                list(chain(*[self.buffer_wrapper_cublas(b, flavour) for b in self.buffers_second()])) +
+                list(chain(*[self.buffer_wrapper_cublas(b, flavour) for b in self.scalar_buffers_first()])) +
+                list(chain(*[self.buffer_wrapper_cublas(b, flavour) for b in self.scalar_buffers_second()])) +
+                list(chain(*[self.scalar_use_wrapper_cublas(s, flavour) for s in self.other_scalars()])))
+
    def arguments_def(self, flavour):
        """Retrieves a combination of all the argument definitions"""
        return (self.options_def() + self.sizes_def() +
@ -561,7 +713,8 @@ class Routine:
                self.scalar_def("beta", flavour) +
                list(chain(*[self.buffer_def(b) for b in self.buffers_second()])) +
                list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) +
-                list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
+                list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])) +
+                self.batch_count_def())

    def arguments_def_netlib(self, flavour):
        """As above, but for the Netlib CBLAS API"""
@ -574,6 +727,7 @@ class Routine:
                list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
        if self.name in self.routines_scalar_no_return():
            result += list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_first()]))
+        result += self.batch_count_def()
        return result

    def arguments_def_c(self, flavour):
@ -585,7 +739,8 @@ class Routine:
                self.scalar_def("beta", flavour) +
                list(chain(*[self.buffer_def(b) for b in self.buffers_second()])) +
                list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) +
-                list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
+                list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])) +
+                self.batch_count_def())

    def arguments_def_wrapper_clblas(self, flavour):
        """As above, but clBLAS wrapper plain data-types"""
@ -609,6 +764,17 @@ class Routine:
                list(chain(*[self.buffer_def_vector(b, flavour) for b in self.scalar_buffers_second()])) +
                list(chain(*[self.scalar_def_plain(s, flavour) for s in self.other_scalars()])))

+    def arguments_def_wrapper_cublas(self, flavour):
+        """As above, but cuBLAS wrapper plain data-types"""
+        return (self.options_def_wrapper_cublas() + self.sizes_def() +
+                list(chain(*[self.buffer_def_wrapper_cuda(b, flavour) for b in self.scalar_buffers_first()])) +
+                self.scalar_def_plain("alpha", flavour) +
+                list(chain(*[self.buffer_def_wrapper_cuda(b, flavour) for b in self.buffers_first()])) +
+                self.scalar_def_plain("beta", flavour) +
+                list(chain(*[self.buffer_def_wrapper_cuda(b, flavour) for b in self.buffers_second()])) +
+                list(chain(*[self.buffer_def_wrapper_cuda(b, flavour) for b in self.scalar_buffers_second()])) +
+                list(chain(*[self.scalar_def_plain(s, flavour) for s in self.other_scalars()])))
+
    def arguments_type(self, flavour):
        """Retrieves a combination of all the argument types"""
        return (self.options_type() + self.sizes_type() +
@ -618,7 +784,8 @@ class Routine:
                self.scalar_type("beta", flavour) +
                list(chain(*[self.buffer_type(b) for b in self.buffers_second()])) +
                list(chain(*[self.buffer_type(b) for b in self.scalar_buffers_second()])) +
-                list(chain(*[self.scalar_type(s, flavour) for s in self.other_scalars()])))
+                list(chain(*[self.scalar_type(s, flavour) for s in self.other_scalars()])) +
+                self.batch_count_type())

    def arguments_doc(self):
        """Retrieves a combination of all the argument types"""
@ -630,7 +797,8 @@ class Routine:
                self.scalar_doc("beta") +
                list(chain(*[self.buffer_doc(b) for b in self.buffers_second()])) +
                list(chain(*[self.buffer_doc(b) for b in self.scalar_buffers_second()])) +
-                list(chain(*[self.scalar_doc(s) for s in self.other_scalars()])))
+                list(chain(*[self.scalar_doc(s) for s in self.other_scalars()])) +
+                self.batch_count_doc())

    def requirements_doc(self):
        """Retrieves a list of routine requirements for documentation"""
@ -640,7 +808,7 @@ class Routine:
        """Retrieves the C++ templated definition for a routine"""
        indent = " " * (spaces + self.length())
        result = "template <" + self.template.name + ">\n"
-        result += "StatusCode " + self.name.capitalize() + "("
+        result += "StatusCode " + self.capitalized_name() + "("
        result += (",\n" + indent).join([a for a in self.arguments_def(self.template)])
        result += ",\n" + indent + "cl_command_queue* queue, cl_event* event" + default_event + ")"
        return result
@ -649,7 +817,7 @@ class Routine:
        """As above, but now without variable names"""
        indent = " " * (spaces + self.length())
        result = "template <" + self.template.name + ">\n"
-        result += "StatusCode " + self.name.capitalize() + "("
+        result += "StatusCode " + self.capitalized_name() + "("
        result += (",\n" + indent).join([a for a in self.arguments_type(self.template)])
        result += ",\n" + indent + "cl_command_queue*, cl_event*)"
        return result
@ -657,7 +825,7 @@ class Routine:
    def routine_header_c(self, flavour, spaces, extra_qualifier):
        """As above, but now for C"""
        indent = " " * (spaces + self.length())
-        result = "CLBlastStatusCode" + extra_qualifier + " CLBlast" + flavour.name + self.name + "("
+        result = "CLBlastStatusCode" + extra_qualifier + " CLBlast" + flavour.name + self.plain_name() + "("
        result += (",\n" + indent).join([a for a in self.arguments_def_c(flavour)])
        result += ",\n" + indent + "cl_command_queue* queue, cl_event* event)"
        return result
@ -677,6 +845,8 @@ class Routine:
        if self.name in self.routines_scalar_no_return():
            routine_name += "_sub"
            indent += "    "
+        if self.batched:
+            routine_name += "batched"
        result = return_type + extra_qualifier + " cblas_" + flavour.name.lower() + routine_name + "("
        result += (",\n" + indent).join([a for a in self.arguments_def_netlib(flavour)]) + ")"
        return result
@ -703,3 +873,17 @@ class Routine:
        result = "void cblasX" + self.name + "("
        result += (",\n" + indent).join([a for a in self.arguments_def_wrapper_cblas(flavour)]) + ")"
        return result
+
+    def routine_header_wrapper_cublas(self, flavour, def_only, spaces):
+        """As above, but now for the cuBLAS wrapper"""
+        template = "<" + flavour.template + ">" if self.no_scalars() and not def_only else ""
+        indent = " " * (spaces + self.length() + len(template))
+        result = ""
+        if self.no_scalars():
+            result += "template <"
+            if def_only:
+                result += flavour.name
+            result += ">\n"
+        result += "cublasStatus_t cublasX" + self.name + template + "(cublasHandle_t handle, "
+        result += (",\n" + indent).join([a for a in self.arguments_def_wrapper_cublas(flavour)]) + ")"
+        return result
--- a/scripts/graphs/common.r
+++ b/scripts/graphs/common.r
@ -1,262 +0,0 @@
-
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project uses a tab-size of two spaces and a max-width of 100 characters per line.
-#
-# Author(s):
-#   Cedric Nugteren <www.cedricnugteren.nl>
-#
-# This file implements the common performance scripts, such as creating a graph.
-#
-# ==================================================================================================
-
-# Colours
-black     = "#000000"
-grey      = "#888888"
-purplish  = "#550077" # [ 85,  0,119] lumi=26
-blueish   = "#4765b1" # [ 71,101,177] lumi=100
-redish    = "#d67568" # [214,117,104] lumi=136
-greenish  = "#9bd4ca" # [155,212,202] lumi=199
-
-# Sets the graph markers (circles, triangles, etc.)
-pchs = c(15, 18, 17, 12)
-
-# Other constants
-kilo = 1024
-mega = 1024*1024
-
-# R options
-options("width"=170)
-
-# ==================================================================================================
-
-# Settings
-num_runs <- 5
-num_runs_short <- 50
-xtics_subset_threshold <- 100
-xtics_subset_stepsize <- 8
-
-devices <- c("-platform","-device")
-options_string <- "-q -no_abbrv -cblas 0"
-
-# Command-line arguments
-command_line <- commandArgs(trailingOnly=TRUE)
-if (length(command_line) != 2) {
-  print("Usage for device Z on platform Y: Rscript xxxxx.r Y Z")
-  quit()
-}
-platform_id <- command_line[1]
-device_id <- command_line[2]
-
-# Selects the device
-devices_values <- c(platform_id, device_id)
-devices_string <- paste(devices, devices_values, collapse=" ")
-
-
-# Filter the string: only lines containing a ";" can be valid lines
-filter_string <- function(raw_result_string) {
-  result_string <- c()
-  for (line in raw_result_string) {
-    if (grepl(";",line)) {
-      result_string <-
-       c(result_string, line)
-    }
-  }
-  return(result_string)
-}
-
-# ==================================================================================================
-
-# The main function
-main <- function(routine_name, precision, test_names, test_values,
-                test_xlabels, test_xaxis, metric_gflops) {
-
-  # Names
-  display_name <- toupper(routine_name)
-  if (precision == 16) { display_name <- gsub("^X","H",display_name); }
-  if (precision == 32) { display_name <- gsub("^X","S",display_name); }
-  if (precision == 64) { display_name <- gsub("^X","D",display_name); }
-  if (precision == 3232) { display_name <- gsub("^X","C",display_name); }
-  if (precision == 6464) { display_name <- gsub("^X","Z",display_name); }
-  executable <- paste("./clblast_client_", routine_name, sep="")
-
-  # Display
-  library_names <- c("CLBlast", "clBLAS")
-  if (precision == 16) { library_names <- c("CLBlast FP16", "CLBlast FP32", "clBLAS FP32"); }
-  colourset <- c(blueish, redish)
-  if (precision == 16) { colourset <- c(blueish, purplish, redish); }
-
-  # Configures the outputfile
-  file_name <- paste(display_name, ".pdf", sep="")
-  if (length(test_names) == 6) {
-    pdf(file_name, height=8, width=13)
-    par(mfrow=c(2, 3))
-    par(oma=c(0, 0, 0, 0))
-    par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)]
-    par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)]
-  }
-  else { # length(test_names) == 2
-    pdf(file_name, height=8, width=13)
-    par(mfrow=c(2, 1))
-    par(oma=c(0, 0, 0, 0))
-    par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)]
-    par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)]
-  }
-
-  # Loops over the test-cases
-  for (test_id in 1:length(test_names)) {
-    params_values <- test_values[[test_id]]
-
-    # Loops over the commands within a single list (within a case)
-    for (command_id in 1:length(params_values)) {
-
-      # Runs the client and captures the result
-      params_string <- paste(parameters, params_values[[command_id]], collapse=" ")
-      arguments <- paste(devices_string, params_string, options_string, sep=" ")
-      print(paste("Running", executable, arguments, sep=" "))
-      raw_result_string <- system2(command=executable, args=arguments, stdout=TRUE)
-      result_string <- filter_string(raw_result_string)
-
-      # Reads the result into a dataframe
-      command_db <- read.csv(text=result_string, sep=";")
-
-      # For half-precision: also runs the FP32 version for comparison
-      if (precision == 16) {
-        params_string <- gsub("-precision 16", "-precision 32", params_string)
-        arguments <- paste(devices_string, params_string, options_string, sep=" ")
-        print(paste("Running", executable, arguments, sep=" "))
-        raw_result_string <- system2(command=executable, args=arguments, stdout=TRUE)
-        result_string <- filter_string(raw_result_string)
-
-        # Reads the result into a dataframe
-        command_db_32 <- read.csv(text=result_string, sep=";")
-        stopifnot(nrow(command_db) == nrow(command_db_32))
-
-        # Combines the results
-        command_db["ms_FP32_1"] = command_db_32$ms_1
-        command_db["GFLOPS_FP32_1"] = command_db_32$GFLOPS_1
-        command_db["GBs_FP32_1"] = command_db_32$GBs_1
-        command_db["ms_FP32_2"] = command_db_32$ms_2
-        command_db["GFLOPS_FP32_2"] = command_db_32$GFLOPS_2
-        command_db["GBs_FP32_2"] = command_db_32$GBs_2
-      }
-
-      # Append the results to the final dataframe
-      if (command_id == 1) {
-        db <- command_db
-      } else {
-        db <- rbind(db, command_db)
-      }
-    }
-    print(db)
-
-    # Sets the values on the x-axis and their labels (test dependent)
-    if (is.character(test_xaxis[[test_id]][[1]])) {
-      xdata <- db[,test_xaxis[[test_id]][[1]]]
-      xtics <- xdata
-      log_scale <- test_xaxis[[test_id]][[2]]
-    }
-    else {
-      xdata <- test_xaxis[[test_id]][[1]]
-      xtics <- test_xaxis[[test_id]][[2]]
-      log_scale <- ""
-    }
-
-    # Plots the graph with GFLOPS on the Y-axis
-    if (metric_gflops) {
-      if (precision == 16) {
-        ydata = list(db$GFLOPS_1, db$GFLOPS_FP32_1, db$GFLOPS_FP32_2)
-        ymax = max(max(db$GFLOPS_1), max(db$GFLOPS_FP32_1), max(db$GFLOPS_FP32_2))
-      } else {
-        ydata = list(db$GFLOPS_1, db$GFLOPS_2)
-        ymax = max(max(db$GFLOPS_1), max(db$GFLOPS_2))
-      }
-      plot_graph(xdata=xdata, ydata=ydata, log_setting=log_scale,
-                 xmin=min(xdata), xmax=max(xdata),
-                 ymin=0, ymax=ymax,
-                 xtics=xtics,
-                 xlabel=test_xlabels[[test_id]], ylabel="GFLOPS (higher is better)",
-                 graph_title=paste(display_name, test_names[[test_id]], sep=" "),
-                 multiple=50, experiment_names=library_names, colourset=colourset)
-    # Plots the graph with GB/s on the Y-axis
-    } else {
-      if (precision == 16) {
-        ydata = list(db$GBs_1, db$GBs_FP32_1, db$GBs_FP32_2)
-        ymax = max(max(db$GBs_1), max(db$GBs_FP32_1), max(db$GBs_FP32_2))
-      } else {
-        ydata = list(db$GBs_1, db$GBs_2)
-        ymax = max(max(db$GBs_1), max(db$GBs_2))
-      }
-      plot_graph(xdata=xdata, ydata=ydata, log_setting=log_scale,
-                 xmin=min(xdata), xmax=max(xdata),
-                 ymin=0, ymax=ymax,
-                 xtics=xtics,
-                 xlabel=test_xlabels[[test_id]], ylabel="GB/s (higher is better)",
-                 graph_title=paste(display_name, test_names[[test_id]], sep=" "),
-                 multiple=10, experiment_names=library_names, colourset=colourset)
-    }
-  }
-}
-
-# ==================================================================================================
-
-# Plots data
-plot_graph <- function(xdata, ydata, log_setting,
-                       xmin, xmax, ymin, ymax,
-                       xtics, xlabel, ylabel,
-                       graph_title,
-                       multiple, experiment_names, colourset) {
-
-  # Update the ymax to the next multiple of something
-  ymax <- multiple*ceiling(ymax/multiple)
-
-  # Add kilo or mega to the x-labels
-  for (i in 1:length(xtics)) {
-    if (!is.na(as.numeric(xtics[i]))) {
-      if (as.numeric(xtics[i])%%mega == 0) {
-        xtics[i] <- paste(as.character(as.numeric(xtics[i])/mega), "M", sep="")
-      } else if (as.numeric(xtics[i])%%kilo == 0) {
-        xtics[i] <- paste(as.character(as.numeric(xtics[i])/kilo), "K", sep="")
-      }
-    }
-  }
-
-  # Creates an initial graph with axis but without data
-  par(new=F)
-  plot(x=xmin:xmax, y=rep(1, length(xmin:xmax)), log=log_setting,
-       main="", xlab="", ylab="",
-       ylim=c(ymin, ymax), xlim=c(xmin, xmax), axes=F, "n")
-  axis(side=2, las=2)
-  if (length(xdata) > xtics_subset_threshold) {  # Too many indices to print, plot only every Nth
-    subset <- seq(from=1, to=length(xdata), by=xtics_subset_stepsize)
-    axis(side=1, at=xdata[subset], labels=xtics[subset], las=2)
-  } else {
-    axis(side=1, at=xdata, labels=xtics, las=2)
-  }
-  title(xlab=xlabel, line=-1)
-  title(ylab=ylabel, line=2)
-  title(graph_title, line=-2)
-  par(new=T)
-
-  # Loops over all experiments
-  num_experiments <- length(ydata)
-  for (id in 1:num_experiments) {
-
-    # Plots the data for this experiment
-    plot(x=xdata, y=ydata[[id]], log=log_setting,
-         col=colourset[id], pch=pchs[id], lty=1, lwd=1, cex=1,
-         xlab="", ylab="", ylim=c(ymin, ymax), xlim=c(xmin, xmax),
-         axes=F, "b", xpd=T)
-    par(new=T)
-  }
-
-  # Add a legend
-  legend("bottomright", experiment_names,
-         lwd=1, ncol=1, col=colourset, pch=pchs, lty=1, cex=1,
-         bty="n", xpd=T)
-
-  # Done
-  par(new=F)
-}
-
-# ==================================================================================================
--- a/scripts/graphs/xaxpy.r
+++ b/scripts/graphs/xaxpy.r
@ -1,96 +0,0 @@
-
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project uses a tab-size of two spaces and a max-width of 100 characters per line.
-#
-# Author(s):
-#   Cedric Nugteren <www.cedricnugteren.nl>
-#
-# This file implements the performance script for the Xaxpy routine
-#
-# ==================================================================================================
-
-# Includes the common functions
-args <- commandArgs(trailingOnly = FALSE)
-thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
-source(file.path(dirname(thisfile), "common.r"))
-
-# ==================================================================================================
-
-# Settings
-routine_name <- "xaxpy"
-parameters <- c("-n","-incx","-incy",
-                "-num_steps","-step","-runs","-precision")
-precision <- 32
-
-# Sets the names of the test-cases
-test_names <- list(
-  "multiples of 256K",
-  "multiples of 256K (+1)",
-  "around n=1M",
-  "around n=16M",
-  "strides (n=8M)",
-  "powers of 2"
-)
-
-# Defines the test-cases
-test_values <- list(
-  list(c(256*kilo, 1, 1, 16, 256*kilo, num_runs, precision)),
-  list(c(256*kilo+1, 1, 1, 16, 256*kilo, num_runs, precision)),
-  list(c(1*mega, 1, 1, 16, 1, num_runs, precision)),
-  list(c(16*mega, 1, 1, 16, 1, num_runs, precision)),
-  list(
-    c(8*mega, 1, 1, 1, 0, num_runs, precision),
-    c(8*mega, 2, 1, 1, 0, num_runs, precision),
-    c(8*mega, 4, 1, 1, 0, num_runs, precision),
-    c(8*mega, 8, 1, 1, 0, num_runs, precision),
-    c(8*mega, 1, 2, 1, 0, num_runs, precision),
-    c(8*mega, 1, 4, 1, 0, num_runs, precision),
-    c(8*mega, 1, 8, 1, 0, num_runs, precision),
-    c(8*mega, 2, 2, 1, 0, num_runs, precision),
-    c(8*mega, 4, 4, 1, 0, num_runs, precision),
-    c(8*mega, 8, 8, 1, 0, num_runs, precision)
-  ),
-  list(
-    c(32*kilo, 1, 1, 1, 0, num_runs, precision),
-    c(64*kilo, 1, 1, 1, 0, num_runs, precision),
-    c(128*kilo, 1, 1, 1, 0, num_runs, precision),
-    c(256*kilo, 1, 1, 1, 0, num_runs, precision),
-    c(512*kilo, 1, 1, 1, 0, num_runs, precision),
-    c(1*mega, 1, 1, 1, 0, num_runs, precision),
-    c(2*mega, 1, 1, 1, 0, num_runs, precision),
-    c(4*mega, 1, 1, 1, 0, num_runs, precision),
-    c(8*mega, 1, 1, 1, 0, num_runs, precision),
-    c(16*mega, 1, 1, 1, 0, num_runs, precision),
-    c(32*mega, 1, 1, 1, 0, num_runs, precision),
-    c(64*mega, 1, 1, 1, 0, num_runs, precision)
-  )
-)
-
-# Defines the x-labels corresponding to the test-cases
-test_xlabels <- list(
-  "vector sizes (n)",
-  "vector sizes (n)",
-  "vector sizes (n)",
-  "vector sizes (n)",
-  "increments/strides for x and y",
-  "vector sizes (n)"
-)
-
-# Defines the x-axis of the test-cases
-test_xaxis <- list(
-  c("n", ""),
-  c("n", ""),
-  c("n", ""),
-  c("n", ""),
-  list(1:10, c("x1y1", "x2y1", "x4y1", "x8y1", "x1y2", "x1y4", "x1y8", "x2y2", "x4y4", "x8y8")),
-  c("n", "x")
-)
-
-# ==================================================================================================
-
-# Start the script
-main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
-     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=FALSE)
-
-# ==================================================================================================
--- a/scripts/graphs/xgemm.r
+++ b/scripts/graphs/xgemm.r
@ -1,94 +0,0 @@
-
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project uses a tab-size of two spaces and a max-width of 100 characters per line.
-#
-# Author(s):
-#   Cedric Nugteren <www.cedricnugteren.nl>
-#
-# This file implements the performance script for the Xgemm routine
-#
-# ==================================================================================================
-
-# Includes the common functions
-args <- commandArgs(trailingOnly = FALSE)
-thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
-source(file.path(dirname(thisfile), "common.r"))
-
-# ==================================================================================================
-
-# Settings
-routine_name <- "xgemm"
-parameters <- c("-m","-n","-k","-layout","-transA","-transB",
-                "-num_steps","-step","-runs","-precision")
-precision <- 32
-
-# Sets the names of the test-cases
-test_names <- list(
-  "multiples of 128",
-  "multiples of 128 (+1)",
-  "around m=n=k=512",
-  "around m=n=k=2048",
-  "layouts and transposing (m=n=k=1024)",
-  "powers of 2"
-)
-
-# Defines the test-cases
-test_values <- list(
-  list(c( 128,  128,  128, 102, 111, 111, 16, 128, num_runs, precision)),
-  list(c( 129,  129,  129, 102, 111, 111, 16, 128, num_runs, precision)),
-  list(c( 512,  512,  512, 102, 111, 111, 16, 1, num_runs, precision)),
-  list(c(2048, 2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)),
-  list(
-    c(1024, 1024, 1024, 101, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 1024, 101, 111, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 1024, 101, 112, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 1024, 101, 112, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 1024, 102, 111, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 1024, 102, 112, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 1024, 102, 112, 112, 1, 0, num_runs, precision)
-  ),
-  list(
-    c(   8,    8,    8, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  16,   16,   16, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  32,   32,   32, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  64,   64,   64, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 128,  128,  128, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 256,  256,  256, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 512,  512,  512, 102, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
-    c(2048, 2048, 2048, 102, 111, 111, 1, 0, num_runs, precision),
-    c(4096, 4096, 4096, 102, 111, 111, 1, 0, num_runs, precision),
-    c(8192, 8192, 8192, 102, 111, 111, 1, 0, num_runs, precision)
-  )
-)
-
-# Defines the x-labels corresponding to the test-cases
-test_xlabels <- list(
-  "matrix sizes (m=n=k)",
-  "matrix sizes (m=n=k)",
-  "matrix sizes (m=n=k)",
-  "matrix sizes (m=n=k)",
-  "layout (row/col), transA (n/y), transB (n/y)",
-  "matrix sizes (m=n=k)"
-)
-
-# Defines the x-axis of the test-cases
-test_xaxis <- list(
-  c("m", ""),
-  c("m", ""),
-  c("m", ""),
-  c("m", ""),
-  list(1:8, c("row,n,n", "row,n,y", "row,y,n", "row,y,y",
-              "col,n,n", "col,n,y", "col,y,n", "col,y,y")),
-  c("m", "x")
-)
-
-# ==================================================================================================
-
-# Start the script
-main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
-     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
-
-# ==================================================================================================
--- a/scripts/graphs/xgemm_small.r
+++ b/scripts/graphs/xgemm_small.r
@ -1,56 +0,0 @@
-
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project uses a tab-size of two spaces and a max-width of 100 characters per line.
-#
-# Author(s):
-#   Cedric Nugteren <www.cedricnugteren.nl>
-#
-# This file implements the performance script for small sizes of Xgemm, testing the direct kernel
-#
-# ==================================================================================================
-
-# Includes the common functions
-args <- commandArgs(trailingOnly = FALSE)
-thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
-source(file.path(dirname(thisfile), "common.r"))
-
-# ==================================================================================================
-
-# Settings
-routine_name <- "xgemm"
-parameters <- c("-m","-n","-k","-layout","-transA","-transB",
-                "-num_steps","-step","-runs","-precision")
-precision <- 32
-
-# Sets the names of the test-cases
-test_names <- list(
-  "small matrices in steps of 16",
-  "small matrices in steps of 1"
-)
-
-# Defines the test-cases
-test_values <- list(
-  list(c( 128,  128,  128, 102, 111, 111, 57, 16, num_runs_short, precision)),
-  list(c( 128,  128,  128, 102, 111, 111, 385, 1, num_runs_short, precision))
-)
-
-# Defines the x-labels corresponding to the test-cases
-test_xlabels <- list(
-  "matrix sizes (m=n=k)",
-  "matrix sizes (m=n=k)"
-)
-
-# Defines the x-axis of the test-cases
-test_xaxis <- list(
-  c("m", ""),
-  c("m", "")
-)
-
-# ==================================================================================================
-
-# Start the script
-main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
-     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
-
-# ==================================================================================================
--- a/scripts/graphs/xgemv.r
+++ b/scripts/graphs/xgemv.r
@ -1,83 +0,0 @@
-
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project uses a tab-size of two spaces and a max-width of 100 characters per line.
-#
-# Author(s):
-#   Cedric Nugteren <www.cedricnugteren.nl>
-#
-# This file implements the performance script for the Xgemv routine
-#
-# ==================================================================================================
-
-# Includes the common functions
-args <- commandArgs(trailingOnly = FALSE)
-thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
-source(file.path(dirname(thisfile), "common.r"))
-
-# ==================================================================================================
-
-# Settings
-routine_name <- "xgemv"
-parameters <- c("-n","-m","-incx","-incy","-layout",
-                "-num_steps","-step","-runs","-precision")
-precision <- 32
-
-# Sets the names of the test-cases
-test_names <- list(
-  "multiples of 256",
-  "multiples of 256 (+1)",
-  "around n=m=2K",
-  "multiples of 256 [rotated]",
-  "multiples of 256 (+1) [rotated]",
-  "strides (n=2K)"
-)
-
-# Defines the test-cases
-test_values <- list(
-  list(c(256, 256, 1, 1, 102, 16, 256, num_runs, precision)),
-  list(c(256+1, 256+1, 1, 1, 102, 16, 256, num_runs, precision)),
-  list(c(2*kilo, 2*kilo, 1, 1, 102, 16, 1, num_runs, precision)),
-  list(c(256, 256, 1, 1, 101, 16, 256, num_runs, precision)),
-  list(c(256+1, 256+1, 1, 1, 101, 16, 256, num_runs, precision)),
-  list(
-    c(2*kilo, 2*kilo, 1, 1, 102, 1, 0, num_runs, precision),
-    c(2*kilo, 2*kilo, 2, 1, 102, 1, 0, num_runs, precision),
-    c(2*kilo, 2*kilo, 4, 1, 102, 1, 0, num_runs, precision),
-    c(2*kilo, 2*kilo, 8, 1, 102, 1, 0, num_runs, precision),
-    c(2*kilo, 2*kilo, 1, 2, 102, 1, 0, num_runs, precision),
-    c(2*kilo, 2*kilo, 1, 4, 102, 1, 0, num_runs, precision),
-    c(2*kilo, 2*kilo, 1, 8, 102, 1, 0, num_runs, precision),
-    c(2*kilo, 2*kilo, 2, 2, 102, 1, 0, num_runs, precision),
-    c(2*kilo, 2*kilo, 4, 4, 102, 1, 0, num_runs, precision),
-    c(2*kilo, 2*kilo, 8, 8, 102, 1, 0, num_runs, precision)
-  )
-)
-
-# Defines the x-labels corresponding to the test-cases
-test_xlabels <- list(
-  "vector sizes (n)",
-  "vector sizes (n)",
-  "vector sizes (n)",
-  "vector sizes (n)",
-  "vector sizes (n)",
-  "increments/strides for x and y"
-)
-
-# Defines the x-axis of the test-cases
-test_xaxis <- list(
-  c("n", ""),
-  c("n", ""),
-  c("n", ""),
-  c("n", ""),
-  c("n", ""),
-  list(1:10, c("x1y1", "x2y1", "x4y1", "x8y1", "x1y2", "x1y4", "x1y8", "x2y2", "x4y4", "x8y8"))
-)
-
-# ==================================================================================================
-
-# Start the script
-main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
-     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=FALSE)
-
-# ==================================================================================================
--- a/scripts/graphs/xsymm.r
+++ b/scripts/graphs/xsymm.r
@ -1,94 +0,0 @@
-
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project uses a tab-size of two spaces and a max-width of 100 characters per line.
-#
-# Author(s):
-#   Cedric Nugteren <www.cedricnugteren.nl>
-#
-# This file implements the performance script for the Xsymm routine
-#
-# ==================================================================================================
-
-# Includes the common functions
-args <- commandArgs(trailingOnly = FALSE)
-thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
-source(file.path(dirname(thisfile), "common.r"))
-
-# ==================================================================================================
-
-# Settings
-routine_name <- "xsymm"
-parameters <- c("-m","-n","-layout","-side","-triangle",
-                "-num_steps","-step","-runs","-precision")
-precision <- 32
-
-# Sets the names of the test-cases
-test_names <- list(
-  "multiples of 128",
-  "multiples of 128 (+1)",
-  "around m=n=512",
-  "around m=n=2048",
-  "layouts and side/triangle (m=n=1024)",
-  "powers of 2"
-)
-
-# Defines the test-cases
-test_values <- list(
-  list(c( 128,  128, 102, 141, 121, 16, 128, num_runs, precision)),
-  list(c( 129,  129, 102, 141, 121, 16, 128, num_runs, precision)),
-  list(c( 512,  512, 102, 141, 121, 16, 1, num_runs, precision)),
-  list(c(2048, 2048, 102, 141, 121, 16, 1, num_runs, precision)),
-  list(
-    c(1024, 1024, 101, 141, 121, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 141, 122, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 142, 121, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 142, 122, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 141, 121, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 141, 122, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 142, 121, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 142, 122, 1, 0, num_runs, precision)
-  ),
-  list(
-    c(   8,    8, 102, 141, 121, 1, 0, num_runs, precision),
-    c(  16,   16, 102, 141, 121, 1, 0, num_runs, precision),
-    c(  32,   32, 102, 141, 121, 1, 0, num_runs, precision),
-    c(  64,   64, 102, 141, 121, 1, 0, num_runs, precision),
-    c( 128,  128, 102, 141, 121, 1, 0, num_runs, precision),
-    c( 256,  256, 102, 141, 121, 1, 0, num_runs, precision),
-    c( 512,  512, 102, 141, 121, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 141, 121, 1, 0, num_runs, precision),
-    c(2048, 2048, 102, 141, 121, 1, 0, num_runs, precision),
-    c(4096, 4096, 102, 141, 121, 1, 0, num_runs, precision),
-    c(8192, 8192, 102, 141, 121, 1, 0, num_runs, precision)
-  )
-)
-
-# Defines the x-labels corresponding to the test-cases
-test_xlabels <- list(
-  "matrix sizes (m=n)",
-  "matrix sizes (m=n)",
-  "matrix sizes (m=n)",
-  "matrix sizes (m=n)",
-  "layout (row/col), side (l/r), triangle (up/lo)",
-  "matrix sizes (m=n)"
-)
-
-# Defines the x-axis of the test-cases
-test_xaxis <- list(
-  c("m", ""),
-  c("m", ""),
-  c("m", ""),
-  c("m", ""),
-  list(1:8, c("row,l,up", "row,r,up", "row,l,lo", "row,r,lo",
-              "col,l,up", "col,r,up", "col,l,lo", "col,r,lo")),
-  c("m", "x")
-)
-
-# ==================================================================================================
-
-# Start the script
-main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
-     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
-
-# ==================================================================================================
--- a/scripts/graphs/xsyr2k.r
+++ b/scripts/graphs/xsyr2k.r
@ -1,94 +0,0 @@
-
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project uses a tab-size of two spaces and a max-width of 100 characters per line.
-#
-# Author(s):
-#   Cedric Nugteren <www.cedricnugteren.nl>
-#
-# This file implements the performance script for the Xsyr2k routine
-#
-# ==================================================================================================
-
-# Includes the common functions
-args <- commandArgs(trailingOnly = FALSE)
-thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
-source(file.path(dirname(thisfile), "common.r"))
-
-# ==================================================================================================
-
-# Settings
-routine_name <- "xsyr2k"
-parameters <- c("-n","-k","-layout","-triangle","-transA",
-                "-num_steps","-step","-runs","-precision")
-precision <- 32
-
-# Sets the names of the test-cases
-test_names <- list(
-  "multiples of 128",
-  "multiples of 128 (+1)",
-  "around n=k=512",
-  "around n=k=1536",
-  "layouts and transposing (n=k=1024)",
-  "powers of 2"
-)
-
-# Defines the test-cases
-test_values <- list(
-  list(c( 128,  128, 102, 111, 111, 16, 128, num_runs, precision)),
-  list(c( 129,  129, 102, 111, 111, 16, 128, num_runs, precision)),
-  list(c( 512,  512, 102, 111, 111, 16, 1, num_runs, precision)),
-  list(c(1536, 1536, 102, 111, 111, 16, 1, num_runs, precision)),
-  list(
-    c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision)
-  ),
-  list(
-    c(   8,    8, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  16,   16, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  32,   32, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  64,   64, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 128,  128, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 256,  256, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 512,  512, 102, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
-    c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision),
-    c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision),
-    c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision)
-  )
-)
-
-# Defines the x-labels corresponding to the test-cases
-test_xlabels <- list(
-  "matrix sizes (n=k)",
-  "matrix sizes (n=k)",
-  "matrix sizes (n=k)",
-  "matrix sizes (n=k)",
-  "layout (row/col), triangle (u/l), transA (n/y)",
-  "matrix sizes (n=k)"
-)
-
-# Defines the x-axis of the test-cases
-test_xaxis <- list(
-  c("n", ""),
-  c("n", ""),
-  c("n", ""),
-  c("n", ""),
-  list(1:8, c("row,u,n", "row,u,y", "row,l,n", "row,l,y",
-              "col,u,n", "col,u,y", "col,l,n", "col,l,y")),
-  c("n", "x")
-)
-
-# ==================================================================================================
-
-# Start the script
-main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
-     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
-
-# ==================================================================================================
--- a/scripts/graphs/xsyrk.r
+++ b/scripts/graphs/xsyrk.r
@ -1,94 +0,0 @@
-
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project uses a tab-size of two spaces and a max-width of 100 characters per line.
-#
-# Author(s):
-#   Cedric Nugteren <www.cedricnugteren.nl>
-#
-# This file implements the performance script for the Xsyrk routine
-#
-# ==================================================================================================
-
-# Includes the common functions
-args <- commandArgs(trailingOnly = FALSE)
-thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
-source(file.path(dirname(thisfile), "common.r"))
-
-# ==================================================================================================
-
-# Settings
-routine_name <- "xsyrk"
-parameters <- c("-n","-k","-layout","-triangle","-transA",
-                "-num_steps","-step","-runs","-precision")
-precision <- 32
-
-# Sets the names of the test-cases
-test_names <- list(
-  "multiples of 128",
-  "multiples of 128 (+1)",
-  "around n=k=512",
-  "around n=k=2048",
-  "layouts and transposing (n=k=1024)",
-  "powers of 2"
-)
-
-# Defines the test-cases
-test_values <- list(
-  list(c( 128,  128, 102, 121, 111, 16, 128, num_runs, precision)),
-  list(c( 129,  129, 102, 121, 111, 16, 128, num_runs, precision)),
-  list(c( 512,  512, 102, 121, 111, 16, 1, num_runs, precision)),
-  list(c(2048, 2048, 102, 121, 111, 16, 1, num_runs, precision)),
-  list(
-    c(1024, 1024, 101, 121, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 121, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 122, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 122, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 121, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 121, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 122, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 122, 112, 1, 0, num_runs, precision)
-  ),
-  list(
-    c(   8,    8, 102, 121, 111, 1, 0, num_runs, precision),
-    c(  16,   16, 102, 121, 111, 1, 0, num_runs, precision),
-    c(  32,   32, 102, 121, 111, 1, 0, num_runs, precision),
-    c(  64,   64, 102, 121, 111, 1, 0, num_runs, precision),
-    c( 128,  128, 102, 121, 111, 1, 0, num_runs, precision),
-    c( 256,  256, 102, 121, 111, 1, 0, num_runs, precision),
-    c( 512,  512, 102, 121, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 121, 111, 1, 0, num_runs, precision),
-    c(2048, 2048, 102, 121, 111, 1, 0, num_runs, precision),
-    c(4096, 4096, 102, 121, 111, 1, 0, num_runs, precision),
-    c(8192, 8192, 102, 121, 111, 1, 0, num_runs, precision)
-  )
-)
-
-# Defines the x-labels corresponding to the test-cases
-test_xlabels <- list(
-  "matrix sizes (n=k)",
-  "matrix sizes (n=k)",
-  "matrix sizes (n=k)",
-  "matrix sizes (n=k)",
-  "layout (row/col), triangle (u/l), transA (n/y)",
-  "matrix sizes (n=k)"
-)
-
-# Defines the x-axis of the test-cases
-test_xaxis <- list(
-  c("n", ""),
-  c("n", ""),
-  c("n", ""),
-  c("n", ""),
-  list(1:8, c("row,u,n", "row,u,y", "row,l,n", "row,l,y",
-              "col,u,n", "col,u,y", "col,l,n", "col,l,y")),
-  c("n", "x")
-)
-
-# ==================================================================================================
-
-# Start the script
-main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
-     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
-
-# ==================================================================================================
--- a/scripts/graphs/xtrmm.r
+++ b/scripts/graphs/xtrmm.r
@ -1,127 +0,0 @@
-
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project uses a tab-size of two spaces and a max-width of 100 characters per line.
-#
-# Author(s):
-#   Cedric Nugteren <www.cedricnugteren.nl>
-#
-# This file implements the performance script for the Xtrmm routine
-#
-# ==================================================================================================
-
-# Includes the common functions
-args <- commandArgs(trailingOnly = FALSE)
-thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
-source(file.path(dirname(thisfile), "common.r"))
-
-# ==================================================================================================
-
-# Settings
-routine_name <- "xtrmm"
-parameters <- c("-m","-n","-layout","-side","-triangle","-transA","-diagonal",
-                "-num_steps","-step","-runs","-precision")
-precision <- 32
-
-# Sets the names of the test-cases
-test_names <- list(
-  "multiples of 128",
-  "multiples of 128 (+1)",
-  "around m=n=512",
-  "around m=n=2048",
-  "layouts and side/triangle (m=n=1024)",
-  "powers of 2"
-)
-
-# Defines the test-cases
-test_values <- list(
-  list(c( 128,  128, 102, 141, 121, 111, 131, 16, 128, num_runs, precision)),
-  list(c( 129,  129, 102, 141, 121, 111, 131, 16, 128, num_runs, precision)),
-  list(c( 512,  512, 102, 141, 121, 111, 131, 16, 1, num_runs, precision)),
-  list(c(2048, 2048, 102, 141, 121, 111, 131, 16, 1, num_runs, precision)),
-  list(
-    c(1024, 1024, 101, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 141, 121, 111, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 141, 121, 112, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 141, 121, 112, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 141, 122, 111, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 141, 122, 111, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 141, 122, 112, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 141, 122, 112, 132, 1, 0, num_runs, precision),
-
-    c(1024, 1024, 101, 142, 121, 111, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 142, 121, 111, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 142, 121, 112, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 142, 121, 112, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 142, 122, 111, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 142, 122, 111, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 142, 122, 112, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 142, 122, 112, 132, 1, 0, num_runs, precision),
-
-    c(1024, 1024, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 141, 121, 111, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 141, 121, 112, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 141, 121, 112, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 141, 122, 111, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 141, 122, 111, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 141, 122, 112, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 141, 122, 112, 132, 1, 0, num_runs, precision),
-
-    c(1024, 1024, 102, 142, 121, 111, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 142, 121, 111, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 142, 121, 112, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 142, 121, 112, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 142, 122, 111, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 142, 122, 111, 132, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 142, 122, 112, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 142, 122, 112, 132, 1, 0, num_runs, precision)
-  ),
-  list(
-    c(   8,    8, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c(  16,   16, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c(  32,   32, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c(  64,   64, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c( 128,  128, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c( 256,  256, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c( 512,  512, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c(2048, 2048, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c(4096, 4096, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
-    c(8192, 8192, 102, 141, 121, 111, 131, 1, 0, num_runs, precision)
-  )
-)
-
-# Defines the x-labels corresponding to the test-cases
-test_xlabels <- list(
-  "matrix sizes (m=n)",
-  "matrix sizes (m=n)",
-  "matrix sizes (m=n)",
-  "matrix sizes (m=n)",
-  "layout (row/col), side (l/r), triangle (up/lo), transA (n/y), diag (u/nu)",
-  "matrix sizes (m=n)"
-)
-
-# Defines the x-axis of the test-cases
-test_xaxis <- list(
-  c("m", ""),
-  c("m", ""),
-  c("m", ""),
-  c("m", ""),
-  list(1:32, c("row,l,up,n,u", "row,l,up,n,nu", "row,l,up,y,u", "row,l,up,y,nu",
-               "row,r,up,n,u", "row,r,up,n,nu", "row,r,up,y,u", "row,r,up,y,nu",
-               "row,l,lo,n,u", "row,l,lo,n,nu", "row,l,lo,y,u", "row,l,lo,y,nu",
-               "row,r,lo,n,u", "row,r,lo,n,nu", "row,r,lo,y,u", "row,r,lo,y,nu",
-               "col,l,up,n,u", "col,l,up,n,nu", "col,l,up,y,u", "col,l,up,y,nu",
-               "col,r,up,n,u", "col,r,up,n,nu", "col,r,up,y,u", "col,r,up,y,nu",
-               "col,l,lo,n,u", "col,l,lo,n,nu", "col,l,lo,y,u", "col,l,lo,y,nu",
-               "col,r,lo,n,u", "col,r,lo,n,nu", "col,r,lo,y,u", "col,r,lo,y,nu")),
-  c("m", "x")
-)
-
-# ==================================================================================================
-
-# Start the script
-main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
-     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
-
-# ==================================================================================================
--- a/src/cache.cpp
+++ b/src/cache.cpp
@ -15,108 +15,116 @@
 #include <vector>
 #include <mutex>

+#include "database/database.hpp"
 #include "cache.hpp"

 namespace clblast {
 // =================================================================================================

-// Stores the compiled binary or IR in the cache
-void StoreBinaryToCache(const std::string &binary, const std::string &device_name,
-                        const Precision &precision, const std::string &routine_name) {
-  #ifdef VERBOSE
-    printf("[DEBUG] Storing binary in cache\n");
-  #endif
-  binary_cache_mutex_.lock();
-  binary_cache_.push_back(BinaryCache{binary, device_name, precision, routine_name});
-  binary_cache_mutex_.unlock();
-}
+template <typename Key, typename Value>
+template <typename U>
+Value Cache<Key, Value>::Get(const U &key, bool *in_cache) const {
+  std::lock_guard<std::mutex> lock(cache_mutex_);

-// Stores the compiled program in the cache
-void StoreProgramToCache(const Program &program, const Context &context,
-                         const Precision &precision, const std::string &routine_name) {
-  #ifdef VERBOSE
-    printf("[DEBUG] Storing program in cache\n");
-  #endif
-  program_cache_mutex_.lock();
-  program_cache_.push_back(ProgramCache{program, context(), precision, routine_name});
-  program_cache_mutex_.unlock();
-}
-
-// Queries the cache and retrieves a matching binary. Assumes that the match is available, throws
-// otherwise.
-const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision,
-                                      const std::string &routine_name) {
-  #ifdef VERBOSE
-    printf("[DEBUG] Retrieving binary from cache\n");
-  #endif
-  binary_cache_mutex_.lock();
-  for (auto &cached_binary: binary_cache_) {
-    if (cached_binary.MatchInCache(device_name, precision, routine_name)) {
-      binary_cache_mutex_.unlock();
-      return cached_binary.binary;
+#if __cplusplus >= 201402L
+  // generalized std::map::find() of C++14
+  auto it = cache_.find(key);
+#else
+  // O(n) lookup in a vector
+  auto it = std::find_if(cache_.begin(), cache_.end(), [&] (const std::pair<Key, Value> &pair) {
+    return pair.first == key;
+  });
+#endif
+  if (it == cache_.end()) {
+    if (in_cache) {
+      *in_cache = false;
    }
+    return Value();
  }
-  binary_cache_mutex_.unlock();
-  throw LogicError("GetBinaryFromCache: Expected binary in cache, but found none");
+
+  if (in_cache) {
+    *in_cache = true;
+  }
+  return it->second;
 }

-// Queries the cache and retrieves a matching program. Assumes that the match is available, throws
-// otherwise.
-const Program& GetProgramFromCache(const Context &context, const Precision &precision,
-                                   const std::string &routine_name) {
-  #ifdef VERBOSE
-    printf("[DEBUG] Retrieving program from cache\n");
-  #endif
-  program_cache_mutex_.lock();
-  for (auto &cached_program: program_cache_) {
-    if (cached_program.MatchInCache(context(), precision, routine_name)) {
-      program_cache_mutex_.unlock();
-      return cached_program.program;
-    }
+template <typename Key, typename Value>
+void Cache<Key, Value>::Store(Key &&key, Value &&value) {
+  std::lock_guard<std::mutex> lock(cache_mutex_);
+
+#if __cplusplus >= 201402L
+  // emplace() into a map
+  auto r = cache_.emplace(std::move(key), std::move(value));
+  if (!r.second) {
+    throw LogicError("Cache::Store: object already in cache");
  }
-  program_cache_mutex_.unlock();
-  throw LogicError("GetProgramFromCache: Expected program in cache, but found none");
+#else
+  // emplace_back() into a vector
+  cache_.emplace_back(std::move(key), std::move(value));
+#endif
 }

-// Queries the cache to see whether or not the compiled kernel is already there
-bool BinaryIsInCache(const std::string &device_name, const Precision &precision,
-                     const std::string &routine_name) {
-  binary_cache_mutex_.lock();
-  for (auto &cached_binary: binary_cache_) {
-    if (cached_binary.MatchInCache(device_name, precision, routine_name)) {
-      binary_cache_mutex_.unlock();
-      return true;
+template <typename Key, typename Value>
+void Cache<Key, Value>::Remove(const Key &key) {
+  std::lock_guard<std::mutex> lock(cache_mutex_);
+#if __cplusplus >= 201402L
+  cache_.erase(key);
+#else
+  auto it = cache_.begin();
+  while (it != cache_.end()) {
+    if ((*it).first == key) {
+      it = cache_.erase(it);
    }
+    else ++it;
  }
-  binary_cache_mutex_.unlock();
-  return false;
+#endif
 }

-// Queries the cache to see whether or not the compiled kernel is already there
-bool ProgramIsInCache(const Context &context, const Precision &precision,
-                      const std::string &routine_name) {
-  program_cache_mutex_.lock();
-  for (auto &cached_program: program_cache_) {
-    if (cached_program.MatchInCache(context(), precision, routine_name)) {
-      program_cache_mutex_.unlock();
-      return true;
+template <typename Key, typename Value>
+template <int I1, int I2>
+void Cache<Key, Value>::RemoveBySubset(const Key &key) {
+  std::lock_guard<std::mutex> lock(cache_mutex_);
+  auto it = cache_.begin();
+  while (it != cache_.end()) {
+    const auto current_key = (*it).first;
+    if ((std::get<I1>(key) == std::get<I1>(current_key)) &&
+        (std::get<I2>(key) == std::get<I2>(current_key))) {
+      it = cache_.erase(it);
    }
+    else ++it;
  }
-  program_cache_mutex_.unlock();
-  return false;
 }

+template <typename Key, typename Value>
+void Cache<Key, Value>::Invalidate() {
+  std::lock_guard<std::mutex> lock(cache_mutex_);
+
+  cache_.clear();
+}
+
+template <typename Key, typename Value>
+Cache<Key, Value> &Cache<Key, Value>::Instance() {
+  return instance_;
+}
+
+template <typename Key, typename Value>
+Cache<Key, Value> Cache<Key, Value>::instance_;
+
 // =================================================================================================

-// Clears the cache of stored binaries and programs
-void CacheClearAll() {
-  binary_cache_mutex_.lock();
-  binary_cache_.clear();
-  binary_cache_mutex_.unlock();
-  program_cache_mutex_.lock();
-  program_cache_.clear();
-  program_cache_mutex_.unlock();
-}
+template class Cache<BinaryKey, std::string>;
+template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const;
+
+// =================================================================================================
+
+template class Cache<ProgramKey, Program>;
+template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const;
+template void ProgramCache::RemoveBySubset<1, 2>(const ProgramKey &); // precision and routine name
+
+// =================================================================================================
+
+template class Cache<DatabaseKey, Database>;
+template Database DatabaseCache::Get(const DatabaseKeyRef &, bool *) const;

 // =================================================================================================
 } // namespace clblast
--- a/src/cache.hpp
+++ b/src/cache.hpp
@ -15,81 +15,92 @@
 #define CLBLAST_CACHE_H_

 #include <string>
-#include <vector>
 #include <mutex>
+#include <map>

 #include "utilities/utilities.hpp"

 namespace clblast {
 // =================================================================================================

-// The cache of compiled OpenCL binaries, along with some meta-data
-struct BinaryCache {
-  std::string binary;
-  std::string device_name;
-  Precision precision;
-  std::string routine_name_;
+// The generic thread-safe cache. We assume that the Key may be a heavyweight struct that is not
+// normally used by the caller, while the Value is either lightweight or ref-counted.
+// Hence, searching by non-Key is supported (if there is a corresponding operator<()), and
+// on Store() the Key instance is moved from the caller (because it will likely be constructed
+// as temporary at the time of Store()).
+template <typename Key, typename Value>
+class Cache {
+public:
+  // Cached object is returned by-value to avoid racing with Invalidate().
+  // Due to lack of std::optional<>, in case of a cache miss we return a default-constructed
+  // Value and set the flag to false.
+  template <typename U>
+  Value Get(const U &key, bool *in_cache) const;

-  // Finds out whether the properties match
-  bool MatchInCache(const std::string &ref_device, const Precision &ref_precision,
-                    const std::string &ref_routine) {
-    return (device_name == ref_device &&
-            precision == ref_precision &&
-            routine_name_ == ref_routine);
-  }
-};
+  // We do not return references to just stored object to avoid racing with Invalidate().
+  // Caller is expected to store a temporary.
+  void Store(Key &&key, Value &&value);
+  void Invalidate();

-// The actual cache, implemented as a vector of the above data-type, and its mutex
-static std::vector<BinaryCache> binary_cache_;
-static std::mutex binary_cache_mutex_;
+  // Removes all entries with a given key
+  void Remove(const Key &key);
+  template <int I1, int I2> void RemoveBySubset(const Key &key); // currently supports 2 indices
+
+  static Cache<Key, Value> &Instance();
+
+private:
+#if __cplusplus >= 201402L
+  // The std::less<void> allows to search in cache by an object comparable with Key, without
+  // constructing a temporary Key
+  // (see http://en.cppreference.com/w/cpp/utility/functional/less_void,
+  //      http://www.open-std.org/JTC1/SC22/WG21/docs/papers/2013/n3657.htm,
+  //      http://stackoverflow.com/questions/10536788/avoiding-key-construction-for-stdmapfind)
+  std::map<Key, Value, std::less<void>> cache_;
+#else
+  std::vector<std::pair<Key, Value>> cache_;
+#endif
+  mutable std::mutex cache_mutex_;
+
+  static Cache<Key, Value> instance_;
+}; // class Cache

 // =================================================================================================

-// The cache of compiled OpenCL programs, along with some meta-data
-struct ProgramCache {
-  Program program;
-  cl_context context;
-  Precision precision;
-  std::string routine_name_;
+// The key struct for the cache of compiled OpenCL binaries
+// Order of fields: precision, routine_name, device_name (smaller fields first)
+typedef std::tuple<Precision, std::string, std::string> BinaryKey;
+typedef std::tuple<const Precision &, const std::string &, const std::string &> BinaryKeyRef;

-  // Finds out whether the properties match
-  bool MatchInCache(const cl_context ref_context, const Precision &ref_precision,
-                    const std::string &ref_routine) {
-    return (context == ref_context &&
-            precision == ref_precision &&
-            routine_name_ == ref_routine);
-  }
-};
+typedef Cache<BinaryKey, std::string> BinaryCache;

-// The actual cache, implemented as a vector of the above data-type, and its mutex
-static std::vector<ProgramCache> program_cache_;
-static std::mutex program_cache_mutex_;
+extern template class Cache<BinaryKey, std::string>;
+extern template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const;

 // =================================================================================================

-// Stores the compiled binary or program in the cache
-void StoreBinaryToCache(const std::string &binary, const std::string &device_name,
-                        const Precision &precision, const std::string &routine_name);
-void StoreProgramToCache(const Program &program, const Context &context,
-                         const Precision &precision, const std::string &routine_name);
+// The key struct for the cache of compiled OpenCL programs (context-dependent)
+// Order of fields: context, precision, routine_name (smaller fields first)
+typedef std::tuple<cl_context, Precision, std::string> ProgramKey;
+typedef std::tuple<const cl_context &, const Precision &, const std::string &> ProgramKeyRef;

-// Queries the cache and retrieves a matching binary or program. Assumes that the match is
-// available, throws otherwise.
-const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision,
-                                      const std::string &routine_name);
-const Program& GetProgramFromCache(const Context &context, const Precision &precision,
-                                   const std::string &routine_name);
+typedef Cache<ProgramKey, Program> ProgramCache;

-// Queries the cache to see whether or not the compiled kernel is already there
-bool BinaryIsInCache(const std::string &device_name, const Precision &precision,
-                     const std::string &routine_name);
-bool ProgramIsInCache(const Context &context, const Precision &precision,
-                      const std::string &routine_name);
+extern template class Cache<ProgramKey, Program>;
+extern template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const;

 // =================================================================================================

-// Clears the cache of stored binaries
-void CacheClearAll();
+class Database;
+
+// The key struct for the cache of database maps.
+// Order of fields: precision, device_name, kernel_name (smaller fields first)
+typedef std::tuple<Precision, std::string, std::string> DatabaseKey;
+typedef std::tuple<const Precision &, const std::string &, const std::string &> DatabaseKeyRef;
+
+typedef Cache<DatabaseKey, Database> DatabaseCache;
+
+extern template class Cache<DatabaseKey, Database>;
+extern template Database DatabaseCache::Get(const DatabaseKeyRef &, bool *) const;

 // =================================================================================================
 } // namespace clblast
--- a/src/clblast.cpp
+++ b/src/clblast.cpp
@ -15,8 +15,8 @@

 #include <string>

-#include "clblast.h"
 #include "cache.hpp"
+#include "clblast.h"

 // BLAS level-1 includes
 #include "routines/level1/xswap.hpp"
@ -45,6 +45,7 @@
 #include "routines/level2/xtrmv.hpp"
 #include "routines/level2/xtbmv.hpp"
 #include "routines/level2/xtpmv.hpp"
+#include "routines/level2/xtrsv.hpp"
 #include "routines/level2/xger.hpp"
 #include "routines/level2/xgeru.hpp"
 #include "routines/level2/xgerc.hpp"
@ -66,9 +67,12 @@
 #include "routines/level3/xsyr2k.hpp"
 #include "routines/level3/xher2k.hpp"
 #include "routines/level3/xtrmm.hpp"
+#include "routines/level3/xtrsm.hpp"

 // Level-x includes (non-BLAS)
 #include "routines/levelx/xomatcopy.hpp"
+#include "routines/levelx/xaxpybatched.hpp"
+#include "routines/levelx/xgemmbatched.hpp"

 namespace clblast {

@ -1145,12 +1149,20 @@ template StatusCode PUBLIC_API Tpmv<half>(const Layout, const Triangle, const Tr

 // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
 template <typename T>
-StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal,
-                const size_t,
-                const cl_mem, const size_t, const size_t,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event) {
+  try {
+    auto queue_cpp = Queue(*queue);
+    auto routine = Xtrsv<T>(queue_cpp, event);
+    routine.DoTrsv(layout, triangle, a_transpose, diagonal,
+                   n,
+                   Buffer<T>(a_buffer), a_offset, a_ld,
+                   Buffer<T>(x_buffer), x_offset, x_inc);
+    return StatusCode::kSuccess;
+  } catch (...) { return DispatchException(); }
 }
 template StatusCode PUBLIC_API Trsv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
                                           const size_t,
@ -2065,15 +2077,24 @@ template StatusCode PUBLIC_API Trmm<half>(const Layout, const Side, const Triang
                                          cl_mem, const size_t, const size_t,
                                          cl_command_queue*, cl_event*);

-// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
+// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
 template <typename T>
-StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
-                const size_t, const size_t,
-                const T,
-                const cl_mem, const size_t, const size_t,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                cl_command_queue* queue, cl_event* event) {
+  try {
+    auto queue_cpp = Queue(*queue);
+    auto routine = Xtrsm<T>(queue_cpp, event);
+    routine.DoTrsm(layout, side, triangle, a_transpose, diagonal,
+                   m, n,
+                   alpha,
+                   Buffer<T>(a_buffer), a_offset, a_ld,
+                   Buffer<T>(b_buffer), b_offset, b_ld);
+    return StatusCode::kSuccess;
+  } catch (...) { return DispatchException(); }
 }
 template StatusCode PUBLIC_API Trsm<float>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
                                           const size_t, const size_t,
@ -2099,12 +2120,6 @@ template StatusCode PUBLIC_API Trsm<double2>(const Layout, const Side, const Tri
                                             const cl_mem, const size_t, const size_t,
                                             cl_mem, const size_t, const size_t,
                                             cl_command_queue*, cl_event*);
-template StatusCode PUBLIC_API Trsm<half>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
-                                          const size_t, const size_t,
-                                          const half,
-                                          const cl_mem, const size_t, const size_t,
-                                          cl_mem, const size_t, const size_t,
-                                          cl_command_queue*, cl_event*);

 // =================================================================================================
 // Extra non-BLAS routines (level-X)
@ -2160,16 +2175,222 @@ template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose,
                                              cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);

+// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
+template <typename T>
+StatusCode AxpyBatched(const size_t n,
+                       const T *alphas,
+                       const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                       cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                       const size_t batch_count,
+                       cl_command_queue* queue, cl_event* event) {
+  try {
+    auto queue_cpp = Queue(*queue);
+    auto routine = XaxpyBatched<T>(queue_cpp, event);
+    auto alphas_cpp = std::vector<T>();
+    auto x_offsets_cpp = std::vector<size_t>();
+    auto y_offsets_cpp = std::vector<size_t>();
+    for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+      alphas_cpp.push_back(alphas[batch]);
+      x_offsets_cpp.push_back(x_offsets[batch]);
+      y_offsets_cpp.push_back(y_offsets[batch]);
+    }
+    routine.DoAxpyBatched(n,
+                          alphas_cpp,
+                          Buffer<T>(x_buffer), x_offsets_cpp, x_inc,
+                          Buffer<T>(y_buffer), y_offsets_cpp, y_inc,
+                          batch_count);
+    return StatusCode::kSuccess;
+  } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API AxpyBatched<float>(const size_t,
+                                                  const float*,
+                                                  const cl_mem, const size_t*, const size_t,
+                                                  cl_mem, const size_t*, const size_t,
+                                                  const size_t,
+                                                  cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API AxpyBatched<double>(const size_t,
+                                                   const double*,
+                                                   const cl_mem, const size_t*, const size_t,
+                                                   cl_mem, const size_t*, const size_t,
+                                                   const size_t,
+                                                   cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API AxpyBatched<float2>(const size_t,
+                                                   const float2*,
+                                                   const cl_mem, const size_t*, const size_t,
+                                                   cl_mem, const size_t*, const size_t,
+                                                   const size_t,
+                                                   cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API AxpyBatched<double2>(const size_t,
+                                                    const double2*,
+                                                    const cl_mem, const size_t*, const size_t,
+                                                    cl_mem, const size_t*, const size_t,
+                                                    const size_t,
+                                                    cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API AxpyBatched<half>(const size_t,
+                                                 const half*,
+                                                 const cl_mem, const size_t*, const size_t,
+                                                 cl_mem, const size_t*, const size_t,
+                                                 const size_t,
+                                                 cl_command_queue*, cl_event*);
+
+// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED
+template <typename T>
+StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                       const size_t m, const size_t n, const size_t k,
+                       const T *alphas,
+                       const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                       const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                       const T *betas,
+                       cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                       const size_t batch_count,
+                       cl_command_queue* queue, cl_event* event) {
+  try {
+    auto queue_cpp = Queue(*queue);
+    auto routine = XgemmBatched<T>(queue_cpp, event);
+    auto alphas_cpp = std::vector<T>();
+    auto betas_cpp = std::vector<T>();
+    auto a_offsets_cpp = std::vector<size_t>();
+    auto b_offsets_cpp = std::vector<size_t>();
+    auto c_offsets_cpp = std::vector<size_t>();
+    for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+      alphas_cpp.push_back(alphas[batch]);
+      betas_cpp.push_back(betas[batch]);
+      a_offsets_cpp.push_back(a_offsets[batch]);
+      b_offsets_cpp.push_back(b_offsets[batch]);
+      c_offsets_cpp.push_back(c_offsets[batch]);
+    }
+    routine.DoGemmBatched(layout, a_transpose, b_transpose,
+                          m, n, k,
+                          alphas_cpp,
+                          Buffer<T>(a_buffer), a_offsets_cpp, a_ld,
+                          Buffer<T>(b_buffer), b_offsets_cpp, b_ld,
+                          betas_cpp,
+                          Buffer<T>(c_buffer), c_offsets_cpp, c_ld,
+                          batch_count);
+    return StatusCode::kSuccess;
+  } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API GemmBatched<float>(const Layout, const Transpose, const Transpose,
+                                                  const size_t, const size_t, const size_t,
+                                                  const float*,
+                                                  const cl_mem, const size_t*, const size_t,
+                                                  const cl_mem, const size_t*, const size_t,
+                                                  const float*,
+                                                  cl_mem, const size_t*, const size_t,
+                                                  const size_t,
+                                                  cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API GemmBatched<double>(const Layout, const Transpose, const Transpose,
+                                                   const size_t, const size_t, const size_t,
+                                                   const double*,
+                                                   const cl_mem, const size_t*, const size_t,
+                                                   const cl_mem, const size_t*, const size_t,
+                                                   const double*,
+                                                   cl_mem, const size_t*, const size_t,
+                                                   const size_t,
+                                                   cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API GemmBatched<float2>(const Layout, const Transpose, const Transpose,
+                                                   const size_t, const size_t, const size_t,
+                                                   const float2*,
+                                                   const cl_mem, const size_t*, const size_t,
+                                                   const cl_mem, const size_t*, const size_t,
+                                                   const float2*,
+                                                   cl_mem, const size_t*, const size_t,
+                                                   const size_t,
+                                                   cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API GemmBatched<double2>(const Layout, const Transpose, const Transpose,
+                                                    const size_t, const size_t, const size_t,
+                                                    const double2*,
+                                                    const cl_mem, const size_t*, const size_t,
+                                                    const cl_mem, const size_t*, const size_t,
+                                                    const double2*,
+                                                    cl_mem, const size_t*, const size_t,
+                                                    const size_t,
+                                                    cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API GemmBatched<half>(const Layout, const Transpose, const Transpose,
+                                                 const size_t, const size_t, const size_t,
+                                                 const half*,
+                                                 const cl_mem, const size_t*, const size_t,
+                                                 const cl_mem, const size_t*, const size_t,
+                                                 const half*,
+                                                 cl_mem, const size_t*, const size_t,
+                                                 const size_t,
+                                                 cl_command_queue*, cl_event*);
 // =================================================================================================

 // Clears the cache of stored binaries
 StatusCode ClearCache() {
  try {
-    CacheClearAll();
+    ProgramCache::Instance().Invalidate();
+    BinaryCache::Instance().Invalidate();
  } catch (...) { return DispatchException(); }
  return StatusCode::kSuccess;
 }

+template <typename Real, typename Complex>
+void FillCacheForPrecision(Queue &queue) {
+  try {
+
+    // Runs all the level 1 set-up functions
+    Xswap<Real>(queue, nullptr); Xswap<Complex>(queue, nullptr);
+    Xswap<Real>(queue, nullptr); Xswap<Complex>(queue, nullptr);
+    Xscal<Real>(queue, nullptr); Xscal<Complex>(queue, nullptr);
+    Xcopy<Real>(queue, nullptr); Xcopy<Complex>(queue, nullptr);
+    Xaxpy<Real>(queue, nullptr); Xaxpy<Complex>(queue, nullptr);
+    Xdot<Real>(queue, nullptr);
+    Xdotu<Complex>(queue, nullptr);
+    Xdotc<Complex>(queue, nullptr);
+    Xnrm2<Real>(queue, nullptr); Xnrm2<Complex>(queue, nullptr);
+    Xasum<Real>(queue, nullptr); Xasum<Complex>(queue, nullptr);
+    Xsum<Real>(queue, nullptr); Xsum<Complex>(queue, nullptr);
+    Xamax<Real>(queue, nullptr); Xamax<Complex>(queue, nullptr);
+    Xmax<Real>(queue, nullptr); Xmax<Complex>(queue, nullptr);
+    Xmin<Real>(queue, nullptr); Xmin<Complex>(queue, nullptr);
+
+    // Runs all the level 2 set-up functions
+    Xgemv<Real>(queue, nullptr); Xgemv<Complex>(queue, nullptr);
+    Xgbmv<Real>(queue, nullptr); Xgbmv<Complex>(queue, nullptr);
+    Xhemv<Complex>(queue, nullptr);
+    Xhbmv<Complex>(queue, nullptr);
+    Xhpmv<Complex>(queue, nullptr);
+    Xsymv<Real>(queue, nullptr);
+    Xsbmv<Real>(queue, nullptr);
+    Xspmv<Real>(queue, nullptr);
+    Xtrmv<Real>(queue, nullptr); Xtrmv<Complex>(queue, nullptr);
+    Xtbmv<Real>(queue, nullptr); Xtbmv<Complex>(queue, nullptr);
+    Xtpmv<Real>(queue, nullptr); Xtpmv<Complex>(queue, nullptr);
+    Xger<Real>(queue, nullptr);
+    Xgeru<Complex>(queue, nullptr);
+    Xgerc<Complex>(queue, nullptr);
+    Xher<Complex,Real>(queue, nullptr);
+    Xhpr<Complex,Real>(queue, nullptr);
+    Xher2<Complex>(queue, nullptr);
+    Xhpr2<Complex>(queue, nullptr);
+    Xsyr<Real>(queue, nullptr);
+    Xspr<Real>(queue, nullptr);
+    Xsyr2<Real>(queue, nullptr);
+    Xspr2<Real>(queue, nullptr);
+
+    // Runs all the level 3 set-up functions
+    Xgemm<Real>(queue, nullptr); Xgemm<Complex>(queue, nullptr);
+    Xsymm<Real>(queue, nullptr); Xsymm<Complex>(queue, nullptr);
+    Xhemm<Complex>(queue, nullptr);
+    Xsyrk<Real>(queue, nullptr); Xsyrk<Complex>(queue, nullptr);
+    Xherk<Complex,Real>(queue, nullptr);
+    Xsyr2k<Real>(queue, nullptr); Xsyr2k<Complex>(queue, nullptr);
+    Xher2k<Complex,Real>(queue, nullptr);
+    Xtrmm<Real>(queue, nullptr); Xtrmm<Complex>(queue, nullptr);
+
+    // Runs all the non-BLAS set-up functions
+    Xomatcopy<Real>(queue, nullptr); Xomatcopy<Complex>(queue, nullptr);
+
+  } catch(const RuntimeErrorCode &e) {
+    if (e.status() != StatusCode::kNoDoublePrecision &&
+        e.status() != StatusCode::kNoHalfPrecision) {
+      throw;
+    }
+  }
+}
+
 // Fills the cache with all binaries for a specific device
 // TODO: Add half-precision FP16 set-up calls
 StatusCode FillCache(const cl_device_id device) {
@ -2180,58 +2401,52 @@ StatusCode FillCache(const cl_device_id device) {
    auto context = Context(device_cpp);
    auto queue = Queue(context, device_cpp);

-    // Runs all the level 1 set-up functions
-    Xswap<float>(queue, nullptr); Xswap<double>(queue, nullptr); Xswap<float2>(queue, nullptr); Xswap<double2>(queue, nullptr);
-    Xswap<float>(queue, nullptr); Xswap<double>(queue, nullptr); Xswap<float2>(queue, nullptr); Xswap<double2>(queue, nullptr);
-    Xscal<float>(queue, nullptr); Xscal<double>(queue, nullptr); Xscal<float2>(queue, nullptr); Xscal<double2>(queue, nullptr);
-    Xcopy<float>(queue, nullptr); Xcopy<double>(queue, nullptr); Xcopy<float2>(queue, nullptr); Xcopy<double2>(queue, nullptr);
-    Xaxpy<float>(queue, nullptr); Xaxpy<double>(queue, nullptr); Xaxpy<float2>(queue, nullptr); Xaxpy<double2>(queue, nullptr);
-    Xdot<float>(queue, nullptr); Xdot<double>(queue, nullptr);
-    Xdotu<float2>(queue, nullptr); Xdotu<double2>(queue, nullptr);
-    Xdotc<float2>(queue, nullptr); Xdotc<double2>(queue, nullptr);
-    Xnrm2<float>(queue, nullptr); Xnrm2<double>(queue, nullptr); Xnrm2<float2>(queue, nullptr); Xnrm2<double2>(queue, nullptr);
-    Xasum<float>(queue, nullptr); Xasum<double>(queue, nullptr); Xasum<float2>(queue, nullptr); Xasum<double2>(queue, nullptr);
-    Xsum<float>(queue, nullptr); Xsum<double>(queue, nullptr); Xsum<float2>(queue, nullptr); Xsum<double2>(queue, nullptr);
-    Xamax<float>(queue, nullptr); Xamax<double>(queue, nullptr); Xamax<float2>(queue, nullptr); Xamax<double2>(queue, nullptr);
-    Xmax<float>(queue, nullptr); Xmax<double>(queue, nullptr); Xmax<float2>(queue, nullptr); Xmax<double2>(queue, nullptr);
-    Xmin<float>(queue, nullptr); Xmin<double>(queue, nullptr); Xmin<float2>(queue, nullptr); Xmin<double2>(queue, nullptr);
+    FillCacheForPrecision<float, float2>(queue);
+    FillCacheForPrecision<double, double2>(queue);

-    // Runs all the level 2 set-up functions
-    Xgemv<float>(queue, nullptr); Xgemv<double>(queue, nullptr); Xgemv<float2>(queue, nullptr); Xgemv<double2>(queue, nullptr);
-    Xgbmv<float>(queue, nullptr); Xgbmv<double>(queue, nullptr); Xgbmv<float2>(queue, nullptr); Xgbmv<double2>(queue, nullptr);
-    Xhemv<float2>(queue, nullptr); Xhemv<double2>(queue, nullptr);
-    Xhbmv<float2>(queue, nullptr); Xhbmv<double2>(queue, nullptr);
-    Xhpmv<float2>(queue, nullptr); Xhpmv<double2>(queue, nullptr);
-    Xsymv<float>(queue, nullptr); Xsymv<double>(queue, nullptr);
-    Xsbmv<float>(queue, nullptr); Xsbmv<double>(queue, nullptr);
-    Xspmv<float>(queue, nullptr); Xspmv<double>(queue, nullptr);
-    Xtrmv<float>(queue, nullptr); Xtrmv<double>(queue, nullptr); Xtrmv<float2>(queue, nullptr); Xtrmv<double2>(queue, nullptr);
-    Xtbmv<float>(queue, nullptr); Xtbmv<double>(queue, nullptr); Xtbmv<float2>(queue, nullptr); Xtbmv<double2>(queue, nullptr);
-    Xtpmv<float>(queue, nullptr); Xtpmv<double>(queue, nullptr); Xtpmv<float2>(queue, nullptr); Xtpmv<double2>(queue, nullptr);
-    Xger<float>(queue, nullptr); Xger<double>(queue, nullptr);
-    Xgeru<float2>(queue, nullptr); Xgeru<double2>(queue, nullptr);
-    Xgerc<float2>(queue, nullptr); Xgerc<double2>(queue, nullptr);
-    Xher<float2,float>(queue, nullptr); Xher<double2,double>(queue, nullptr);
-    Xhpr<float2,float>(queue, nullptr); Xhpr<double2,double>(queue, nullptr);
-    Xher2<float2>(queue, nullptr); Xher2<double2>(queue, nullptr);
-    Xhpr2<float2>(queue, nullptr); Xhpr2<double2>(queue, nullptr);
-    Xsyr<float>(queue, nullptr); Xsyr<double>(queue, nullptr);
-    Xspr<float>(queue, nullptr); Xspr<double>(queue, nullptr);
-    Xsyr2<float>(queue, nullptr); Xsyr2<double>(queue, nullptr);
-    Xspr2<float>(queue, nullptr); Xspr2<double>(queue, nullptr);
+  } catch (...) { return DispatchException(); }
+  return StatusCode::kSuccess;
+}

-    // Runs all the level 3 set-up functions
-    Xgemm<float>(queue, nullptr); Xgemm<double>(queue, nullptr); Xgemm<float2>(queue, nullptr); Xgemm<double2>(queue, nullptr);
-    Xsymm<float>(queue, nullptr); Xsymm<double>(queue, nullptr); Xsymm<float2>(queue, nullptr); Xsymm<double2>(queue, nullptr);
-    Xhemm<float2>(queue, nullptr); Xhemm<double2>(queue, nullptr);
-    Xsyrk<float>(queue, nullptr); Xsyrk<double>(queue, nullptr); Xsyrk<float2>(queue, nullptr); Xsyrk<double2>(queue, nullptr);
-    Xherk<float2,float>(queue, nullptr); Xherk<double2,double>(queue, nullptr);
-    Xsyr2k<float>(queue, nullptr); Xsyr2k<double>(queue, nullptr); Xsyr2k<float2>(queue, nullptr); Xsyr2k<double2>(queue, nullptr);
-    Xher2k<float2,float>(queue, nullptr); Xher2k<double2,double>(queue, nullptr);
-    Xtrmm<float>(queue, nullptr); Xtrmm<double>(queue, nullptr); Xtrmm<float2>(queue, nullptr); Xtrmm<double2>(queue, nullptr);
+// =================================================================================================

-    // Runs all the level 3 set-up functions
-    Xomatcopy<float>(queue, nullptr); Xomatcopy<double>(queue, nullptr); Xomatcopy<float2>(queue, nullptr); Xomatcopy<double2>(queue, nullptr);
+// Overrides the tuning parameters for this device-precision-kernel combination
+StatusCode OverrideParameters(const cl_device_id device, const std::string &kernel_name,
+                              const Precision precision,
+                              const std::unordered_map<std::string,size_t> &parameters) {
+  try {
+
+    // Retrieves the device name
+    const auto device_cpp = Device(device);
+    const auto device_name = device_cpp.Name();
+
+    // Retrieves the current database values to verify whether the new ones are complete
+    auto in_cache = false;
+    const auto current_database = DatabaseCache::Instance().Get(DatabaseKeyRef{ precision, device_name, kernel_name }, &in_cache);
+    if (!in_cache) { return StatusCode::kInvalidOverrideKernel; }
+    for (const auto &current_param : current_database.GetParameterNames()) {
+      if (parameters.find(current_param) == parameters.end()) {
+        return StatusCode::kMissingOverrideParameter;
+      }
+    }
+
+    // Clears the existing program & binary cache for routines with the target kernel
+    const auto routine_names = Routine::routines_by_kernel.at(kernel_name);
+    for (const auto &routine_name : routine_names) {
+      ProgramCache::Instance().RemoveBySubset<1, 2>(ProgramKey{nullptr, precision, routine_name});
+      BinaryCache::Instance().Remove(BinaryKey{precision, routine_name, device_name});
+    }
+
+    // Creates a small custom database based on the provided parameters
+    const auto database_device = Database::DatabaseDevice{"default", parameters};
+    const auto database_vendor = Database::DatabaseVendor{database::kDeviceTypeAll, "default", {database_device}};
+    const auto database_entry = Database::DatabaseEntry{kernel_name, precision, {database_vendor}};
+    const auto database_entries = std::vector<Database::DatabaseEntry>{database_entry};
+    const auto database = Database(device_cpp, kernel_name, precision, database_entries);
+
+    // Removes the old database entry and stores the new one in the cache
+    DatabaseCache::Instance().Remove(DatabaseKey{ precision, device_name, kernel_name });
+    DatabaseCache::Instance().Store(DatabaseKey{ precision, device_name, kernel_name }, Database(database));

  } catch (...) { return DispatchException(); }
  return StatusCode::kSuccess;
--- a/src/clblast_c.cpp
+++ b/src/clblast_c.cpp
@ -12,12 +12,14 @@
 // =================================================================================================

 #include <string>
+#include <unordered_map>

+#include "utilities/utilities.hpp"
 #include "clblast_c.h"
 #include "clblast.h"
-#include "utilities/utilities.hpp"

 // Shortcuts to the clblast namespace
+using half = clblast::half;
 using float2 = clblast::float2;
 using double2 = clblast::double2;

@ -3349,27 +3351,6 @@ CLBlastStatusCode CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide sid
    );
  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
 }
-CLBlastStatusCode CLBlastHtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
-                               const size_t m, const size_t n,
-                               const cl_half alpha,
-                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
-                               cl_command_queue* queue, cl_event* event) {
-  try {
-    return static_cast<CLBlastStatusCode>(
-      clblast::Trsm(static_cast<clblast::Layout>(layout),
-                    static_cast<clblast::Side>(side),
-                    static_cast<clblast::Triangle>(triangle),
-                    static_cast<clblast::Transpose>(a_transpose),
-                    static_cast<clblast::Diagonal>(diagonal),
-                    m, n,
-                    alpha,
-                    a_buffer, a_offset, a_ld,
-                    b_buffer, b_offset, b_ld,
-                    queue, event)
-    );
-  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
-}

 // =================================================================================================
 // Extra non-BLAS routines (level-X)
@ -3467,6 +3448,270 @@ CLBlastStatusCode CLBlastHomatcopy(const CLBlastLayout layout, const CLBlastTran
  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
 }

+// AXPY
+CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
+                                      const float *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<float>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(alphas[batch]);
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::AxpyBatched(n,
+                           alphas_cpp.data(),
+                           x_buffer, x_offsets, x_inc,
+                           y_buffer, y_offsets, y_inc,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDaxpyBatched(const size_t n,
+                                      const double *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<double>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(alphas[batch]);
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::AxpyBatched(n,
+                           alphas_cpp.data(),
+                           x_buffer, x_offsets, x_inc,
+                           y_buffer, y_offsets, y_inc,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCaxpyBatched(const size_t n,
+                                      const cl_float2 *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<float2>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(float2{alphas[batch].s[0], alphas[batch].s[1]});
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::AxpyBatched(n,
+                           alphas_cpp.data(),
+                           x_buffer, x_offsets, x_inc,
+                           y_buffer, y_offsets, y_inc,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZaxpyBatched(const size_t n,
+                                      const cl_double2 *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<double2>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(double2{alphas[batch].s[0], alphas[batch].s[1]});
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::AxpyBatched(n,
+                           alphas_cpp.data(),
+                           x_buffer, x_offsets, x_inc,
+                           y_buffer, y_offsets, y_inc,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHaxpyBatched(const size_t n,
+                                      const cl_half *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<half>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(alphas[batch]);
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::AxpyBatched(n,
+                           alphas_cpp.data(),
+                           x_buffer, x_offsets, x_inc,
+                           y_buffer, y_offsets, y_inc,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+
+// GEMM
+CLBlastStatusCode CLBlastSgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const float *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const float *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<float>();
+  auto betas_cpp = std::vector<float>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(alphas[batch]);
+    betas_cpp.push_back(betas[batch]);
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::GemmBatched(static_cast<clblast::Layout>(layout),
+                           static_cast<clblast::Transpose>(a_transpose),
+                           static_cast<clblast::Transpose>(b_transpose),
+                           m, n, k,
+                           alphas_cpp.data(),
+                           a_buffer, a_offsets, a_ld,
+                           b_buffer, b_offsets, b_ld,
+                           betas_cpp.data(),
+                           c_buffer, c_offsets, c_ld,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const double *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const double *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<double>();
+  auto betas_cpp = std::vector<double>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(alphas[batch]);
+    betas_cpp.push_back(betas[batch]);
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::GemmBatched(static_cast<clblast::Layout>(layout),
+                           static_cast<clblast::Transpose>(a_transpose),
+                           static_cast<clblast::Transpose>(b_transpose),
+                           m, n, k,
+                           alphas_cpp.data(),
+                           a_buffer, a_offsets, a_ld,
+                           b_buffer, b_offsets, b_ld,
+                           betas_cpp.data(),
+                           c_buffer, c_offsets, c_ld,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const cl_float2 *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const cl_float2 *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<float2>();
+  auto betas_cpp = std::vector<float2>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(float2{alphas[batch].s[0], alphas[batch].s[1]});
+    betas_cpp.push_back(float2{betas[batch].s[0], betas[batch].s[1]});
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::GemmBatched(static_cast<clblast::Layout>(layout),
+                           static_cast<clblast::Transpose>(a_transpose),
+                           static_cast<clblast::Transpose>(b_transpose),
+                           m, n, k,
+                           alphas_cpp.data(),
+                           a_buffer, a_offsets, a_ld,
+                           b_buffer, b_offsets, b_ld,
+                           betas_cpp.data(),
+                           c_buffer, c_offsets, c_ld,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const cl_double2 *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const cl_double2 *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<double2>();
+  auto betas_cpp = std::vector<double2>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(double2{alphas[batch].s[0], alphas[batch].s[1]});
+    betas_cpp.push_back(double2{betas[batch].s[0], betas[batch].s[1]});
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::GemmBatched(static_cast<clblast::Layout>(layout),
+                           static_cast<clblast::Transpose>(a_transpose),
+                           static_cast<clblast::Transpose>(b_transpose),
+                           m, n, k,
+                           alphas_cpp.data(),
+                           a_buffer, a_offsets, a_ld,
+                           b_buffer, b_offsets, b_ld,
+                           betas_cpp.data(),
+                           c_buffer, c_offsets, c_ld,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const cl_half *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const cl_half *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<half>();
+  auto betas_cpp = std::vector<half>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(alphas[batch]);
+    betas_cpp.push_back(betas[batch]);
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::GemmBatched(static_cast<clblast::Layout>(layout),
+                           static_cast<clblast::Transpose>(a_transpose),
+                           static_cast<clblast::Transpose>(b_transpose),
+                           m, n, k,
+                           alphas_cpp.data(),
+                           a_buffer, a_offsets, a_ld,
+                           b_buffer, b_offsets, b_ld,
+                           betas_cpp.data(),
+                           c_buffer, c_offsets, c_ld,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+
 // =================================================================================================

 // Clears the cache of stored binaries
@ -3484,3 +3729,23 @@ CLBlastStatusCode CLBlastFillCache(const cl_device_id device) {
 }

 // =================================================================================================
+
+// Overrides the tuning parameters for this device-precision-kernel combination
+CLBlastStatusCode PUBLIC_API CLBlastOverrideParameters(const cl_device_id device, const char* kernel_name,
+                                                       const CLBlastPrecision precision, const size_t num_parameters,
+                                                       const char** parameters_names, const size_t* parameters_values) {
+  try {
+    const auto kernel_name_cpp = std::string(kernel_name);
+    const auto precision_cpp = static_cast<clblast::Precision>(precision);
+    auto parameters = std::unordered_map<std::string, size_t>();
+    for (auto i = size_t{0}; i < num_parameters; ++i) {
+      const auto parameter_name = std::string(parameters_names[i]);
+      const auto parameter_value = parameters_values[i];
+      parameters[parameter_name] = parameter_value;
+    }
+    const auto status = clblast::OverrideParameters(device, kernel_name_cpp, precision_cpp, parameters);
+    return static_cast<CLBlastStatusCode>(status);
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+
+// =================================================================================================
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@ -164,6 +164,10 @@ class Platform {
    platform_ = platforms[platform_id];
  }

+  // Methods to retrieve platform information
+  std::string Name() const { return GetInfoString(CL_PLATFORM_NAME); }
+  std::string Vendor() const { return GetInfoString(CL_PLATFORM_VENDOR); }
+
  // Returns the number of devices on this platform
  size_t NumDevices() const {
    auto result = cl_uint{0};
@ -175,6 +179,17 @@ class Platform {
  const cl_platform_id& operator()() const { return platform_; }
 private:
  cl_platform_id platform_;
+
+  // Private helper functions
+  std::string GetInfoString(const cl_device_info info) const {
+    auto bytes = size_t{0};
+    CheckError(clGetPlatformInfo(platform_, info, 0, nullptr, &bytes));
+    auto result = std::string{};
+    result.resize(bytes);
+    CheckError(clGetPlatformInfo(platform_, info, bytes, &result[0], nullptr));
+    result.resize(strlen(result.c_str())); // Removes any trailing '\0'-characters
+    return result;
+  }
 };

 // Retrieves a vector with all platforms
@ -333,7 +348,10 @@ class Context {

  // Regular constructor with memory management
  explicit Context(const Device &device):
-      context_(new cl_context, [](cl_context* c) { CheckErrorDtor(clReleaseContext(*c)); delete c; }) {
+      context_(new cl_context, [](cl_context* c) {
+        if (*c) { CheckErrorDtor(clReleaseContext(*c)); }
+        delete c;
+      }) {
    auto status = CL_SUCCESS;
    const cl_device_id dev = device();
    *context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status);
@ -355,33 +373,37 @@ using ContextPointer = cl_context*;
 // Enumeration of build statuses of the run-time compilation process
 enum class BuildStatus { kSuccess, kError, kInvalid };

-// C++11 version of 'cl_program'. Additionally holds the program's source code.
+// C++11 version of 'cl_program'.
 class Program {
 public:
-  // Note that there is no constructor based on the regular OpenCL data-type because of extra state
+  Program() = default;

  // Source-based constructor with memory management
-  explicit Program(const Context &context, std::string source):
-      program_(new cl_program, [](cl_program* p) { CheckErrorDtor(clReleaseProgram(*p)); delete p; }),
-      length_(source.length()),
-      source_(std::move(source)),
-      source_ptr_(&source_[0]) {
+  explicit Program(const Context &context, const std::string &source):
+      program_(new cl_program, [](cl_program* p) {
+        if (*p) { CheckErrorDtor(clReleaseProgram(*p)); }
+        delete p;
+      }) {
+    const char *source_ptr = &source[0];
+    size_t length = source.length();
    auto status = CL_SUCCESS;
-    *program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status);
+    *program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status);
    CLError::Check(status, "clCreateProgramWithSource");
  }

  // Binary-based constructor with memory management
-  explicit Program(const Device &device, const Context &context, const std::string& binary):
-      program_(new cl_program, [](cl_program* p) { CheckErrorDtor(clReleaseProgram(*p)); delete p; }),
-      length_(binary.length()),
-      source_(binary),
-      source_ptr_(&source_[0]) {
+  explicit Program(const Device &device, const Context &context, const std::string &binary):
+      program_(new cl_program, [](cl_program* p) {
+        if (*p) { CheckErrorDtor(clReleaseProgram(*p)); }
+        delete p;
+      }) {
+    const char *binary_ptr = &binary[0];
+    size_t length = binary.length();
    auto status1 = CL_SUCCESS;
    auto status2 = CL_SUCCESS;
    const cl_device_id dev = device();
-    *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_,
-                                          reinterpret_cast<const unsigned char**>(&source_ptr_),
+    *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length,
+                                          reinterpret_cast<const unsigned char**>(&binary_ptr),
                                          &status1, &status2);
    CLError::Check(status1, "clCreateProgramWithBinary (binary status)");
    CLError::Check(status2, "clCreateProgramWithBinary");
@ -421,9 +443,6 @@ class Program {
  const cl_program& operator()() const { return *program_; }
 private:
  std::shared_ptr<cl_program> program_;
-  size_t length_;
-  std::string source_; // Note: the source can also be a binary or IR
-  const char* source_ptr_;
 };

 // =================================================================================================
@ -440,8 +459,10 @@ class Queue {

  // Regular constructor with memory management
  explicit Queue(const Context &context, const Device &device):
-      queue_(new cl_command_queue, [](cl_command_queue* s) { CheckErrorDtor(clReleaseCommandQueue(*s));
-                                                             delete s; }) {
+      queue_(new cl_command_queue, [](cl_command_queue* s) {
+        if (*s) { CheckErrorDtor(clReleaseCommandQueue(*s)); }
+        delete s;
+      }) {
    auto status = CL_SUCCESS;
    *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
    CLError::Check(status, "clCreateCommandQueue");
@ -594,9 +615,6 @@ class Buffer {

  // Copies from host to device: writing the device buffer a-synchronously
  void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
-    if (access_ == BufferAccess::kReadOnly) {
-      throw LogicError("Buffer: writing to a read-only buffer");
-    }
    if (GetSize() < (offset+size)*sizeof(T)) {
      throw LogicError("Buffer: target device buffer is too small");
    }
@ -665,7 +683,10 @@ class Kernel {

  // Regular constructor with memory management
  explicit Kernel(const Program &program, const std::string &name):
-      kernel_(new cl_kernel, [](cl_kernel* k) { CheckErrorDtor(clReleaseKernel(*k)); delete k; }) {
+      kernel_(new cl_kernel, [](cl_kernel* k) {
+        if (*k) { CheckErrorDtor(clReleaseKernel(*k)); }
+        delete k;
+      }) {
    auto status = CL_SUCCESS;
    *kernel_ = clCreateKernel(program(), name.c_str(), &status);
    CLError::Check(status, "clCreateKernel");
--- a/src/database/apple_cpu_fallback.hpp
+++ b/src/database/apple_cpu_fallback.hpp
@ -0,0 +1,70 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file provides overrides for Apple's OpenCL CPU implementation. It is a special case compared
+// to all other implementations, as it only supports a 1-dimensional work-group size. In addition,
+// that work-group size is limited to 1024 (in theory) or much lower (kernel resource dependent).
+// Thus, instead of supporting this corner-case in the whole regular flow (starting from the tuner),
+// we provide this file with some manual overrides.
+//
+// Note: These overrides are to make the Apple CPU work and not crash, they are not in any way
+// optimized parameters. For decent speed don't use Apple's OpenCL CPU implementation.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+// =================================================================================================
+
+const Database::DatabaseEntry XaxpyApple = {
+  "Xaxpy", Precision::kAny, { { kDeviceTypeAll, "default", { { "default", { {"VW",8}, {"WGS",1}, {"WPT",4} } } } } }
+};
+const Database::DatabaseEntry XdotApple = {
+  "Xdot", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"WGS1",1}, {"WGS2",1} } } } } }
+};
+const Database::DatabaseEntry XgemvApple = {
+  "Xgemv", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"WGS1",1}, {"WPT1",4}, {"UNROLL1", 1} } } } } }
+};
+const Database::DatabaseEntry XgemvFastApple = {
+  "XgemvFast", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"VW2",1}, {"WGS2",1}, {"WPT2",1} } } } } }
+};
+const Database::DatabaseEntry XgemvFastRotApple = {
+  "XgemvFastRot", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"VW3",1}, {"WGS3",1}, {"WPT3",1} } } } } }
+};
+const Database::DatabaseEntry XgerApple = {
+  "Xger", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } } } } }
+};
+const Database::DatabaseEntry XtrsvApple = {
+  "Xtrsv", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"TRSV_BLOCK_SIZE",32} } } } } }
+};
+const Database::DatabaseEntry XgemmApple = {
+  "Xgemm", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"KWG",1}, {"KWI",1}, {"MDIMA",1}, {"MDIMC",1}, {"MWG",1}, {"NDIMB",1}, {"NDIMC",1}, {"NWG",1}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } } } } }
+};
+const Database::DatabaseEntry XgemmDirectApple = {
+  "XgemmDirect", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"KWID",1}, {"MDIMAD",1}, {"MDIMCD",1}, {"NDIMBD",1}, {"NDIMCD",1}, {"PADA",0}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",1} } } } } }
+};
+const Database::DatabaseEntry CopyApple = {
+  "Copy", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"COPY_DIMX",1}, {"COPY_DIMY",1}, {"COPY_VW",1}, {"COPY_WPT",1} } } } } }
+};
+const Database::DatabaseEntry PadApple = {
+  "Pad", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"PAD_DIMX",1}, {"PAD_DIMY",1}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } } } } }
+};
+const Database::DatabaseEntry TransposeApple = {
+  "Transpose", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"TRA_DIM",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } } } } }
+};
+const Database::DatabaseEntry PadtransposeApple = {
+  "Padtranspose", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",1}, {"PADTRA_WPT",1} } } } } }
+};
+const Database::DatabaseEntry InvertApple = {
+  "Invert", Precision::kAny, { {  kDeviceTypeAll, "default", { { "default", { {"INTERNAL_BLOCK_SIZE",16} } } } } }
+};
+
+// =================================================================================================
+} // namespace database
+} // namespace clblast
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@ -11,6 +11,8 @@
 //
 // =================================================================================================

+#include <list>
+
 #include "utilities/utilities.hpp"

 #include "database/database.hpp"
@ -20,35 +22,47 @@
 #include "database/kernels/xgemv_fast.hpp"
 #include "database/kernels/xgemv_fast_rot.hpp"
 #include "database/kernels/xger.hpp"
+#include "database/kernels/xtrsv.hpp"
 #include "database/kernels/xgemm.hpp"
 #include "database/kernels/xgemm_direct.hpp"
 #include "database/kernels/copy.hpp"
 #include "database/kernels/pad.hpp"
 #include "database/kernels/transpose.hpp"
 #include "database/kernels/padtranspose.hpp"
+#include "database/kernels/invert.hpp"
+#include "database/apple_cpu_fallback.hpp"
 #include "database/kernel_selection.hpp"

 namespace clblast {
 // =================================================================================================

-// Initializes the database
-const std::vector<const Database::DatabaseEntry*> Database::database = {
-  &database::XaxpyHalf, &database::XaxpySingle, &database::XaxpyDouble, &database::XaxpyComplexSingle, &database::XaxpyComplexDouble,
-  &database::XdotHalf, &database::XdotSingle, &database::XdotDouble, &database::XdotComplexSingle, &database::XdotComplexDouble,
-  &database::XgemvHalf, &database::XgemvSingle, &database::XgemvDouble, &database::XgemvComplexSingle, &database::XgemvComplexDouble,
-  &database::XgemvFastHalf, &database::XgemvFastSingle, &database::XgemvFastDouble, &database::XgemvFastComplexSingle, &database::XgemvFastComplexDouble,
-  &database::XgemvFastRotHalf, &database::XgemvFastRotSingle, &database::XgemvFastRotDouble, &database::XgemvFastRotComplexSingle, &database::XgemvFastRotComplexDouble,
-  &database::XgerHalf, &database::XgerSingle, &database::XgerDouble, &database::XgerComplexSingle, &database::XgerComplexDouble,
-  &database::XgemmHalf, &database::XgemmSingle, &database::XgemmDouble, &database::XgemmComplexSingle, &database::XgemmComplexDouble,
-  &database::XgemmDirectHalf, &database::XgemmDirectSingle, &database::XgemmDirectDouble, &database::XgemmDirectComplexSingle, &database::XgemmDirectComplexDouble,
-  &database::CopyHalf, &database::CopySingle, &database::CopyDouble, &database::CopyComplexSingle, &database::CopyComplexDouble,
-  &database::PadHalf, &database::PadSingle, &database::PadDouble, &database::PadComplexSingle, &database::PadComplexDouble,
-  &database::TransposeHalf, &database::TransposeSingle, &database::TransposeDouble, &database::TransposeComplexSingle, &database::TransposeComplexDouble,
-  &database::PadtransposeHalf, &database::PadtransposeSingle, &database::PadtransposeDouble, &database::PadtransposeComplexSingle, &database::PadtransposeComplexDouble,
-  &database::KernelSelectionHalf, &database::KernelSelectionSingle, &database::KernelSelectionDouble, &database::KernelSelectionComplexSingle, &database::KernelSelectionComplexDouble
+// Initializes the databases
+const std::vector<Database::DatabaseEntry> Database::database = std::vector<Database::DatabaseEntry>{
+  database::XaxpyHalf, database::XaxpySingle, database::XaxpyDouble, database::XaxpyComplexSingle, database::XaxpyComplexDouble,
+  database::XdotHalf, database::XdotSingle, database::XdotDouble, database::XdotComplexSingle, database::XdotComplexDouble,
+  database::XgemvHalf, database::XgemvSingle, database::XgemvDouble, database::XgemvComplexSingle, database::XgemvComplexDouble,
+  database::XgemvFastHalf, database::XgemvFastSingle, database::XgemvFastDouble, database::XgemvFastComplexSingle, database::XgemvFastComplexDouble,
+  database::XgemvFastRotHalf, database::XgemvFastRotSingle, database::XgemvFastRotDouble, database::XgemvFastRotComplexSingle, database::XgemvFastRotComplexDouble,
+  database::XgerHalf, database::XgerSingle, database::XgerDouble, database::XgerComplexSingle, database::XgerComplexDouble,
+  database::XtrsvHalf, database::XtrsvSingle, database::XtrsvDouble, database::XtrsvComplexSingle, database::XtrsvComplexDouble,
+  database::XgemmHalf, database::XgemmSingle, database::XgemmDouble, database::XgemmComplexSingle, database::XgemmComplexDouble,
+  database::XgemmDirectHalf, database::XgemmDirectSingle, database::XgemmDirectDouble, database::XgemmDirectComplexSingle, database::XgemmDirectComplexDouble,
+  database::CopyHalf, database::CopySingle, database::CopyDouble, database::CopyComplexSingle, database::CopyComplexDouble,
+  database::PadHalf, database::PadSingle, database::PadDouble, database::PadComplexSingle, database::PadComplexDouble,
+  database::TransposeHalf, database::TransposeSingle, database::TransposeDouble, database::TransposeComplexSingle, database::TransposeComplexDouble,
+  database::PadtransposeHalf, database::PadtransposeSingle, database::PadtransposeDouble, database::PadtransposeComplexSingle, database::PadtransposeComplexDouble,
+  database::InvertHalf, database::InvertSingle, database::InvertDouble, database::InvertComplexSingle, database::InvertComplexDouble,
+  database::KernelSelectionHalf, database::KernelSelectionSingle, database::KernelSelectionDouble, database::KernelSelectionComplexSingle, database::KernelSelectionComplexDouble
+};
+const std::vector<Database::DatabaseEntry> Database::apple_cpu_fallback = std::vector<Database::DatabaseEntry>{
+  database::XaxpyApple, database::XdotApple,
+  database::XgemvApple, database::XgemvFastApple, database::XgemvFastRotApple, database::XgerApple, database::XtrsvApple,
+  database::XgemmApple, database::XgemmDirectApple,
+  database::CopyApple, database::PadApple, database::TransposeApple, database::PadtransposeApple,
+  database::InvertApple
 };

-// The OpenCL device vendors
+// The default values
 const std::string Database::kDeviceVendorAll = "default";

 // Alternative names for some OpenCL vendors
@ -63,12 +77,11 @@ const std::unordered_map<std::string, std::string> Database::kVendorNames{

 // Constructor, computing device properties and populating the parameter-vector from the database.
 // This takes an optional overlay database in case of custom tuning or custom kernels.
-Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
-                   const Precision precision, const std::vector<const DatabaseEntry*> &overlay):
-  parameters_{} {
+Database::Database(const Device &device, const std::string &kernel_name,
+                   const Precision precision, const std::vector<DatabaseEntry> &overlay):
+  parameters_(std::make_shared<Parameters>()) {

  // Finds information of the current device
-  auto device = queue.GetDevice();
  auto device_type = device.Type();
  auto device_vendor = device.Vendor();
  auto device_name = device.Name();
@ -80,20 +93,31 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
    }
  }

-  // Iterates over all kernels to include, and retrieves the parameters for each of them
-  for (auto &kernel: kernels) {
-    auto search_result = ParametersPtr{};
+  // Sets the databases to search through
+  auto databases = std::list<std::vector<DatabaseEntry>>{overlay, database};

-    for (auto &db: { database, overlay}) {
-      search_result = Search(kernel, device_type, device_vendor, device_name, precision, db);
-      if (search_result) {
-        parameters_.insert(search_result->begin(), search_result->end());
-        break;
+  // Special case: modifies the database if the device is a CPU with Apple OpenCL
+  #if defined(__APPLE__) || defined(__MACOSX)
+    if (device.Type() == "CPU") {
+      auto extensions = device.Capabilities();
+      const auto is_apple = (extensions.find("cl_APPLE_SetMemObjectDestructor") == std::string::npos) ? false : true;
+      if (is_apple) {
+        databases.push_front(apple_cpu_fallback);
      }
    }
+  #endif

-    if (!search_result) { throw RuntimeErrorCode(StatusCode::kDatabaseError); }
+  // Searches potentially multiple databases
+  auto search_result = ParametersPtr{};
+  for (auto &db: databases) {
+    search_result = Search(kernel_name, device_type, device_vendor, device_name, precision, db);
+    if (search_result) {
+      parameters_->insert(search_result->begin(), search_result->end());
+      break;
+    }
  }
+
+  if (!search_result) { throw RuntimeErrorCode(StatusCode::kDatabaseError); }
 }

 // =================================================================================================
@ -101,12 +125,21 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
 // Returns a list of OpenCL pre-processor defines in string form
 std::string Database::GetDefines() const {
  std::string defines{};
-  for (auto &parameter: parameters_) {
+  for (auto &parameter: *parameters_) {
    defines += "#define "+parameter.first+" "+ToString(parameter.second)+"\n";
  }
  return defines;
 }

+// Retrieves the names of all the parameters
+std::vector<std::string> Database::GetParameterNames() const {
+  auto parameter_names = std::vector<std::string>();
+  for (auto &parameter: *parameters_) {
+    parameter_names.push_back(parameter.first);
+  }
+  return parameter_names;
+}
+
 // =================================================================================================

 // Searches a particular database for the right kernel and precision
@ -115,15 +148,16 @@ Database::ParametersPtr Database::Search(const std::string &this_kernel,
                                         const std::string &this_vendor,
                                         const std::string &this_device,
                                         const Precision this_precision,
-                                         const std::vector<const DatabaseEntry*> &this_database) const {
+                                         const std::vector<DatabaseEntry> &this_database) const {

  // Selects the right kernel
  for (auto &db: this_database) {
-    if (db->kernel == this_kernel && db->precision == this_precision) {
+    if ((db.kernel == this_kernel) &&
+        (db.precision == this_precision || db.precision == Precision::kAny)) {

      // Searches for the right vendor and device type, or selects the default if unavailable. This
      // assumes that the default vendor / device type is last in the database.
-      for (auto &vendor: db->vendors) {
+      for (auto &vendor: db.vendors) {
        if ((vendor.name == this_vendor || vendor.name == kDeviceVendorAll) &&
            (vendor.type == this_type || vendor.type == database::kDeviceTypeAll)) {

--- a/src/database/database.hpp
+++ b/src/database/database.hpp
@ -70,27 +70,61 @@ class Database {
  static const std::unordered_map<std::string, std::string> kVendorNames;

  // The database consists of separate database entries, stored together in a vector
-  static const std::vector<const DatabaseEntry*> database;
+  static const std::vector<DatabaseEntry> database;
+
+  // Database for a special case: Apple CPUs support limited number of threads
+  static const std::vector<DatabaseEntry> apple_cpu_fallback;
+
+  Database() = default;

  // The constructor with a user-provided database overlay (potentially an empty vector)
-  explicit Database(const Queue &queue, const std::vector<std::string> &routines,
-                    const Precision precision, const std::vector<const DatabaseEntry*> &overlay);
+  explicit Database(const Device &device, const std::string &kernel_name,
+                    const Precision precision, const std::vector<DatabaseEntry> &overlay);

  // Accessor of values by key
-  size_t operator[](const std::string key) const { return parameters_.find(key)->second; }
+  size_t operator[](const std::string &key) const { return parameters_->find(key)->second; }
+  bool exists(const std::string &key) const { return (parameters_->count(key) == 1); }

  // Obtain a list of OpenCL pre-processor defines based on the parameters
  std::string GetDefines() const;

+  // Retrieves the names of all the parameters
+  std::vector<std::string> GetParameterNames() const;
+
 private:
  // Search method for a specified database, returning pointer (possibly a nullptr)
  ParametersPtr Search(const std::string &this_kernel, const std::string &this_type,
                       const std::string &this_vendor, const std::string &this_device,
                       const Precision this_precision,
-                       const std::vector<const DatabaseEntry*> &db) const;
+                       const std::vector<DatabaseEntry> &db) const;

  // Found parameters suitable for this device/kernel
-  Parameters parameters_;
+  std::shared_ptr<Parameters> parameters_;
+};
+
+// =================================================================================================
+
+// Multiple databases together in a map
+class Databases {
+ public:
+
+  explicit Databases(const std::vector<std::string> &kernel_names): kernel_names_(kernel_names) { }
+
+  // Database accessor
+  Database& operator()(const std::string &kernel_name) { return databases_[kernel_name]; }
+
+  // Retrieves a parameter from the database
+  size_t operator[](const std::string &key) const {
+    for (const auto &kernel_name : kernel_names_) {
+      const auto &kernel_db = databases_.find(kernel_name)->second;
+      if (kernel_db.exists(key)) { return kernel_db[key]; }
+    }
+    throw RuntimeErrorCode(StatusCode::kDatabaseError);
+  }
+
+ private:
+  const std::vector<std::string> kernel_names_;
+  std::unordered_map<std::string, Database> databases_;
 };

 // =================================================================================================
--- a/src/database/kernel_selection.hpp
+++ b/src/database/kernel_selection.hpp
@ -22,13 +22,12 @@ const Database::DatabaseEntry KernelSelectionHalf = {
  "KernelSelection", Precision::kHalf, {
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
-        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
-        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",1280*1280*1280} } },
      }
    },
    { // Default
@ -45,13 +44,12 @@ const Database::DatabaseEntry KernelSelectionSingle = {
  "KernelSelection", Precision::kSingle, {
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
-        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
-        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",1280*1280*1280} } },
      }
    },
    { // Default
@ -68,13 +66,12 @@ const Database::DatabaseEntry KernelSelectionComplexSingle = {
  "KernelSelection", Precision::kComplexSingle, {
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
-        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
-        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",1280*1280*1280} } },
      }
    },
    { // Default
@ -91,13 +88,12 @@ const Database::DatabaseEntry KernelSelectionDouble = {
  "KernelSelection", Precision::kDouble, {
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
-        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
-        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",1280*1280*1280} } },
      }
    },
    { // Default
@ -114,13 +110,12 @@ const Database::DatabaseEntry KernelSelectionComplexDouble = {
  "KernelSelection", Precision::kComplexDouble, {
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
-        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
-        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",1280*1280*1280} } },
      }
    },
    { // Default
--- a/src/database/kernels/copy.hpp
+++ b/src/database/kernels/copy.hpp
@ -17,6 +17,12 @@ namespace database {

 const Database::DatabaseEntry CopyHalf = {
  "Copy", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
+      }
+    },
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",4} } },
@ -26,7 +32,7 @@ const Database::DatabaseEntry CopyHalf = {
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
      }
    },
  }
@ -39,12 +45,15 @@ const Database::DatabaseEntry CopySingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "ATI Radeon HD 6750M",                             { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "Ellesmere",                                       { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",8} } },
        { "Hawaii",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
        { "Oland",                                           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
        { "Pitcairn",                                        { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
        { "Tahiti",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
        { "Tonga",                                           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
-        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+        { "Turks",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
      }
    },
    { // ARM GPUs
@ -55,10 +64,12 @@ const Database::DatabaseEntry CopySingle = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",2} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",2} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
-        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",2} } },
      }
    },
    { // Intel GPUs
@ -83,6 +94,7 @@ const Database::DatabaseEntry CopySingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
        { "GeForce GTX 1070",                                { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "GeForce GTX 1080",                                { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
        { "GeForce GTX 480",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
        { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
        { "GeForce GTX 680",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
@ -92,9 +104,10 @@ const Database::DatabaseEntry CopySingle = {
        { "GeForce GTX TITAN",                               { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
        { "GeForce GTX TITAN Black",                         { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} } },
        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+        { "TITAN X (Pascal)",                                { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
        { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
        { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
      }
    },
    { // Default
@ -112,18 +125,23 @@ const Database::DatabaseEntry CopyComplexSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "ATI Radeon HD 6750M",                             { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "Ellesmere",                                       { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } },
        { "Hawaii",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "Oland",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Pitcairn",                                        { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
        { "Tonga",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",2} } },
-        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "Turks",                                           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",2} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
      }
@ -150,6 +168,7 @@ const Database::DatabaseEntry CopyComplexSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX 1070",                                { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 1080",                                { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "GeForce GTX 480",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX 750",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
@ -157,14 +176,15 @@ const Database::DatabaseEntry CopyComplexSingle = {
        { "GeForce GTX 980",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX TITAN Black",                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
        { "Tesla K40m",                                      { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
-        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
      }
    },
  }
@ -177,12 +197,13 @@ const Database::DatabaseEntry CopyDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "Ellesmere",                                       { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
        { "Hawaii",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "Oland",                                           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
        { "Pitcairn",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "Tonga",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",4} } },
-        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
      }
    },
    { // ARM GPUs
@ -193,10 +214,12 @@ const Database::DatabaseEntry CopyDouble = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
-        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
      }
    },
    { // Intel accelerators
@ -209,6 +232,7 @@ const Database::DatabaseEntry CopyDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "GeForce GTX 1070",                                { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "GeForce GTX 1080",                                { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
        { "GeForce GTX 480",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "GeForce GTX 670",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "GeForce GTX 680",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
@ -218,14 +242,15 @@ const Database::DatabaseEntry CopyDouble = {
        { "GeForce GTX TITAN",                               { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
        { "GeForce GTX TITAN Black",                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
        { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
-        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
      }
    },
  }
@ -238,6 +263,7 @@ const Database::DatabaseEntry CopyComplexDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "Ellesmere",                                       { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "Hawaii",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
        { "Oland",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Pitcairn",                                        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
@ -254,10 +280,12 @@ const Database::DatabaseEntry CopyComplexDouble = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",4} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
-        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
      }
    },
    { // Intel accelerators
@ -270,6 +298,7 @@ const Database::DatabaseEntry CopyComplexDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX 1070",                                { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",4} } },
+        { "GeForce GTX 1080",                                { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX 480",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX 680",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
@ -279,6 +308,7 @@ const Database::DatabaseEntry CopyComplexDouble = {
        { "GeForce GTX TITAN",                               { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX TITAN Black",                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
--- a/src/database/kernels/invert.hpp
+++ b/src/database/kernels/invert.hpp
@ -0,0 +1,78 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// Tuning parameters for the diagonal matrix inversion kernels
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+// =================================================================================================
+
+const Database::DatabaseEntry InvertHalf = {
+  "Invert", Precision::kHalf, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"INTERNAL_BLOCK_SIZE",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry InvertSingle = {
+  "Invert", Precision::kSingle, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"INTERNAL_BLOCK_SIZE",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry InvertComplexSingle = {
+  "Invert", Precision::kComplexSingle, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"INTERNAL_BLOCK_SIZE",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry InvertDouble = {
+  "Invert", Precision::kDouble, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"INTERNAL_BLOCK_SIZE",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry InvertComplexDouble = {
+  "Invert", Precision::kComplexDouble, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"INTERNAL_BLOCK_SIZE",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace database
+} // namespace clblast
--- a/src/database/kernels/pad.hpp
+++ b/src/database/kernels/pad.hpp
@ -17,6 +17,12 @@ namespace database {

 const Database::DatabaseEntry PadHalf = {
  "Pad", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+      }
+    },
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
@ -39,12 +45,15 @@ const Database::DatabaseEntry PadSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "ATI Radeon HD 6750M",                             { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "Ellesmere",                                       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
        { "Hawaii",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
        { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Pitcairn",                                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Tahiti",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Tonga",                                           { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+        { "Turks",                                           { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
      }
    },
    { // ARM GPUs
@ -55,8 +64,10 @@ const Database::DatabaseEntry PadSingle = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
      }
@ -83,6 +94,7 @@ const Database::DatabaseEntry PadSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "GeForce GTX 1070",                                { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 1080",                                { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX 480",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
        { "GeForce GTX 670",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
        { "GeForce GTX 680",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
@ -92,9 +104,10 @@ const Database::DatabaseEntry PadSingle = {
        { "GeForce GTX TITAN",                               { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "GeForce GTX TITAN Black",                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "TITAN X (Pascal)",                                { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "Tesla K40m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
      }
    },
    { // Default
@ -112,12 +125,15 @@ const Database::DatabaseEntry PadComplexSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "ATI Radeon HD 6750M",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "Ellesmere",                                       { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
        { "Hawaii",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Pitcairn",                                        { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Tahiti",                                          { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tonga",                                           { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Turks",                                           { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
      }
    },
    { // ARM GPUs
@ -128,10 +144,12 @@ const Database::DatabaseEntry PadComplexSingle = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
      }
    },
    { // Intel GPUs
@ -156,6 +174,7 @@ const Database::DatabaseEntry PadComplexSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX 1070",                                { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 1080",                                { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX 480",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "GeForce GTX 670",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "GeForce GTX 680",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
@ -165,6 +184,7 @@ const Database::DatabaseEntry PadComplexSingle = {
        { "GeForce GTX TITAN",                               { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "GeForce GTX TITAN Black",                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "TITAN X (Pascal)",                                { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Tesla K40m",                                      { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
@ -172,7 +192,7 @@ const Database::DatabaseEntry PadComplexSingle = {
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
      }
    },
  }
@ -185,12 +205,13 @@ const Database::DatabaseEntry PadDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Ellesmere",                                       { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "Hawaii",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Pitcairn",                                        { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Tahiti",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tonga",                                           { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
    { // ARM GPUs
@ -201,8 +222,10 @@ const Database::DatabaseEntry PadDouble = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
      }
@ -217,6 +240,7 @@ const Database::DatabaseEntry PadDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX 1070",                                { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 1080",                                { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "GeForce GTX 480",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX 670",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "GeForce GTX 680",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
@ -226,6 +250,7 @@ const Database::DatabaseEntry PadDouble = {
        { "GeForce GTX TITAN",                               { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX TITAN Black",                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "TITAN X (Pascal)",                                { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
        { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tesla K40m",                                      { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@ -246,6 +271,7 @@ const Database::DatabaseEntry PadComplexDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Ellesmere",                                       { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Hawaii",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "Pitcairn",                                        { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@ -262,10 +288,12 @@ const Database::DatabaseEntry PadComplexDouble = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
      }
    },
    { // Intel accelerators
@ -278,6 +306,7 @@ const Database::DatabaseEntry PadComplexDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX 1070",                                { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
+        { "GeForce GTX 1080",                                { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX 480",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX 670",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX 680",                                 { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@ -287,6 +316,7 @@ const Database::DatabaseEntry PadComplexDouble = {
        { "GeForce GTX TITAN",                               { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "GeForce GTX TITAN Black",                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
        { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "TITAN X (Pascal)",                                { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tesla K20m",                                      { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Tesla K40m",                                      { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
--- a/src/database/kernels/padtranspose.hpp
+++ b/src/database/kernels/padtranspose.hpp
@ -17,6 +17,12 @@ namespace database {

 const Database::DatabaseEntry PadtransposeHalf = {
  "Padtranspose", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+      }
+    },
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
@ -39,11 +45,14 @@ const Database::DatabaseEntry PadtransposeSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+        { "ATI Radeon HD 6750M",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "Ellesmere",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Hawaii",                                          { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
        { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
        { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
        { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
        { "Tonga",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Turks",                                           { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
      }
    },
@ -55,8 +64,10 @@ const Database::DatabaseEntry PadtransposeSingle = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
      }
@ -83,6 +94,7 @@ const Database::DatabaseEntry PadtransposeSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
        { "GeForce GTX 1070",                                { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 1080",                                { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "GeForce GTX 670",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
        { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@ -92,6 +104,7 @@ const Database::DatabaseEntry PadtransposeSingle = {
        { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "GeForce GTX TITAN Black",                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
        { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "Tesla K20m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
@ -112,11 +125,14 @@ const Database::DatabaseEntry PadtransposeComplexSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+        { "ATI Radeon HD 6750M",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "Ellesmere",                                       { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Hawaii",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "Tonga",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Turks",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
      }
    },
@ -128,10 +144,12 @@ const Database::DatabaseEntry PadtransposeComplexSingle = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
      }
    },
    { // Intel GPUs
@ -156,6 +174,7 @@ const Database::DatabaseEntry PadtransposeComplexSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX 1070",                                { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 1080",                                { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
        { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX 670",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@ -165,6 +184,7 @@ const Database::DatabaseEntry PadtransposeComplexSingle = {
        { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN Black",                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
        { "Tesla K20m",                                      { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@ -185,6 +205,7 @@ const Database::DatabaseEntry PadtransposeDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+        { "Ellesmere",                                       { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
        { "Hawaii",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
        { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
@ -201,8 +222,10 @@ const Database::DatabaseEntry PadtransposeDouble = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
      }
@ -217,6 +240,7 @@ const Database::DatabaseEntry PadtransposeDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX 1070",                                { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 1080",                                { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
        { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX 670",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@ -226,6 +250,7 @@ const Database::DatabaseEntry PadtransposeDouble = {
        { "GeForce GTX TITAN",                               { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN Black",                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
        { "Tesla K20m",                                      { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@ -233,7 +258,7 @@ const Database::DatabaseEntry PadtransposeDouble = {
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
      }
    },
  }
@ -246,6 +271,7 @@ const Database::DatabaseEntry PadtransposeComplexDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+        { "Ellesmere",                                       { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Hawaii",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
@ -262,10 +288,12 @@ const Database::DatabaseEntry PadtransposeComplexDouble = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
-        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
      }
    },
    { // Intel accelerators
@ -278,6 +306,7 @@ const Database::DatabaseEntry PadtransposeComplexDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX 1070",                                { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 1080",                                { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
        { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX 670",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
@ -287,6 +316,7 @@ const Database::DatabaseEntry PadtransposeComplexDouble = {
        { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN Black",                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
        { "Tesla K20m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
--- a/src/database/kernels/transpose.hpp
+++ b/src/database/kernels/transpose.hpp
@ -17,6 +17,12 @@ namespace database {

 const Database::DatabaseEntry TransposeHalf = {
  "Transpose", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
+      }
+    },
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
@ -26,7 +32,7 @@ const Database::DatabaseEntry TransposeHalf = {
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
      }
    },
  }
@ -39,12 +45,15 @@ const Database::DatabaseEntry TransposeSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
+        { "ATI Radeon HD 6750M",                             { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "Ellesmere",                                       { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
        { "Hawaii",                                          { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
        { "Oland",                                           { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
        { "Pitcairn",                                        { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "Tahiti",                                          { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
        { "Tonga",                                           { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
-        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "Turks",                                           { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
      }
    },
    { // ARM GPUs
@ -55,8 +64,10 @@ const Database::DatabaseEntry TransposeSingle = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",16} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",16} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
      }
@ -83,6 +94,7 @@ const Database::DatabaseEntry TransposeSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
        { "GeForce GTX 1070",                                { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "GeForce GTX 1080",                                { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "GeForce GTX 480",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "GeForce GTX 670",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
        { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
@ -92,6 +104,7 @@ const Database::DatabaseEntry TransposeSingle = {
        { "GeForce GTX TITAN",                               { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "GeForce GTX TITAN Black",                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "GeForce GTX TITAN X",                             { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "TITAN X (Pascal)",                                { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "Tesla K20m",                                      { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "Tesla K40m",                                      { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
@ -112,12 +125,15 @@ const Database::DatabaseEntry TransposeComplexSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "ATI Radeon HD 6750M",                             { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "Ellesmere",                                       { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
        { "Hawaii",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "Oland",                                           { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
        { "Pitcairn",                                        { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "Tahiti",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "Tonga",                                           { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "Turks",                                           { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
      }
    },
    { // ARM GPUs
@ -128,8 +144,10 @@ const Database::DatabaseEntry TransposeComplexSingle = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
      }
@ -150,6 +168,7 @@ const Database::DatabaseEntry TransposeComplexSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "GeForce GTX 1070",                                { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "GeForce GTX 1080",                                { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX 480",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX 670",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
@ -159,6 +178,7 @@ const Database::DatabaseEntry TransposeComplexSingle = {
        { "GeForce GTX TITAN",                               { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX TITAN Black",                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
@ -166,7 +186,7 @@ const Database::DatabaseEntry TransposeComplexSingle = {
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
      }
    },
  }
@ -179,6 +199,7 @@ const Database::DatabaseEntry TransposeDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "Ellesmere",                                       { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
        { "Hawaii",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "Oland",                                           { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
        { "Pitcairn",                                        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
@ -195,10 +216,12 @@ const Database::DatabaseEntry TransposeDouble = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",16} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
      }
    },
    { // Intel accelerators
@ -211,6 +234,7 @@ const Database::DatabaseEntry TransposeDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
        { "GeForce GTX 1070",                                { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "GeForce GTX 1080",                                { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "GeForce GTX 480",                                 { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "GeForce GTX 670",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
        { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
@ -220,6 +244,7 @@ const Database::DatabaseEntry TransposeDouble = {
        { "GeForce GTX TITAN",                               { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "GeForce GTX TITAN Black",                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
@ -240,6 +265,7 @@ const Database::DatabaseEntry TransposeComplexDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "Ellesmere",                                       { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "Hawaii",                                          { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
        { "Oland",                                           { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "Pitcairn",                                        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
@ -256,16 +282,19 @@ const Database::DatabaseEntry TransposeComplexDouble = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "GeForce GTX 1070",                                { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX 1080",                                { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX 480",                                 { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX 670",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
@ -275,6 +304,7 @@ const Database::DatabaseEntry TransposeComplexDouble = {
        { "GeForce GTX TITAN",                               { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX TITAN Black",                         { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
--- a/src/database/kernels/xaxpy.hpp
+++ b/src/database/kernels/xaxpy.hpp
@ -17,6 +17,12 @@ namespace database {

 const Database::DatabaseEntry XaxpyHalf = {
  "Xaxpy", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"VW",4}, {"WGS",128}, {"WPT",4} } },
+        { "default",                                         { {"VW",4}, {"WGS",128}, {"WPT",4} } },
+      }
+    },
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@ -26,7 +32,7 @@ const Database::DatabaseEntry XaxpyHalf = {
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",8}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",8}, {"WGS",256}, {"WPT",4} } },
      }
    },
  }
@ -39,12 +45,15 @@ const Database::DatabaseEntry XaxpySingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "ATI Radeon HD 6750M",                             { {"VW",1}, {"WGS",256}, {"WPT",2} } },
+        { "Ellesmere",                                       { {"VW",1}, {"WGS",64}, {"WPT",4} } },
        { "Hawaii",                                          { {"VW",2}, {"WGS",64}, {"WPT",2} } },
        { "Oland",                                           { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "Pitcairn",                                        { {"VW",2}, {"WGS",128}, {"WPT",1} } },
        { "Tahiti",                                          { {"VW",2}, {"WGS",64}, {"WPT",1} } },
        { "Tonga",                                           { {"VW",1}, {"WGS",256}, {"WPT",8} } },
-        { "default",                                         { {"VW",2}, {"WGS",64}, {"WPT",2} } },
+        { "Turks",                                           { {"VW",2}, {"WGS",256}, {"WPT",1} } },
+        { "default",                                         { {"VW",2}, {"WGS",256}, {"WPT",1} } },
      }
    },
    { // ARM GPUs
@ -55,10 +64,12 @@ const Database::DatabaseEntry XaxpySingle = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW",8}, {"WGS",512}, {"WPT",1} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS",512}, {"WPT",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",4}, {"WGS",256}, {"WPT",1} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
-        { "default",                                         { {"VW",2}, {"WGS",256}, {"WPT",1} } },
+        { "default",                                         { {"VW",8}, {"WGS",512}, {"WPT",1} } },
      }
    },
    { // Intel GPUs
@ -83,6 +94,7 @@ const Database::DatabaseEntry XaxpySingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"VW",2}, {"WGS",64}, {"WPT",1} } },
        { "GeForce GTX 1070",                                { {"VW",1}, {"WGS",64}, {"WPT",4} } },
+        { "GeForce GTX 1080",                                { {"VW",1}, {"WGS",256}, {"WPT",1} } },
        { "GeForce GTX 480",                                 { {"VW",2}, {"WGS",128}, {"WPT",1} } },
        { "GeForce GTX 670",                                 { {"VW",2}, {"WGS",64}, {"WPT",1} } },
        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
@ -92,9 +104,10 @@ const Database::DatabaseEntry XaxpySingle = {
        { "GeForce GTX TITAN",                               { {"VW",4}, {"WGS",256}, {"WPT",1} } },
        { "GeForce GTX TITAN Black",                         { {"VW",4}, {"WGS",128}, {"WPT",4} } },
        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"VW",4}, {"WGS",128}, {"WPT",1} } },
        { "Tesla K20m",                                      { {"VW",4}, {"WGS",128}, {"WPT",1} } },
        { "Tesla K40m",                                      { {"VW",4}, {"WGS",128}, {"WPT",1} } },
-        { "default",                                         { {"VW",4}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",4}, {"WGS",256}, {"WPT",1} } },
      }
    },
    { // Default
@ -112,11 +125,14 @@ const Database::DatabaseEntry XaxpyComplexSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",2}, {"WGS",64}, {"WPT",8} } },
+        { "ATI Radeon HD 6750M",                             { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "Ellesmere",                                       { {"VW",2}, {"WGS",256}, {"WPT",1} } },
        { "Hawaii",                                          { {"VW",1}, {"WGS",128}, {"WPT",2} } },
        { "Oland",                                           { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "Pitcairn",                                        { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "Tahiti",                                          { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "Tonga",                                           { {"VW",1}, {"WGS",256}, {"WPT",8} } },
+        { "Turks",                                           { {"VW",2}, {"WGS",256}, {"WPT",1} } },
        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
      }
    },
@ -128,8 +144,10 @@ const Database::DatabaseEntry XaxpyComplexSingle = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW",4}, {"WGS",1024}, {"WPT",1} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",4}, {"WGS",256}, {"WPT",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",1}, {"WGS",1024}, {"WPT",2} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW",4}, {"WGS",1024}, {"WPT",1} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
        { "default",                                         { {"VW",8}, {"WGS",1024}, {"WPT",1} } },
      }
@ -156,6 +174,7 @@ const Database::DatabaseEntry XaxpyComplexSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"VW",1}, {"WGS",512}, {"WPT",1} } },
        { "GeForce GTX 1070",                                { {"VW",1}, {"WGS",64}, {"WPT",2} } },
+        { "GeForce GTX 1080",                                { {"VW",2}, {"WGS",64}, {"WPT",1} } },
        { "GeForce GTX 480",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
        { "GeForce GTX 670",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
@ -165,6 +184,7 @@ const Database::DatabaseEntry XaxpyComplexSingle = {
        { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS",256}, {"WPT",1} } },
        { "GeForce GTX TITAN Black",                         { {"VW",1}, {"WGS",128}, {"WPT",2} } },
        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",512}, {"WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"VW",2}, {"WGS",512}, {"WPT",1} } },
        { "Tesla K20m",                                      { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "Tesla K40m",                                      { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "default",                                         { {"VW",1}, {"WGS",256}, {"WPT",1} } },
@ -185,6 +205,7 @@ const Database::DatabaseEntry XaxpyDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+        { "Ellesmere",                                       { {"VW",2}, {"WGS",64}, {"WPT",4} } },
        { "Hawaii",                                          { {"VW",1}, {"WGS",64}, {"WPT",2} } },
        { "Oland",                                           { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "Pitcairn",                                        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
@ -201,10 +222,12 @@ const Database::DatabaseEntry XaxpyDouble = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW",4}, {"WGS",64}, {"WPT",1} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",8}, {"WGS",64}, {"WPT",1} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW",8}, {"WGS",256}, {"WPT",1} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",8}, {"WGS",2048}, {"WPT",1} } },
-        { "default",                                         { {"VW",8}, {"WGS",512}, {"WPT",1} } },
+        { "default",                                         { {"VW",8}, {"WGS",64}, {"WPT",1} } },
      }
    },
    { // Intel accelerators
@ -217,6 +240,7 @@ const Database::DatabaseEntry XaxpyDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "GeForce GTX 1070",                                { {"VW",1}, {"WGS",64}, {"WPT",8} } },
+        { "GeForce GTX 1080",                                { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "GeForce GTX 480",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "GeForce GTX 670",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@ -226,14 +250,15 @@ const Database::DatabaseEntry XaxpyDouble = {
        { "GeForce GTX TITAN",                               { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
        { "GeForce GTX TITAN Black",                         { {"VW",2}, {"WGS",128}, {"WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",512}, {"WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"VW",2}, {"WGS",512}, {"WPT",1} } },
        { "Tesla K20m",                                      { {"VW",2}, {"WGS",128}, {"WPT",1} } },
        { "Tesla K40m",                                      { {"VW",2}, {"WGS",128}, {"WPT",1} } },
-        { "default",                                         { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",2} } },
+        { "default",                                         { {"VW",2}, {"WGS",256}, {"WPT",1} } },
      }
    },
  }
@ -246,6 +271,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "Ellesmere",                                       { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "Hawaii",                                          { {"VW",2}, {"WGS",64}, {"WPT",1} } },
        { "Oland",                                           { {"VW",1}, {"WGS",256}, {"WPT",1} } },
        { "Pitcairn",                                        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
@ -262,8 +288,10 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW",4}, {"WGS",1024}, {"WPT",1} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",8}, {"WGS",128}, {"WPT",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",8}, {"WGS",512}, {"WPT",1} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW",8}, {"WGS",1024}, {"WPT",1} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS",256}, {"WPT",1} } },
        { "default",                                         { {"VW",4}, {"WGS",1024}, {"WPT",1} } },
      }
@ -278,6 +306,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "GeForce GTX 1070",                                { {"VW",1}, {"WGS",64}, {"WPT",2} } },
+        { "GeForce GTX 1080",                                { {"VW",1}, {"WGS",256}, {"WPT",1} } },
        { "GeForce GTX 480",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "GeForce GTX 670",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@ -287,6 +316,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
        { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS",64}, {"WPT",4} } },
        { "GeForce GTX TITAN Black",                         { {"VW",1}, {"WGS",128}, {"WPT",4} } },
        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+        { "TITAN X (Pascal)",                                { {"VW",1}, {"WGS",256}, {"WPT",2} } },
        { "Tesla K20m",                                      { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "Tesla K40m",                                      { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@ -294,7 +324,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",256}, {"WPT",1} } },
      }
    },
  }
--- a/src/database/kernels/xdot.hpp
+++ b/src/database/kernels/xdot.hpp
@ -17,6 +17,12 @@ namespace database {

 const Database::DatabaseEntry XdotHalf = {
  "Xdot", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"WGS1",256}, {"WGS2",64} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",64} } },
+      }
+    },
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
@ -39,17 +45,22 @@ const Database::DatabaseEntry XdotSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",128}, {"WGS2",32} } },
+        { "ATI Radeon HD 6750M",                             { {"WGS1",256}, {"WGS2",32} } },
+        { "Ellesmere",                                       { {"WGS1",128}, {"WGS2",32} } },
        { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
        { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",32} } },
        { "Tahiti",                                          { {"WGS1",128}, {"WGS2",32} } },
        { "Tonga",                                           { {"WGS1",64}, {"WGS2",32} } },
+        { "Turks",                                           { {"WGS1",128}, {"WGS2",64} } },
        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",32}, {"WGS2",32} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",1024}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",1024}, {"WGS2",32} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",64}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
      }
    },
    { // Intel GPUs
@ -67,6 +78,7 @@ const Database::DatabaseEntry XdotSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"WGS1",128}, {"WGS2",32} } },
        { "GeForce GTX 1070",                                { {"WGS1",128}, {"WGS2",1024} } },
+        { "GeForce GTX 1080",                                { {"WGS1",512}, {"WGS2",64} } },
        { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
        { "GeForce GTX 670",                                 { {"WGS1",512}, {"WGS2",1024} } },
        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",128} } },
@ -75,13 +87,14 @@ const Database::DatabaseEntry XdotSingle = {
        { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",32} } },
        { "GeForce GTX TITAN Black",                         { {"WGS1",512}, {"WGS2",64} } },
        { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
+        { "TITAN X (Pascal)",                                { {"WGS1",1024}, {"WGS2",32} } },
        { "Tesla K20m",                                      { {"WGS1",1024}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",256}, {"WGS2",256} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",64} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",256}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
      }
    },
  }
@ -94,17 +107,22 @@ const Database::DatabaseEntry XdotComplexSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",32} } },
+        { "ATI Radeon HD 6750M",                             { {"WGS1",256}, {"WGS2",256} } },
+        { "Ellesmere",                                       { {"WGS1",256}, {"WGS2",32} } },
        { "Oland",                                           { {"WGS1",128}, {"WGS2",32} } },
        { "Pitcairn",                                        { {"WGS1",256}, {"WGS2",32} } },
        { "Tahiti",                                          { {"WGS1",64}, {"WGS2",32} } },
        { "Tonga",                                           { {"WGS1",256}, {"WGS2",64} } },
-        { "default",                                         { {"WGS1",256}, {"WGS2",64} } },
+        { "Turks",                                           { {"WGS1",128}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",32} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",128}, {"WGS2",64} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",1024}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",1024}, {"WGS2",32} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",256}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",32} } },
      }
    },
    { // Intel GPUs
@ -122,6 +140,7 @@ const Database::DatabaseEntry XdotComplexSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"WGS1",64}, {"WGS2",32} } },
        { "GeForce GTX 1070",                                { {"WGS1",128}, {"WGS2",32} } },
+        { "GeForce GTX 1080",                                { {"WGS1",128}, {"WGS2",64} } },
        { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
        { "GeForce GTX 670",                                 { {"WGS1",256}, {"WGS2",32} } },
        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",64} } },
@ -130,13 +149,14 @@ const Database::DatabaseEntry XdotComplexSingle = {
        { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",64} } },
        { "GeForce GTX TITAN Black",                         { {"WGS1",128}, {"WGS2",64} } },
        { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
+        { "TITAN X (Pascal)",                                { {"WGS1",256}, {"WGS2",32} } },
        { "Tesla K20m",                                      { {"WGS1",512}, {"WGS2",32} } },
        { "default",                                         { {"WGS1",512}, {"WGS2",64} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",256}, {"WGS2",64} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",32} } },
      }
    },
  }
@ -149,6 +169,7 @@ const Database::DatabaseEntry XdotDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",128} } },
+        { "Ellesmere",                                       { {"WGS1",128}, {"WGS2",64} } },
        { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
        { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",32} } },
        { "Tahiti",                                          { {"WGS1",256}, {"WGS2",32} } },
@ -158,14 +179,17 @@ const Database::DatabaseEntry XdotDouble = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",64}, {"WGS2",128} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",512}, {"WGS2",64} } },
-        { "default",                                         { {"WGS1",512}, {"WGS2",64} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",256}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",64} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"WGS1",128}, {"WGS2",32} } },
        { "GeForce GTX 1070",                                { {"WGS1",128}, {"WGS2",512} } },
+        { "GeForce GTX 1080",                                { {"WGS1",128}, {"WGS2",128} } },
        { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
        { "GeForce GTX 670",                                 { {"WGS1",256}, {"WGS2",32} } },
        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",64} } },
@ -174,8 +198,9 @@ const Database::DatabaseEntry XdotDouble = {
        { "GeForce GTX 980",                                 { {"WGS1",128}, {"WGS2",32} } },
        { "GeForce GTX TITAN Black",                         { {"WGS1",128}, {"WGS2",64} } },
        { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
+        { "TITAN X (Pascal)",                                { {"WGS1",128}, {"WGS2",32} } },
        { "Tesla K20m",                                      { {"WGS1",512}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",128}, {"WGS2",64} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",128} } },
      }
    },
    { // Default
@ -193,6 +218,7 @@ const Database::DatabaseEntry XdotComplexDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",32} } },
+        { "Ellesmere",                                       { {"WGS1",256}, {"WGS2",32} } },
        { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
        { "Pitcairn",                                        { {"WGS1",256}, {"WGS2",32} } },
        { "Tahiti",                                          { {"WGS1",256}, {"WGS2",32} } },
@ -202,14 +228,17 @@ const Database::DatabaseEntry XdotComplexDouble = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",32}, {"WGS2",128} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",1024}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",1024}, {"WGS2",32} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",32}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"WGS1",64}, {"WGS2",32} } },
        { "GeForce GTX 1070",                                { {"WGS1",128}, {"WGS2",64} } },
+        { "GeForce GTX 1080",                                { {"WGS1",128}, {"WGS2",32} } },
        { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
        { "GeForce GTX 670",                                 { {"WGS1",512}, {"WGS2",128} } },
        { "GeForce GTX 680",                                 { {"WGS1",256}, {"WGS2",64} } },
@ -218,13 +247,14 @@ const Database::DatabaseEntry XdotComplexDouble = {
        { "GeForce GTX 980",                                 { {"WGS1",64}, {"WGS2",32} } },
        { "GeForce GTX TITAN Black",                         { {"WGS1",128}, {"WGS2",32} } },
        { "GeForce GTX TITAN X",                             { {"WGS1",128}, {"WGS2",32} } },
+        { "TITAN X (Pascal)",                                { {"WGS1",128}, {"WGS2",64} } },
        { "Tesla K20m",                                      { {"WGS1",128}, {"WGS2",32} } },
        { "default",                                         { {"WGS1",128}, {"WGS2",64} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",256}, {"WGS2",64} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
      }
    },
  }
--- a/src/database/kernels/xgemm.hpp
+++ b/src/database/kernels/xgemm.hpp
@ -17,6 +17,12 @@ namespace database {

 const Database::DatabaseEntry XgemmHalf = {
  "Xgemm", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+      }
+    },
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
@ -38,12 +44,15 @@ const Database::DatabaseEntry XgemmSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } },
+        { "ATI Radeon HD 6750M",                             { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",8}, {"VWN",8} } },
+        { "Ellesmere",                                       { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
        { "Hawaii",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
        { "Oland",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
        { "Pitcairn",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tahiti",                                          { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
        { "Tonga",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } },
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
+        { "Turks",                                           { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
      }
    },
    { // ARM GPUs
@ -54,10 +63,12 @@ const Database::DatabaseEntry XgemmSingle = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",8} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
      }
    },
    { // Intel GPUs
@ -82,6 +93,7 @@ const Database::DatabaseEntry XgemmSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
        { "GeForce GTX 1070",                                { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
+        { "GeForce GTX 1080",                                { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
        { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
        { "GeForce GTX 670",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
        { "GeForce GTX 680",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
@ -91,9 +103,10 @@ const Database::DatabaseEntry XgemmSingle = {
        { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
        { "GeForce GTX TITAN Black",                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
        { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
+        { "TITAN X (Pascal)",                                { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
        { "Tesla K20m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
        { "Tesla K40m",                                      { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
      }
    },
    { // Default
@ -111,12 +124,15 @@ const Database::DatabaseEntry XgemmComplexSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+        { "ATI Radeon HD 6750M",                             { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+        { "Ellesmere",                                       { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
        { "Hawaii",                                          { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Oland",                                           { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
        { "Pitcairn",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
        { "Tahiti",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
        { "Tonga",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
+        { "Turks",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
      }
    },
    { // ARM GPUs
@ -127,10 +143,12 @@ const Database::DatabaseEntry XgemmComplexSingle = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
      }
    },
    { // Intel GPUs
@ -155,6 +173,7 @@ const Database::DatabaseEntry XgemmComplexSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
        { "GeForce GTX 1070",                                { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
+        { "GeForce GTX 1080",                                { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
        { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
        { "GeForce GTX 670",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
        { "GeForce GTX 680",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
@ -164,6 +183,7 @@ const Database::DatabaseEntry XgemmComplexSingle = {
        { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "GeForce GTX TITAN Black",                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
        { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
+        { "TITAN X (Pascal)",                                { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
        { "Tesla K20m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
        { "Tesla K40m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@ -184,6 +204,7 @@ const Database::DatabaseEntry XgemmDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+        { "Ellesmere",                                       { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
        { "Hawaii",                                          { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
        { "Oland",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
        { "Pitcairn",                                        { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
@ -200,10 +221,12 @@ const Database::DatabaseEntry XgemmDouble = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
-        { "default",                                         { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
      }
    },
    { // Intel accelerators
@ -216,6 +239,7 @@ const Database::DatabaseEntry XgemmDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
        { "GeForce GTX 1070",                                { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+        { "GeForce GTX 1080",                                { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
        { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
        { "GeForce GTX 670",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "GeForce GTX 680",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
@ -225,14 +249,15 @@ const Database::DatabaseEntry XgemmDouble = {
        { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
        { "GeForce GTX TITAN Black",                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "TITAN X (Pascal)",                                { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
        { "Tesla K20m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tesla K40m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
      }
    },
  }
@ -245,12 +270,13 @@ const Database::DatabaseEntry XgemmComplexDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+        { "Ellesmere",                                       { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Hawaii",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
        { "Oland",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
        { "Pitcairn",                                        { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tahiti",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tonga",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
    { // ARM GPUs
@ -261,10 +287,12 @@ const Database::DatabaseEntry XgemmComplexDouble = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
      }
    },
    { // Intel accelerators
@ -277,6 +305,7 @@ const Database::DatabaseEntry XgemmComplexDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
        { "GeForce GTX 1070",                                { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
+        { "GeForce GTX 1080",                                { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
        { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "GeForce GTX 670",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
        { "GeForce GTX 680",                                 { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@ -285,14 +314,15 @@ const Database::DatabaseEntry XgemmComplexDouble = {
        { "GeForce GTX 980",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
        { "GeForce GTX TITAN Black",                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
        { "GeForce GTX TITAN X",                             { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "TITAN X (Pascal)",                                { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tesla K20m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tesla K40m",                                      { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
  }
--- a/src/database/kernels/xgemm_direct.hpp
+++ b/src/database/kernels/xgemm_direct.hpp
@ -17,6 +17,12 @@ namespace database {

 const Database::DatabaseEntry XgemmDirectHalf = {
  "XgemmDirect", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"KWID",8}, {"MDIMAD",32}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",32}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+        { "default",                                         { {"KWID",8}, {"MDIMAD",32}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",32}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+      }
+    },
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
@ -25,7 +31,7 @@ const Database::DatabaseEntry XgemmDirectHalf = {
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
      }
    },
  }
@ -38,8 +44,18 @@ const Database::DatabaseEntry XgemmDirectSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+        { "ATI Radeon HD 6750M",                             { {"KWID",8}, {"MDIMAD",8}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",0}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+        { "Ellesmere",                                       { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",32}, {"NDIMCD",32}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",32} } },
        { "Tonga",                                           { {"KWID",16}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",32}, {"NDIMCD",8}, {"PADA",0}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
-        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "Turks",                                           { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",1}, {"VWND",8}, {"WGD",64} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",2}, {"VWND",2}, {"WGD",64} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
      }
    },
    { // Intel GPUs
@ -51,9 +67,11 @@ const Database::DatabaseEntry XgemmDirectSingle = {
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 1080",                                { {"KWID",16}, {"MDIMAD",16}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
        { "GeForce GTX 750 Ti",                              { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
        { "GeForce GTX TITAN Black",                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
-        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
+        { "TITAN X (Pascal)",                                { {"KWID",8}, {"MDIMAD",32}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
      }
    },
    { // Default
@ -71,10 +89,19 @@ const Database::DatabaseEntry XgemmDirectComplexSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "ATI Radeon HD 6750M",                             { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
        { "Tonga",                                           { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+        { "Turks",                                           { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",16} } },
        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
      }
    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+      }
+    },
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
@ -84,14 +111,16 @@ const Database::DatabaseEntry XgemmDirectComplexSingle = {
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 1080",                                { {"KWID",8}, {"MDIMAD",8}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
        { "GeForce GTX 750 Ti",                              { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } },
        { "GeForce GTX TITAN Black",                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "TITAN X (Pascal)",                                { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
      }
    },
  }
@ -104,20 +133,30 @@ const Database::DatabaseEntry XgemmDirectDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "Ellesmere",                                       { {"KWID",8}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",32} } },
        { "Tonga",                                           { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
      }
    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+      }
+    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 1080",                                { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
        { "GeForce GTX 750 Ti",                              { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
        { "GeForce GTX TITAN Black",                         { {"KWID",8}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "TITAN X (Pascal)",                                { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",16} } },
      }
    },
  }
@ -130,20 +169,30 @@ const Database::DatabaseEntry XgemmDirectComplexDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "Ellesmere",                                       { {"KWID",16}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
        { "Tonga",                                           { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
      }
    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",32}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"KWID",8}, {"MDIMAD",16}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",16} } },
+      }
+    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 1080",                                { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
        { "GeForce GTX 750 Ti",                              { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
        { "GeForce GTX TITAN Black",                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
+        { "TITAN X (Pascal)",                                { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
      }
    },
  }
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@ -17,6 +17,12 @@ namespace database {

 const Database::DatabaseEntry XgemvHalf = {
  "Xgemv", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"WGS1",256}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",256}, {"WPT1",1} } },
+      }
+    },
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
@ -39,18 +45,23 @@ const Database::DatabaseEntry XgemvSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",128}, {"WPT1",1} } },
+        { "ATI Radeon HD 6750M",                             { {"WGS1",32}, {"WPT1",1} } },
+        { "Ellesmere",                                       { {"WGS1",256}, {"WPT1",1} } },
        { "Hawaii",                                          { {"WGS1",128}, {"WPT1",1} } },
        { "Oland",                                           { {"WGS1",128}, {"WPT1",1} } },
        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1} } },
        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1} } },
        { "Tonga",                                           { {"WGS1",128}, {"WPT1",2} } },
+        { "Turks",                                           { {"WGS1",32}, {"WPT1",1} } },
        { "default",                                         { {"WGS1",128}, {"WPT1",1} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",128}, {"WPT1",4} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",64}, {"WPT1",4} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",4} } },
      }
    },
@ -62,7 +73,7 @@ const Database::DatabaseEntry XgemvSingle = {
        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",256}, {"WPT1",1} } },
        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",32}, {"WPT1",1} } },
        { "Iris",                                            { {"WGS1",64}, {"WPT1",2} } },
-        { "Iris Pro",                                        { {"WGS1",256}, {"WPT1",2} } },
+        { "Iris Pro",                                        { {"WGS1",128}, {"WPT1",1} } },
        { "default",                                         { {"WGS1",128}, {"WPT1",1} } },
      }
    },
@ -76,6 +87,7 @@ const Database::DatabaseEntry XgemvSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"WGS1",256}, {"WPT1",1} } },
        { "GeForce GTX 1070",                                { {"WGS1",128}, {"WPT1",1} } },
+        { "GeForce GTX 1080",                                { {"WGS1",32}, {"WPT1",1} } },
        { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1} } },
        { "GeForce GTX 670",                                 { {"WGS1",64}, {"WPT1",1} } },
        { "GeForce GTX 680",                                 { {"WGS1",256}, {"WPT1",1} } },
@ -85,6 +97,7 @@ const Database::DatabaseEntry XgemvSingle = {
        { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
        { "GeForce GTX TITAN Black",                         { {"WGS1",256}, {"WPT1",1} } },
        { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WPT1",1} } },
+        { "TITAN X (Pascal)",                                { {"WGS1",32}, {"WPT1",1} } },
        { "Tesla K20m",                                      { {"WGS1",128}, {"WPT1",1} } },
        { "Tesla K40m",                                      { {"WGS1",256}, {"WPT1",1} } },
        { "default",                                         { {"WGS1",256}, {"WPT1",1} } },
@ -105,19 +118,24 @@ const Database::DatabaseEntry XgemvComplexSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1} } },
+        { "ATI Radeon HD 6750M",                             { {"WGS1",64}, {"WPT1",1} } },
+        { "Ellesmere",                                       { {"WGS1",32}, {"WPT1",1} } },
        { "Hawaii",                                          { {"WGS1",64}, {"WPT1",1} } },
        { "Oland",                                           { {"WGS1",64}, {"WPT1",1} } },
        { "Pitcairn",                                        { {"WGS1",64}, {"WPT1",1} } },
        { "Tahiti",                                          { {"WGS1",64}, {"WPT1",1} } },
        { "Tonga",                                           { {"WGS1",32}, {"WPT1",1} } },
+        { "Turks",                                           { {"WGS1",64}, {"WPT1",1} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",32}, {"WPT1",4} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",128}, {"WPT1",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",64}, {"WPT1",4} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",2} } },
      }
    },
    { // Intel GPUs
@ -142,6 +160,7 @@ const Database::DatabaseEntry XgemvComplexSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"WGS1",256}, {"WPT1",1} } },
        { "GeForce GTX 1070",                                { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 1080",                                { {"WGS1",32}, {"WPT1",1} } },
        { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1} } },
        { "GeForce GTX 670",                                 { {"WGS1",64}, {"WPT1",1} } },
        { "GeForce GTX 680",                                 { {"WGS1",64}, {"WPT1",1} } },
@ -149,6 +168,7 @@ const Database::DatabaseEntry XgemvComplexSingle = {
        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WPT1",1} } },
        { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
        { "GeForce GTX TITAN Black",                         { {"WGS1",32}, {"WPT1",1} } },
+        { "TITAN X (Pascal)",                                { {"WGS1",32}, {"WPT1",1} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
      }
    },
@ -167,6 +187,7 @@ const Database::DatabaseEntry XgemvDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1} } },
+        { "Ellesmere",                                       { {"WGS1",32}, {"WPT1",1} } },
        { "Hawaii",                                          { {"WGS1",128}, {"WPT1",1} } },
        { "Oland",                                           { {"WGS1",256}, {"WPT1",1} } },
        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1} } },
@ -177,8 +198,10 @@ const Database::DatabaseEntry XgemvDouble = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",64}, {"WPT1",4} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",2} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",64}, {"WPT1",4} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",4} } },
      }
    },
@ -192,6 +215,7 @@ const Database::DatabaseEntry XgemvDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"WGS1",128}, {"WPT1",1} } },
        { "GeForce GTX 1070",                                { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 1080",                                { {"WGS1",32}, {"WPT1",1} } },
        { "GeForce GTX 480",                                 { {"WGS1",256}, {"WPT1",1} } },
        { "GeForce GTX 670",                                 { {"WGS1",128}, {"WPT1",1} } },
        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WPT1",1} } },
@ -201,6 +225,7 @@ const Database::DatabaseEntry XgemvDouble = {
        { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
        { "GeForce GTX TITAN Black",                         { {"WGS1",32}, {"WPT1",1} } },
        { "GeForce GTX TITAN X",                             { {"WGS1",64}, {"WPT1",1} } },
+        { "TITAN X (Pascal)",                                { {"WGS1",32}, {"WPT1",1} } },
        { "Tesla K20m",                                      { {"WGS1",256}, {"WPT1",1} } },
        { "Tesla K40m",                                      { {"WGS1",256}, {"WPT1",1} } },
        { "default",                                         { {"WGS1",128}, {"WPT1",1} } },
@ -221,6 +246,7 @@ const Database::DatabaseEntry XgemvComplexDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1} } },
+        { "Ellesmere",                                       { {"WGS1",32}, {"WPT1",1} } },
        { "Hawaii",                                          { {"WGS1",64}, {"WPT1",1} } },
        { "Oland",                                           { {"WGS1",256}, {"WPT1",1} } },
        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1} } },
@ -231,8 +257,10 @@ const Database::DatabaseEntry XgemvComplexDouble = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",64}, {"WPT1",4} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",32}, {"WPT1",4} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",4} } },
      }
    },
--- a/src/database/kernels/xgemv_fast.hpp
+++ b/src/database/kernels/xgemv_fast.hpp
@ -17,6 +17,12 @@ namespace database {

 const Database::DatabaseEntry XgemvFastHalf = {
  "XgemvFast", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } },
+      }
+    },
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
@ -39,19 +45,24 @@ const Database::DatabaseEntry XgemvFastSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "ATI Radeon HD 6750M",                             { {"VW2",2}, {"WGS2",64}, {"WPT2",2} } },
+        { "Ellesmere",                                       { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Hawaii",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Oland",                                           { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Tahiti",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Tonga",                                           { {"VW2",1}, {"WGS2",16}, {"WPT2",4} } },
+        { "Turks",                                           { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW2",1}, {"WGS2",32}, {"WPT2",4} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
-        { "default",                                         { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW2",2}, {"WGS2",16}, {"WPT2",4} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
      }
    },
    { // Intel GPUs
@ -62,7 +73,7 @@ const Database::DatabaseEntry XgemvFastSingle = {
        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } },
        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
        { "Iris",                                            { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
-        { "Iris Pro",                                        { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
+        { "Iris Pro",                                        { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
        { "default",                                         { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
      }
    },
@ -76,6 +87,7 @@ const Database::DatabaseEntry XgemvFastSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
        { "GeForce GTX 1070",                                { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 1080",                                { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
        { "GeForce GTX 480",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
        { "GeForce GTX 670",                                 { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
        { "GeForce GTX 680",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
@ -85,6 +97,7 @@ const Database::DatabaseEntry XgemvFastSingle = {
        { "GeForce GTX TITAN",                               { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
        { "GeForce GTX TITAN Black",                         { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
        { "GeForce GTX TITAN X",                             { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "TITAN X (Pascal)",                                { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Tesla K20m",                                      { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
        { "Tesla K40m",                                      { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
        { "default",                                         { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
@ -105,18 +118,23 @@ const Database::DatabaseEntry XgemvFastComplexSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
+        { "ATI Radeon HD 6750M",                             { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "Ellesmere",                                       { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Hawaii",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Oland",                                           { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Tahiti",                                          { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
        { "Tonga",                                           { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
+        { "Turks",                                           { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW2",2}, {"WGS2",64}, {"WPT2",4} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW2",4}, {"WGS2",16}, {"WPT2",4} } },
        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } },
      }
    },
@ -163,6 +181,7 @@ const Database::DatabaseEntry XgemvFastDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "Ellesmere",                                       { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
        { "Hawaii",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Oland",                                           { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
@ -173,8 +192,10 @@ const Database::DatabaseEntry XgemvFastDouble = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW2",1}, {"WGS2",16}, {"WPT2",4} } },
        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
      }
    },
@ -188,6 +209,7 @@ const Database::DatabaseEntry XgemvFastDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
        { "GeForce GTX 1070",                                { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 1080",                                { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
        { "GeForce GTX 480",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "GeForce GTX 670",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
        { "GeForce GTX 680",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
@ -197,6 +219,7 @@ const Database::DatabaseEntry XgemvFastDouble = {
        { "GeForce GTX TITAN",                               { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
        { "GeForce GTX TITAN Black",                         { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
        { "GeForce GTX TITAN X",                             { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "TITAN X (Pascal)",                                { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } },
        { "Tesla K20m",                                      { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
        { "Tesla K40m",                                      { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
        { "default",                                         { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
@ -217,6 +240,7 @@ const Database::DatabaseEntry XgemvFastComplexDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "Ellesmere",                                       { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
        { "Hawaii",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Oland",                                           { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
        { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
@ -227,9 +251,11 @@ const Database::DatabaseEntry XgemvFastComplexDouble = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW2",4}, {"WGS2",32}, {"WPT2",4} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW2",2}, {"WGS2",64}, {"WPT2",4} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
-        { "default",                                         { {"VW2",2}, {"WGS2",64}, {"WPT2",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW2",1}, {"WGS2",16}, {"WPT2",2} } },
+        { "default",                                         { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
      }
    },
    { // Intel accelerators
--- a/src/database/kernels/xgemv_fast_rot.hpp
+++ b/src/database/kernels/xgemv_fast_rot.hpp
@ -17,6 +17,12 @@ namespace database {

 const Database::DatabaseEntry XgemvFastRotHalf = {
  "XgemvFastRot", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
+      }
+    },
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
@ -38,14 +44,19 @@ const Database::DatabaseEntry XgemvFastRotSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
+        { "ATI Radeon HD 6750M",                             { {"VW3",8}, {"WGS3",128}, {"WPT3",16} } },
+        { "Ellesmere",                                       { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
        { "Tonga",                                           { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
+        { "Turks",                                           { {"VW3",8}, {"WGS3",128}, {"WPT3",16} } },
        { "default",                                         { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
-        { "default",                                         { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
      }
    },
    { // Intel GPUs
@ -54,21 +65,23 @@ const Database::DatabaseEntry XgemvFastRotSingle = {
        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
-        { "Iris Pro",                                        { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
+        { "Iris Pro",                                        { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
        { "default",                                         { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 1080",                                { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
        { "GeForce GTX 750 Ti",                              { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
        { "GeForce GTX TITAN",                               { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
        { "GeForce GTX TITAN Black",                         { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
+        { "TITAN X (Pascal)",                                { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
      }
    },
  }
@ -81,14 +94,19 @@ const Database::DatabaseEntry XgemvFastRotComplexSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
+        { "ATI Radeon HD 6750M",                             { {"VW3",8}, {"WGS3",32}, {"WPT3",8} } },
+        { "Ellesmere",                                       { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
        { "Tonga",                                           { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
-        { "default",                                         { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
+        { "Turks",                                           { {"VW3",4}, {"WGS3",32}, {"WPT3",8} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
-        { "default",                                         { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
      }
    },
    { // Intel GPUs
@ -103,7 +121,7 @@ const Database::DatabaseEntry XgemvFastRotComplexSingle = {
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
      }
    },
  }
@ -116,21 +134,26 @@ const Database::DatabaseEntry XgemvFastRotDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
+        { "Ellesmere",                                       { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
        { "Tonga",                                           { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
        { "default",                                         { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
-        { "default",                                         { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 1080",                                { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
        { "GeForce GTX 750 Ti",                              { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
        { "GeForce GTX TITAN",                               { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
        { "GeForce GTX TITAN Black",                         { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+        { "TITAN X (Pascal)",                                { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
        { "default",                                         { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
      }
    },
@ -149,19 +172,22 @@ const Database::DatabaseEntry XgemvFastRotComplexDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
+        { "Ellesmere",                                       { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
        { "Tonga",                                           { {"VW3",4}, {"WGS3",16}, {"WPT3",8} } },
        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",16} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
-        { "default",                                         { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
      }
    },
  }
--- a/src/database/kernels/xger.hpp
+++ b/src/database/kernels/xger.hpp
@ -17,6 +17,12 @@ namespace database {

 const Database::DatabaseEntry XgerHalf = {
  "Xger", Precision::kHalf, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "Ellesmere",                                       { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
+      }
+    },
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
@ -26,7 +32,7 @@ const Database::DatabaseEntry XgerHalf = {
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",4}, {"WGS2",8}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
      }
    },
  }
@ -39,12 +45,15 @@ const Database::DatabaseEntry XgerSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
+        { "ATI Radeon HD 6750M",                             { {"WGS1",16}, {"WGS2",16}, {"WPT",4} } },
+        { "Ellesmere",                                       { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
        { "Hawaii",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
        { "Oland",                                           { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
        { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
        { "Tahiti",                                          { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
        { "Tonga",                                           { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
+        { "Turks",                                           { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",16}, {"WGS2",16}, {"WPT",1} } },
      }
    },
    { // ARM GPUs
@ -55,7 +64,9 @@ const Database::DatabaseEntry XgerSingle = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",32}, {"WGS2",4}, {"WPT",4} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",256}, {"WGS2",4}, {"WPT",4} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
        { "default",                                         { {"WGS1",128}, {"WGS2",8}, {"WPT",4} } },
      }
@ -75,6 +86,7 @@ const Database::DatabaseEntry XgerSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
        { "GeForce GTX 1070",                                { {"WGS1",512}, {"WGS2",1}, {"WPT",1} } },
+        { "GeForce GTX 1080",                                { {"WGS1",16}, {"WGS2",4}, {"WPT",1} } },
        { "GeForce GTX 480",                                 { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
        { "GeForce GTX 670",                                 { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
@ -82,6 +94,7 @@ const Database::DatabaseEntry XgerSingle = {
        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
        { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
        { "GeForce GTX TITAN Black",                         { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+        { "TITAN X (Pascal)",                                { {"WGS1",512}, {"WGS2",2}, {"WPT",1} } },
        { "default",                                         { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
      }
    },
@ -100,12 +113,15 @@ const Database::DatabaseEntry XgerComplexSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
+        { "ATI Radeon HD 6750M",                             { {"WGS1",16}, {"WGS2",16}, {"WPT",1} } },
+        { "Ellesmere",                                       { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
        { "Hawaii",                                          { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
        { "Oland",                                           { {"WGS1",4}, {"WGS2",8}, {"WPT",1} } },
        { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
        { "Tahiti",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
        { "Tonga",                                           { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+        { "Turks",                                           { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
      }
    },
    { // ARM GPUs
@ -116,9 +132,11 @@ const Database::DatabaseEntry XgerComplexSingle = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",256}, {"WGS2",2}, {"WPT",4} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",2}, {"WPT",4} } },
      }
    },
    { // Intel GPUs
@ -136,6 +154,7 @@ const Database::DatabaseEntry XgerComplexSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
        { "GeForce GTX 1070",                                { {"WGS1",16}, {"WGS2",64}, {"WPT",2} } },
+        { "GeForce GTX 1080",                                { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } },
        { "GeForce GTX 480",                                 { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
        { "GeForce GTX 670",                                 { {"WGS1",16}, {"WGS2",32}, {"WPT",2} } },
        { "GeForce GTX 680",                                 { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
@ -143,6 +162,7 @@ const Database::DatabaseEntry XgerComplexSingle = {
        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
        { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
        { "GeForce GTX TITAN Black",                         { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
+        { "TITAN X (Pascal)",                                { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } },
        { "default",                                         { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
      }
    },
@ -161,12 +181,13 @@ const Database::DatabaseEntry XgerDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
+        { "Ellesmere",                                       { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
        { "Hawaii",                                          { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
        { "Oland",                                           { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
        { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
        { "Tahiti",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
        { "Tonga",                                           { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
      }
    },
    { // ARM GPUs
@ -177,15 +198,18 @@ const Database::DatabaseEntry XgerDouble = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",512}, {"WGS2",16}, {"WPT",1} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",256}, {"WGS2",4}, {"WPT",4} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"WGS1",128}, {"WGS2",8}, {"WPT",2} } },
        { "GeForce GTX 1070",                                { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
+        { "GeForce GTX 1080",                                { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } },
        { "GeForce GTX 480",                                 { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
        { "GeForce GTX 670",                                 { {"WGS1",32}, {"WGS2",32}, {"WPT",2} } },
        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
@ -193,7 +217,8 @@ const Database::DatabaseEntry XgerDouble = {
        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WGS2",16}, {"WPT",1} } },
        { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
        { "GeForce GTX TITAN Black",                         { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
+        { "TITAN X (Pascal)",                                { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
      }
    },
    { // Default
@ -211,6 +236,7 @@ const Database::DatabaseEntry XgerComplexDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+        { "Ellesmere",                                       { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
        { "Hawaii",                                          { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } },
        { "Oland",                                           { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
        { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
@ -227,7 +253,9 @@ const Database::DatabaseEntry XgerComplexDouble = {
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz",       { {"WGS1",128}, {"WGS2",4}, {"WPT",4} } },
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
+        { "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz",        { {"WGS1",512}, {"WGS2",2}, {"WPT",2} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
      }
@ -236,6 +264,7 @@ const Database::DatabaseEntry XgerComplexDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GRID K520",                                       { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
        { "GeForce GTX 1070",                                { {"WGS1",8}, {"WGS2",128}, {"WPT",1} } },
+        { "GeForce GTX 1080",                                { {"WGS1",8}, {"WGS2",4}, {"WPT",1} } },
        { "GeForce GTX 480",                                 { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
        { "GeForce GTX 670",                                 { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } },
        { "GeForce GTX 680",                                 { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
@ -243,6 +272,7 @@ const Database::DatabaseEntry XgerComplexDouble = {
        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
        { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
        { "GeForce GTX TITAN Black",                         { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
+        { "TITAN X (Pascal)",                                { {"WGS1",4}, {"WGS2",8}, {"WPT",1} } },
        { "default",                                         { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
      }
    },
--- a/src/database/kernels/xtrsv.hpp
+++ b/src/database/kernels/xtrsv.hpp
@ -0,0 +1,78 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file populates the database with best-found tuning parameters for the 'Xtrsv' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+// =================================================================================================
+
+const Database::DatabaseEntry XtrsvHalf = {
+  "Xtrsv", Precision::kHalf, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRSV_BLOCK_SIZE",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XtrsvSingle = {
+  "Xtrsv", Precision::kSingle, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRSV_BLOCK_SIZE",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XtrsvComplexSingle = {
+  "Xtrsv", Precision::kComplexSingle, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRSV_BLOCK_SIZE",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XtrsvDouble = {
+  "Xtrsv", Precision::kDouble, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRSV_BLOCK_SIZE",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XtrsvComplexDouble = {
+  "Xtrsv", Precision::kComplexDouble, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRSV_BLOCK_SIZE",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace database
+} // namespace clblast
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@ -31,9 +31,7 @@ R"(

 // Enable support for double-precision
 #if PRECISION == 64 || PRECISION == 6464
-  #if __OPENCL_VERSION__ <= CL_VERSION_1_1
-     #pragma OPENCL EXTENSION cl_khr_fp64: enable
-  #endif
+   #pragma OPENCL EXTENSION cl_khr_fp64: enable
 #endif

 // Half-precision
@ -71,7 +69,7 @@ R"(

 // Complex single-precision
 #elif PRECISION == 3232
-  typedef struct cfloat {float x; float y;} real;
+  typedef float2 real;
  typedef struct cfloat2 {real x; real y;} real2;
  typedef struct cfloat4 {real x; real y; real z; real w;} real4;
  typedef struct cfloat8 {real s0; real s1; real s2; real s3;
@ -86,7 +84,7 @@ R"(

 // Complex double-precision
 #elif PRECISION == 6464
-  typedef struct cdouble {double x; double y;} real;
+  typedef double2 real;
  typedef struct cdouble2 {real x; real y;} real2;
  typedef struct cdouble4 {real x; real y; real z; real w;} real4;
  typedef struct cdouble8 {real s0; real s1; real s2; real s3;
@ -162,6 +160,13 @@ R"(
  #define AbsoluteValue(value) value = fabs(value)
 #endif

+// Negation (component-wise)
+#if PRECISION == 3232 || PRECISION == 6464
+  #define Negate(value) value.x = -(value.x); value.y = -(value.y)
+#else
+  #define Negate(value) value = -(value)
+#endif
+
 // Adds two complex variables
 #if PRECISION == 3232 || PRECISION == 6464
  #define Add(c, a, b) c.x = a.x + b.x; c.y = a.y + b.y
@ -169,6 +174,13 @@ R"(
  #define Add(c, a, b) c = a + b
 #endif

+// Subtracts two complex variables
+#if PRECISION == 3232 || PRECISION == 6464
+  #define Subtract(c, a, b) c.x = a.x - b.x; c.y = a.y - b.y
+#else
+  #define Subtract(c, a, b) c = a - b
+#endif
+
 // Multiply two complex variables (used in the defines below)
 #if PRECISION == 3232 || PRECISION == 6464
  #define MulReal(a, b) a.x*b.x - a.y*b.y
@ -193,6 +205,20 @@ R"(
  #endif
 #endif

+// The scalar multiply-subtract function
+#if PRECISION == 3232 || PRECISION == 6464
+  #define MultiplySubtract(c, a, b) c.x -= MulReal(a,b); c.y -= MulImag(a,b)
+#else
+  #define MultiplySubtract(c, a, b) c -= a * b
+#endif
+
+// The scalar division function: full division
+#if PRECISION == 3232 || PRECISION == 6464
+  #define DivideFull(c, a, b) singlereal num_x = (a.x * b.x) + (a.y * b.y); singlereal num_y = (a.y * b.x) - (a.x * b.y); singlereal denom = (b.x * b.x) + (b.y * b.y); c.x = num_x / denom; c.y = num_y / denom
+#else
+  #define DivideFull(c, a, b) c = a / b
+#endif
+
 // The scalar AXPBY function
 #if PRECISION == 3232 || PRECISION == 6464
  #define AXPBY(e, a, b, c, d) e.x = MulReal(a,b) + MulReal(c,d); e.y = MulImag(a,b) + MulImag(c,d)
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@ -9,7 +9,7 @@
 //
 // This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit
 // strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't
-// support vector data-types.
+// support vector data-types. The general version has a batched implementation as well.
 //
 // This kernel uses the level-1 BLAS common tuning parameters.
 //
@ -36,14 +36,31 @@ void Xaxpy(const int n, const real_arg arg_alpha,
  }
 }

-// =================================================================================================
+// Faster version of the kernel without offsets and strided accesses but with if-statement. Also
+// assumes that 'n' is dividable by 'VW' and 'WPT'.
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XaxpyFaster(const int n, const real_arg arg_alpha,
+                 const __global realV* restrict xgm,
+                 __global realV* ygm) {
+  const real alpha = GetRealArg(arg_alpha);
+
+  if (get_global_id(0) < n / (VW)) {
+    #pragma unroll
+    for (int w=0; w<WPT; ++w) {
+      const int id = w*get_global_size(0) + get_global_id(0);
+      realV xvalue = xgm[id];
+      realV yvalue = ygm[id];
+      ygm[id] = MultiplyAddVector(yvalue, alpha, xvalue);
+    }
+  }
+}

 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
 __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
-void XaxpyFast(const int n, const real_arg arg_alpha,
-               const __global realV* restrict xgm,
-               __global realV* ygm) {
+void XaxpyFastest(const int n, const real_arg arg_alpha,
+                  const __global realV* restrict xgm,
+                  __global realV* ygm) {
  const real alpha = GetRealArg(arg_alpha);

  #pragma unroll
@ -57,6 +74,24 @@ void XaxpyFast(const int n, const real_arg arg_alpha,

 // =================================================================================================

+// Full version of the kernel with offsets and strided accesses: batched version
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XaxpyBatched(const int n, const __constant real_arg* arg_alphas,
+                  const __global real* restrict xgm, const __constant int* x_offsets, const int x_inc,
+                  __global real* ygm, const __constant int* y_offsets, const int y_inc) {
+  const int batch = get_group_id(1);
+  const real alpha = GetRealArg(arg_alphas[batch]);
+
+  // Loops over the work that needs to be done (allows for an arbitrary number of threads)
+  #pragma unroll
+  for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
+    real xvalue = xgm[id*x_inc + x_offsets[batch]];
+    MultiplyAdd(ygm[id*y_inc + y_offsets[batch]], alpha, xvalue);
+  }
+}
+
+// =================================================================================================
+
 // End of the C++11 raw string literal
 )"

--- a/src/kernels/level2/xtrsv.opencl
+++ b/src/kernels/level2/xtrsv.opencl
@ -0,0 +1,144 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains kernels to perform forward or backward substition, as used in the TRSV routine
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+#if defined(ROUTINE_TRSV)
+
+__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
+void FillVector(const int n, const int inc, const int offset,
+                __global real* restrict dest, const real_arg arg_value) {
+  const real value = GetRealArg(arg_value);
+  const int tid = get_global_id(0);
+  if (tid < n) {
+    dest[tid*inc + offset] = value;
+  }
+}
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+
+#ifndef TRSV_BLOCK_SIZE
+  #define TRSV_BLOCK_SIZE 32    // The block size for forward or backward substition
+#endif
+
+// =================================================================================================
+
+__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
+void trsv_forward(int n,
+                  const __global real *A, const int a_offset, int a_ld,
+                  __global real *b, const int b_offset, int b_inc,
+                  __global real *x, const int x_offset, int x_inc,
+                  const int is_transposed, const int is_unit_diagonal, const int do_conjugate) {
+  __local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE];
+  __local real xlm[TRSV_BLOCK_SIZE];
+  const int tid = get_local_id(0);
+
+  // Pre-loads the data into local memory
+  if (tid < n) {
+    Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]);
+    if (is_transposed == 0) {
+      for (int i = 0; i < n; ++i) {
+        alm[i][tid] = A[i + tid*a_ld + a_offset];
+      }
+    }
+    else {
+      for (int i = 0; i < n; ++i) {
+        alm[i][tid] = A[tid + i*a_ld + a_offset];
+      }
+    }
+    if (do_conjugate) {
+      for (int i = 0; i < n; ++i) {
+        COMPLEX_CONJUGATE(alm[i][tid]);
+      }
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Computes the result (single-threaded for now)
+  if (tid == 0) {
+    for (int i = 0; i < n; ++i) {
+      for (int j = 0; j < i; ++j) {
+        MultiplySubtract(xlm[i], alm[i][j], xlm[j]);
+      }
+      if (is_unit_diagonal == 0) { DivideFull(xlm[i], xlm[i], alm[i][i]); }
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Stores the results
+  if (tid < n) {
+    x[tid*x_inc + x_offset] = xlm[tid];
+  }
+}
+
+__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
+void trsv_backward(int n,
+                   const __global real *A, const int a_offset, int a_ld,
+                   __global real *b, const int b_offset, int b_inc,
+                   __global real *x, const int x_offset, int x_inc,
+                   const int is_transposed, const int is_unit_diagonal, const int do_conjugate) {
+  __local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE];
+  __local real xlm[TRSV_BLOCK_SIZE];
+  const int tid = get_local_id(0);
+
+  // Pre-loads the data into local memory
+  if (tid < n) {
+    Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]);
+    if (is_transposed == 0) {
+      for (int i = 0; i < n; ++i) {
+        alm[i][tid] = A[i + tid*a_ld + a_offset];
+      }
+    }
+    else {
+      for (int i = 0; i < n; ++i) {
+        alm[i][tid] = A[tid + i*a_ld + a_offset];
+      }
+    }
+    if (do_conjugate) {
+      for (int i = 0; i < n; ++i) {
+        COMPLEX_CONJUGATE(alm[i][tid]);
+      }
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Computes the result (single-threaded for now)
+  if (tid == 0) {
+    for (int i = n - 1; i >= 0; --i) {
+      for (int j = i + 1; j < n; ++j) {
+        MultiplySubtract(xlm[i], alm[i][j], xlm[j]);
+      }
+      if (is_unit_diagonal == 0) { DivideFull(xlm[i], xlm[i], alm[i][i]); }
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Stores the results
+  if (tid < n) {
+    x[tid*x_inc + x_offset] = xlm[tid];
+  }
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
--- a/src/kernels/level3/copy_pad.opencl
+++ b/src/kernels/level3/copy_pad.opencl
@ -24,16 +24,14 @@ R"(
 // Copies a matrix from source to destination. The output is padded with zero values in case the
 // destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld
 // value and offset can be different.
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-void CopyPadMatrix(const int src_one, const int src_two,
-                   const int src_ld, const int src_offset,
-                   __global const real* restrict src,
-                   const int dest_one, const int dest_two,
-                   const int dest_ld, const int dest_offset,
-                   __global real* dest,
-                   const real_arg arg_alpha,
-                   const int do_conjugate) {
-  const real alpha = GetRealArg(arg_alpha);
+inline void _CopyPadMatrix(const int src_one, const int src_two,
+                           const int src_ld, const int src_offset,
+                           __global const real* restrict src,
+                           const int dest_one, const int dest_two,
+                           const int dest_ld, const int dest_offset,
+                           __global real* dest,
+                           const real alpha,
+                           const int do_conjugate) {

  // Loops over the work per thread in both dimensions
  #pragma unroll
@ -60,22 +58,36 @@ void CopyPadMatrix(const int src_one, const int src_two,
  }
 }

+// Interface to the above function
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void CopyPadMatrix(const int src_one, const int src_two,
+                   const int src_ld, const int src_offset,
+                   __global const real* restrict src,
+                   const int dest_one, const int dest_two,
+                   const int dest_ld, const int dest_offset,
+                   __global real* dest,
+                   const real_arg arg_alpha,
+                   const int do_conjugate) {
+  const real alpha = GetRealArg(arg_alpha);
+  _CopyPadMatrix(src_one, src_two, src_ld, src_offset, src,
+                 dest_one, dest_two, dest_ld, dest_offset, dest,
+                 alpha, do_conjugate);
+}
+
 // =================================================================================================

 // Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but
 // writes only the actual data back to the destination matrix. Again, the ld value and offset can
 // be different.
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-void CopyMatrix(const int src_one, const int src_two,
-                const int src_ld, const int src_offset,
-                __global const real* restrict src,
-                const int dest_one, const int dest_two,
-                const int dest_ld, const int dest_offset,
-                __global real* dest,
-                const real_arg arg_alpha,
-                const int upper, const int lower,
-                const int diagonal_imag_zero) {
-  const real alpha = GetRealArg(arg_alpha);
+inline void _CopyMatrix(const int src_one, const int src_two,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_one, const int dest_two,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest,
+                        const real alpha,
+                        const int upper, const int lower,
+                        const int diagonal_imag_zero) {

  // Loops over the work per thread in both dimensions
  #pragma unroll
@ -105,6 +117,62 @@ void CopyMatrix(const int src_one, const int src_two,
  }
 }

+// Interface to the above function
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void CopyMatrix(const int src_one, const int src_two,
+                const int src_ld, const int src_offset,
+                __global const real* restrict src,
+                const int dest_one, const int dest_two,
+                const int dest_ld, const int dest_offset,
+                __global real* dest,
+                const real_arg arg_alpha,
+                const int upper, const int lower,
+                const int diagonal_imag_zero) {
+  const real alpha = GetRealArg(arg_alpha);
+  _CopyMatrix(src_one, src_two, src_ld, src_offset, src,
+              dest_one, dest_two, dest_ld, dest_offset, dest,
+              alpha, upper, lower, diagonal_imag_zero);
+}
+
+// =================================================================================================
+#if defined(ROUTINE_GEMMBATCHED)
+
+// Batched version of the above
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void CopyPadMatrixBatched(const int src_one, const int src_two,
+                          const int src_ld, const __constant int* src_offsets,
+                          __global const real* restrict src,
+                          const int dest_one, const int dest_two,
+                          const int dest_ld, const __constant int* dest_offsets,
+                          __global real* dest,
+                          const int do_conjugate) {
+  const int batch = get_group_id(2);
+  const int src_offset = src_offsets[batch];
+  const int dest_offset = dest_offsets[batch];
+  real alpha; SetToOne(alpha);
+  _CopyPadMatrix(src_one, src_two, src_ld, src_offset, src,
+                 dest_one, dest_two, dest_ld, dest_offset, dest,
+                 alpha, do_conjugate);
+}
+
+// Batched version of the above
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void CopyMatrixBatched(const int src_one, const int src_two,
+                       const int src_ld, const __constant int* src_offsets,
+                       __global const real* restrict src,
+                       const int dest_one, const int dest_two,
+                       const int dest_ld, const __constant int* dest_offsets,
+                       __global real* dest) {
+  const int batch = get_group_id(2);
+  const int src_offset = src_offsets[batch];
+  const int dest_offset = dest_offsets[batch];
+  real alpha; SetToOne(alpha);
+  _CopyMatrix(src_one, src_two, src_ld, src_offset, src,
+              dest_one, dest_two, dest_ld, dest_offset, dest,
+              alpha, 0, 0, 0);
+}
+
+#endif
 // =================================================================================================

 // End of the C++11 raw string literal
--- a/src/kernels/level3/invert_diagonal_blocks.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks.opencl
@ -0,0 +1,431 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains kernels to invert squared diagonal blocks of a matrix. These kernels are based
+// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
+// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
+// and Jack Dongarra.
+//
+// =================================================================================================
+//
+//  Let A be an block_size*block_size lower triangular matrix, and B its inverse.
+//  Then the block decomposition
+//  
+//      [ A11   0  ] * [ B11   0  ] = [ I 0 ]
+//      [ A21  A22 ]   [ B21  B22 ]   [ 0 I ]
+//  
+//  yields
+//  
+//      A11*B11 = I            ==>  B11 =  A11^{-1},
+//      A22*B22 = I            ==>  B22 =  A22^{-1},
+//      A21*B11 + A22*B21 = 0  ==>  B21 = -A22^{-1}*A21*B11 = -B22*A21*B11.
+//  
+//  The InvertDiagonalBlock kernel inverts A11 and A22.
+//  The TripleMatMul routines multiply:
+//  part 1:  B21 =  A21 * B11,
+//  part 2:  B21 = -B22 * B21.
+//  
+//  At this level, inner block is current_size=16, with one 4 x 4 work-group per inner block. Each
+//  submatrix Aij and Bij is current_size x current_size. The submatrix dimension is multiplied by 2
+//  at each level, so the next level is current_size*2 = 32. A 'page' is the next bigger block,
+//  here current_size*2=32,
+//                 [ B11   0  ]
+//  which contains [ B21  B22 ].
+//  Outer blocks are block_size x block_size.
+//  
+//  A21 may have < current_size rows, but is guaranteed to have current_size cols since A22 is on
+//  the right. This makes a single check easy to do.
+//  
+//  B is stored in workspace that is a full multiple of block_size x block_size; no checks needed.
+//  
+//  We split this into part1 & part2 to synchronize all blocks and make sure
+//  that writes to B12 are observed by all blocks.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+#if defined(ROUTINE_INVERT)
+
+#define LOCALX 17 // 16 + 1 to avoid bank conflicts
+#define LOCALY 16
+
+// =================================================================================================
+
+// Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix
+__kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1)))
+void InvertDiagonalBlock(int n, __global const real* restrict src, const int src_offset, const int src_ld,
+                         __global real* restrict dest, const int outer_block_size,
+                         const int unit_diagonal, const int is_upper)
+{
+  const int thread_index = get_local_id(0);
+  const int block_index = get_group_id(0);
+
+  // Sets the offset for this particular block in the source and destination matrices
+  const int src_block_offset = block_index * (INTERNAL_BLOCK_SIZE + src_ld * INTERNAL_BLOCK_SIZE) + src_offset;
+  const int num_inner_blocks = outer_block_size / INTERNAL_BLOCK_SIZE;
+  const int dest_block_offset = (block_index / num_inner_blocks) * outer_block_size * outer_block_size + // go to the (block_index / num_inner_blocks) outer outer_block_size*outer_block_size block,
+                                (block_index % num_inner_blocks) * (outer_block_size*INTERNAL_BLOCK_SIZE + INTERNAL_BLOCK_SIZE); // then to the (block_index % num_inner_blocks) inner INTERNAL_BLOCK_SIZE*INTERNAL_BLOCK_SIZE block inside that
+
+  // Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE
+  __local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+
+  // Loads the source lower triangle into local memory. Any values in the upper triangle or
+  // outside of the matrix are set to zero
+  #pragma unroll
+  for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
+    const bool condition = (is_upper) ? (thread_index <= j && block_index*INTERNAL_BLOCK_SIZE + j < n) :
+                                        (thread_index >= j && block_index*INTERNAL_BLOCK_SIZE + thread_index < n);
+    if (condition) {
+      lm[thread_index][j] = src[j*src_ld + thread_index + src_block_offset];
+    }
+    else {
+      SetToZero(lm[thread_index][j]);
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  
+  // Inverts the diagonal
+  real inverted_diagonal;
+  SetToOne(inverted_diagonal);
+  if (unit_diagonal == 0) {
+    const real diagonal_value = lm[thread_index][thread_index];
+    if (!IsZero(diagonal_value)) { // Only for non-singular values and values inside the matrix
+      real constant_one;
+      SetToOne(constant_one);
+      DivideFull(inverted_diagonal, constant_one, diagonal_value);
+    }
+  }
+  lm[thread_index][thread_index] = inverted_diagonal;
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Upper-triangular
+  if (is_upper) {
+
+    // Computes the elements 0:j-1 of the j-th column
+    for (int j = 1; j < INTERNAL_BLOCK_SIZE; ++j) {
+      if (thread_index < j) {
+        real sum;
+        SetToZero(sum);
+        #pragma unroll
+        for (int k = 0; k < j; ++k) {
+          MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
+        }
+        real diagonal_value = lm[j][j];
+        Negate(diagonal_value);
+        Multiply(lm[thread_index][j], diagonal_value, sum);
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  }
+
+  // Lower triangular
+  else {
+
+    // Computes the elements j+1:INTERNAL_BLOCK_SIZE-1 of the j-th column
+    for (int j = INTERNAL_BLOCK_SIZE - 2; j >= 0; --j) {
+      if (thread_index > j) {
+        real sum;
+        SetToZero(sum);
+        #pragma unroll
+        for (int k = j + 1; k < INTERNAL_BLOCK_SIZE; ++k) {
+          MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
+        }
+        real diagonal_value = lm[j][j];
+        Negate(diagonal_value);
+        Multiply(lm[thread_index][j], diagonal_value, sum);
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  }
+  
+  // Writes the result to global memory
+  #pragma unroll
+  for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
+    dest[j*outer_block_size + thread_index + dest_block_offset] = lm[thread_index][j];
+  }
+}
+
+// =================================================================================================
+
+// Triple matrix-multiplication kernel: C = A * B
+inline void TripleMatMul(const int size, const bool upper, const int part, __local real* blm, int n,
+                         __global const real* agm, __global const real* bgm, __global real* cgm,
+                         const int lda, const int ldb, const int ldc,
+                         int current_size, int num_pages, const int block_size) {
+
+  // Emulates a 3D grid: NX * (NY * num_pages)
+  const int by   = get_group_id(1) / num_pages;
+  const int page = get_group_id(1) % num_pages;
+  const int lidx = get_local_id(0);
+  const int lidy = get_local_id(1);
+  const int ibx  = get_group_id(0) * (get_local_size(0)*get_local_size(1));
+  const int iby  = by*16;
+  const int id   = lidx + lidy*get_local_size(0);
+  const int row  = page*current_size*2 + current_size + ibx + id;
+  int col        = page*current_size*2 + current_size;
+
+  // Sets the offsets for this specific thread
+  agm += ibx + id;
+  bgm += lidx + (iby + lidy)*ldb;
+  cgm += ibx + id + iby*ldc;
+
+  // Initializes the result registers
+  real cpm[16];
+  #pragma unroll
+  for (int j = 0; j < 16; ++j) {
+    SetToZero(cpm[j]);
+  }
+
+  // Computes NT x 16 block of C, each thread computes one 1 x 16 row
+  for (int k = 0; k < current_size; k += 16) {
+
+    // Loads a 16 x 16 block of B into local memory using NX x 4 threads
+    #pragma unroll
+    for( int i=0; i < 16; i += (size/4) ) {  // += get_local_size(0)
+      #pragma unroll
+      for( int j=0; j < 16; j += 4 ) {  // += get_local_size(1)
+        blm[(lidx + i) * LOCALX + (lidy + j)] = bgm[k + i + j*ldb];
+      }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Upper triangular
+    if (upper) {
+
+      // Performs 16 x 16 multiply-add operations
+      #pragma unroll
+      for (int i = 0; i < 16; ++i) {
+        if (part == 2 || col++ < n) {
+          #pragma unroll
+          for (int j = 0; j < 16; ++j) {
+            MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]);
+          }
+        }
+      }
+    }
+
+    // Lower triangular
+    else {
+      if (row < n) {
+
+        // Performs 16 x 16 multiply-add operations
+        #pragma unroll
+        for (int i = 0; i < 16; ++i) {
+          #pragma unroll
+          for (int j = 0; j < 16; ++j) {
+            MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]);
+          }
+        }
+      }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+
+  // Stores NT x 16 results: each thread writes one 16 x 1 row
+  #pragma unroll
+  for (int i = 0; i < 16; ++i) {
+    if (part == 2) { Negate(cpm[i]); }
+    cgm[0] = cpm[i];
+    cgm += ldc;
+  }
+}
+
+// =================================================================================================
+
+// Triple matrix-multiplication kernel part 1: B12 = A12 * B22 (upper) or B21 = A21 * B11 (lower)
+inline void TripleMatMulPart1(const int size, const bool upper, __local real* blm, int n,
+                              __global const real* src, const int a_offset, const int lda,
+                              __global real* dest, int current_size, int num_pages, const int block_size) {
+
+  // Emulates a 3D grid: NX * (NY * num_pages)
+  const int page = get_group_id(1) % num_pages;
+
+  // Computes the destination block offset:
+  // - go to the (page / pages_per_block) outer block_size * block_size block
+  // - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that
+  const int pages_per_block = block_size / (current_size*2);
+  dest += (page / pages_per_block) * block_size * block_size +
+          (page % pages_per_block) * (current_size*2*block_size + current_size*2);
+
+  // Using the GEMM notation: C = A*B
+  __global const real* agm;
+  __global const real* bgm;
+  __global real* cgm;
+  if (upper) { // upper triangular: B12 = A12 * B22
+    agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size*lda;  // A12
+    bgm = dest + current_size*block_size + current_size;                                      // B22
+    cgm = dest + current_size*block_size;                                                     // B12
+  }
+  else { // lower triangular: B21 = A21 * B11
+    agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size;  // A21
+    bgm = dest;                                                                           // B11
+    cgm = dest + current_size;                                                            // B21
+  }
+
+  // Runs the generic C = A * B matrix multiplication
+  const int ldb = block_size;
+  const int ldc = block_size;
+  TripleMatMul(size, upper, 1, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
+}
+
+// Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower)
+inline void TripleMatMulPart2(const int size, const bool upper, __local real* blm, const int n,
+                              __global real* dest, int current_size, int num_pages, const int block_size) {
+
+  // Emulates a 3D grid: NX * (NY * num_pages)
+  const int page = get_group_id(1) % num_pages;
+
+  // Computes the destination block offset:
+  // - go to the (page / pages_per_block) outer block_size * block_size block
+  // - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that
+  const int pages_per_block = block_size / (current_size*2);
+  dest += (page / pages_per_block) * block_size * block_size +
+          (page % pages_per_block) * (current_size*2*block_size + current_size*2);
+
+  // Using the GEMM notation: C = A*B
+  __global const real* agm;
+  __global const real* bgm;
+  __global real* cgm;
+  if (upper) { // upper triangular: B12 = -B11 * B12
+    agm = dest;                            // B11
+    cgm = dest + current_size*block_size;  // B12
+    bgm = cgm;                             // B12, okay to overwrite
+  }
+
+  else { // lower triangular: B21 = -B22 * B21
+    agm = dest + current_size*block_size + current_size;  // B22
+    cgm = dest + current_size;                            // B21
+    bgm = cgm;                                            // B21, okay to overwrite
+  }
+
+  // Runs the generic C = A * B matrix multiplication
+  const int lda = block_size;
+  const int ldb = block_size;
+  const int ldc = block_size;
+  TripleMatMul(size, upper, 2, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
+}
+
+// =================================================================================================
+
+// B21 = A21 * B11
+__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(16, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B21 = -B22 * B21
+__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(16, false, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B21 = A21 * B11
+__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(32, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B21 = -B22 * B21
+__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(32, false, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B21 = A21 * B11
+__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(64, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B21 = -B22 * B21
+__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// =================================================================================================
+
+// B12 =  A12 * B22
+__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(16, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B12 = -B11 * B12
+__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(16, true, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B12 =  A12 * B22
+__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(32, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B12 = -B11 * B12
+__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(32, true, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B12 =  A12 * B22
+__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(64, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B12 = -B11 * B12
+__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size);
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
--- a/src/kernels/level3/level3.opencl
+++ b/src/kernels/level3/level3.opencl
@ -73,6 +73,22 @@ R"(
  #define PADTRA_PAD 0    // Padding of the local memory to avoid bank-conflicts
 #endif

+// =================================================================================================
+#if defined(ROUTINE_INVERT) || defined(ROUTINE_TRSM)
+
+__kernel __attribute__((reqd_work_group_size(8, 8, 1)))
+void FillMatrix(const int m, const int n, const int ld, const int offset,
+                __global real* restrict dest, const real_arg arg_value) {
+  const real value = GetRealArg(arg_value);
+  const int id_one = get_global_id(0);
+  const int id_two = get_global_id(1);
+  if (id_one < m && id_two < n) {
+    dest[id_two*ld + id_one + offset] = value;
+  }
+}
+
+#endif
+
 // =================================================================================================

 // End of the C++11 raw string literal
--- a/src/kernels/level3/transpose_pad.opencl
+++ b/src/kernels/level3/transpose_pad.opencl
@ -24,19 +24,15 @@ R"(

 // Transposes a matrix from source to destination. The output is padded with zero values in case the
 // destination matrix dimensions are larger than the transposed source matrix dimensions.
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
-void TransposePadMatrix(const int src_one, const int src_two,
-                        const int src_ld, const int src_offset,
-                        __global const real* restrict src,
-                        const int dest_one, const int dest_two,
-                        const int dest_ld, const int dest_offset,
-                        __global real* dest,
-                        const real_arg arg_alpha,
-                        const int do_conjugate) {
-  const real alpha = GetRealArg(arg_alpha);
-
-  // Local memory to store a tile of the matrix (for coalescing)
-  __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
+inline void _TransposePadMatrix(__local real* tile,
+                                const int src_one, const int src_two,
+                                const int src_ld, const int src_offset,
+                                __global const real* restrict src,
+                                const int dest_one, const int dest_two,
+                                const int dest_ld, const int dest_offset,
+                                __global real* dest,
+                                const real alpha,
+                                const int do_conjugate) {

  // Loop over the work per thread
  #pragma unroll
@ -56,7 +52,9 @@ void TransposePadMatrix(const int src_one, const int src_two,
      if (id_src_two < src_two && id_src_one < src_one) {
        value = src[id_src_two*src_ld + id_src_one + src_offset];
      }
-      tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value;
+      const int tile_id0 = get_local_id(0)*PADTRA_WPT + w_one;
+      const int tile_id1 = get_local_id(1)*PADTRA_WPT + w_two;
+      tile[tile_id1 * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD) + tile_id0] = value;
    }
  }

@ -75,7 +73,9 @@ void TransposePadMatrix(const int src_one, const int src_two,

      // Stores the transposed value in the destination matrix
      if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
-        real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
+        const int tile_id0 = get_local_id(1)*PADTRA_WPT + w_one;
+        const int tile_id1 = get_local_id(0)*PADTRA_WPT + w_two;
+        real value = tile[tile_id1 * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD) + tile_id0];
        if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
        Multiply(dest[id_dest_two*dest_ld + id_dest_one + dest_offset], alpha, value);
      }
@ -83,25 +83,38 @@ void TransposePadMatrix(const int src_one, const int src_two,
  }
 }

+// Interface to the above function
+__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+void TransposePadMatrix(const int src_one, const int src_two,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_one, const int dest_two,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest,
+                        const real_arg arg_alpha,
+                        const int do_conjugate) {
+  const real alpha = GetRealArg(arg_alpha);
+  __local real tile[(PADTRA_WPT*PADTRA_TILE) * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD)];
+  _TransposePadMatrix(tile, src_one, src_two, src_ld, src_offset, src,
+                      dest_one, dest_two, dest_ld, dest_offset, dest,
+                      alpha, do_conjugate);
+}
+
 // =================================================================================================

 // Transposes a matrix, while considering possible padding in the source matrix. Data is read from a
 // padded source matrix, but only the actual data is written back to the transposed destination
 // matrix. This kernel optionally checks for upper/lower triangular matrices.
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
-void TransposeMatrix(const int src_one, const int src_two,
-                     const int src_ld, const int src_offset,
-                     __global const real* restrict src,
-                     const int dest_one, const int dest_two,
-                     const int dest_ld, const int dest_offset,
-                     __global real* dest,
-                     const real_arg arg_alpha,
-                     const int upper, const int lower,
-                     const int diagonal_imag_zero) {
-  const real alpha = GetRealArg(arg_alpha);
-
-  // Local memory to store a tile of the matrix (for coalescing)
-  __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
+inline void _TransposeMatrix(__local real* tile,
+                             const int src_one, const int src_two,
+                             const int src_ld, const int src_offset,
+                             __global const real* restrict src,
+                             const int dest_one, const int dest_two,
+                             const int dest_ld, const int dest_offset,
+                             __global real* dest,
+                             const real alpha,
+                             const int upper, const int lower,
+                             const int diagonal_imag_zero) {

  // Loop over the work per thread
  #pragma unroll
@ -117,7 +130,9 @@ void TransposeMatrix(const int src_one, const int src_two,
      // Loads data into the local memory if the thread IDs are within bounds of the source matrix.
      if ((id_src_one < src_one) && (id_src_two < src_two)) {
        real value = src[id_src_two*src_ld + id_src_one + src_offset];
-        tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value;
+        const int tile_id0 = get_local_id(0)*PADTRA_WPT + w_one;
+        const int tile_id1 = get_local_id(1)*PADTRA_WPT + w_two;
+        tile[tile_id1 * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD) + tile_id0] = value;
      }
    }
  }
@ -145,7 +160,9 @@ void TransposeMatrix(const int src_one, const int src_two,

        // Stores the transposed value in the destination matrix
        if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
-          real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
+          const int tile_id0 = get_local_id(1)*PADTRA_WPT + w_one;
+          const int tile_id1 = get_local_id(0)*PADTRA_WPT + w_two;
+          real value = tile[tile_id1 * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD) + tile_id0];
          if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); }
          Multiply(dest[id_dest_two*dest_ld + id_dest_one + dest_offset], alpha, value);
        }
@ -154,6 +171,65 @@ void TransposeMatrix(const int src_one, const int src_two,
  }
 }

+// Interface to the above function
+__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+void TransposeMatrix(const int src_one, const int src_two,
+                     const int src_ld, const int src_offset,
+                     __global const real* restrict src,
+                     const int dest_one, const int dest_two,
+                     const int dest_ld, const int dest_offset,
+                     __global real* dest,
+                     const real_arg arg_alpha,
+                     const int upper, const int lower,
+                     const int diagonal_imag_zero) {
+  const real alpha = GetRealArg(arg_alpha);
+  __local real tile[(PADTRA_WPT*PADTRA_TILE) * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD)];
+  _TransposeMatrix(tile, src_one, src_two, src_ld, src_offset, src,
+                   dest_one, dest_two, dest_ld, dest_offset, dest,
+                   alpha, upper, lower, diagonal_imag_zero);
+}
+
+// =================================================================================================
+#if defined(ROUTINE_GEMMBATCHED)
+
+// Batched version of the above
+__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+void TransposePadMatrixBatched(const int src_one, const int src_two,
+                               const int src_ld, const __constant int* src_offsets,
+                               __global const real* restrict src,
+                               const int dest_one, const int dest_two,
+                               const int dest_ld, const __constant int* dest_offsets,
+                               __global real* dest,
+                               const int do_conjugate) {
+  const int batch = get_group_id(2);
+  const int src_offset = src_offsets[batch];
+  const int dest_offset = dest_offsets[batch];
+  real alpha; SetToOne(alpha);
+  __local real tile[(PADTRA_WPT*PADTRA_TILE) * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD)];
+  _TransposePadMatrix(tile, src_one, src_two, src_ld, src_offset, src,
+                      dest_one, dest_two, dest_ld, dest_offset, dest,
+                      alpha, do_conjugate);
+}
+
+// Batched version of the above
+__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+void TransposeMatrixBatched(const int src_one, const int src_two,
+                            const int src_ld, const __constant int* src_offsets,
+                            __global const real* restrict src,
+                            const int dest_one, const int dest_two,
+                            const int dest_ld, const __constant int* dest_offsets,
+                            __global real* dest) {
+  const int batch = get_group_id(2);
+  const int src_offset = src_offsets[batch];
+  const int dest_offset = dest_offsets[batch];
+  real alpha; SetToOne(alpha);
+  __local real tile[(PADTRA_WPT*PADTRA_TILE) * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD)];
+  _TransposeMatrix(tile, src_one, src_two, src_ld, src_offset, src,
+                   dest_one, dest_two, dest_ld, dest_offset, dest,
+                   alpha, 0, 0, 0);
+}
+
+#endif
 // =================================================================================================

 // End of the C++11 raw string literal
--- a/src/kernels/level3/xgemm_batched.opencl
+++ b/src/kernels/level3/xgemm_batched.opencl
@ -0,0 +1,70 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the batched version of the non-direct GEMM kernel. See part 1 for information
+// about the non-batched version of the kernel.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Main entry point of the kernel. This is the regular full version.
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK,
+                  const __constant real_arg* arg_alphas,
+                  const __constant real_arg* arg_betas,
+                  const __global realM* restrict agm, const int a_one, const int a_two,
+                  const __global realN* restrict bgm, const int b_one, const int b_two,
+                  __global realM* cgm, const int c_one, const int c_two) {
+  const int batch = get_group_id(2);
+  const real alpha = GetRealArg(arg_alphas[batch]);
+  const real beta = GetRealArg(arg_betas[batch]);
+
+  // Sets the offsets
+  const int a_offset = batch * a_one * a_two;
+  const int b_offset = batch * b_one * b_two;
+  const int c_offset = batch * c_one * c_two;
+  const __global realM* restrict agm_ = &agm[a_offset / VWM];
+  const __global realN* restrict bgm_ = &bgm[b_offset / VWN];
+  __global realM* restrict cgm_ = &cgm[c_offset / VWM];
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+
+  // Computes the matrix-multiplication and stores the result in register memory
+  realM cpm[NWI][MWI/VWM];
+  #if SA == 1 && SB == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_, cpm, alm, blm);
+  #elif SA == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_, cpm, alm);
+  #elif SB == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_, cpm, blm);
+  #else
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_, cpm);
+  #endif
+
+  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+  StoreResults(cgm_, cpm, kSizeM, alpha, beta);
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
--- a/src/kernels/level3/xgemm_direct_batched.opencl
+++ b/src/kernels/level3/xgemm_direct_batched.opencl
@ -0,0 +1,110 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the batched version of the direct GEMM kernels. See part 1 for information
+// about the non-batched version of the kernel.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Direct version of the batched GEMM kernel with [A, B] = [non-transposed, non-transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
+                                   const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
+                                   const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
+                                   const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld,
+                                   __global real* cgm, const __constant int* c_offsets, const int c_ld,
+                                   const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  const int batch = get_group_id(2);
+  const real_arg arg_alpha = arg_alphas[batch];
+  const real_arg arg_beta = arg_betas[batch];
+  const int a_offset = a_offsets[batch];
+  const int b_offset = b_offsets[batch];
+  const int c_offset = c_offsets[batch];
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 0, 0, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the batched GEMM kernel with [A, B] = [non-transposed, transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
+                                   const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
+                                   const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
+                                   const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld,
+                                   __global real* cgm, const __constant int* c_offsets, const int c_ld,
+                                   const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  const int batch = get_group_id(2);
+  const real_arg arg_alpha = arg_alphas[batch];
+  const real_arg arg_beta = arg_betas[batch];
+  const int a_offset = a_offsets[batch];
+  const int b_offset = b_offsets[batch];
+  const int c_offset = c_offsets[batch];
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 0, 1, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the batched GEMM kernel with [A, B] = [transposed, non-transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
+                                   const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
+                                   const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
+                                   const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld,
+                                   __global real* cgm, const __constant int* c_offsets, const int c_ld,
+                                   const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  const int batch = get_group_id(2);
+  const real_arg arg_alpha = arg_alphas[batch];
+  const real_arg arg_beta = arg_betas[batch];
+  const int a_offset = a_offsets[batch];
+  const int b_offset = b_offsets[batch];
+  const int c_offset = c_offsets[batch];
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 1, 0, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the batched GEMM kernel with [A, B] = [transposed, transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
+                                   const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
+                                   const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
+                                   const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld,
+                                   __global real* cgm, const __constant int* c_offsets, const int c_ld,
+                                   const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  const int batch = get_group_id(2);
+  const real_arg arg_alpha = arg_alphas[batch];
+  const real_arg arg_beta = arg_betas[batch];
+  const int a_offset = a_offsets[batch];
+  const int b_offset = b_offsets[batch];
+  const int c_offset = c_offsets[batch];
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 1, 1, c_transpose, a_conjugate, b_conjugate);
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
--- a/src/kernels/level3/xgemm_direct_part2.opencl
+++ b/src/kernels/level3/xgemm_direct_part2.opencl
@ -42,7 +42,7 @@ inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local re
      int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;

      // Loads the data from global memory into the local memory
-      const realMD avec = agm[idk*(a_ld/VWMD) + idm + a_offset];
+      const realMD avec = agm[idk*(a_ld/VWMD) + idm + (a_offset/VWMD)];
      #if VWMD == 1
         alm[kg*(WGD + PADA) + mg] = avec;
      #elif VWMD == 2
@ -113,7 +113,7 @@ inline void GlobalToLocalDirectB(const __global realND* restrict bgm, __local re
      int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;

      // Loads the data from global memory into the local memory
-      const realND bvec = bgm[idk*(b_ld/VWND) + idn + b_offset];
+      const realND bvec = bgm[idk*(b_ld/VWND) + idn + (b_offset/VWND)];
      #if VWND == 1
         blm[kg*(WGD + PADB) + ng] = bvec;
      #elif VWND == 2
--- a/src/kernels/level3/xgemm_direct_part3.opencl
+++ b/src/kernels/level3/xgemm_direct_part3.opencl
@ -53,13 +53,13 @@ inline void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK,
    for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) {

      // Loads data: off-chip --> local (matrix A and B)
-      if (a_ld % VWMD == 0) {
+      if (a_ld % VWMD == 0 && a_offset % VWMD == 0) {
        GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
      }
      else {
        GlobalToLocalScalarA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
      }
-      if (b_ld % VWND == 0) {
+      if (b_ld % VWND == 0 && b_offset % VWND == 0) {
        GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate);
      }
      else {
--- a/src/routine.cpp
+++ b/src/routine.cpp
@ -21,22 +21,75 @@
 namespace clblast {
 // =================================================================================================

+// For each kernel this map contains a list of routines it is used in
+const std::vector<std::string> Routine::routines_axpy = {"AXPY", "COPY", "SCAL", "SWAP"};
+const std::vector<std::string> Routine::routines_dot = {"AMAX", "ASUM", "DOT", "DOTC", "DOTU", "MAX", "MIN", "NRM2", "SUM"};
+const std::vector<std::string> Routine::routines_ger = {"GER", "GERC", "GERU", "HER", "HER2", "HPR", "HPR2", "SPR", "SPR2", "SYR", "SYR2"};
+const std::vector<std::string> Routine::routines_gemv = {"GBMV", "GEMV", "HBMV", "HEMV", "HPMV", "SBMV", "SPMV", "SYMV", "TMBV", "TPMV", "TRMV", "TRSV"};
+const std::vector<std::string> Routine::routines_gemm = {"GEMM", "HEMM", "SYMM", "TRMM"};
+const std::vector<std::string> Routine::routines_gemm_syrk = {"GEMM", "HEMM", "HER2K", "HERK", "SYMM", "SYR2K", "SYRK", "TRMM", "TRSM"};
+const std::vector<std::string> Routine::routines_trsm = {"TRSM"};
+const std::unordered_map<std::string, const std::vector<std::string>> Routine::routines_by_kernel = {
+  {"Xaxpy", routines_axpy},
+  {"Xdot", routines_dot},
+  {"Xgemv", routines_gemv},
+  {"XgemvFast", routines_gemv},
+  {"XgemvFastRot", routines_gemv},
+  {"Xtrsv", routines_gemv},
+  {"Xger", routines_ger},
+  {"Copy", routines_gemm_syrk},
+  {"Pad", routines_gemm_syrk},
+  {"Transpose", routines_gemm_syrk},
+  {"Padtranspose", routines_gemm_syrk},
+  {"Xgemm", routines_gemm_syrk},
+  {"XgemmDirect", routines_gemm},
+  {"KernelSelection", routines_gemm},
+  {"Invert", routines_trsm},
+};
+// =================================================================================================
+
 // The constructor does all heavy work, errors are returned as exceptions
 Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
-                 const std::vector<std::string> &routines, const Precision precision,
-                 const std::vector<const Database::DatabaseEntry*> &userDatabase,
+                 const std::vector<std::string> &kernel_names, const Precision precision,
+                 const std::vector<Database::DatabaseEntry> &userDatabase,
                 std::initializer_list<const char *> source):
    precision_(precision),
    routine_name_(name),
+    kernel_names_(kernel_names),
    queue_(queue),
    event_(event),
    context_(queue_.GetContext()),
    device_(queue_.GetDevice()),
    device_name_(device_.Name()),
-    db_(queue_, routines, precision_, userDatabase) {
+    db_(kernel_names) {
+
+  InitDatabase(userDatabase);
+  InitProgram(source);
+}
+
+void Routine::InitDatabase(const std::vector<Database::DatabaseEntry> &userDatabase) {
+  for (const auto &kernel_name : kernel_names_) {
+
+    // Queries the cache to see whether or not the kernel parameter database is already there
+    bool has_db;
+    db_(kernel_name) = DatabaseCache::Instance().Get(DatabaseKeyRef{ precision_, device_name_, kernel_name },
+                                                     &has_db);
+    if (has_db) { continue; }
+
+    // Builds the parameter database for this device and routine set and stores it in the cache
+    db_(kernel_name) = Database(device_, kernel_name, precision_, userDatabase);
+    DatabaseCache::Instance().Store(DatabaseKey{ precision_, device_name_, kernel_name },
+                                    Database{ db_(kernel_name) });
+  }
+}
+
+void Routine::InitProgram(std::initializer_list<const char *> source) {

  // Queries the cache to see whether or not the program (context-specific) is already there
-  if (ProgramIsInCache(context_, precision_, routine_name_)) { return; }
+  bool has_program;
+  program_ = ProgramCache::Instance().Get(ProgramKeyRef{ context_(), precision_, routine_name_ },
+                                          &has_program);
+  if (has_program) { return; }

  // Sets the build options from an environmental variable (if set)
  auto options = std::vector<std::string>();
@ -47,33 +100,36 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name,

  // Queries the cache to see whether or not the binary (device-specific) is already there. If it
  // is, a program is created and stored in the cache
-  if (BinaryIsInCache(device_name_, precision_, routine_name_)) {
-    auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
-    auto program = Program(device_, context_, binary);
-    program.Build(device_, options);
-    StoreProgramToCache(program, context_, precision_, routine_name_);
+  bool has_binary;
+  auto binary = BinaryCache::Instance().Get(BinaryKeyRef{ precision_, routine_name_, device_name_ },
+                                            &has_binary);
+  if (has_binary) {
+    program_ = Program(device_, context_, binary);
+    program_.Build(device_, options);
+    ProgramCache::Instance().Store(ProgramKey{ context_(), precision_, routine_name_ },
+                                   Program{ program_ });
+    return;
  }

  // Otherwise, the kernel will be compiled and program will be built. Both the binary and the
  // program will be added to the cache.

  // Inspects whether or not cl_khr_fp64 is supported in case of double precision
-  const auto extensions = device_.Capabilities();
-  if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
-    if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
-      throw RuntimeErrorCode(StatusCode::kNoDoublePrecision);
-    }
+  if ((precision_ == Precision::kDouble && !PrecisionSupported<double>(device_)) ||
+      (precision_ == Precision::kComplexDouble && !PrecisionSupported<double2>(device_))) {
+    throw RuntimeErrorCode(StatusCode::kNoDoublePrecision);
  }

  // As above, but for cl_khr_fp16 (half precision)
-  if (precision_ == Precision::kHalf) {
-    if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
-      throw RuntimeErrorCode(StatusCode::kNoHalfPrecision);
-    }
+  if (precision_ == Precision::kHalf && !PrecisionSupported<half>(device_)) {
+    throw RuntimeErrorCode(StatusCode::kNoHalfPrecision);
  }

  // Collects the parameters for this device in the form of defines, and adds the precision
-  auto source_string = db_.GetDefines();
+  auto source_string = std::string{""};
+  for (const auto &kernel_name : kernel_names_) {
+    source_string += db_(kernel_name).GetDefines();
+  }
  source_string += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";

  // Adds the name of the routine as a define
@ -114,21 +170,23 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
  #endif

  // Compiles the kernel
-  auto program = Program(context_, source_string);
+  program_ = Program(context_, source_string);
  try {
-    program.Build(device_, options);
+    program_.Build(device_, options);
  } catch (const CLError &e) {
    if (e.status() == CL_BUILD_PROGRAM_FAILURE) {
      fprintf(stdout, "OpenCL compiler error/warning: %s\n",
-              program.GetBuildInfo(device_).c_str());
+              program_.GetBuildInfo(device_).c_str());
    }
    throw;
  }

  // Store the compiled binary and program in the cache
-  const auto binary = program.GetIR();
-  StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
-  StoreProgramToCache(program, context_, precision_, routine_name_);
+  BinaryCache::Instance().Store(BinaryKey{ precision_, routine_name_, device_name_ },
+                                program_.GetIR());
+
+  ProgramCache::Instance().Store(ProgramKey{ context_(), precision_, routine_name_ },
+                                 Program{ program_ });

  // Prints the elapsed compilation time in case of debugging in verbose mode
  #ifdef VERBOSE
--- a/src/routine.hpp
+++ b/src/routine.hpp
@ -18,6 +18,7 @@

 #include <string>
 #include <vector>
+#include <unordered_map>

 #include "utilities/utilities.hpp"
 #include "cache.hpp"
@ -35,18 +36,39 @@ class Routine {
  // Base class constructor. The user database is an optional extra database to override the
  // built-in database.
  // All heavy preparation work is done inside this constructor.
+  // NOTE: the caller must provide the same userDatabase for each combination of device, precision
+  // and routine list, otherwise the caching logic will break.
  explicit Routine(Queue &queue, EventPointer event, const std::string &name,
                   const std::vector<std::string> &routines, const Precision precision,
-                   const std::vector<const Database::DatabaseEntry*> &userDatabase,
+                   const std::vector<Database::DatabaseEntry> &userDatabase,
                   std::initializer_list<const char *> source);

+  // List of kernel-routine look-ups
+  static const std::vector<std::string> routines_axpy;
+  static const std::vector<std::string> routines_dot;
+  static const std::vector<std::string> routines_ger;
+  static const std::vector<std::string> routines_gemv;
+  static const std::vector<std::string> routines_gemm;
+  static const std::vector<std::string> routines_gemm_syrk;
+  static const std::vector<std::string> routines_trsm;
+  static const std::unordered_map<std::string, const std::vector<std::string>> routines_by_kernel;
+
+ private:
+
+  // Initializes program_, fetching cached program or building one
+  void InitProgram(std::initializer_list<const char *> source);
+
+  // Initializes db_, fetching cached database or building one
+  void InitDatabase(const std::vector<Database::DatabaseEntry> &userDatabase);
+
 protected:

  // Non-static variable for the precision
  const Precision precision_;

-  // The routine's name
+  // The routine's name and the corresponding kernels
  const std::string routine_name_;
+  const std::vector<std::string> kernel_names_;

  // The OpenCL objects, accessible only from derived classes
  Queue queue_;
@ -57,8 +79,11 @@ class Routine {
  // OpenCL device properties
  const std::string device_name_;

+  // Compiled program (either retrieved from cache or compiled in slow path)
+  Program program_;
+
  // Connection to the database for all the device-specific parameters
-  const Database db_;
+  Databases db_;
 };

 // =================================================================================================
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@ -19,8 +19,8 @@
 #include <string>
 #include <vector>

-#include "clblast.h"
 #include "clpp11.hpp"
+#include "clblast.h"
 #include "database/database.hpp"

 namespace clblast {
@ -33,11 +33,52 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,

 // =================================================================================================

+// Sets all elements of a matrix to a constant value
+template <typename T>
+void FillMatrix(Queue &queue, const Device &device,
+                const Program &program, const Databases &,
+                EventPointer event, const std::vector<Event> &waitForEvents,
+                const size_t m, const size_t n, const size_t ld, const size_t offset,
+                const Buffer<T> &dest,
+                const T constant_value) {
+  auto kernel = Kernel(program, "FillMatrix");
+  kernel.SetArgument(0, static_cast<int>(m));
+  kernel.SetArgument(1, static_cast<int>(n));
+  kernel.SetArgument(2, static_cast<int>(ld));
+  kernel.SetArgument(3, static_cast<int>(offset));
+  kernel.SetArgument(4, dest());
+  kernel.SetArgument(5, GetRealArg(constant_value));
+  auto local = std::vector<size_t>{8, 8};
+  auto global = std::vector<size_t>{Ceil(m, 8), Ceil(n, 8)};
+  RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+}
+
+// Sets all elements of a vector to a constant value
+template <typename T>
+void FillVector(Queue &queue, const Device &device,
+                const Program &program, const Databases &,
+                EventPointer event, const std::vector<Event> &waitForEvents,
+                const size_t n, const size_t inc, const size_t offset,
+                const Buffer<T> &dest,
+                const T constant_value) {
+  auto kernel = Kernel(program, "FillVector");
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, static_cast<int>(inc));
+  kernel.SetArgument(2, static_cast<int>(offset));
+  kernel.SetArgument(3, dest());
+  kernel.SetArgument(4, GetRealArg(constant_value));
+  auto local = std::vector<size_t>{64};
+  auto global = std::vector<size_t>{Ceil(n, 64)};
+  RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+}
+
+// =================================================================================================
+
 // Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
 // to write to symmetric and triangular matrices through optional arguments.
 template <typename T>
 void PadCopyTransposeMatrix(Queue &queue, const Device &device,
-                            const Database &db,
+                            const Databases &db,
                            EventPointer event, const std::vector<Event> &waitForEvents,
                            const size_t src_one, const size_t src_two,
                            const size_t src_ld, const size_t src_offset,
@ -155,6 +196,70 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
  }
 }

+// Batched version of the above
+template <typename T>
+void PadCopyTransposeMatrixBatched(Queue &queue, const Device &device,
+                                   const Databases &db,
+                                   EventPointer event, const std::vector<Event> &waitForEvents,
+                                   const size_t src_one, const size_t src_two,
+                                   const size_t src_ld, const Buffer<int> &src_offsets,
+                                   const Buffer<T> &src,
+                                   const size_t dest_one, const size_t dest_two,
+                                   const size_t dest_ld, const Buffer<int> &dest_offsets,
+                                   const Buffer<T> &dest,
+                                   const Program &program, const bool do_pad,
+                                   const bool do_transpose, const bool do_conjugate,
+                                   const size_t batch_count) {
+
+  // Determines the right kernel
+  auto kernel_name = std::string{};
+  if (do_transpose) {
+    kernel_name = (do_pad) ? "TransposePadMatrixBatched" : "TransposeMatrixBatched";
+  }
+  else {
+    kernel_name = (do_pad) ? "CopyPadMatrixBatched" : "CopyMatrixBatched";
+  }
+
+  // Retrieves the kernel from the compiled binary
+  auto kernel = Kernel(program, kernel_name);
+
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(src_one));
+  kernel.SetArgument(1, static_cast<int>(src_two));
+  kernel.SetArgument(2, static_cast<int>(src_ld));
+  kernel.SetArgument(3, src_offsets());
+  kernel.SetArgument(4, src());
+  kernel.SetArgument(5, static_cast<int>(dest_one));
+  kernel.SetArgument(6, static_cast<int>(dest_two));
+  kernel.SetArgument(7, static_cast<int>(dest_ld));
+  kernel.SetArgument(8, dest_offsets());
+  kernel.SetArgument(9, dest());
+  if (do_pad) {
+    kernel.SetArgument(10, static_cast<int>(do_conjugate));
+  }
+
+  // Launches the kernel and returns the error code. Uses global and local thread sizes based on
+  // parameters in the database.
+  if (do_transpose) {
+    const auto global = std::vector<size_t>{
+      Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
+      Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
+      batch_count
+    };
+    const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"], 1};
+    RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+  }
+  else {
+    const auto global = std::vector<size_t>{
+      Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
+      Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"]),
+      batch_count
+    };
+    const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"], 1};
+    RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+  }
+}
+
 // =================================================================================================
 } // namespace clblast

--- a/src/routines/level1/xamax.cpp
+++ b/src/routines/level1/xamax.cpp
@ -43,9 +43,8 @@ void Xamax<T>::DoAmax(const size_t n,
  TestVectorIndex(1, imax_buffer, imax_offset);

  // Retrieves the Xamax kernels from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel1 = Kernel(program, "Xamax");
-  auto kernel2 = Kernel(program, "XamaxEpilogue");
+  auto kernel1 = Kernel(program_, "Xamax");
+  auto kernel2 = Kernel(program_, "XamaxEpilogue");

  // Creates the buffer for intermediate values
  auto temp_size = 2*db_["WGS2"];
--- a/src/routines/level1/xasum.cpp
+++ b/src/routines/level1/xasum.cpp
@ -43,9 +43,8 @@ void Xasum<T>::DoAsum(const size_t n,
  TestVectorScalar(1, asum_buffer, asum_offset);

  // Retrieves the Xasum kernels from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel1 = Kernel(program, "Xasum");
-  auto kernel2 = Kernel(program, "XasumEpilogue");
+  auto kernel1 = Kernel(program_, "Xasum");
+  auto kernel2 = Kernel(program_, "XasumEpilogue");

  // Creates the buffer for intermediate values
  auto temp_size = 2*db_["WGS2"];
--- a/src/routines/level1/xaxpy.cpp
+++ b/src/routines/level1/xaxpy.cpp
@ -44,19 +44,21 @@ void Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
  TestVectorY(n, y_buffer, y_offset, y_inc);

  // Determines whether or not the fast-version can be used
-  bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
-                         (y_offset == 0) && (y_inc == 1) &&
-                         IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]);
+  const auto use_faster_kernel = (x_offset == 0) && (x_inc == 1) &&
+                                 (y_offset == 0) && (y_inc == 1) &&
+                                 IsMultiple(n, db_["WPT"]*db_["VW"]);
+  const auto use_fastest_kernel = use_faster_kernel &&
+                                  IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]);

  // If possible, run the fast-version of the kernel
-  auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy";
+  const auto kernel_name = (use_fastest_kernel) ? "XaxpyFastest" :
+                           (use_faster_kernel) ? "XaxpyFaster" : "Xaxpy";

  // Retrieves the Xaxpy kernel from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program_, kernel_name);

  // Sets the kernel arguments
-  if (use_fast_kernel) {
+  if (use_faster_kernel || use_fastest_kernel) {
    kernel.SetArgument(0, static_cast<int>(n));
    kernel.SetArgument(1, GetRealArg(alpha));
    kernel.SetArgument(2, x_buffer());
@ -74,13 +76,18 @@ void Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
  }

  // Launches the kernel
-  if (use_fast_kernel) {
+  if (use_fastest_kernel) {
    auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
    auto local = std::vector<size_t>{db_["WGS"]};
    RunKernel(kernel, queue_, device_, global, local, event_);
  }
+  else if (use_faster_kernel) {
+    auto global = std::vector<size_t>{Ceil(CeilDiv(n, db_["WPT"]*db_["VW"]), db_["WGS"])};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
  else {
-    auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+    const auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
    auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
    auto local = std::vector<size_t>{db_["WGS"]};
    RunKernel(kernel, queue_, device_, global, local, event_);
--- a/src/routines/level1/xcopy.cpp
+++ b/src/routines/level1/xcopy.cpp
@ -52,8 +52,7 @@ void Xcopy<T>::DoCopy(const size_t n,
  auto kernel_name = (use_fast_kernel) ? "XcopyFast" : "Xcopy";

  // Retrieves the Xcopy kernel from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program_, kernel_name);

  // Sets the kernel arguments
  if (use_fast_kernel) {
--- a/src/routines/level1/xdot.cpp
+++ b/src/routines/level1/xdot.cpp
@ -46,9 +46,8 @@ void Xdot<T>::DoDot(const size_t n,
  TestVectorScalar(1, dot_buffer, dot_offset);

  // Retrieves the Xdot kernels from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel1 = Kernel(program, "Xdot");
-  auto kernel2 = Kernel(program, "XdotEpilogue");
+  auto kernel1 = Kernel(program_, "Xdot");
+  auto kernel2 = Kernel(program_, "XdotEpilogue");

  // Creates the buffer for intermediate values
  auto temp_size = 2*db_["WGS2"];
--- a/src/routines/level1/xnrm2.cpp
+++ b/src/routines/level1/xnrm2.cpp
@ -43,9 +43,8 @@ void Xnrm2<T>::DoNrm2(const size_t n,
  TestVectorScalar(1, nrm2_buffer, nrm2_offset);

  // Retrieves the Xnrm2 kernels from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel1 = Kernel(program, "Xnrm2");
-  auto kernel2 = Kernel(program, "Xnrm2Epilogue");
+  auto kernel1 = Kernel(program_, "Xnrm2");
+  auto kernel2 = Kernel(program_, "Xnrm2Epilogue");

  // Creates the buffer for intermediate values
  auto temp_size = 2*db_["WGS2"];
--- a/src/routines/level1/xscal.cpp
+++ b/src/routines/level1/xscal.cpp
@ -49,8 +49,7 @@ void Xscal<T>::DoScal(const size_t n, const T alpha,
  auto kernel_name = (use_fast_kernel) ? "XscalFast" : "Xscal";

  // Retrieves the Xscal kernel from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program_, kernel_name);

  // Sets the kernel arguments
  if (use_fast_kernel) {
--- a/src/routines/level1/xswap.cpp
+++ b/src/routines/level1/xswap.cpp
@ -52,8 +52,7 @@ void Xswap<T>::DoSwap(const size_t n,
  auto kernel_name = (use_fast_kernel) ? "XswapFast" : "Xswap";

  // Retrieves the Xswap kernel from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program_, kernel_name);

  // Sets the kernel arguments
  if (use_fast_kernel) {
--- a/src/routines/level2/xgemv.cpp
+++ b/src/routines/level2/xgemv.cpp
@ -22,9 +22,10 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>(), {}, {
+    Routine(queue, event, name, {"Xgemv", "XgemvFast", "XgemvFastRot", "Xtrsv"}, PrecisionValue<T>(), {}, {
    #include "../../kernels/level2/xgemv.opencl"
    #include "../../kernels/level2/xgemv_fast.opencl"
+    #include "../../kernels/level2/xtrsv.opencl"
    }) {
 }

@ -69,14 +70,14 @@ void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
  if (m == 0 || n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Computes whether or not the matrix has an alternative layout (row or column-major).
-  auto a_altlayout = (layout == Layout::kRowMajor);
+  const auto a_altlayout = (layout == Layout::kRowMajor);
  auto a_one = (a_altlayout) ? n : m;
-  auto a_two = (a_altlayout) ? m : n;
+  const auto a_two = (a_altlayout) ? m : n;

  // Swap m and n if the matrix is transposed
-  auto a_transposed = (a_transpose != Transpose::kNo);
-  auto m_real = (a_transposed) ? n : m;
-  auto n_real = (a_transposed) ? m : n;
+  const auto a_transposed = (a_transpose != Transpose::kNo);
+  const auto m_real = (a_transposed) ? n : m;
+  const auto n_real = (a_transposed) ? m : n;

  // Special adjustments for banded matrices
  if (kl != 0 || ku != 0) {
@ -84,10 +85,10 @@ void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
  }

  // Determines whether the kernel needs to perform rotated access ('^' is the XOR operator)
-  auto a_rotated = a_transposed ^ a_altlayout;
+  const auto a_rotated = a_transposed ^ a_altlayout;

  // In case of complex data-types, the transpose can also become a conjugate transpose
-  auto a_conjugate = (a_transpose == Transpose::kConjugate);
+  const auto a_conjugate = (a_transpose == Transpose::kConjugate);

  // Tests the matrix and the vectors for validity
  if (packed) { TestMatrixAP(n, a_buffer, a_offset); }
@ -106,8 +107,8 @@ void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
                    IsMultiple(a_ld, db_["VW3"]);

  // If possible, run the fast-version (rotated or non-rotated) of the kernel
-  auto kernel_name = "Xgemv";
-  auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]);
+  auto kernel_name = std::string{"Xgemv"};
+  const auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]);
  auto global_size = m_ceiled / db_["WPT1"];
  auto local_size = db_["WGS1"];
  if (fast_kernel) {
@ -122,8 +123,7 @@ void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
  }

  // Retrieves the Xgemv kernel from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program_, kernel_name);

  // Sets the kernel arguments
  kernel.SetArgument(0, static_cast<int>(m_real));
--- a/src/routines/level2/xger.cpp
+++ b/src/routines/level2/xger.cpp
@ -53,8 +53,7 @@ void Xger<T>::DoGer(const Layout layout,
  TestVectorY(n, y_buffer, y_offset, y_inc);

  // Retrieves the kernel from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel = Kernel(program, "Xger");
+  auto kernel = Kernel(program_, "Xger");

  // Sets the kernel arguments
  kernel.SetArgument(0, static_cast<int>(a_one));
--- a/src/routines/level2/xher.cpp
+++ b/src/routines/level2/xher.cpp
@ -67,8 +67,7 @@ void Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
  const auto matching_alpha = GetAlpha(alpha);

  // Retrieves the kernel from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel = Kernel(program, "Xher");
+  auto kernel = Kernel(program_, "Xher");

  // Sets the kernel arguments
  kernel.SetArgument(0, static_cast<int>(n));
--- a/src/routines/level2/xher2.cpp
+++ b/src/routines/level2/xher2.cpp
@ -54,8 +54,7 @@ void Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
  TestVectorY(n, y_buffer, y_offset, y_inc);

  // Retrieves the kernel from the compiled binary
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel = Kernel(program, "Xher2");
+  auto kernel = Kernel(program_, "Xher2");

  // Sets the kernel arguments
  kernel.SetArgument(0, static_cast<int>(n));
--- a/src/routines/level2/xtbmv.cpp
+++ b/src/routines/level2/xtbmv.cpp
@ -52,9 +52,9 @@ void Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
  auto fast_kernels = false;
  try {
    MatVec(layout, a_transpose,
-           n, n, static_cast<T>(1),
+           n, n, ConstantOne<T>(),
           a_buffer, a_offset, a_ld,
-           scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+           scratch_buffer, x_offset, x_inc, ConstantZero<T>(),
           x_buffer, x_offset, x_inc,
           fast_kernels, fast_kernels,
           parameter, false, k, 0);
--- a/src/routines/level2/xtpmv.cpp
+++ b/src/routines/level2/xtpmv.cpp
@ -52,9 +52,9 @@ void Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
  auto fast_kernels = false;
  try {
    MatVec(layout, a_transpose,
-           n, n, static_cast<T>(1),
+           n, n, ConstantOne<T>(),
           ap_buffer, ap_offset, n,
-           scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+           scratch_buffer, x_offset, x_inc, ConstantZero<T>(),
           x_buffer, x_offset, x_inc,
           fast_kernels, fast_kernels,
           parameter, true, 0, 0);
--- a/src/routines/level2/xtrmv.cpp
+++ b/src/routines/level2/xtrmv.cpp
@ -52,9 +52,9 @@ void Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
  auto fast_kernels = false;
  try {
    MatVec(layout, a_transpose,
-           n, n, static_cast<T>(1),
+           n, n, ConstantOne<T>(),
           a_buffer, a_offset, a_ld,
-           scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+           scratch_buffer, x_offset, x_inc, ConstantZero<T>(),
           x_buffer, x_offset, x_inc,
           fast_kernels, fast_kernels,
           parameter, false, 0, 0);
--- a/src/routines/level2/xtrsv.cpp
+++ b/src/routines/level2/xtrsv.cpp
@ -0,0 +1,161 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrsv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "routines/level2/xtrsv.hpp"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtrsv<T>::Xtrsv(Queue &queue, EventPointer event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+template <typename T>
+void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
+                            const Transpose a_transpose, const Diagonal diagonal,
+                            const size_t n,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+  if (n > db_["TRSV_BLOCK_SIZE"]) { throw BLASError(StatusCode::kUnexpectedError); };
+
+  // Translates CLBlast arguments to 0/1 integers for the OpenCL kernel
+  const auto is_unit_diagonal = (diagonal == Diagonal::kNonUnit) ? 0 : 1;
+  const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kColMajor) ||
+                              (a_transpose != Transpose::kNo && layout != Layout::kColMajor)) ? 0 : 1;
+  const auto do_conjugate = (a_transpose == Transpose::kConjugate) ? 1 : 0;
+
+  // The data is either in the upper or lower triangle
+  const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) ||
+                         (triangle == Triangle::kLower && a_transpose != Transpose::kNo));
+
+  // Retrieves the kernel from the compiled binary
+  const auto kernel_name = (is_upper) ? "trsv_backward" : "trsv_forward";
+  auto kernel = Kernel(program_, kernel_name);
+
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, a_buffer());
+  kernel.SetArgument(2, static_cast<int>(a_offset));
+  kernel.SetArgument(3, static_cast<int>(a_ld));
+  kernel.SetArgument(4, b_buffer());
+  kernel.SetArgument(5, static_cast<int>(b_offset));
+  kernel.SetArgument(6, static_cast<int>(b_inc));
+  kernel.SetArgument(7, x_buffer());
+  kernel.SetArgument(8, static_cast<int>(x_offset));
+  kernel.SetArgument(9, static_cast<int>(x_inc));
+  kernel.SetArgument(10, static_cast<int>(is_transposed));
+  kernel.SetArgument(11, static_cast<int>(is_unit_diagonal));
+  kernel.SetArgument(12, static_cast<int>(do_conjugate));
+
+  // Launches the kernel
+  const auto local = std::vector<size_t>{db_["TRSV_BLOCK_SIZE"]};
+  const auto global = std::vector<size_t>{1};
+  auto event = Event();
+  RunKernel(kernel, queue_, device_, global, local, event.pointer());
+  event.WaitForCompletion();
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
+                      const Transpose a_transpose, const Diagonal diagonal,
+                      const size_t n,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc) {
+
+  // Makes sure all dimensions are larger than zero
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
+
+  // Tests the matrix and vector
+  TestMatrixA(n, n, a_buffer, a_offset, a_ld);
+  TestVectorX(n, b_buffer, b_offset, b_inc);
+
+  // Creates a copy of B to avoid overwriting input while computing output
+  // TODO: Make x with 0 offset and unit increment by creating custom copy-to and copy-from kernels
+  const auto x_offset = b_offset;
+  const auto x_inc = b_inc;
+  const auto x_size = n*x_inc + x_offset;
+  auto x_buffer = Buffer<T>(context_, x_size);
+  b_buffer.CopyTo(queue_, x_size, x_buffer);
+
+  // Fills the output buffer with zeros
+  auto eventWaitList = std::vector<Event>();
+  auto fill_vector_event = Event();
+  FillVector(queue_, device_, program_, db_, fill_vector_event.pointer(), eventWaitList,
+             n, x_inc, x_offset, x_buffer, ConstantZero<T>());
+  fill_vector_event.WaitForCompletion();
+
+  // Derives properties based on the arguments
+  const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) ||
+                         (triangle == Triangle::kLower && a_transpose != Transpose::kNo));
+  const auto is_transposed = ((layout == Layout::kColMajor && a_transpose == Transpose::kNo) ||
+                              (layout != Layout::kColMajor && a_transpose != Transpose::kNo));
+
+  // Loops over the blocks
+  auto col = n; // the initial column position
+  for (auto i = size_t{0}; i < n; i += db_["TRSV_BLOCK_SIZE"]) {
+    const auto block_size = std::min(db_["TRSV_BLOCK_SIZE"], n - i);
+
+    // Sets the next column position
+    col = (is_upper) ? col - block_size : i;
+
+    // Sets the offsets for upper or lower triangular
+    const auto extra_offset_a = (is_transposed) ?
+                                (is_upper ? col + (col+block_size)*a_ld : col) :
+                                (is_upper ? col+block_size + col*a_ld : col*a_ld);
+    const auto extra_offset_x = (is_upper) ? (col+block_size)*x_inc : 0;
+    const auto extra_offset_b = col*x_inc;
+
+    // Runs the GEMV routine to compute x' = A * x
+    if (i > 0) {
+      const auto gemv_m = (a_transpose == Transpose::kNo) ? block_size : i;
+      const auto gemv_n = (a_transpose == Transpose::kNo) ? i : block_size;
+      DoGemv(layout, a_transpose, gemv_m, gemv_n, ConstantOne<T>(),
+             a_buffer, a_offset + extra_offset_a, a_ld,
+             x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(),
+             x_buffer, x_offset + extra_offset_b, x_inc );
+    }
+
+    // Runs the triangular substitution for the block size
+    Substitution(layout, triangle, a_transpose, diagonal, block_size,
+                 a_buffer, a_offset + col + col*a_ld, a_ld,
+                 b_buffer, b_offset + col*b_inc, b_inc,
+                 x_buffer, x_offset + col*x_inc, x_inc);
+  }
+
+  // Retrieves the results
+  x_buffer.CopyTo(queue_, x_size, b_buffer);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtrsv<half>;
+template class Xtrsv<float>;
+template class Xtrsv<double>;
+template class Xtrsv<float2>;
+template class Xtrsv<double2>;
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routines/level2/xtrsv.hpp
+++ b/src/routines/level2/xtrsv.hpp
@ -0,0 +1,60 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrsv routine. It uses a block-algorithm and performs small triangular
+// forward and backward substitutions on the diagonal parts of the matrix in combination with larger
+// GEMV computation on the remainder of the matrix.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTRSV_H_
+#define CLBLAST_ROUTINES_XTRSV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtrsv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::queue_;
+  using Xgemv<T>::context_;
+  using Xgemv<T>::device_;
+  using Xgemv<T>::db_;
+  using Xgemv<T>::program_;
+  using Xgemv<T>::DoGemv;
+
+  // Constructor
+  Xtrsv(Queue &queue, EventPointer event, const std::string &name = "TRSV");
+
+  // Templated-precision implementation of the routine
+  void DoTrsv(const Layout layout, const Triangle triangle,
+              const Transpose a_transpose, const Diagonal diagonal,
+              const size_t n,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+
+  // Performs forward or backward substitution on a small triangular matrix
+  void Substitution(const Layout layout, const Triangle triangle,
+                    const Transpose a_transpose, const Diagonal diagonal,
+                    const size_t n,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
+                    const Buffer<T> &x_buffer, const size_t offset_x, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTRSV_H_
+#endif
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@ -33,10 +33,11 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
    #include "../../kernels/level3/convert_symmetric.opencl"
    #include "../../kernels/level3/convert_triangular.opencl"
    #include "../../kernels/level3/convert_hermitian.opencl"
+    , // separated in multiple parts to prevent C1091 in MSVC 2013
    #include "../../kernels/level3/xgemm_direct_part1.opencl"
    #include "../../kernels/level3/xgemm_direct_part2.opencl"
    #include "../../kernels/level3/xgemm_direct_part3.opencl"
-    , // separated in two parts to prevent C1091 in MSVC 2013
+    , // separated in multiple parts to prevent C1091 in MSVC 2013
    #include "../../kernels/level3/xgemm_part1.opencl"
    #include "../../kernels/level3/xgemm_part2.opencl"
    #include "../../kernels/level3/xgemm_part3.opencl"
@ -103,19 +104,19 @@ void Xgemm<T>::DoGemm(const Layout layout,
  // Selects which version of GEMM to run
  const auto do_gemm_direct = (m * n * k < db_["XGEMM_MIN_INDIRECT_SIZE"]);
  if (do_gemm_direct) { // for small sizes (single kernel)
-    return GemmDirect(m, n, k, alpha,
-                      a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
-                      c_buffer, c_offset, c_ld,
-                      a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate);
+    GemmDirect(m, n, k, alpha,
+               a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
+               c_buffer, c_offset, c_ld,
+               a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate);
  }
  else { // for larger sizes (pre/post-processing plus a very fast kernel)
-    return GemmIndirect(m, n, k, alpha,
-                        a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
-                        c_buffer, c_offset, c_ld,
-                        a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate,
-                        a_one, a_two, a_want_rotated,
-                        b_one, b_two, b_want_rotated,
-                        c_one, c_two, c_want_rotated);
+    GemmIndirect(m, n, k, alpha,
+                 a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
+                 c_buffer, c_offset, c_ld,
+                 a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate,
+                 a_one, a_two, a_want_rotated,
+                 b_one, b_two, b_want_rotated,
+                 c_one, c_two, c_want_rotated);
  }
 }

@ -126,16 +127,16 @@ void Xgemm<T>::DoGemm(const Layout layout,
 // overhead of these extra kernels might not be ideal for certain devices/arguments.
 template <typename T>
 void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
-                                  const T alpha,
-                                  const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                                  const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
-                                  const T beta,
-                                  const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
-                                  const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
-                                  const bool a_conjugate, const bool b_conjugate,
-                                  const size_t a_one, const size_t a_two, const bool a_want_rotated,
-                                  const size_t b_one, const size_t b_two, const bool b_want_rotated,
-                                  const size_t c_one, const size_t c_two, const bool c_want_rotated) {
+                            const T alpha,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                            const T beta,
+                            const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+                            const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                            const bool a_conjugate, const bool b_conjugate,
+                            const size_t a_one, const size_t a_two, const bool a_want_rotated,
+                            const size_t b_one, const size_t b_two, const bool b_want_rotated,
+                            const size_t c_one, const size_t c_two, const bool c_want_rotated) {
  // Calculates the ceiled versions of m, n, and k
  const auto m_ceiled = Ceil(m, db_["MWG"]);
  const auto n_ceiled = Ceil(n, db_["NWG"]);
@ -150,9 +151,6 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
  const auto c_one_i = (c_want_rotated) ? n_ceiled : m_ceiled;
  const auto c_two_i = (c_want_rotated) ? m_ceiled : n_ceiled;

-  // Loads the program from the database
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
  // Determines whether or not temporary matrices are needed
  auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 &&
                   a_do_transpose == false && a_conjugate == false;
@ -178,7 +176,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
    PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
                           a_one, a_two, a_ld, a_offset, a_buffer,
                           a_one_i, a_two_i, a_one_i, 0, a_temp,
-                           ConstantOne<T>(), program,
+                           ConstantOne<T>(), program_,
                           true, a_do_transpose, a_conjugate);
    eventWaitList.push_back(eventProcessA);
  }
@ -189,7 +187,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
    PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
                           b_one, b_two, b_ld, b_offset, b_buffer,
                           b_one_i, b_two_i, b_one_i, 0, b_temp,
-                           ConstantOne<T>(), program,
+                           ConstantOne<T>(), program_,
                           true, b_do_transpose, b_conjugate);
    eventWaitList.push_back(eventProcessB);
  }
@ -200,13 +198,13 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
    PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
                           c_one, c_two, c_ld, c_offset, c_buffer,
                           c_one_i, c_two_i, c_one_i, 0, c_temp,
-                           ConstantOne<T>(), program,
+                           ConstantOne<T>(), program_,
                           true, c_do_transpose, false);
    eventWaitList.push_back(eventProcessC);
  }

  // Retrieves the Xgemm kernel from the compiled binary
-  auto kernel = Kernel(program, "Xgemm");
+  auto kernel = Kernel(program_, "Xgemm");

  // Sets the kernel arguments
  kernel.SetArgument(0, static_cast<int>(m_ceiled));
@ -236,7 +234,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
    PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
                           c_one_i, c_two_i, c_one_i, 0, c_temp,
                           c_one, c_two, c_ld, c_offset, c_buffer,
-                           ConstantOne<T>(), program,
+                           ConstantOne<T>(), program_,
                           false, c_do_transpose, false);
  }
 }
@ -247,21 +245,18 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
 // The direct version of GEMM, requiring just one kernel, no pre or post-processing kernels.
 template <typename T>
 void Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
-                                const T alpha,
-                                const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                                const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
-                                const T beta,
-                                const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
-                                const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
-                                const bool a_conjugate, const bool b_conjugate) {
-
-  // Loads the program from the database
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+                          const T alpha,
+                          const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                          const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                          const T beta,
+                          const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+                          const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                          const bool a_conjugate, const bool b_conjugate) {

  // Retrieves the proper XgemmDirect kernel from the compiled binary
  const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") :
                                       (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN");
-  auto kernel = Kernel(program, name);
+  auto kernel = Kernel(program_, name);

  // Sets the kernel arguments
  kernel.SetArgument(0, static_cast<int>(m));
--- a/src/routines/level3/xhemm.cpp
+++ b/src/routines/level3/xhemm.cpp
@ -58,8 +58,7 @@ void Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle trian

  // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm
  // routine afterwards
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program_, kernel_name);

  // Sets the arguments for the hermitian-to-squared kernel
  kernel.SetArgument(0, static_cast<int>(k));
--- a/src/routines/level3/xhemm.hpp
+++ b/src/routines/level3/xhemm.hpp
@ -30,6 +30,7 @@ class Xhemm: public Xgemm<T> {
  using Xgemm<T>::queue_;
  using Xgemm<T>::context_;
  using Xgemm<T>::device_;
+  using Xgemm<T>::program_;
  using Xgemm<T>::db_;
  using Xgemm<T>::DoGemm;

--- a/src/routines/level3/xher2k.cpp
+++ b/src/routines/level3/xher2k.cpp
@ -81,9 +81,6 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
  // Decides which kernel to run: the upper-triangular or lower-triangular version
  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";

-  // Loads the program from the database
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
  // Determines whether or not temporary matrices are needed
  auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
                    ab_rotated == false && ab_conjugate == false;
@ -116,7 +113,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
    PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList,
                           ab_one, ab_two, a_ld, a_offset, a_buffer,
                           n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
-                           ConstantOne<T>(), program,
+                           ConstantOne<T>(), program_,
                           true, ab_rotated, ab_conjugate);
    eventWaitList.push_back(eventProcessA1);
  }
@ -125,7 +122,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
    PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList,
                           ab_one, ab_two, a_ld, a_offset, a_buffer,
                           n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
-                           ConstantOne<T>(), program,
+                           ConstantOne<T>(), program_,
                           true, ab_rotated, !ab_conjugate);
    eventWaitList.push_back(eventProcessA2);
  }
@ -134,7 +131,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
    PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList,
                           ab_one, ab_two, b_ld, b_offset, b_buffer,
                           n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
-                           ConstantOne<T>(), program,
+                           ConstantOne<T>(), program_,
                           true, ab_rotated, ab_conjugate);
    eventWaitList.push_back(eventProcessB1);
  }
@ -143,7 +140,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
    PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList,
                           ab_one, ab_two, b_ld, b_offset, b_buffer,
                           n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
-                           ConstantOne<T>(), program,
+                           ConstantOne<T>(), program_,
                           true, ab_rotated, !ab_conjugate);
    eventWaitList.push_back(eventProcessB2);
  }
@ -154,12 +151,12 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
  PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
                         n, n, c_ld, c_offset, c_buffer,
                         n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
-                         ConstantOne<T>(), program,
+                         ConstantOne<T>(), program_,
                         true, c_rotated, false);
  eventWaitList.push_back(eventProcessC);

  // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
-  auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program_, kernel_name);

  // Sets the kernel arguments
  kernel.SetArgument(0, static_cast<int>(n_ceiled));
@ -201,7 +198,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
  PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
                         n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                         n, n, c_ld, c_offset, c_buffer,
-                         ConstantOne<T>(), program,
+                         ConstantOne<T>(), program_,
                         false, c_rotated, false, upper, lower, true);
 }

--- a/src/routines/level3/xherk.cpp
+++ b/src/routines/level3/xherk.cpp
@ -79,9 +79,6 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran
  // Decides which kernel to run: the upper-triangular or lower-triangular version
  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";

-  // Loads the program from the database
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
  // Determines whether or not temporary matrices are needed
  auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
                   a_rotated == false && a_conjugate == false;
@ -109,7 +106,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran
    PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
                           a_one, a_two, a_ld, a_offset, a_buffer,
                           n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
-                           ConstantOne<T>(), program,
+                           ConstantOne<T>(), program_,
                           true, a_rotated, a_conjugate);
    eventWaitList.push_back(eventProcessA);
  }
@ -118,7 +115,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran
    PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
                           a_one, a_two, a_ld, a_offset, a_buffer,
                           n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
-                           ConstantOne<T>(), program,
+                           ConstantOne<T>(), program_,
                           true, a_rotated, b_conjugate);
    eventWaitList.push_back(eventProcessB);
  }
@ -129,12 +126,12 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran
  PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
                         n, n, c_ld, c_offset, c_buffer,
                         n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
-                         ConstantOne<T>(), program,
+                         ConstantOne<T>(), program_,
                         true, c_rotated, false);
  eventWaitList.push_back(eventProcessC);

  // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
-  auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program_, kernel_name);

  // Sets the kernel arguments
  kernel.SetArgument(0, static_cast<int>(n_ceiled));
@ -163,7 +160,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran
  PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
                         n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                         n, n, c_ld, c_offset, c_buffer,
-                         ConstantOne<T>(), program,
+                         ConstantOne<T>(), program_,
                         false, c_rotated, false, upper, lower, true);
 }

--- a/src/routines/level3/xsymm.cpp
+++ b/src/routines/level3/xsymm.cpp
@ -30,12 +30,12 @@ Xsymm<T>::Xsymm(Queue &queue, EventPointer event, const std::string &name):
 // The main routine
 template <typename T>
 void Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle,
-                            const size_t m, const size_t n,
-                            const T alpha,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
-                            const T beta,
-                            const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
+                      const size_t m, const size_t n,
+                      const T alpha,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                      const T beta,
+                      const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {

  // Makes sure all dimensions are larger than zero
  if ((m == 0) || (n == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }
@ -58,8 +58,7 @@ void Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle trian

  // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm
  // routine afterwards
-  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-  auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program_, kernel_name);

  // Sets the arguments for the symmetric-to-squared kernel
  kernel.SetArgument(0, static_cast<int>(k));
--- a/Show More
+++ b/Show More