Merge pull request #58 from CNugteren/development

Update to version 0.7.0
2024-07-07 12:23:46 +02:00 · 2016-05-08 21:25:50 +02:00 · 2016-05-08 21:25:50 +02:00 · d91356a6b7
parent d190becd89 942912daeb
commit d91356a6b7
187 changed files with 12794 additions and 3069 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,4 +2,5 @@ build
 stash
 .*
 *.pyc
-*.db
+*.db
+cl.hpp
--- a/19
+++ b/19
@ -1,4 +1,23 @@

+Version 0.7.0
+- Added exports to be able to create a DLL on Windows (thanks to Marco Hutter)
+- Made the library thread-safe
+- Performance and correctness tests can now (on top of clBLAS) be performed against CPU BLAS libraries
+- Fixed the use of events within the library
+- Changed the enum parameters to match the raw values of the cblas standard
+- Fixed the cache of previously compiled binaries and added a function to fill or clear it
+- Various minor fixes and enhancements
+- Added a preliminary version of the API documentation
+- Added additional sample programs
+- Added tuned parameters for various devices (see README)
+- Added level-1 routines:
+  * SNRM2/DNRM2/ScNRM2/DzNRM2
+  * SASUM/DASUM/ScASUM/DzASUM
+  * SSUM/DSUM/ScSUM/DzSUM (non-absolute version of the above xASUM BLAS routines)
+  * iSAMAX/iDAMAX/iCAMAX/iZAMAX
+  * iSMAX/iDMAX/iCMAX/iZMAX (non-absolute version of the above ixAMAX BLAS routines)
+  * iSMIN/iDMIN/iCMIN/iZMIN (non-absolute minimum version of the above ixAMAX BLAS routines)
+
 Version 0.6.0
 - Added support for MSVC (Visual Studio) 2015
 - Added tuned parameters for various devices (see README)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -13,7 +13,7 @@
 cmake_minimum_required(VERSION 2.8.10)
 project("clblast" C CXX)
 set(clblast_VERSION_MAJOR 0)
-set(clblast_VERSION_MINOR 6)
+set(clblast_VERSION_MINOR 7)
 set(clblast_VERSION_PATCH 0)

 # Options and their default values
@ -66,13 +66,22 @@ else ()
      set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable")
    endif()
  elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-    set(FLAGS "${FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
+    set(FLAGS "${FLAGS} -Wextra -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
    set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch")
    set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn")
+    set(FLAGS "${FLAGS} -Wno-deprecated-declarations")
  endif()
 endif()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")

+# C compiler settings (for the sample)
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+  set(CFLAGS "/Ox")
+else ()
+  set(CFLAGS "-O3 -std=c99")
+endif()
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CFLAGS}")
+
 # ==================================================================================================

 # Package scripts location
@ -90,11 +99,13 @@ if(TUNERS)
  endif()
 endif()

-# Locates the clBLAS library in case the tests need to be compiled. "FindclBLAS.cmake" is included.
+# Locates the reference BLAS libraries in case the tests need to be compiled. The "FindclBLAS.cmake"
+# and "FindCBLAS.cmake" are included.
 if(TESTS)
  find_package(clBLAS)
-  if(NOT CLBLAS_FOUND)
-    message(STATUS "Could NOT find clBLAS, disabling the compilation of the tests")
+  find_package(CBLAS)
+  if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND)
+    message(STATUS "Could NOT find clBLAS nor a CPU BLAS, disabling the compilation of the tests")
    set(TESTS OFF)
  endif()
 endif()
@ -109,8 +120,8 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
 # Sets the supported routines and the used kernels. New routines and kernels should be added here.
 set(KERNELS copy pad transpose padtranspose xaxpy xdot xger xgemm xgemv)
 set(SAMPLE_PROGRAMS_CPP sgemm)
-set(SAMPLE_PROGRAMS_C sgemm)
-set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc)
+set(SAMPLE_PROGRAMS_C sasum dgemv sgemm cache)
+set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
 set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
                    xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
 set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
@ -120,7 +131,8 @@ set(PRECISIONS 32 64 3232 6464)
 # ==================================================================================================

 # Gathers all source-files
-set(SOURCES src/clblast.cc src/database.cc src/routine.cc src/utilities.cc src/clblast_c.cc)
+set(SOURCES src/clblast.cc src/database.cc src/routine.cc src/cache.cc
+            src/utilities.cc src/clblast_c.cc)
 foreach(ROUTINE ${LEVEL1_ROUTINES})
  set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cc)
 endforeach()
@ -156,6 +168,9 @@ endif()
 # This section contains all the code related to the examples
 if(SAMPLES)

+  # Downloads the cl.hpp file from Khronos
+  file(DOWNLOAD https://www.khronos.org/registry/cl/api/1.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp)
+
  # Adds sample programs (C++)
  foreach(SAMPLE ${SAMPLE_PROGRAMS_CPP})
    add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cc)
@ -204,11 +219,33 @@ endif()
 # ==================================================================================================

 # Down from here is all test (performance and correctness) related. Note that these tests require
-# the presence of the clBLAS library to act as a reference.
+# the presence of clBLAS and/or a BLAS library to act as a reference.
 if(TESTS)

-  # Adds new include directories for the reference clBLAS
-  include_directories(${clblast_SOURCE_DIR}/test ${CLBLAS_INCLUDE_DIRS})
+  # Sets the specifics for the reference BLAS libraries
+  set(REF_INCLUDES )
+  set(REF_LIBRARIES )
+  if(CLBLAS_FOUND)
+    set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS})
+    set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES})
+    if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+      add_definitions(" /DCLBLAST_REF_CLBLAS")
+    else()
+      add_definitions(" -DCLBLAST_REF_CLBLAS")
+    endif()
+  endif()
+  if(CBLAS_FOUND)
+    set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS})
+    set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES})
+    if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+      add_definitions(" /DCLBLAST_REF_CBLAS")
+    else()
+      add_definitions(" -DCLBLAST_REF_CBLAS")
+    endif()
+  endif()
+
+  # Sets the include directories
+  include_directories(${clblast_SOURCE_DIR}/test ${REF_INCLUDES})

  # Creates the common correctness-tests objects (requires CMake 2.8.8)
  add_library(test_correctness_common OBJECT
@ -228,7 +265,7 @@ if(TESTS)
                   test/correctness/routines/level3/${ROUTINE}.cc)
  endforeach()
  foreach(ROUTINE ${ROUTINES})
-    target_link_libraries(clblast_test_${ROUTINE} clblast ${CLBLAS_LIBRARIES} ${OPENCL_LIBRARIES})
+    target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
    install(TARGETS clblast_test_${ROUTINE} DESTINATION bin)
  endforeach()

@ -258,7 +295,7 @@ if(TESTS)
                   test/performance/routines/level3/${ROUTINE}.cc)
  endforeach()
  foreach(ROUTINE ${ROUTINES})
-    target_link_libraries(clblast_client_${ROUTINE} clblast ${CLBLAS_LIBRARIES} ${OPENCL_LIBRARIES})
+    target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
    install(TARGETS clblast_client_${ROUTINE} DESTINATION bin)
  endforeach()

--- a/README.md
+++ b/README.md
@ -52,6 +52,14 @@ The pre-requisites for compilation of CLBlast are:
  - Intel OpenCL
  - Beignet

+Furthermore, to build the (optional) correctness and performance tests, another BLAS library is needed to serve as a reference. This can be either:
+
+* The OpenCL BLAS library [clBLAS](http://github.com/clMathLibraries/clBLAS (maintained by AMD)
+* A regular CPU Netlib BLAS library, e.g.:
+  - OpenBLAS
+  - BLIS
+  - Accelerate
+
 An example of an out-of-source build (starting from the root of the CLBlast folder):

    mkdir build
@ -76,7 +84,7 @@ Or alternatively the plain C version:

    #include <clblast_c.h>

-Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the `clblast.h` include file. Additionally, a couple of stand-alone example programs are included in `samples/`.
+Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the `clblast.h` include file and the included [API documentation](doc/api.md). Additionally, a couple of stand-alone example programs are included in `samples/`.


 Using the tuners (optional)
@ -95,6 +103,8 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
  - Tesla K40m
 * AMD GPUs:
  - Tahiti
+  - Hawaii
+  - Pitcairn
  - R9 M370X
 * Intel GPUs:
  - Iris
@ -128,16 +138,16 @@ In summary, tuning the entire library for your device can be done as follows (st
    make


-Compiling the tests (optional)
+Compiling the correctness and performance tests (optional)
 -------------

 To make sure CLBlast is working correctly on your device (recommended), compile with the tests enabled:

    cmake -DTESTS=ON ..

-Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is tested against [clBLAS](http://github.com/clMathLibraries/clBLAS) for correctness. The library clBLAS is therefore required to be installed on your system for the CLBlast tests.
+Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is best tested against [clBLAS](http://github.com/clMathLibraries/clBLAS) for correctness. If the library clBLAS is not installed on your system, it will use a regular CPU BLAS library to test against. If both are present, setting the command-line option `-clblas 1` or `-cblas 1` will select the library to test against for the `clblast_test_xxxxx` executables.

-With the `-DTESTS=ON` flag, additional performance tests are compiled. These come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run both CLBlast and clBLAS in a head-to-head performance test.
+With the `-DTESTS=ON` flag, additional performance tests are compiled. These come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run CLBlast in a head-to-head performance test against clBLAS and/or a CPU BLAS library.


 Performance remarks
@ -161,64 +171,77 @@ These graphs can be generated automatically on your own device. First, compile C
 Supported routines
 -------------

-CLBlast is in active development but already supports almost all the BLAS routines. The currently supported routines are marked with '✔' in the following tables. Empty boxes represent routines that still need to be implemented in a future release, whereas routines marked with '-' are not part of BLAS at all.
+CLBlast is in active development but already supports almost all the BLAS routines. The supported routines are marked with '✔' in the following tables. Routines marked with '-' do not exist: they are not part of BLAS at all.

-| Level-1  | S | D | C | Z | Notes   |
-| ---------|---|---|---|---|---------|
-| xROTG    |   |   | - | - |         |
-| xROTMG   |   |   | - | - |         |
-| xROT     |   |   | - | - |         |
-| xROTM    |   |   | - | - |         |
-| xSWAP    | ✔ | ✔ | ✔ | ✔ |         |
-| xSCAL    | ✔ | ✔ | ✔ | ✔ | +CS +ZD |
-| xCOPY    | ✔ | ✔ | ✔ | ✔ |         |
-| xAXPY    | ✔ | ✔ | ✔ | ✔ |         |
-| xDOT     | ✔ | ✔ | - | - |         |
-| xDOTU    | - | - | ✔ | ✔ |         |
-| xDOTC    | - | - | ✔ | ✔ |         |
-| xNRM2    |   |   | - | - | +SC +DZ |
-| xASUM    |   |   | - | - | +SC +DZ |
-| IxAMAX   |   |   |   |   |         |
+| Level-1  | S | D | C | Z |
+| ---------|---|---|---|---|
+| xSWAP    | ✔ | ✔ | ✔ | ✔ |
+| xSCAL    | ✔ | ✔ | ✔ | ✔ |
+| xCOPY    | ✔ | ✔ | ✔ | ✔ |
+| xAXPY    | ✔ | ✔ | ✔ | ✔ |
+| xDOT     | ✔ | ✔ | - | - |
+| xDOTU    | - | - | ✔ | ✔ |
+| xDOTC    | - | - | ✔ | ✔ |
+| xNRM2    | ✔ | ✔ | ✔ | ✔ |
+| xASUM    | ✔ | ✔ | ✔ | ✔ |
+| IxAMAX   | ✔ | ✔ | ✔ | ✔ |

-| Level-2  | S | D | C | Z | Notes   |
-| ---------|---|---|---|---|---------|
-| xGEMV    | ✔ | ✔ | ✔ | ✔ |         |
-| xGBMV    | ✔ | ✔ | ✔ | ✔ |         |
-| xHEMV    | - | - | ✔ | ✔ |         |
-| xHBMV    | - | - | ✔ | ✔ |         |
-| xHPMV    | - | - | ✔ | ✔ |         |
-| xSYMV    | ✔ | ✔ | - | - |         |
-| xSBMV    | ✔ | ✔ | - | - |         |
-| xSPMV    | ✔ | ✔ | - | - |         |
-| xTRMV    | ✔ | ✔ | ✔ | ✔ |         |
-| xTBMV    | ✔ | ✔ | ✔ | ✔ |         |
-| xTPMV    | ✔ | ✔ | ✔ | ✔ |         |
-| xTRSV    |   |   |   |   |         |
-| xTBSV    |   |   |   |   |         |
-| xTPSV    |   |   |   |   |         |
-| xGER     | ✔ | ✔ | - | - |         |
-| xGERU    | - | - | ✔ | ✔ |         |
-| xGERC    | - | - | ✔ | ✔ |         |
-| xHER     | - | - | ✔ | ✔ |         |
-| xHPR     | - | - | ✔ | ✔ |         |
-| xHER2    | - | - | ✔ | ✔ |         |
-| xHPR2    | - | - | ✔ | ✔ |         |
-| xSYR     | ✔ | ✔ | - | - |         |
-| xSPR     | ✔ | ✔ | - | - |         |
-| xSYR2    | ✔ | ✔ | - | - |         |
-| xSPR2    | ✔ | ✔ | - | - |         |
+| Level-2  | S | D | C | Z |
+| ---------|---|---|---|---|
+| xGEMV    | ✔ | ✔ | ✔ | ✔ |
+| xGBMV    | ✔ | ✔ | ✔ | ✔ |
+| xHEMV    | - | - | ✔ | ✔ |
+| xHBMV    | - | - | ✔ | ✔ |
+| xHPMV    | - | - | ✔ | ✔ |
+| xSYMV    | ✔ | ✔ | - | - |
+| xSBMV    | ✔ | ✔ | - | - |
+| xSPMV    | ✔ | ✔ | - | - |
+| xTRMV    | ✔ | ✔ | ✔ | ✔ |
+| xTBMV    | ✔ | ✔ | ✔ | ✔ |
+| xTPMV    | ✔ | ✔ | ✔ | ✔ |
+| xGER     | ✔ | ✔ | - | - |
+| xGERU    | - | - | ✔ | ✔ |
+| xGERC    | - | - | ✔ | ✔ |
+| xHER     | - | - | ✔ | ✔ |
+| xHPR     | - | - | ✔ | ✔ |
+| xHER2    | - | - | ✔ | ✔ |
+| xHPR2    | - | - | ✔ | ✔ |
+| xSYR     | ✔ | ✔ | - | - |
+| xSPR     | ✔ | ✔ | - | - |
+| xSYR2    | ✔ | ✔ | - | - |
+| xSPR2    | ✔ | ✔ | - | - |

-| Level-3  | S | D | C | Z | Notes   |
-| ---------|---|---|---|---|---------|
-| xGEMM    | ✔ | ✔ | ✔ | ✔ |         |
-| xSYMM    | ✔ | ✔ | ✔ | ✔ |         |
-| xHEMM    | - | - | ✔ | ✔ |         |
-| xSYRK    | ✔ | ✔ | ✔ | ✔ |         |
-| xHERK    | - | - | ✔ | ✔ |         |
-| xSYR2K   | ✔ | ✔ | ✔ | ✔ |         |
-| xHER2K   | - | - | ✔ | ✔ |         |
-| xTRMM    | ✔ | ✔ | ✔ | ✔ |         |
-| xTRSM    |   |   |   |   |         |
+| Level-3  | S | D | C | Z |
+| ---------|---|---|---|---|
+| xGEMM    | ✔ | ✔ | ✔ | ✔ |
+| xSYMM    | ✔ | ✔ | ✔ | ✔ |
+| xHEMM    | - | - | ✔ | ✔ |
+| xSYRK    | ✔ | ✔ | ✔ | ✔ |
+| xHERK    | - | - | ✔ | ✔ |
+| xSYR2K   | ✔ | ✔ | ✔ | ✔ |
+| xHER2K   | - | - | ✔ | ✔ |
+| xTRMM    | ✔ | ✔ | ✔ | ✔ |
+
+In addition, some non-BLAS routines are also supported by CLBlast. They are experimental and should be used with care:
+
+| Additional | S | D | C | Z |
+| -----------|---|---|---|---|
+| xSUM       | ✔ | ✔ | ✔ | ✔ |
+| IxMAX      | ✔ | ✔ | ✔ | ✔ |
+| IxMIN      | ✔ | ✔ | ✔ | ✔ |
+
+Some BLAS routines are not supported yet by CLBlast. They are shown in the following table:
+
+| Unsupported | S | D | C | Z |
+| ------------|---|---|---|---|
+| xROTG       |   |   | - | - |
+| xROTMG      |   |   | - | - |
+| xROT        |   |   | - | - |
+| xROTM       |   |   | - | - |
+| xTRSV       |   |   |   |   |
+| xTBSV       |   |   |   |   |
+| xTPSV       |   |   |   |   |
+| xTRSM       |   |   |   |   |


 Contributing
@ -226,28 +249,28 @@ Contributing

 Contributions are welcome in the form of tuning results for OpenCL devices previously untested. Furthermore, merge requests are welcome as long as they contain unit additions or modifications. Furthermore, they should follow the CLBlast coding style, which is based on the [Google C++ style guide](https://google-styleguide.googlecode.com/svn/trunk/cppguide.html) and the Effective C++ books by Scott Meyers.

-The contributing authors so far are:
+The contributing authors (code, pull requests, testing) so far are:

 * [Cedric Nugteren](http://www.cedricnugteren.nl)
+* [Anton Lokhmotov](https://github.com/psyhtest)
+* [Dragan Djuric](https://github.com/blueberry)
+* [Hugh Perkins](https://github.com/hughperkins)

 Tuning and testing on a variety of OpenCL devices was made possible by:

 * [TU/e ES research group](http://www.es.ele.tue.nl/)
 * [ASCI DAS4 and DAS5](http://www.cs.vu.nl/das4/)
-* [Dividiti](http://www.dividiti.com)
+* [dividiti](http://www.dividiti.com)
 * [SURFsara HPC center](http://www.surfsara.com)

 Support us
 -------------

-This project started in March 2015 as an evenings and weekends free-time project next to a full-time job. If you are in the position to support the project by OpenCL-hardware donations or otherwise, please find contact information on the [website of the main author](http://www.cedricnugteren.nl).
+This project started in March 2015 as an evenings and weekends free-time project next to a full-time job for Cedric Nugteren. If you are in the position to support the project by OpenCL-hardware donations or otherwise, please find contact information on the [website of the main author](http://www.cedricnugteren.nl).


 To-do list before release of version 1.0
 -------------

- Support all routines supported by clBLAS
- Allow the user control over events and synchronization
 - Add half-precision routines (e.g. HGEMM)
- Enable correctness and performance testing against a CPU-based BLAS library
- Test in multi-threaded environments
+- Add API documentation
--- a/cmake/Modules/FindCBLAS.cmake
+++ b/cmake/Modules/FindCBLAS.cmake
@ -0,0 +1,75 @@
+
+# ==================================================================================================
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+# project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+# width of 100 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+#
+# ==================================================================================================
+#
+# Defines the following variables:
+#   CBLAS_FOUND          Boolean holding whether or not the Netlib BLAS library was found
+#   CBLAS_INCLUDE_DIRS   The Netlib BLAS include directory
+#   CBLAS_LIBRARIES      The Netlib BLAS library
+#
+# In case BLAS is not installed in the default directory, set the CBLAS_ROOT variable to point to
+# the root of BLAS, such that 'cblas.h' can be found in $CBLAS_ROOT/include. This can either be
+# done using an environmental variable (e.g. export CBLAS_ROOT=/path/to/BLAS) or using a CMake
+# variable (e.g. cmake -DCBLAS_ROOT=/path/to/BLAS ..).
+#
+# ==================================================================================================
+
+# Sets the possible install locations
+set(CBLAS_HINTS
+  ${CBLAS_ROOT}
+  $ENV{CBLAS_ROOT}
+)
+set(CBLAS_PATHS
+  /usr
+  /usr/local
+  /usr/local/opt
+  /System/Library/Frameworks
+)
+
+# Finds the include directories
+find_path(CBLAS_INCLUDE_DIRS
+  NAMES cblas.h
+  HINTS ${CBLAS_HINTS}
+  PATH_SUFFIXES
+    include inc include/x86_64 include/x64
+    openblas/include include/blis blis/include blis/include/blis
+    Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers
+  PATHS ${CBLAS_PATHS}
+  DOC "Netlib BLAS include header cblas.h"
+)
+mark_as_advanced(CBLAS_INCLUDE_DIRS)
+
+# Finds the library
+find_library(CBLAS_LIBRARIES
+  NAMES cblas blas mkl blis openblas accelerate
+  HINTS ${CBLAS_HINTS}
+  PATH_SUFFIXES
+    lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import
+    openblas/lib blis/lib lib/atlas-base
+  PATHS ${CBLAS_PATHS}
+  DOC "Netlib BLAS library"
+)
+mark_as_advanced(CBLAS_LIBRARIES)
+
+# ==================================================================================================
+
+# Notification messages
+if(NOT CBLAS_INCLUDE_DIRS)
+    message(STATUS "Could NOT find 'cblas.h', install a CPU Netlib BLAS or set CBLAS_ROOT")
+endif()
+if(NOT CBLAS_LIBRARIES)
+    message(STATUS "Could NOT find a CPU Netlib BLAS library, install it or set CBLAS_ROOT")
+endif()
+
+# Determines whether or not BLAS was found
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(CBLAS DEFAULT_MSG CBLAS_INCLUDE_DIRS CBLAS_LIBRARIES)
+
+# ==================================================================================================
--- a/doc/clblast.md
+++ b/doc/clblast.md
--- a/include/clblast.h
+++ b/include/clblast.h
@ -73,11 +73,11 @@ enum class StatusCode {
 };

 // Matrix layout and transpose types
-enum class Layout { kRowMajor, kColMajor };
-enum class Transpose { kNo, kYes, kConjugate };
-enum class Side { kLeft, kRight };
-enum class Triangle { kUpper, kLower };
-enum class Diagonal { kUnit, kNonUnit };
+enum class Layout { kRowMajor = 101, kColMajor = 102 };
+enum class Transpose { kNo = 111, kYes = 112, kConjugate = 113 };
+enum class Triangle { kUpper = 121, kLower = 122 };
+enum class Diagonal { kNonUnit = 131, kUnit = 132 };
+enum class Side { kLeft = 141, kRight = 142 };

 // Precision scoped enum (values in bits)
 enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
@ -87,26 +87,60 @@ enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
 // BLAS level-1 (vector-vector) routines
 // =================================================================================================

+// Generate givens plane rotation: SROTG/DROTG
+template <typename T>
+StatusCode Rotg(cl_mem sa_buffer, const size_t sa_offset,
+                cl_mem sb_buffer, const size_t sb_offset,
+                cl_mem sc_buffer, const size_t sc_offset,
+                cl_mem ss_buffer, const size_t ss_offset,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Generate modified givens plane rotation: SROTMG/DROTMG
+template <typename T>
+StatusCode Rotmg(cl_mem sd1_buffer, const size_t sd1_offset,
+                 cl_mem sd2_buffer, const size_t sd2_offset,
+                 cl_mem sx1_buffer, const size_t sx1_offset,
+                 const cl_mem sy1_buffer, const size_t sy1_offset,
+                 cl_mem sparam_buffer, const size_t sparam_offset,
+                 cl_command_queue* queue, cl_event* event = nullptr);
+
+// Apply givens plane rotation: SROT/DROT
+template <typename T>
+StatusCode Rot(const size_t n,
+               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+               const T cos,
+               const T sin,
+               cl_command_queue* queue, cl_event* event = nullptr);
+
+// Apply modified givens plane rotation: SROTM/DROTM
+template <typename T>
+StatusCode Rotm(const size_t n,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_mem sparam_buffer, const size_t sparam_offset,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
 // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP
 template <typename T>
 StatusCode Swap(const size_t n,
                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL
 template <typename T>
 StatusCode Scal(const size_t n,
                const T alpha,
                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY
 template <typename T>
 StatusCode Copy(const size_t n,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY
 template <typename T>
@ -114,7 +148,7 @@ StatusCode Axpy(const size_t n,
                const T alpha,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Dot product of two vectors: SDOT/DDOT
 template <typename T>
@ -122,7 +156,7 @@ StatusCode Dot(const size_t n,
               cl_mem dot_buffer, const size_t dot_offset,
               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-               cl_command_queue* queue, cl_event* event);
+               cl_command_queue* queue, cl_event* event = nullptr);

 // Dot product of two complex vectors: CDOTU/ZDOTU
 template <typename T>
@ -130,7 +164,7 @@ StatusCode Dotu(const size_t n,
                cl_mem dot_buffer, const size_t dot_offset,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC
 template <typename T>
@ -138,7 +172,49 @@ StatusCode Dotc(const size_t n,
                cl_mem dot_buffer, const size_t dot_offset,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2
+template <typename T>
+StatusCode Nrm2(const size_t n,
+                cl_mem nrm2_buffer, const size_t nrm2_offset,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM
+template <typename T>
+StatusCode Asum(const size_t n,
+                cl_mem asum_buffer, const size_t asum_offset,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM
+template <typename T>
+StatusCode Sum(const size_t n,
+               cl_mem sum_buffer, const size_t sum_offset,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_command_queue* queue, cl_event* event = nullptr);
+
+// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX
+template <typename T>
+StatusCode Amax(const size_t n,
+                cl_mem imax_buffer, const size_t imax_offset,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX
+template <typename T>
+StatusCode Max(const size_t n,
+               cl_mem imax_buffer, const size_t imax_offset,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_command_queue* queue, cl_event* event = nullptr);
+
+// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN
+template <typename T>
+StatusCode Min(const size_t n,
+               cl_mem imin_buffer, const size_t imin_offset,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_command_queue* queue, cl_event* event = nullptr);

 // =================================================================================================
 // BLAS level-2 (matrix-vector) routines
@ -153,7 +229,7 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                const T beta,
                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV
 template <typename T>
@ -164,7 +240,7 @@ StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                const T beta,
                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Hermitian matrix-vector multiplication: CHEMV/ZHEMV
 template <typename T>
@ -175,7 +251,7 @@ StatusCode Hemv(const Layout layout, const Triangle triangle,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                const T beta,
                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV
 template <typename T>
@ -186,7 +262,7 @@ StatusCode Hbmv(const Layout layout, const Triangle triangle,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                const T beta,
                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV
 template <typename T>
@ -197,7 +273,7 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                const T beta,
                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Symmetric matrix-vector multiplication: SSYMV/DSYMV
 template <typename T>
@ -208,7 +284,7 @@ StatusCode Symv(const Layout layout, const Triangle triangle,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                const T beta,
                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV
 template <typename T>
@ -219,7 +295,7 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                const T beta,
                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV
 template <typename T>
@ -230,7 +306,7 @@ StatusCode Spmv(const Layout layout, const Triangle triangle,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                const T beta,
                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV
 template <typename T>
@ -238,7 +314,7 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_
                const size_t n,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV
 template <typename T>
@ -246,7 +322,7 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_
                const size_t n, const size_t k,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV
 template <typename T>
@ -254,7 +330,7 @@ StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_
                const size_t n,
                const cl_mem ap_buffer, const size_t ap_offset,
                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
 template <typename T>
@ -262,7 +338,7 @@ StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_
                const size_t n,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV
 template <typename T>
@ -270,7 +346,7 @@ StatusCode Tbsv(const Layout layout, const Triangle triangle, const Transpose a_
                const size_t n, const size_t k,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV
 template <typename T>
@ -278,7 +354,7 @@ StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_
                const size_t n,
                const cl_mem ap_buffer, const size_t ap_offset,
                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // General rank-1 matrix update: SGER/DGER
 template <typename T>
@ -288,7 +364,7 @@ StatusCode Ger(const Layout layout,
               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-               cl_command_queue* queue, cl_event* event);
+               cl_command_queue* queue, cl_event* event = nullptr);

 // General rank-1 complex matrix update: CGERU/ZGERU
 template <typename T>
@ -298,7 +374,7 @@ StatusCode Geru(const Layout layout,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // General rank-1 complex conjugated matrix update: CGERC/ZGERC
 template <typename T>
@ -308,7 +384,7 @@ StatusCode Gerc(const Layout layout,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Hermitian rank-1 matrix update: CHER/ZHER
 template <typename T>
@ -317,7 +393,7 @@ StatusCode Her(const Layout layout, const Triangle triangle,
               const T alpha,
               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-               cl_command_queue* queue, cl_event* event);
+               cl_command_queue* queue, cl_event* event = nullptr);

 // Hermitian packed rank-1 matrix update: CHPR/ZHPR
 template <typename T>
@ -326,7 +402,7 @@ StatusCode Hpr(const Layout layout, const Triangle triangle,
               const T alpha,
               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
               cl_mem ap_buffer, const size_t ap_offset,
-               cl_command_queue* queue, cl_event* event);
+               cl_command_queue* queue, cl_event* event = nullptr);

 // Hermitian rank-2 matrix update: CHER2/ZHER2
 template <typename T>
@ -336,7 +412,7 @@ StatusCode Her2(const Layout layout, const Triangle triangle,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2
 template <typename T>
@ -346,7 +422,7 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                cl_mem ap_buffer, const size_t ap_offset,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Symmetric rank-1 matrix update: SSYR/DSYR
 template <typename T>
@ -355,7 +431,7 @@ StatusCode Syr(const Layout layout, const Triangle triangle,
               const T alpha,
               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-               cl_command_queue* queue, cl_event* event);
+               cl_command_queue* queue, cl_event* event = nullptr);

 // Symmetric packed rank-1 matrix update: SSPR/DSPR
 template <typename T>
@ -364,7 +440,7 @@ StatusCode Spr(const Layout layout, const Triangle triangle,
               const T alpha,
               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
               cl_mem ap_buffer, const size_t ap_offset,
-               cl_command_queue* queue, cl_event* event);
+               cl_command_queue* queue, cl_event* event = nullptr);

 // Symmetric rank-2 matrix update: SSYR2/DSYR2
 template <typename T>
@ -374,7 +450,7 @@ StatusCode Syr2(const Layout layout, const Triangle triangle,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Symmetric packed rank-2 matrix update: SSPR2/DSPR2
 template <typename T>
@ -384,7 +460,7 @@ StatusCode Spr2(const Layout layout, const Triangle triangle,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                cl_mem ap_buffer, const size_t ap_offset,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // =================================================================================================
 // BLAS level-3 (matrix-matrix) routines
@ -399,7 +475,7 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos
                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                const T beta,
                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM
 template <typename T>
@ -410,7 +486,7 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                const T beta,
                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
 template <typename T>
@ -421,7 +497,7 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                const T beta,
                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK
 template <typename T>
@ -431,7 +507,7 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                const T beta,
                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Rank-K update of a hermitian matrix: CHERK/ZHERK
 template <typename T>
@ -441,7 +517,7 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                const T beta,
                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K
 template <typename T>
@ -452,7 +528,7 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a
                 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                 const T beta,
                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
-                 cl_command_queue* queue, cl_event* event);
+                 cl_command_queue* queue, cl_event* event = nullptr);

 // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
 template <typename T, typename U>
@ -463,7 +539,7 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a
                 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                 const U beta,
                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
-                 cl_command_queue* queue, cl_event* event);
+                 cl_command_queue* queue, cl_event* event = nullptr);

 // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM
 template <typename T>
@ -472,7 +548,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c
                const T alpha,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
 template <typename T>
@ -481,9 +557,20 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, c
                const T alpha,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
-                cl_command_queue* queue, cl_event* event);
+                cl_command_queue* queue, cl_event* event = nullptr);

 // =================================================================================================
+
+// CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
+// for the same device. This cache can be cleared to free up system memory or in case of debugging.
+StatusCode ClearCache();
+
+// The cache can also be pre-initialized for a specific device with all possible CLBLast kernels.
+// Further CLBlast routine calls will then run at maximum speed.
+StatusCode FillCache(const cl_device_id device);
+
+// =================================================================================================
+
 } // namespace clblast

 // CLBLAST_CLBLAST_H_
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
--- a/include/internal/cache.h
+++ b/include/internal/cache.h
@ -0,0 +1,100 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the caching functionality of compiled binaries and programs.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_CACHE_H_
+#define CLBLAST_CACHE_H_
+
+#include <string>
+#include <vector>
+#include <mutex>
+
+#include "internal/utilities.h"
+
+namespace clblast {
+namespace cache {
+// =================================================================================================
+
+// The cache of compiled OpenCL binaries, along with some meta-data
+struct BinaryCache {
+  std::string binary;
+  std::string device_name;
+  Precision precision;
+  std::string routine_name_;
+
+  // Finds out whether the properties match
+  bool MatchInCache(const std::string &ref_device, const Precision &ref_precision,
+                    const std::string &ref_routine) {
+    return (device_name == ref_device &&
+            precision == ref_precision &&
+            routine_name_ == ref_routine);
+  }
+};
+
+// The actual cache, implemented as a vector of the above data-type, and its mutex
+static std::vector<BinaryCache> binary_cache_;
+static std::mutex binary_cache_mutex_;
+
+// =================================================================================================
+
+// The cache of compiled OpenCL programs, along with some meta-data
+struct ProgramCache {
+  Program program;
+  ContextPointer context_ptr;
+  Precision precision;
+  std::string routine_name_;
+
+  // Finds out whether the properties match
+  bool MatchInCache(const ContextPointer ref_context, const Precision &ref_precision,
+                    const std::string &ref_routine) {
+    return (context_ptr == ref_context &&
+            precision == ref_precision &&
+            routine_name_ == ref_routine);
+  }
+};
+
+// The actual cache, implemented as a vector of the above data-type, and its mutex
+static std::vector<ProgramCache> program_cache_;
+static std::mutex program_cache_mutex_;
+
+// =================================================================================================
+
+// Stores the compiled binary or program in the cache
+void StoreBinaryToCache(const std::string &binary, const std::string &device_name,
+                        const Precision &precision, const std::string &routine_name);
+void StoreProgramToCache(const Program &program, const Context &context,
+                         const Precision &precision, const std::string &routine_name);
+
+// Queries the cache and retrieves a matching binary or program. Assumes that the match is
+// available, throws otherwise.
+const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision,
+                                      const std::string &routine_name);
+const Program& GetProgramFromCache(const Context &context, const Precision &precision,
+                                   const std::string &routine_name);
+
+// Queries the cache to see whether or not the compiled kernel is already there
+bool BinaryIsInCache(const std::string &device_name, const Precision &precision,
+                     const std::string &routine_name);
+bool ProgramIsInCache(const Context &context, const Precision &precision,
+                      const std::string &routine_name);
+
+// =================================================================================================
+
+// Clears the cache of stored binaries
+StatusCode ClearCache();
+
+// =================================================================================================
+} // namespace cache
+} // namespace clblast
+
+// CLBLAST_CACHE_H_
+#endif
--- a/include/internal/clpp11.h
+++ b/include/internal/clpp11.h
@ -78,11 +78,16 @@ class Event {
  // Regular constructor
  explicit Event(): event_(nullptr) { }

+  // Waits for completion of this event
+  void WaitForCompletion() const {
+    CheckError(clWaitForEvents(1, &event_));
+  }
+
  // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
  // the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation:
  // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx
  float GetElapsedTime() const {
-    CheckError(clWaitForEvents(1, &event_));
+    WaitForCompletion();
    auto bytes = size_t{0};
    clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
    auto time_start = size_t{0};
@ -95,10 +100,14 @@ class Event {

  // Accessor to the private data-member
  cl_event& operator()() { return event_; }
+  cl_event* pointer() { return &event_; }
 private:
  cl_event event_;
 };

+// Pointer to an OpenCL event
+using EventPointer = cl_event*;
+
 // =================================================================================================

 // C++11 version of 'cl_platform_id'
@ -260,10 +269,14 @@ class Context {

  // Accessor to the private data-member
  const cl_context& operator()() const { return *context_; }
+  cl_context* pointer() const { return &(*context_); }
 private:
  std::shared_ptr<cl_context> context_;
 };

+// Pointer to an OpenCL context
+using ContextPointer = cl_context*;
+
 // =================================================================================================

 // Enumeration of build statuses of the run-time compilation process
@ -274,7 +287,7 @@ class Program {
 public:
  // Note that there is no constructor based on the regular OpenCL data-type because of extra state

-  // Regular constructor with memory management
+  // Source-based constructor with memory management
  explicit Program(const Context &context, std::string source):
      program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
      length_(source.length()),
@ -285,6 +298,22 @@ class Program {
    CheckError(status);
  }

+  // Binary-based constructor with memory management
+  explicit Program(const Device &device, const Context &context, const std::string& binary):
+      program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
+      length_(binary.length()),
+      source_(binary),
+      source_ptr_(&source_[0]) {
+    auto status1 = CL_SUCCESS;
+    auto status2 = CL_SUCCESS;
+    const cl_device_id dev = device();
+    *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_,
+                                          reinterpret_cast<const unsigned char**>(&source_ptr_),
+                                          &status1, &status2);
+    CheckError(status1);
+    CheckError(status2);
+  }
+
  // Compiles the device program and returns whether or not there where any warnings/errors
  BuildStatus Build(const Device &device, std::vector<std::string> &options) {
    auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
@ -313,7 +342,7 @@ class Program {
    return result;
  }

-  // Retrieves an intermediate representation of the compiled program
+  // Retrieves a binary or an intermediate representation of the compiled program
  std::string GetIR() const {
    auto bytes = size_t{0};
    CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
@ -329,7 +358,7 @@ class Program {
 private:
  std::shared_ptr<cl_program> program_;
  size_t length_;
-  std::string source_;
+  std::string source_; // Note: the source can also be a binary or IR
  const char* source_ptr_;
 };

@ -468,31 +497,33 @@ class Buffer {
  }

  // Copies from device to host: reading the device buffer a-synchronously
-  void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
+  void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
    if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
    CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
                                   host, 0, nullptr, nullptr));
  }
  void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
-                 const size_t offset = 0) {
+                 const size_t offset = 0) const {
    if (host.size() < size) { Error("target host buffer is too small"); }
    ReadAsync(queue, size, host.data(), offset);
  }
  void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
-                 const size_t offset = 0) {
+                 const size_t offset = 0) const {
    if (host.size() < size) { Error("target host buffer is too small"); }
    ReadAsync(queue, size, host.data(), offset);
  }

  // Copies from device to host: reading the device buffer
-  void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
+  void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
    ReadAsync(queue, size, host, offset);
    queue.Finish();
  }
-  void Read(const Queue &queue, const size_t size, std::vector<T> &host, const size_t offset = 0) {
+  void Read(const Queue &queue, const size_t size, std::vector<T> &host,
+            const size_t offset = 0) const {
    Read(queue, size, host.data(), offset);
  }
-  void Read(const Queue &queue, const size_t size, BufferHost<T> &host, const size_t offset = 0) {
+  void Read(const Queue &queue, const size_t size, BufferHost<T> &host,
+            const size_t offset = 0) const {
    Read(queue, size, host.data(), offset);
  }

@ -601,17 +632,37 @@ class Kernel {

  // Launches a kernel onto the specified queue
  void Launch(const Queue &queue, const std::vector<size_t> &global,
-              const std::vector<size_t> &local, Event &event) {
+              const std::vector<size_t> &local, EventPointer event) {
    CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
                                      nullptr, global.data(), local.data(),
-                                      0, nullptr, &(event())));
+                                      0, nullptr, event));
+  }
+
+  // As above, but with an event waiting list
+  void Launch(const Queue &queue, const std::vector<size_t> &global,
+              const std::vector<size_t> &local, EventPointer event,
+              std::vector<Event>& waitForEvents) {
+    if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); }
+
+    // Builds a plain version of the events waiting list
+    auto waitForEventsPlain = std::vector<cl_event>();
+    for (auto &waitEvent : waitForEvents) {
+      waitForEventsPlain.push_back(waitEvent());
+    }
+
+    // Launches the kernel while waiting for other events
+    CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
+                                      nullptr, global.data(), local.data(),
+                                      static_cast<cl_uint>(waitForEventsPlain.size()),
+                                      waitForEventsPlain.data(),
+                                      event));
  }

  // As above, but with the default local workgroup size
-  void Launch(const Queue &queue, const std::vector<size_t> &global, Event &event) {
+  void Launch(const Queue &queue, const std::vector<size_t> &global, EventPointer event) {
    CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
                                      nullptr, global.data(), nullptr,
-                                      0, nullptr, &(event())));
+                                      0, nullptr, event));
  }

  // Accessor to the private data-member
--- a/include/internal/database/copy.h
+++ b/include/internal/database/copy.h
@ -19,8 +19,10 @@ const Database::DatabaseEntry Database::CopySingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "Hawaii",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+        { "Pitcairn",                                        { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
        { "Tahiti",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
-        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
      }
    },
    { // ARM GPUs
@ -78,6 +80,8 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "Hawaii",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+        { "Pitcairn",                                        { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
@ -129,6 +133,8 @@ const Database::DatabaseEntry Database::CopyDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "Hawaii",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+        { "Pitcairn",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
@ -181,8 +187,10 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "Hawaii",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
+        { "Pitcairn",                                        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
    { // ARM GPUs
--- a/include/internal/database/pad.h
+++ b/include/internal/database/pad.h
@ -19,6 +19,8 @@ const Database::DatabaseEntry Database::PadSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Hawaii",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+        { "Pitcairn",                                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Tahiti",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
@ -78,8 +80,10 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Hawaii",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "Pitcairn",                                        { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Tahiti",                                          { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
    { // ARM GPUs
@ -124,7 +128,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
  }
@ -137,8 +141,10 @@ const Database::DatabaseEntry Database::PadDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Hawaii",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "Pitcairn",                                        { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Tahiti",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
    { // ARM GPUs
@ -189,6 +195,8 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Hawaii",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Pitcairn",                                        { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tahiti",                                          { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
--- a/include/internal/database/padtranspose.h
+++ b/include/internal/database/padtranspose.h
@ -19,6 +19,8 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+        { "Hawaii",                                          { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+        { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
        { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
      }
@ -78,8 +80,10 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+        { "Hawaii",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
      }
    },
    { // ARM GPUs
@ -137,8 +141,10 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+        { "Hawaii",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Tahiti",                                          { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
      }
    },
    { // ARM GPUs
@ -189,6 +195,8 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+        { "Hawaii",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+        { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
      }
--- a/include/internal/database/transpose.h
+++ b/include/internal/database/transpose.h
@ -19,8 +19,10 @@ const Database::DatabaseEntry Database::TransposeSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
+        { "Hawaii",                                          { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
+        { "Pitcairn",                                        { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "Tahiti",                                          { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
      }
    },
    { // ARM GPUs
@ -78,6 +80,8 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "Hawaii",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "Pitcairn",                                        { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "Tahiti",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
      }
@ -131,8 +135,10 @@ const Database::DatabaseEntry Database::TransposeDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "Hawaii",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "Pitcairn",                                        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
        { "Tahiti",                                          { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
      }
    },
    { // ARM GPUs
@ -183,6 +189,8 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "Hawaii",                                          { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "Pitcairn",                                        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "Tahiti",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
      }
--- a/include/internal/database/xaxpy.h
+++ b/include/internal/database/xaxpy.h
@ -19,6 +19,8 @@ const Database::DatabaseEntry Database::XaxpySingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "Hawaii",                                          { {"VW",2}, {"WGS",64}, {"WPT",2} } },
+        { "Pitcairn",                                        { {"VW",2}, {"WGS",128}, {"WPT",1} } },
        { "Tahiti",                                          { {"VW",2}, {"WGS",64}, {"WPT",1} } },
        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
@ -78,6 +80,8 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",2}, {"WGS",64}, {"WPT",8} } },
+        { "Hawaii",                                          { {"VW",1}, {"WGS",128}, {"WPT",2} } },
+        { "Pitcairn",                                        { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "Tahiti",                                          { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
@ -137,6 +141,8 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+        { "Hawaii",                                          { {"VW",1}, {"WGS",64}, {"WPT",2} } },
+        { "Pitcairn",                                        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "Tahiti",                                          { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
@ -171,12 +177,12 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",512}, {"WPT",1} } },
        { "Tesla K20m",                                      { {"VW",2}, {"WGS",128}, {"WPT",1} } },
        { "Tesla K40m",                                      { {"VW",2}, {"WGS",128}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
    },
  }
@ -189,8 +195,10 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "Hawaii",                                          { {"VW",2}, {"WGS",64}, {"WPT",1} } },
+        { "Pitcairn",                                        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "Tahiti",                                          { {"VW",1}, {"WGS",128}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
    },
    { // ARM GPUs
--- a/include/internal/database/xdot.h
+++ b/include/internal/database/xdot.h
@ -18,54 +18,38 @@ const Database::DatabaseEntry Database::XdotSingle = {
  "Xdot", Precision::kSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
-        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
-        { "Tahiti",                                          { {"VW",1}, {"WGS1",256}, {"WGS2",256} } },
-        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
-      }
-    },
-    { // ARM GPUs
-      kDeviceTypeGPU, "ARM", {
-        { "Mali-T628",                                       { {"VW",1}, {"WGS1",128}, {"WGS2",256} } },
-        { "default",                                         { {"VW",1}, {"WGS1",128}, {"WGS2",256} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",128}, {"WGS2",32} } },
+        { "Hawaii",                                          { {"WGS1",256}, {"WGS2",32} } },
+        { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",32} } },
+        { "Tahiti",                                          { {"WGS1",128}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
-        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
-        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
-        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
-        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",1024}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",1024}, {"WGS2",32} } },
      }
    },
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
-        { "Iris",                                            { {"VW",1}, {"WGS1",512}, {"WGS2",32} } },
-        { "Iris Pro",                                        { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
-        { "default",                                         { {"VW",1}, {"WGS1",128}, {"WGS2",32} } },
-      }
-    },
-    { // Intel accelerators
-      kDeviceTypeAccelerator, "Intel", {
-        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
-        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
+        { "Iris Pro",                                        { {"WGS1",512}, {"WGS2",64} } },
+        { "default",                                         { {"WGS1",512}, {"WGS2",64} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
-        { "GeForce GTX 480",                                 { {"VW",1}, {"WGS1",256}, {"WGS2",128} } },
-        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS1",128}, {"WGS2",128} } },
-        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
-        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
-        { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
-        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
-        { "Tesla K20m",                                      { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
-        { "Tesla K40m",                                      { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
-        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
+        { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
+        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",128} } },
+        { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",32} } },
+        { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
+        { "Tesla K20m",                                      { {"WGS1",1024}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
      }
    },
  }
@ -77,54 +61,38 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
  "Xdot", Precision::kComplexSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
-        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
-        { "Tahiti",                                          { {"VW",1}, {"WGS1",64}, {"WGS2",256} } },
-        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
-      }
-    },
-    { // ARM GPUs
-      kDeviceTypeGPU, "ARM", {
-        { "Mali-T628",                                       { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
-        { "default",                                         { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",32} } },
+        { "Hawaii",                                          { {"WGS1",256}, {"WGS2",32} } },
+        { "Pitcairn",                                        { {"WGS1",256}, {"WGS2",32} } },
+        { "Tahiti",                                          { {"WGS1",64}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
-        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
-        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
-        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
-        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",1024}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",1024}, {"WGS2",32} } },
      }
    },
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
-        { "Iris",                                            { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
-        { "Iris Pro",                                        { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
-        { "default",                                         { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
-      }
-    },
-    { // Intel accelerators
-      kDeviceTypeAccelerator, "Intel", {
-        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
-        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
+        { "Iris Pro",                                        { {"WGS1",32}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",32} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
-        { "GeForce GTX 480",                                 { {"VW",1}, {"WGS1",512}, {"WGS2",512} } },
-        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS1",256}, {"WGS2",32} } },
-        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS1",128}, {"WGS2",32} } },
-        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
-        { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
-        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
-        { "Tesla K20m",                                      { {"VW",1}, {"WGS1",256}, {"WGS2",512} } },
-        { "Tesla K40m",                                      { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
-        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
+        { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
+        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",64} } },
+        { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",64} } },
+        { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
+        { "Tesla K20m",                                      { {"WGS1",512}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",32} } },
      }
    },
  }
@ -136,47 +104,32 @@ const Database::DatabaseEntry Database::XdotDouble = {
  "Xdot", Precision::kDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
-        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
-        { "Tahiti",                                          { {"VW",1}, {"WGS1",64}, {"WGS2",256} } },
-        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
-      }
-    },
-    { // ARM GPUs
-      kDeviceTypeGPU, "ARM", {
-        { "Mali-T628",                                       { {"VW",1}, {"WGS1",64}, {"WGS2",512} } },
-        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",512} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",128} } },
+        { "Hawaii",                                          { {"WGS1",256}, {"WGS2",32} } },
+        { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",32} } },
+        { "Tahiti",                                          { {"WGS1",256}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
-        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS1",512}, {"WGS2",512} } },
-        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
-        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS1",1024}, {"WGS2",512} } },
-        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",512} } },
-      }
-    },
-    { // Intel accelerators
-      kDeviceTypeAccelerator, "Intel", {
-        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
-        { "default",                                         { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",512}, {"WGS2",64} } },
+        { "default",                                         { {"WGS1",512}, {"WGS2",64} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
-        { "GeForce GTX 480",                                 { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
-        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
-        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
-        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS1",32}, {"WGS2",512} } },
-        { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
-        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS1",128}, {"WGS2",128} } },
-        { "Tesla K20m",                                      { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
-        { "Tesla K40m",                                      { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
-        { "default",                                         { {"VW",1}, {"WGS1",32}, {"WGS2",128} } },
+        { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
+        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",64} } },
+        { "GeForce GTX 980",                                 { {"WGS1",128}, {"WGS2",32} } },
+        { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
+        { "Tesla K20m",                                      { {"WGS1",512}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",1}, {"WGS1",32}, {"WGS2",128} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
      }
    },
  }
@ -188,47 +141,32 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
  "Xdot", Precision::kComplexDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
-        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
-        { "Tahiti",                                          { {"VW",1}, {"WGS1",64}, {"WGS2",256} } },
-        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
-      }
-    },
-    { // ARM GPUs
-      kDeviceTypeGPU, "ARM", {
-        { "Mali-T628",                                       { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
-        { "default",                                         { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",32} } },
+        { "Hawaii",                                          { {"WGS1",256}, {"WGS2",32} } },
+        { "Pitcairn",                                        { {"WGS1",256}, {"WGS2",32} } },
+        { "Tahiti",                                          { {"WGS1",256}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
-        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
-        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
-        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
-        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
-      }
-    },
-    { // Intel accelerators
-      kDeviceTypeAccelerator, "Intel", {
-        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",32}, {"WGS2",1024} } },
-        { "default",                                         { {"VW",1}, {"WGS1",32}, {"WGS2",1024} } },
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",1024}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",1024}, {"WGS2",32} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
-        { "GeForce GTX 480",                                 { {"VW",1}, {"WGS1",512}, {"WGS2",512} } },
-        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS1",256}, {"WGS2",64} } },
-        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
-        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS1",32}, {"WGS2",128} } },
-        { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
-        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS1",128}, {"WGS2",128} } },
-        { "Tesla K20m",                                      { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
-        { "Tesla K40m",                                      { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
-        { "default",                                         { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
+        { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
+        { "GeForce GTX 680",                                 { {"WGS1",256}, {"WGS2",64} } },
+        { "GeForce GTX 980",                                 { {"WGS1",64}, {"WGS2",32} } },
+        { "GeForce GTX TITAN X",                             { {"WGS1",128}, {"WGS2",32} } },
+        { "Tesla K20m",                                      { {"WGS1",128}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",1}, {"WGS1",32}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
      }
    },
  }
--- a/include/internal/database/xgemm.h
+++ b/include/internal/database/xgemm.h
@ -19,8 +19,10 @@ const Database::DatabaseEntry Database::XgemmSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
+        { "Hawaii",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
+        { "Pitcairn",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tahiti",                                          { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
    { // ARM GPUs
@ -60,12 +62,12 @@ const Database::DatabaseEntry Database::XgemmSingle = {
        { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
        { "Tesla K20m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
        { "Tesla K40m",                                      { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
  }
@ -78,8 +80,10 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+        { "Hawaii",                                          { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "Pitcairn",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
        { "Tahiti",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
    { // ARM GPUs
@ -100,7 +104,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
      kDeviceTypeGPU, "Intel", {
        { "Iris",                                            { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Iris Pro",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
    { // Intel accelerators
@ -119,12 +123,12 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
        { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
        { "Tesla K20m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
        { "Tesla K40m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
  }
@ -137,8 +141,10 @@ const Database::DatabaseEntry Database::XgemmDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+        { "Hawaii",                                          { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+        { "Pitcairn",                                        { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
        { "Tahiti",                                          { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
      }
    },
    { // ARM GPUs
@ -171,12 +177,12 @@ const Database::DatabaseEntry Database::XgemmDouble = {
        { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tesla K20m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tesla K40m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
  }
@ -189,8 +195,10 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+        { "Hawaii",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+        { "Pitcairn",                                        { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tahiti",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
-        { "default",                                         { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
    { // ARM GPUs
@ -222,12 +230,12 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
        { "GeForce GTX TITAN X",                             { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tesla K20m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tesla K40m",                                      { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
  }
--- a/include/internal/database/xgemv.h
+++ b/include/internal/database/xgemv.h
@ -19,6 +19,8 @@ const Database::DatabaseEntry Database::XgemvSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Hawaii",                                          { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
        { "default",                                         { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
@ -71,8 +73,10 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+        { "Hawaii",                                          { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Pitcairn",                                        { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
        { "Tahiti",                                          { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
    { // Intel CPUs
@ -119,6 +123,8 @@ const Database::DatabaseEntry Database::XgemvDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+        { "Hawaii",                                          { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
@ -164,6 +170,8 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+        { "Hawaii",                                          { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
--- a/include/internal/database/xger.h
+++ b/include/internal/database/xger.h
@ -19,8 +19,10 @@ const Database::DatabaseEntry Database::XgerSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
+        { "Hawaii",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
+        { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
        { "Tahiti",                                          { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
-        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
      }
    },
    { // ARM GPUs
@ -65,8 +67,10 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
+        { "Hawaii",                                          { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
+        { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
        { "Tahiti",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
      }
    },
    { // ARM GPUs
@ -111,8 +115,10 @@ const Database::DatabaseEntry Database::XgerDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
+        { "Hawaii",                                          { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+        { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
        { "Tahiti",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
-        { "default",                                         { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
      }
    },
    { // ARM GPUs
@ -138,7 +144,7 @@ const Database::DatabaseEntry Database::XgerDouble = {
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",16}, {"WGS2",2}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } },
      }
    },
  }
@ -151,6 +157,8 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+        { "Hawaii",                                          { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } },
+        { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
        { "Tahiti",                                          { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
        { "default",                                         { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
      }
--- a/include/internal/public_api.h
+++ b/include/internal/public_api.h
@ -0,0 +1,34 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file provides macro's to define the public API. This is needed when building a Windows DLL.
+// Note: this is only used for the C++ interface, the C interface has its own definition included in
+// the header file itself.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_PUBLIC_API_H_
+#define CLBLAST_PUBLIC_API_H_
+
+namespace clblast {
+// =================================================================================================
+
+// Exports library functions under Windows when building a DLL. See also:
+// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
+#ifdef _WIN32
+  #define PUBLIC_API __declspec(dllexport)
+#else
+  #define PUBLIC_API
+#endif
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_PUBLIC_API_H_
+#endif
--- a/include/internal/routine.h
+++ b/include/internal/routine.h
@ -19,6 +19,7 @@
 #include <string>
 #include <vector>

+#include "internal/cache.h"
 #include "internal/utilities.h"
 #include "internal/database.h"

@ -30,30 +31,11 @@ template <typename T>
 class Routine {
 public:

-  // The cache of compiled OpenCL programs, along with some meta-data
-  struct ProgramCache {
-    Program program;
-    std::string device_name;
-    Precision precision;
-    std::string routine_name_;
-
-    // Finds out whether the properties match
-    bool MatchInCache(const std::string &ref_device, const Precision &ref_precision,
-                      const std::string &ref_routine) {
-      return (device_name == ref_device &&
-              precision == ref_precision &&
-              routine_name_ == ref_routine);
-    }
-  };
-
-  // The actual cache, implemented as a vector of the above data-type
-  static std::vector<ProgramCache> program_cache_;
-
  // Helper functions which check for errors in the status code
  static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }

  // Base class constructor
-  explicit Routine(Queue &queue, Event &event, const std::string &name,
+  explicit Routine(Queue &queue, EventPointer event, const std::string &name,
                   const std::vector<std::string> &routines, const Precision precision);

  // Set-up phase of the kernel
@ -63,7 +45,12 @@ class Routine {
  
  // Runs a kernel given the global and local thread sizes
  StatusCode RunKernel(Kernel &kernel, std::vector<size_t> &global,
-                       const std::vector<size_t> &local);
+                       const std::vector<size_t> &local, EventPointer event,
+                       std::vector<Event>& waitForEvents);
+
+  // As above, but without an event waiting list
+  StatusCode RunKernel(Kernel &kernel, std::vector<size_t> &global,
+                       const std::vector<size_t> &local, EventPointer event);

  // Tests for valid inputs of matrices A, B, and C
  StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
@ -75,17 +62,22 @@ class Routine {
  StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer,
                          const size_t offset, const size_t data_size);

-  // Tests for valid inputs of vectors X and Y
+  // Tests for valid inputs of vector X and Y
  StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
                         const size_t inc, const size_t data_size);
  StatusCode TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
                         const size_t inc, const size_t data_size);
+
+  // Tests for valid inputs of other vectors
  StatusCode TestVectorDot(const size_t n, const Buffer<T> &buffer, const size_t offset,
                           const size_t data_size);
+  StatusCode TestVectorIndex(const size_t n, const Buffer<unsigned int> &buffer,
+                             const size_t offset, const size_t data_size);

  // Copies/transposes a matrix and padds/unpads it with zeroes. This method is also able to write
  // to symmetric and triangular matrices through optional arguments.
-  StatusCode PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
+  StatusCode PadCopyTransposeMatrix(EventPointer event, std::vector<Event>& waitForEvents,
+                                    const size_t src_one, const size_t src_two,
                                    const size_t src_ld, const size_t src_offset,
                                    const Buffer<T> &src,
                                    const size_t dest_one, const size_t dest_two,
@ -95,12 +87,30 @@ class Routine {
                                    const bool do_transpose, const bool do_conjugate,
                                    const bool upper = false, const bool lower = false,
                                    const bool diagonal_imag_zero = false);
-  
-  // Queries the cache and retrieve either a matching program or a boolean whether a match exists.
-  // The first assumes that the program is available in the cache and will throw an exception
-  // otherwise.
-  const Program& GetProgramFromCache() const;
-  bool ProgramIsInCache() const;
+
+  // Stores a newly compiled binary/program into the cache
+  void StoreBinaryToCache(const std::string& binary) const {
+    cache::StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
+  }
+  void StoreProgramToCache(const Program& program) const {
+    cache::StoreProgramToCache(program, context_, precision_, routine_name_);
+  }
+
+  // Queries the cache and retrieve either a matching binary/program or a boolean whether a match
+  // exists. The first assumes that the binary/program is available in the cache and will throw an
+  // exception otherwise.
+  std::string GetBinaryFromCache() const {
+    return cache::GetBinaryFromCache(device_name_, precision_, routine_name_);
+  }
+  Program GetProgramFromCache() const {
+    return cache::GetProgramFromCache(context_, precision_, routine_name_);
+  }
+  bool BinaryIsInCache() const {
+    return cache::BinaryIsInCache(device_name_, precision_, routine_name_);
+  }
+  bool ProgramIsInCache() const {
+    return cache::ProgramIsInCache(context_, precision_, routine_name_);
+  }

  // Non-static variable for the precision. Note that the same variable (but static) might exist in
  // a derived class.
@ -112,7 +122,7 @@ class Routine {

  // The OpenCL objects, accessible only from derived classes
  Queue queue_;
-  Event event_;
+  EventPointer event_;
  const Context context_;
  const Device device_;

--- a/include/internal/routines/level1/xamax.h
+++ b/include/internal/routines/level1/xamax.h
@ -0,0 +1,56 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xamax routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XAMAX_H_
+#define CLBLAST_ROUTINES_XAMAX_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xamax: public Routine<T> {
+ public:
+
+  // Members and methods from the base class
+  using Routine<T>::db_;
+  using Routine<T>::source_string_;
+  using Routine<T>::queue_;
+  using Routine<T>::event_;
+  using Routine<T>::context_;
+  using Routine<T>::GetProgramFromCache;
+  using Routine<T>::TestVectorX;
+  using Routine<T>::TestVectorIndex;
+  using Routine<T>::RunKernel;
+  using Routine<T>::ErrorIn;
+
+  // Constructor
+  Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoAmax(const size_t n,
+                    const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XAMAX_H_
+#endif
--- a/include/internal/routines/level1/xasum.h
+++ b/include/internal/routines/level1/xasum.h
@ -0,0 +1,56 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xasum routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XASUM_H_
+#define CLBLAST_ROUTINES_XASUM_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xasum: public Routine<T> {
+ public:
+
+  // Members and methods from the base class
+  using Routine<T>::db_;
+  using Routine<T>::source_string_;
+  using Routine<T>::queue_;
+  using Routine<T>::event_;
+  using Routine<T>::context_;
+  using Routine<T>::GetProgramFromCache;
+  using Routine<T>::TestVectorX;
+  using Routine<T>::TestVectorDot;
+  using Routine<T>::RunKernel;
+  using Routine<T>::ErrorIn;
+
+  // Constructor
+  Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoAsum(const size_t n,
+                    const Buffer<T> &asum_buffer, const size_t asum_offset,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XASUM_H_
+#endif
--- a/include/internal/routines/level1/xaxpy.h
+++ b/include/internal/routines/level1/xaxpy.h
@ -28,6 +28,7 @@ class Xaxpy: public Routine<T> {
  using Routine<T>::db_;
  using Routine<T>::source_string_;
  using Routine<T>::queue_;
+  using Routine<T>::event_;
  using Routine<T>::GetProgramFromCache;
  using Routine<T>::TestVectorX;
  using Routine<T>::TestVectorY;
@ -35,7 +36,7 @@ class Xaxpy: public Routine<T> {
  using Routine<T>::ErrorIn;

  // Constructor
-  Xaxpy(Queue &queue, Event &event, const std::string &name = "AXPY");
+  Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY");

  // Templated-precision implementation of the routine
  StatusCode DoAxpy(const size_t n, const T alpha,
--- a/include/internal/routines/level1/xcopy.h
+++ b/include/internal/routines/level1/xcopy.h
@ -28,6 +28,7 @@ class Xcopy: public Routine<T> {
  using Routine<T>::db_;
  using Routine<T>::source_string_;
  using Routine<T>::queue_;
+  using Routine<T>::event_;
  using Routine<T>::GetProgramFromCache;
  using Routine<T>::TestVectorX;
  using Routine<T>::TestVectorY;
@ -35,7 +36,7 @@ class Xcopy: public Routine<T> {
  using Routine<T>::ErrorIn;

  // Constructor
-  Xcopy(Queue &queue, Event &event, const std::string &name = "COPY");
+  Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY");

  // Templated-precision implementation of the routine
  StatusCode DoCopy(const size_t n,
--- a/include/internal/routines/level1/xdot.h
+++ b/include/internal/routines/level1/xdot.h
@ -28,6 +28,7 @@ class Xdot: public Routine<T> {
  using Routine<T>::db_;
  using Routine<T>::source_string_;
  using Routine<T>::queue_;
+  using Routine<T>::event_;
  using Routine<T>::context_;
  using Routine<T>::GetProgramFromCache;
  using Routine<T>::TestVectorX;
@ -37,7 +38,7 @@ class Xdot: public Routine<T> {
  using Routine<T>::ErrorIn;

  // Constructor
-  Xdot(Queue &queue, Event &event, const std::string &name = "DOT");
+  Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT");

  // Templated-precision implementation of the routine
  StatusCode DoDot(const size_t n,
--- a/include/internal/routines/level1/xdotc.h
+++ b/include/internal/routines/level1/xdotc.h
@ -28,7 +28,7 @@ class Xdotc: public Xdot<T> {
  using Xdot<T>::DoDot;

  // Constructor
-  Xdotc(Queue &queue, Event &event, const std::string &name = "DOTC");
+  Xdotc(Queue &queue, EventPointer event, const std::string &name = "DOTC");

  // Templated-precision implementation of the routine
  StatusCode DoDotc(const size_t n,
--- a/include/internal/routines/level1/xdotu.h
+++ b/include/internal/routines/level1/xdotu.h
@ -28,7 +28,7 @@ class Xdotu: public Xdot<T> {
  using Xdot<T>::DoDot;

  // Constructor
-  Xdotu(Queue &queue, Event &event, const std::string &name = "DOTU");
+  Xdotu(Queue &queue, EventPointer event, const std::string &name = "DOTU");

  // Templated-precision implementation of the routine
  StatusCode DoDotu(const size_t n,
--- a/include/internal/routines/level1/xmax.h
+++ b/include/internal/routines/level1/xmax.h
@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xmax routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XMAX_H_
+#define CLBLAST_ROUTINES_XMAX_H_
+
+#include "internal/routine.h"
+#include "internal/routines/level1/xamax.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xmax: public Xamax<T> {
+ public:
+
+  // Members and methods from the base class
+  using Xamax<T>::DoAmax;
+
+  // Constructor
+  Xmax(Queue &queue, EventPointer event, const std::string &name = "MAX"):
+    Xamax<T>(queue, event, name) {
+  }
+
+  // Forwards to the regular absolute version. The implementation difference is realised in the
+  // kernel through a pre-processor macro based on the name of the routine.
+  StatusCode DoMax(const size_t n,
+                   const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+    return DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XMAX_H_
+#endif
--- a/include/internal/routines/level1/xmin.h
+++ b/include/internal/routines/level1/xmin.h
@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xmin routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XMIN_H_
+#define CLBLAST_ROUTINES_XMIN_H_
+
+#include "internal/routine.h"
+#include "internal/routines/level1/xamax.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xmin: public Xamax<T> {
+ public:
+
+  // Members and methods from the base class
+  using Xamax<T>::DoAmax;
+
+  // Constructor
+  Xmin(Queue &queue, EventPointer event, const std::string &name = "MIN"):
+    Xamax<T>(queue, event, name) {
+  }
+
+  // Forwards to the regular max-absolute version. The implementation difference is realised in the
+  // kernel through a pre-processor macro based on the name of the routine.
+  StatusCode DoMin(const size_t n,
+                   const Buffer<unsigned int> &imin_buffer, const size_t imin_offset,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+    return DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XMIN_H_
+#endif
--- a/include/internal/routines/level1/xnrm2.h
+++ b/include/internal/routines/level1/xnrm2.h
@ -0,0 +1,56 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xnrm2 routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XNRM2_H_
+#define CLBLAST_ROUTINES_XNRM2_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xnrm2: public Routine<T> {
+ public:
+
+  // Members and methods from the base class
+  using Routine<T>::db_;
+  using Routine<T>::source_string_;
+  using Routine<T>::queue_;
+  using Routine<T>::event_;
+  using Routine<T>::context_;
+  using Routine<T>::GetProgramFromCache;
+  using Routine<T>::TestVectorX;
+  using Routine<T>::TestVectorDot;
+  using Routine<T>::RunKernel;
+  using Routine<T>::ErrorIn;
+
+  // Constructor
+  Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoNrm2(const size_t n,
+                    const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XNRM2_H_
+#endif
--- a/include/internal/routines/level1/xscal.h
+++ b/include/internal/routines/level1/xscal.h
@ -28,13 +28,14 @@ class Xscal: public Routine<T> {
  using Routine<T>::db_;
  using Routine<T>::source_string_;
  using Routine<T>::queue_;
+  using Routine<T>::event_;
  using Routine<T>::GetProgramFromCache;
  using Routine<T>::TestVectorX;
  using Routine<T>::RunKernel;
  using Routine<T>::ErrorIn;

  // Constructor
-  Xscal(Queue &queue, Event &event, const std::string &name = "SCAL");
+  Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL");

  // Templated-precision implementation of the routine
  StatusCode DoScal(const size_t n, const T alpha,
--- a/include/internal/routines/level1/xsum.h
+++ b/include/internal/routines/level1/xsum.h
@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsum routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSUM_H_
+#define CLBLAST_ROUTINES_XSUM_H_
+
+#include "internal/routine.h"
+#include "internal/routines/level1/xasum.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsum: public Xasum<T> {
+ public:
+
+  // Members and methods from the base class
+  using Xasum<T>::DoAsum;
+
+  // Constructor
+  Xsum(Queue &queue, EventPointer event, const std::string &name = "SUM"):
+    Xasum<T>(queue, event, name) {
+  }
+
+  // Forwards to the regular absolute version. The implementation difference is realised in the
+  // kernel through a pre-processor macro based on the name of the routine.
+  StatusCode DoSum(const size_t n,
+                   const Buffer<T> &sum_buffer, const size_t sum_offset,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+    return DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSUM_H_
+#endif
--- a/include/internal/routines/level1/xswap.h
+++ b/include/internal/routines/level1/xswap.h
@ -28,6 +28,7 @@ class Xswap: public Routine<T> {
  using Routine<T>::db_;
  using Routine<T>::source_string_;
  using Routine<T>::queue_;
+  using Routine<T>::event_;
  using Routine<T>::GetProgramFromCache;
  using Routine<T>::TestVectorX;
  using Routine<T>::TestVectorY;
@ -35,7 +36,7 @@ class Xswap: public Routine<T> {
  using Routine<T>::ErrorIn;

  // Constructor
-  Xswap(Queue &queue, Event &event, const std::string &name = "SWAP");
+  Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP");

  // Templated-precision implementation of the routine
  StatusCode DoSwap(const size_t n,
--- a/include/internal/routines/level2/xgbmv.h
+++ b/include/internal/routines/level2/xgbmv.h
@ -30,7 +30,7 @@ class Xgbmv: public Xgemv<T> {
  using Xgemv<T>::MatVec;

  // Constructor
-  Xgbmv(Queue &queue, Event &event, const std::string &name = "GBMV");
+  Xgbmv(Queue &queue, EventPointer event, const std::string &name = "GBMV");

  // Templated-precision implementation of the routine
  StatusCode DoGbmv(const Layout layout, const Transpose a_transpose,
--- a/include/internal/routines/level2/xgemv.h
+++ b/include/internal/routines/level2/xgemv.h
@ -28,6 +28,7 @@ class Xgemv: public Routine<T> {
  using Routine<T>::db_;
  using Routine<T>::source_string_;
  using Routine<T>::queue_;
+  using Routine<T>::event_;
  using Routine<T>::GetProgramFromCache;
  using Routine<T>::TestVectorX;
  using Routine<T>::TestVectorY;
@ -37,7 +38,7 @@ class Xgemv: public Routine<T> {
  using Routine<T>::ErrorIn;

  // Constructor
-  Xgemv(Queue &queue, Event &event, const std::string &name = "GEMV");
+  Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV");

  // Templated-precision implementation of the routine
  StatusCode DoGemv(const Layout layout, const Transpose a_transpose,
--- a/include/internal/routines/level2/xger.h
+++ b/include/internal/routines/level2/xger.h
@ -28,6 +28,7 @@ class Xger: public Routine<T> {
  using Routine<T>::db_;
  using Routine<T>::source_string_;
  using Routine<T>::queue_;
+  using Routine<T>::event_;
  using Routine<T>::GetProgramFromCache;
  using Routine<T>::TestVectorX;
  using Routine<T>::TestVectorY;
@ -36,7 +37,7 @@ class Xger: public Routine<T> {
  using Routine<T>::ErrorIn;

  // Constructor
-  Xger(Queue &queue, Event &event, const std::string &name = "GER");
+  Xger(Queue &queue, EventPointer event, const std::string &name = "GER");

  // Templated-precision implementation of the routine
  StatusCode DoGer(const Layout layout,
--- a/include/internal/routines/level2/xgerc.h
+++ b/include/internal/routines/level2/xgerc.h
@ -28,7 +28,7 @@ class Xgerc: public Xger<T> {
  using Xger<T>::DoGer;

  // Constructor
-  Xgerc(Queue &queue, Event &event, const std::string &name = "GERC");
+  Xgerc(Queue &queue, EventPointer event, const std::string &name = "GERC");

  // Templated-precision implementation of the routine
  StatusCode DoGerc(const Layout layout,
--- a/include/internal/routines/level2/xgeru.h
+++ b/include/internal/routines/level2/xgeru.h
@ -28,7 +28,7 @@ class Xgeru: public Xger<T> {
  using Xger<T>::DoGer;

  // Constructor
-  Xgeru(Queue &queue, Event &event, const std::string &name = "GERU");
+  Xgeru(Queue &queue, EventPointer event, const std::string &name = "GERU");

  // Templated-precision implementation of the routine
  StatusCode DoGeru(const Layout layout,
--- a/include/internal/routines/level2/xhbmv.h
+++ b/include/internal/routines/level2/xhbmv.h
@ -30,7 +30,7 @@ class Xhbmv: public Xgemv<T> {
  using Xgemv<T>::MatVec;

  // Constructor
-  Xhbmv(Queue &queue, Event &event, const std::string &name = "HBMV");
+  Xhbmv(Queue &queue, EventPointer event, const std::string &name = "HBMV");

  // Templated-precision implementation of the routine
  StatusCode DoHbmv(const Layout layout, const Triangle triangle,
--- a/include/internal/routines/level2/xhemv.h
+++ b/include/internal/routines/level2/xhemv.h
@ -30,7 +30,7 @@ class Xhemv: public Xgemv<T> {
  using Xgemv<T>::MatVec;

  // Constructor
-  Xhemv(Queue &queue, Event &event, const std::string &name = "HEMV");
+  Xhemv(Queue &queue, EventPointer event, const std::string &name = "HEMV");

  // Templated-precision implementation of the routine
  StatusCode DoHemv(const Layout layout, const Triangle triangle,
--- a/include/internal/routines/level2/xher.h
+++ b/include/internal/routines/level2/xher.h
@ -28,6 +28,7 @@ class Xher: public Routine<T> {
  using Routine<T>::db_;
  using Routine<T>::source_string_;
  using Routine<T>::queue_;
+  using Routine<T>::event_;
  using Routine<T>::GetProgramFromCache;
  using Routine<T>::TestVectorX;
  using Routine<T>::TestMatrixA;
@ -36,7 +37,7 @@ class Xher: public Routine<T> {
  using Routine<T>::ErrorIn;

  // Constructor
-  Xher(Queue &queue, Event &event, const std::string &name = "HER");
+  Xher(Queue &queue, EventPointer event, const std::string &name = "HER");

  // Translates alpha of type 'U' into type 'T'
  T GetAlpha(const U alpha);
--- a/include/internal/routines/level2/xher2.h
+++ b/include/internal/routines/level2/xher2.h
@ -28,6 +28,7 @@ class Xher2: public Routine<T> {
  using Routine<T>::db_;
  using Routine<T>::source_string_;
  using Routine<T>::queue_;
+  using Routine<T>::event_;
  using Routine<T>::GetProgramFromCache;
  using Routine<T>::TestVectorX;
  using Routine<T>::TestVectorY;
@ -37,7 +38,7 @@ class Xher2: public Routine<T> {
  using Routine<T>::ErrorIn;

  // Constructor
-  Xher2(Queue &queue, Event &event, const std::string &name = "HER2");
+  Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2");

  // Templated-precision implementation of the routine
  StatusCode DoHer2(const Layout layout, const Triangle triangle,
--- a/include/internal/routines/level2/xhpmv.h
+++ b/include/internal/routines/level2/xhpmv.h
@ -30,7 +30,7 @@ class Xhpmv: public Xgemv<T> {
  using Xgemv<T>::MatVec;

  // Constructor
-  Xhpmv(Queue &queue, Event &event, const std::string &name = "HPMV");
+  Xhpmv(Queue &queue, EventPointer event, const std::string &name = "HPMV");

  // Templated-precision implementation of the routine
  StatusCode DoHpmv(const Layout layout, const Triangle triangle,
--- a/include/internal/routines/level2/xhpr.h
+++ b/include/internal/routines/level2/xhpr.h
@ -28,7 +28,7 @@ class Xhpr: public Xher<T,U> {
  using Xher<T,U>::DoHer;

  // Constructor
-  Xhpr(Queue &queue, Event &event, const std::string &name = "HPR");
+  Xhpr(Queue &queue, EventPointer event, const std::string &name = "HPR");

  // Templated-precision implementation of the routine
  StatusCode DoHpr(const Layout layout, const Triangle triangle,
--- a/include/internal/routines/level2/xhpr2.h
+++ b/include/internal/routines/level2/xhpr2.h
@ -28,7 +28,7 @@ class Xhpr2: public Xher2<T> {
  using Xher2<T>::DoHer2;

  // Constructor
-  Xhpr2(Queue &queue, Event &event, const std::string &name = "HPR2");
+  Xhpr2(Queue &queue, EventPointer event, const std::string &name = "HPR2");

  // Templated-precision implementation of the routine
  StatusCode DoHpr2(const Layout layout, const Triangle triangle,
--- a/include/internal/routines/level2/xsbmv.h
+++ b/include/internal/routines/level2/xsbmv.h
@ -30,7 +30,7 @@ class Xsbmv: public Xgemv<T> {
  using Xgemv<T>::MatVec;

  // Constructor
-  Xsbmv(Queue &queue, Event &event, const std::string &name = "SBMV");
+  Xsbmv(Queue &queue, EventPointer event, const std::string &name = "SBMV");

  // Templated-precision implementation of the routine
  StatusCode DoSbmv(const Layout layout, const Triangle triangle,
--- a/include/internal/routines/level2/xspmv.h
+++ b/include/internal/routines/level2/xspmv.h
@ -30,7 +30,7 @@ class Xspmv: public Xgemv<T> {
  using Xgemv<T>::MatVec;

  // Constructor
-  Xspmv(Queue &queue, Event &event, const std::string &name = "SPMV");
+  Xspmv(Queue &queue, EventPointer event, const std::string &name = "SPMV");

  // Templated-precision implementation of the routine
  StatusCode DoSpmv(const Layout layout, const Triangle triangle,
--- a/include/internal/routines/level2/xspr.h
+++ b/include/internal/routines/level2/xspr.h
@ -28,7 +28,7 @@ class Xspr: public Xher<T,T> {
  using Xher<T,T>::DoHer;

  // Constructor
-  Xspr(Queue &queue, Event &event, const std::string &name = "SPR");
+  Xspr(Queue &queue, EventPointer event, const std::string &name = "SPR");

  // Templated-precision implementation of the routine
  StatusCode DoSpr(const Layout layout, const Triangle triangle,
--- a/include/internal/routines/level2/xspr2.h
+++ b/include/internal/routines/level2/xspr2.h
@ -28,7 +28,7 @@ class Xspr2: public Xher2<T> {
  using Xher2<T>::DoHer2;

  // Constructor
-  Xspr2(Queue &queue, Event &event, const std::string &name = "SPR2");
+  Xspr2(Queue &queue, EventPointer event, const std::string &name = "SPR2");

  // Templated-precision implementation of the routine
  StatusCode DoSpr2(const Layout layout, const Triangle triangle,
--- a/include/internal/routines/level2/xsymv.h
+++ b/include/internal/routines/level2/xsymv.h
@ -30,7 +30,7 @@ class Xsymv: public Xgemv<T> {
  using Xgemv<T>::MatVec;

  // Constructor
-  Xsymv(Queue &queue, Event &event, const std::string &name = "SYMV");
+  Xsymv(Queue &queue, EventPointer event, const std::string &name = "SYMV");

  // Templated-precision implementation of the routine
  StatusCode DoSymv(const Layout layout, const Triangle triangle,
--- a/include/internal/routines/level2/xsyr.h
+++ b/include/internal/routines/level2/xsyr.h
@ -28,7 +28,7 @@ class Xsyr: public Xher<T,T> {
  using Xher<T,T>::DoHer;

  // Constructor
-  Xsyr(Queue &queue, Event &event, const std::string &name = "SYR");
+  Xsyr(Queue &queue, EventPointer event, const std::string &name = "SYR");

  // Templated-precision implementation of the routine
  StatusCode DoSyr(const Layout layout, const Triangle triangle,
--- a/include/internal/routines/level2/xsyr2.h
+++ b/include/internal/routines/level2/xsyr2.h
@ -28,7 +28,7 @@ class Xsyr2: public Xher2<T> {
  using Xher2<T>::DoHer2;

  // Constructor
-  Xsyr2(Queue &queue, Event &event, const std::string &name = "SYR2");
+  Xsyr2(Queue &queue, EventPointer event, const std::string &name = "SYR2");

  // Templated-precision implementation of the routine
  StatusCode DoSyr2(const Layout layout, const Triangle triangle,
--- a/include/internal/routines/level2/xtbmv.h
+++ b/include/internal/routines/level2/xtbmv.h
@ -34,7 +34,7 @@ class Xtbmv: public Xgemv<T> {
  using Xgemv<T>::MatVec;

  // Constructor
-  Xtbmv(Queue &queue, Event &event, const std::string &name = "TBMV");
+  Xtbmv(Queue &queue, EventPointer event, const std::string &name = "TBMV");

  // Templated-precision implementation of the routine
  StatusCode DoTbmv(const Layout layout, const Triangle triangle,
--- a/include/internal/routines/level2/xtpmv.h
+++ b/include/internal/routines/level2/xtpmv.h
@ -34,7 +34,7 @@ class Xtpmv: public Xgemv<T> {
  using Xgemv<T>::MatVec;

  // Constructor
-  Xtpmv(Queue &queue, Event &event, const std::string &name = "TPMV");
+  Xtpmv(Queue &queue, EventPointer event, const std::string &name = "TPMV");

  // Templated-precision implementation of the routine
  StatusCode DoTpmv(const Layout layout, const Triangle triangle,
--- a/include/internal/routines/level2/xtrmv.h
+++ b/include/internal/routines/level2/xtrmv.h
@ -34,7 +34,7 @@ class Xtrmv: public Xgemv<T> {
  using Xgemv<T>::MatVec;

  // Constructor
-  Xtrmv(Queue &queue, Event &event, const std::string &name = "TRMV");
+  Xtrmv(Queue &queue, EventPointer event, const std::string &name = "TRMV");

  // Templated-precision implementation of the routine
  StatusCode DoTrmv(const Layout layout, const Triangle triangle,
--- a/include/internal/routines/level3/xgemm.h
+++ b/include/internal/routines/level3/xgemm.h
@ -28,6 +28,7 @@ class Xgemm: public Routine<T> {
  using Routine<T>::db_;
  using Routine<T>::source_string_;
  using Routine<T>::queue_;
+  using Routine<T>::event_;
  using Routine<T>::context_;
  using Routine<T>::GetProgramFromCache;
  using Routine<T>::PadCopyTransposeMatrix;
@ -38,7 +39,7 @@ class Xgemm: public Routine<T> {
  using Routine<T>::ErrorIn;

  // Constructor
-  Xgemm(Queue &queue, Event &event, const std::string &name = "GEMM");
+  Xgemm(Queue &queue, EventPointer event, const std::string &name = "GEMM");

  // Templated-precision implementation of the routine
  StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
--- a/include/internal/routines/level3/xhemm.h
+++ b/include/internal/routines/level3/xhemm.h
@ -37,7 +37,7 @@ class Xhemm: public Xgemm<T> {
  using Xgemm<T>::DoGemm;

  // Constructor
-  Xhemm(Queue &queue, Event &event, const std::string &name = "HEMM");
+  Xhemm(Queue &queue, EventPointer event, const std::string &name = "HEMM");

  // Templated-precision implementation of the routine
  StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle,
--- a/include/internal/routines/level3/xher2k.h
+++ b/include/internal/routines/level3/xher2k.h
@ -30,6 +30,7 @@ class Xher2k: public Routine<T> {
  using Routine<T>::db_;
  using Routine<T>::source_string_;
  using Routine<T>::queue_;
+  using Routine<T>::event_;
  using Routine<T>::context_;
  using Routine<T>::GetProgramFromCache;
  using Routine<T>::PadCopyTransposeMatrix;
@ -40,7 +41,7 @@ class Xher2k: public Routine<T> {
  using Routine<T>::ErrorIn;

  // Constructor
-  Xher2k(Queue &queue, Event &event, const std::string &name = "HER2K");
+  Xher2k(Queue &queue, EventPointer event, const std::string &name = "HER2K");

  // Templated-precision implementation of the routine
  StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
--- a/include/internal/routines/level3/xherk.h
+++ b/include/internal/routines/level3/xherk.h
@ -30,6 +30,7 @@ class Xherk: public Routine<T> {
  using Routine<T>::db_;
  using Routine<T>::source_string_;
  using Routine<T>::queue_;
+  using Routine<T>::event_;
  using Routine<T>::context_;
  using Routine<T>::GetProgramFromCache;
  using Routine<T>::PadCopyTransposeMatrix;
@ -39,7 +40,7 @@ class Xherk: public Routine<T> {
  using Routine<T>::ErrorIn;

  // Constructor
-  Xherk(Queue &queue, Event &event, const std::string &name = "HERK");
+  Xherk(Queue &queue, EventPointer event, const std::string &name = "HERK");

  // Templated-precision implementation of the routine
  StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
--- a/include/internal/routines/level3/xsymm.h
+++ b/include/internal/routines/level3/xsymm.h
@ -39,7 +39,7 @@ class Xsymm: public Xgemm<T> {
  using Xgemm<T>::DoGemm;

  // Constructor
-  Xsymm(Queue &queue, Event &event, const std::string &name = "SYMM");
+  Xsymm(Queue &queue, EventPointer event, const std::string &name = "SYMM");

  // Templated-precision implementation of the routine
  StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle,
--- a/include/internal/routines/level3/xsyr2k.h
+++ b/include/internal/routines/level3/xsyr2k.h
@ -30,6 +30,7 @@ class Xsyr2k: public Routine<T> {
  using Routine<T>::db_;
  using Routine<T>::source_string_;
  using Routine<T>::queue_;
+  using Routine<T>::event_;
  using Routine<T>::context_;
  using Routine<T>::GetProgramFromCache;
  using Routine<T>::PadCopyTransposeMatrix;
@ -40,7 +41,7 @@ class Xsyr2k: public Routine<T> {
  using Routine<T>::ErrorIn;

  // Constructor
-  Xsyr2k(Queue &queue, Event &event, const std::string &name = "SYR2K");
+  Xsyr2k(Queue &queue, EventPointer event, const std::string &name = "SYR2K");

  // Templated-precision implementation of the routine
  StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
--- a/include/internal/routines/level3/xsyrk.h
+++ b/include/internal/routines/level3/xsyrk.h
@ -32,6 +32,7 @@ class Xsyrk: public Routine<T> {
  using Routine<T>::db_;
  using Routine<T>::source_string_;
  using Routine<T>::queue_;
+  using Routine<T>::event_;
  using Routine<T>::context_;
  using Routine<T>::GetProgramFromCache;
  using Routine<T>::PadCopyTransposeMatrix;
@ -41,7 +42,7 @@ class Xsyrk: public Routine<T> {
  using Routine<T>::ErrorIn;

  // Constructor
-  Xsyrk(Queue &queue, Event &event, const std::string &name = "SYRK");
+  Xsyrk(Queue &queue, EventPointer event, const std::string &name = "SYRK");

  // Templated-precision implementation of the routine
  StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
--- a/include/internal/routines/level3/xtrmm.h
+++ b/include/internal/routines/level3/xtrmm.h
@ -38,7 +38,7 @@ class Xtrmm: public Xgemm<T> {
  using Xgemm<T>::DoGemm;

  // Constructor
-  Xtrmm(Queue &queue, Event &event, const std::string &name = "TRMM");
+  Xtrmm(Queue &queue, EventPointer event, const std::string &name = "TRMM");

  // Templated-precision implementation of the routine
  StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle,
--- a/include/internal/utilities.h
+++ b/include/internal/utilities.h
@ -35,6 +35,9 @@ using double2 = std::complex<double>;
 const std::string kKhronosHalfPrecision = "cl_khr_fp16";
 const std::string kKhronosDoublePrecision = "cl_khr_fp64";

+// Catched an unknown error
+constexpr auto kUnknownError = -999;
+
 // =================================================================================================

 // The routine-specific arguments in string form
@ -61,6 +64,9 @@ constexpr auto kArgBOffset = "offb";
 constexpr auto kArgCOffset = "offc";
 constexpr auto kArgAPOffset = "offap";
 constexpr auto kArgDotOffset = "offdot";
+constexpr auto kArgNrm2Offset = "offnrm2";
+constexpr auto kArgAsumOffset = "offasum";
+constexpr auto kArgImaxOffset = "offimax";
 constexpr auto kArgAlpha = "alpha";
 constexpr auto kArgBeta = "beta";

@ -69,12 +75,14 @@ constexpr auto kArgFraction = "fraction";

 // The client-specific arguments in string form
 constexpr auto kArgCompareclblas = "clblas";
+constexpr auto kArgComparecblas = "cblas";
 constexpr auto kArgStepSize = "step";
 constexpr auto kArgNumSteps = "num_steps";
 constexpr auto kArgNumRuns = "runs";

 // The client-specific arguments in string form
 constexpr auto kArgFullTest = "full_test";
+constexpr auto kArgVerbose = "verbose";

 // The common arguments in string form
 constexpr auto kArgPlatform = "platform";
@ -113,6 +121,9 @@ struct Arguments {
  size_t c_offset = 0;
  size_t ap_offset = 0;
  size_t dot_offset = 0;
+  size_t nrm2_offset = 0;
+  size_t asum_offset = 0;
+  size_t imax_offset = 0;
  T alpha = T{1.0};
  T beta = T{1.0};
  size_t x_size = 1;
@ -121,16 +132,15 @@ struct Arguments {
  size_t b_size = 1;
  size_t c_size = 1;
  size_t ap_size = 1;
-  size_t dot_size = 1;
+  size_t scalar_size = 1;
  // Tuner-specific arguments
  double fraction = 1.0;
  // Client-specific arguments
  int compare_clblas = 1;
+  int compare_cblas = 1;
  size_t step = 1;
  size_t num_steps = 0;
  size_t num_runs = 10;
-  // Tester-specific arguments
-  bool full_test = false;
  // Common arguments
  size_t platform_id = 0;
  size_t device_id = 0;
@ -149,7 +159,7 @@ struct Buffers {
  Buffer<T> b_mat;
  Buffer<T> c_mat;
  Buffer<T> ap_mat;
-  Buffer<T> dot;
+  Buffer<T> scalar;
 };

 // =================================================================================================
--- a/samples/cache.c
+++ b/samples/cache.c
@ -0,0 +1,133 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the CLBlast kernel cache, which stores compiled OpenCL binaries for faster
+// repeated kernel execution. The cache can be pre-initialized or cleared.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+// Includes the CLBlast library (C interface)
+#include <clblast_c.h>
+
+// Forward declaration
+void run_example_routine(const cl_device_id device);
+
+// =================================================================================================
+
+// Example use of the CLBlast kernel cache
+int main(void) {
+
+  // OpenCL platform/device settings
+  const size_t platform_id = 0;
+  const size_t device_id = 0;
+
+  // Initializes the OpenCL platform
+  cl_uint num_platforms;
+  clGetPlatformIDs(0, NULL, &num_platforms);
+  cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
+  clGetPlatformIDs(num_platforms, platforms, NULL);
+  cl_platform_id platform = platforms[platform_id];
+
+  // Initializes the OpenCL device
+  cl_uint num_devices;
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+  cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+  cl_device_id device = devices[device_id];
+
+  // Run the routine multiple times in a row: after the first time the binary is already in the
+  // cache and compilation is no longer needed.
+  printf("Starting caching sample with an empty cache\n");
+  run_example_routine(device);
+  run_example_routine(device);
+  run_example_routine(device);
+
+  // Clearing the cache makes CLBlast re-compile the kernel once
+  printf("Clearing cache\n");
+  CLBlastClearCache();
+  run_example_routine(device);
+  run_example_routine(device);
+
+  // When the cache is empty, it can be pre-initialized with compiled kernels for all routines by
+  // calling the CLBlastFillCache function, such that all other CLBlast calls can benefit from
+  // pre-compiled kernels and thus execute at maximum speed.
+  printf("Clearing cache\n");
+  CLBlastClearCache();
+  printf("Filling cache (this might take a while)\n");
+  CLBlastFillCache(device);
+  run_example_routine(device);
+
+  // Clean-up
+  free(platforms);
+  free(devices);
+  return 0;
+}
+
+// =================================================================================================
+
+// Runs an example routine and reports the time
+void run_example_routine(const cl_device_id device) {
+
+  // Example SASUM arguments
+  const size_t n = 1024*128;
+
+  // Creates the OpenCL context, queue, and an event
+  cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
+  cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
+  cl_event event = NULL;
+
+  // Populate host data structures with some example data
+  float* host_input = (float*)malloc(sizeof(float)*n);
+  float* host_output = (float*)malloc(sizeof(float)*1);
+  for (size_t i=0; i<n; ++i) { host_input[i] = -1.5f; }
+  for (size_t i=0; i<1; ++i) { host_output[i] = 0.0f; }
+
+  // Copy the data-structures to the device
+  cl_mem device_input = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(float), NULL, NULL);
+  cl_mem device_output = clCreateBuffer(context, CL_MEM_READ_WRITE, 1*sizeof(float), NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_input, CL_TRUE, 0, n*sizeof(float), host_input, 0, NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);
+
+  // Start the timer
+  clock_t start = clock();
+
+  // Calls an example routine
+  StatusCode status = CLBlastSasum(n,
+                                   device_output, 0,
+                                   device_input, 0, 1,
+                                   &queue, &event);
+
+  // Wait for completion
+  clWaitForEvents(1, &event);
+
+  // Retrieves the execution time
+  clock_t diff = clock() - start;
+  double time_ms = diff * 1000.0f / (double)CLOCKS_PER_SEC;
+
+  // Routine completed. See "clblast_c.h" for status codes (0 -> success).
+  printf("Completed routine with status %d in %.3lf ms\n", status, time_ms);
+
+  // Clean-up
+  free(host_input);
+  free(host_output);
+  clReleaseMemObject(device_input);
+  clReleaseMemObject(device_output);
+  clReleaseCommandQueue(queue);
+  clReleaseContext(context);
+}
+
+// =================================================================================================
--- a/samples/dgemv.c
+++ b/samples/dgemv.c
@ -0,0 +1,106 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the DGEMV routine. It is pure C99 and demonstrates the use of
+// the C API to the CLBlast library.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+// Includes the CLBlast library (C interface)
+#include <clblast_c.h>
+
+// =================================================================================================
+
+// Example use of the double-precision routine DGEMV
+int main(void) {
+
+  // OpenCL platform/device settings
+  const size_t platform_id = 0;
+  const size_t device_id = 0;
+
+  // Example DGEMV arguments
+  const size_t m = 128;
+  const size_t n = 289;
+  const double alpha = 0.7;
+  const double beta = 0.0;
+  const size_t a_ld = n;
+
+  // Initializes the OpenCL platform
+  cl_uint num_platforms;
+  clGetPlatformIDs(0, NULL, &num_platforms);
+  cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
+  clGetPlatformIDs(num_platforms, platforms, NULL);
+  cl_platform_id platform = platforms[platform_id];
+
+  // Initializes the OpenCL device
+  cl_uint num_devices;
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+  cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+  cl_device_id device = devices[device_id];
+
+  // Creates the OpenCL context, queue, and an event
+  cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
+  cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
+  cl_event event = NULL;
+
+  // Populate host data structures with some example data
+  double* host_a = (double*)malloc(sizeof(double)*m*n);
+  double* host_x = (double*)malloc(sizeof(double)*n);
+  double* host_y = (double*)malloc(sizeof(double)*m);
+  for (size_t i=0; i<m*n; ++i) { host_a[i] = 12.193; }
+  for (size_t i=0; i<n; ++i) { host_x[i] = -8.199; }
+  for (size_t i=0; i<m; ++i) { host_y[i] = 0.0; }
+
+  // Copy the data-structures to the device
+  cl_mem device_a = clCreateBuffer(context, CL_MEM_READ_WRITE, m*n*sizeof(double), NULL, NULL);
+  cl_mem device_x = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(double), NULL, NULL);
+  cl_mem device_y = clCreateBuffer(context, CL_MEM_READ_WRITE, m*sizeof(double), NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_a, CL_TRUE, 0, m*n*sizeof(double), host_a, 0, NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_x, CL_TRUE, 0, n*sizeof(double), host_x, 0, NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_y, CL_TRUE, 0, m*sizeof(double), host_y, 0, NULL, NULL);
+
+  // Call the DGEMV routine.
+  StatusCode status = CLBlastDgemv(kRowMajor, kNo,
+                                   m, n,
+                                   alpha,
+                                   device_a, 0, a_ld,
+                                   device_x, 0, 1,
+                                   beta,
+                                   device_y, 0, 1,
+                                   &queue, &event);
+
+  // Wait for completion
+  clWaitForEvents(1, &event);
+
+  // Example completed. See "clblast_c.h" for status codes (0 -> success).
+  printf("Completed DGEMV with status %d\n", status);
+
+  // Clean-up
+  free(platforms);
+  free(devices);
+  free(host_a);
+  free(host_x);
+  free(host_y);
+  clReleaseMemObject(device_a);
+  clReleaseMemObject(device_x);
+  clReleaseMemObject(device_y);
+  clReleaseCommandQueue(queue);
+  clReleaseContext(context);
+  return 0;
+}
+
+// =================================================================================================
--- a/samples/sasum.c
+++ b/samples/sasum.c
@ -0,0 +1,96 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the SASUM routine. It is pure C99 and demonstrates the use of
+// the C API to the CLBlast library.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+// Includes the CLBlast library (C interface)
+#include <clblast_c.h>
+
+// =================================================================================================
+
+// Example use of the single-precision routine SASUM
+int main(void) {
+
+  // OpenCL platform/device settings
+  const size_t platform_id = 0;
+  const size_t device_id = 0;
+
+  // Example SASUM arguments
+  const size_t n = 1000;
+  const float input_value = -1.5f;
+
+  // Initializes the OpenCL platform
+  cl_uint num_platforms;
+  clGetPlatformIDs(0, NULL, &num_platforms);
+  cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
+  clGetPlatformIDs(num_platforms, platforms, NULL);
+  cl_platform_id platform = platforms[platform_id];
+
+  // Initializes the OpenCL device
+  cl_uint num_devices;
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+  cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+  cl_device_id device = devices[device_id];
+
+  // Creates the OpenCL context, queue, and an event
+  cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
+  cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
+  cl_event event = NULL;
+
+  // Populate host data structures with some example data
+  float* host_input = (float*)malloc(sizeof(float)*n);
+  float* host_output = (float*)malloc(sizeof(float)*1);
+  for (size_t i=0; i<n; ++i) { host_input[i] = input_value; }
+  for (size_t i=0; i<1; ++i) { host_output[i] = 0.0f; }
+
+  // Copy the data-structures to the device
+  cl_mem device_input = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(float), NULL, NULL);
+  cl_mem device_output = clCreateBuffer(context, CL_MEM_READ_WRITE, 1*sizeof(float), NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_input, CL_TRUE, 0, n*sizeof(float), host_input, 0, NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);
+
+  // Call the SASUM routine.
+  StatusCode status = CLBlastSasum(n,
+                                   device_output, 0,
+                                   device_input, 0, 1,
+                                   &queue, &event);
+
+  // Wait for completion
+  clWaitForEvents(1, &event);
+
+  // Copies the result back to the host
+  clEnqueueReadBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);
+
+  // Example completed. See "clblast_c.h" for status codes (0 -> success).
+  printf("Completed SASUM with status %d: %zu * |%.1lf| = %.1lf\n", status, n, input_value, host_output[0]);
+
+  // Clean-up
+  free(platforms);
+  free(devices);
+  free(host_input);
+  free(host_output);
+  clReleaseMemObject(device_input);
+  clReleaseMemObject(device_output);
+  clReleaseCommandQueue(queue);
+  clReleaseContext(context);
+  return 0;
+}
+
+// =================================================================================================
--- a/samples/sgemm.c
+++ b/samples/sgemm.c
@ -15,6 +15,7 @@
 //
 // =================================================================================================

+#include <stdlib.h>
 #include <stdio.h>
 #include <string.h>

@ -47,11 +48,11 @@ int main(void) {
  clGetPlatformIDs(num_platforms, platforms, NULL);
  cl_platform_id platform = platforms[platform_id];

-  // Initializes the OpenCL device (note: example for GPU devices only)
+  // Initializes the OpenCL device
  cl_uint num_devices;
-  clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
  cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
-  clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices, NULL);
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
  cl_device_id device = devices[device_id];

  // Creates the OpenCL context, queue, and an event
@ -89,7 +90,7 @@ int main(void) {
  clWaitForEvents(1, &event);

  // Example completed. See "clblast_c.h" for status codes (0 -> success).
-  printf("Completed with status %d\n", status);
+  printf("Completed SGEMM with status %d\n", status);

  // Clean-up
  free(platforms);
--- a/samples/sgemm.cc
+++ b/samples/sgemm.cc
@ -8,8 +8,8 @@
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file demonstrates the use of the SGEMM routine. It is a stand-alone example, but it does
-// requires the Khronos C++ OpenCL API header file (not included). The example uses C++ features,
-// but CLBlast can also be used using the regular C-style OpenCL API.
+// require the Khronos C++ OpenCL API header file (downloaded by CMake). The example uses C++
+// features, but CLBlast can also be used using the regular C-style OpenCL API.
 //
 // Note that this example is meant for illustration purposes only. CLBlast provides other programs
 // for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
@ -22,7 +22,7 @@

 // Includes the C++ OpenCL API. If not yet available, it can be found here:
 // https://www.khronos.org/registry/cl/api/1.1/cl.hpp
-#include <cl.hpp>
+#include "cl.hpp"

 // Includes the CLBlast library
 #include <clblast.h>
@ -52,16 +52,16 @@ int main() {
  if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; }
  auto platform = platforms[platform_id];

-  // Initializes the OpenCL device (note: example for GPU devices only)
+  // Initializes the OpenCL device
  auto devices = std::vector<cl::Device>();
-  platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
+  platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
  if (devices.size() == 0 || device_id >= devices.size()) { return 1; }
  auto device = devices[device_id];

  // Creates the OpenCL context, queue, and an event
  auto context = cl::Context({device});
  auto queue = cl::CommandQueue(context, device);
-  auto event = cl::Event();
+  auto event = cl_event{nullptr};

  // Populate host matrices with some example data
  auto host_a = std::vector<float>(m*k);
@ -84,24 +84,23 @@ int main() {

  // Call the SGEMM routine. Note that the type of alpha and beta (float) determine the precision.
  auto queue_plain = queue();
-  auto event_plain = event();
-  auto status = Gemm(clblast::Layout::kRowMajor,
-                     clblast::Transpose::kNo, clblast::Transpose::kNo,
-                     m, n, k,
-                     alpha,
-                     device_a(), 0, a_ld,
-                     device_b(), 0, b_ld,
-                     beta,
-                     device_c(), 0, c_ld,
-                     &queue_plain, &event_plain);
+  auto status = clblast::Gemm(clblast::Layout::kRowMajor,
+                              clblast::Transpose::kNo, clblast::Transpose::kNo,
+                              m, n, k,
+                              alpha,
+                              device_a(), 0, a_ld,
+                              device_b(), 0, b_ld,
+                              beta,
+                              device_c(), 0, c_ld,
+                              &queue_plain, &event);

  // Record the execution time
-  event.wait();
+  clWaitForEvents(1, &event);
  auto elapsed_time = std::chrono::steady_clock::now() - start_time;
  auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();

  // Example completed. See "clblast.h" for status codes (0 -> success).
-  printf("Completed in %.3lf ms with status %d\n", time_ms, status);
+  printf("Completed SGEMM in %.3lf ms with status %d\n", time_ms, status);
  return 0;
 }

--- a/scripts/database/database.py
+++ b/scripts/database/database.py
@ -34,9 +34,9 @@ DEVICENAME_DEFAULT = "default"
 # Attributes
 DEVICETYPE_ATTRIBUTES = ["device_vendor", "device_type"]
 DEVICE_ATTRIBUTES = ["device", "device_core_clock", "device_compute_units"]
-KERNEL_ATTRIBUTES = ["precision", "kernel_family",
-                     "arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"]
-ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICETYPE_ATTRIBUTES + KERNEL_ATTRIBUTES
+KERNEL_ATTRIBUTES = ["precision", "kernel_family"]
+ARGUMENT_ATTRIBUTES = ["arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"]
+ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICETYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ARGUMENT_ATTRIBUTES

 # OpenCL vendor names and their short name
 VENDOR_NAMES = { "device_vendor": {
@ -95,9 +95,18 @@ def RemoveDuplicates(df):
 def RemoveEntriesByDevice(df, devicename):
 	return df[df["device"] != devicename]

+def RemoveEntriesByKernelFamily(df, familyname):
+	return df[df["kernel_family"] != familyname]
+
 def GetEntriesByField(df, field, value):
 	return df[df[field] == value]

+# Example usage:
+# df = UpdateDatabase(df, (df["kernel_family"] == "xdot") & (df["arg_n"] == "67108864"), "arg_n", "2097152")
+def UpdateDatabase(df, condition, field, value):
+	df.loc[condition, field] = value
+	return df
+
 # Fixes the problem that some vendors use multiple different names
 def SanitizeVendorNames(df):
 	df = df.replace(VENDOR_NAMES)
@ -120,7 +129,7 @@ def CalculateDefaults(df):
 	dfdefault = pd.DataFrame()

 	# Defaults per type/vendor
-	groups = df.groupby(DEVICETYPE_ATTRIBUTES+KERNEL_ATTRIBUTES+["kernel"])
+	groups = df.groupby(DEVICETYPE_ATTRIBUTES+KERNEL_ATTRIBUTES+ARGUMENT_ATTRIBUTES+["kernel"])
 	for name, dfgroup in groups:
 		default_values = dfgroup.min(axis=0)
 		default_values["device"] = DEVICENAME_DEFAULT
@ -129,8 +138,14 @@ def CalculateDefaults(df):
 		default_values["time"] = 0.0
 		dfdefault = dfdefault.append(default_values, ignore_index=True)
 	
+	# Checks for mis-matched arguments
+	groups = dfdefault.groupby(DEVICETYPE_ATTRIBUTES+KERNEL_ATTRIBUTES+["kernel"])
+	for name, dfgroup in groups:
+		if len(dfgroup) != 1:
+			print("[WARNING] Entries for a single kernel with multiple argument values")
+			
 	# Defaults in general
-	groups = df.groupby(KERNEL_ATTRIBUTES+["kernel"])
+	groups = df.groupby(KERNEL_ATTRIBUTES+ARGUMENT_ATTRIBUTES+["kernel"])
 	for name, dfgroup in groups:
 		default_values = dfgroup.min(axis=0)
 		default_values["device_vendor"] = VENDOR_DEFAULT
@ -273,7 +288,6 @@ for file_json in glob.glob(glob_json):
 	new_size = len(database.index)
 	print("with "+str(new_size-old_size)+" new items")

-
 # Stores the modified database back to disk
 if len(glob.glob(glob_json)) >= 1:
 	print("## Storing the database to disk...")
--- a/scripts/generator/datatype.py
+++ b/scripts/generator/datatype.py
@ -22,7 +22,8 @@ D2CL = "cl_double2"

 # Structure holding data-type and precision information
 class DataType():
-	def __init__(self, name, template, scalars, buffertype):
+	def __init__(self, precision_name, name, template, scalars, buffertype):
+		self.precision_name = precision_name
 		self.name = name
 		self.template = template
 		self.alpha_cpp = scalars[0]
@ -57,5 +58,10 @@ class DataType():
 			return "<"+self.buffertype+","+self.beta_cpp+">, "+self.buffertype+", "+self.beta_cpp
 		return "<"+self.buffertype+">, "+self.buffertype+", "+self.beta_cpp

+	# Current scalar is complex
+	def IsComplex(self, scalar):
+		return ((scalar == "alpha" and self.alpha_cpp in [FLT2, DBL2]) or
+		        (scalar == "beta" and self.beta_cpp in [FLT2, DBL2]))
+

 # ==================================================================================================
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@ -8,15 +8,17 @@
 #   Cedric Nugteren <www.cedricnugteren.nl>
 #
 # This script automatically generates the bodies of the following files, creating the full CLBlast
-# API interface and implementation (C, C++, and clBLAS wrapper):
+# API interface and implementation (C, C++, and reference BLAS wrappers):
 #    clblast.h
 #    clblast.cc
 #    clblast_c.h
 #    clblast_c.cc
 #    wrapper_clblas.h
+#    wrapper_cblas.h
 # It also generates the main functions for the correctness and performance tests as found in
 #    test/correctness/routines/levelX/xYYYY.cc
 #    test/performance/routines/levelX/xYYYY.cc
+# It also produces the API documentation found in doc/clblast.md
 #
 # ==================================================================================================

@ -31,75 +33,89 @@ from datatype import DataType, FLT, DBL, FLT2, DBL2, F2CL, D2CL
 # ==================================================================================================

 # Regular data-types
-S = DataType("S", FLT,  [FLT,  FLT,  FLT,  FLT],  FLT ) # single (32)
-D = DataType("D", DBL,  [DBL,  DBL,  DBL,  DBL],  DBL ) # double (64)
-C = DataType("C", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # single-complex (3232)
-Z = DataType("Z", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # double-complex (6464)
+S = DataType("S", "S", FLT,  [FLT,  FLT,  FLT,  FLT],  FLT ) # single (32)
+D = DataType("D", "D", DBL,  [DBL,  DBL,  DBL,  DBL],  DBL ) # double (64)
+C = DataType("C", "C", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # single-complex (3232)
+Z = DataType("Z", "Z", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # double-complex (6464)

 # Special cases
-Css = DataType("C", FLT,          [FLT,  FLT,  FLT,  FLT], FLT2) # As C, but with constants from S
-Zdd = DataType("Z", DBL,          [DBL,  DBL,  DBL,  DBL], DBL2) # As Z, but with constants from D
-Ccs = DataType("C", FLT2+","+FLT, [FLT2, FLT,  F2CL, FLT], FLT2) # As C, but with one constant from S
-Zzd = DataType("Z", DBL2+","+DBL, [DBL2, DBL,  D2CL, DBL], DBL2) # As Z, but with one constant from D
+Sc = DataType("C", "Sc", FLT2,         [FLT2, FLT2, FLT2, FLT2], FLT2) # As C, but with real output
+Dz = DataType("Z", "Dz", DBL2,         [DBL2, DBL2, DBL2, DBL2], DBL2) # As Z, but with real output
+iS = DataType("S", "iS", FLT,          [FLT,  FLT,  FLT,  FLT],  FLT ) # As S, but with integer output
+iD = DataType("D", "iD", DBL,          [DBL,  DBL,  DBL,  DBL],  DBL ) # As D, but with integer output
+iC = DataType("C", "iC", FLT2,         [FLT2, FLT2, F2CL, F2CL], FLT2) # As C, but with integer output
+iZ = DataType("Z", "iZ", DBL2,         [DBL2, DBL2, D2CL, D2CL], DBL2) # As Z, but with integer output
+Css = DataType("C", "C", FLT,          [FLT,  FLT,  FLT,  FLT], FLT2) # As C, but with constants from S
+Zdd = DataType("Z", "Z", DBL,          [DBL,  DBL,  DBL,  DBL], DBL2) # As Z, but with constants from D
+Ccs = DataType("C", "C", FLT2+","+FLT, [FLT2, FLT,  F2CL, FLT], FLT2) # As C, but with one constant from S
+Zzd = DataType("Z", "Z", DBL2+","+DBL, [DBL2, DBL,  D2CL, DBL], DBL2) # As Z, but with one constant from D

 # C++ template data-types
-T = DataType("typename T", "T", ["T", "T", "T", "T"], "T") # regular routine
-Tc = DataType("typename T", "std::complex<T>,T", ["T", "T", "T", "T"], "std::complex<T>") # for herk
-TU = DataType("typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # for her2k
+T = DataType("T", "typename T", "T", ["T", "T", "T", "T"], "T") # regular routine
+Tc = DataType("Tc", "typename T", "std::complex<T>,T", ["T", "T", "T", "T"], "std::complex<T>") # for herk
+TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # for her2k

 # ==================================================================================================

 # Populates a list of routines
 routines = [
 [ # Level 1: vector-vector
-  #Routine(False, "1", "rotg",  T,  [S,D],     [], [], [], [], ["a","b","c","s"], False, "Generate plane rotation"),
-  #Routine(False, "1", "rot",   T,  [S,D],     ["n"], [], [], ["x","y"], ["c","s"], False, "Apply plane rotation"),
-  Routine(True,  "1", "swap",  T,  [S,D,C,Z], ["n"], [], [], ["x","y"], [], False, "Swap two vectors"),
-  Routine(True,  "1", "scal",  T,  [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], False, "Vector scaling"),
-  Routine(True,  "1", "copy",  T,  [S,D,C,Z], ["n"], [], ["x"], ["y"], [], False, "Vector copy"),
-  Routine(True,  "1", "axpy",  T,  [S,D,C,Z], ["n"], [], ["x"], ["y"], ["alpha"], False, "Vector-times-constant plus vector"),
-  Routine(True,  "1", "dot",   T,  [S,D],     ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two vectors"),
-  Routine(True,  "1", "dotu",  T,  [C,Z],     ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors"),
-  Routine(True,  "1", "dotc",  T,  [C,Z],     ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors, one conjugated"),
+  Routine(False, True,  "1", "rotg",  T,  [S,D],     [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []),
+  Routine(False, True,  "1", "rotmg", T,  [S,D],     [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []),
+  Routine(False, True,  "1", "rot",   T,  [S,D],     ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []),
+  Routine(False, True,  "1", "rotm",  T,  [S,D],     ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []),
+  Routine(True,  True,  "1", "swap",  T,  [S,D,C,Z], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges the contents of vectors x and y.", []),
+  Routine(True,  True,  "1", "scal",  T,  [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies all elements of vector x by a scalar constant alpha.", []),
+  Routine(True,  True,  "1", "copy",  T,  [S,D,C,Z], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector x into vector y.", []),
+  Routine(True,  True,  "1", "axpy",  T,  [S,D,C,Z], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation y = alpha * x + y, in which x and y are vectors and alpha is a scalar constant.", []),
+  Routine(True,  True,  "1", "dot",   T,  [S,D],     ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies the vectors x and y element-wise and accumulates the results. The sum is stored in the dot buffer.", []),
+  Routine(True,  True,  "1", "dotu",  T,  [C,Z],     ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []),
+  Routine(True,  True,  "1", "dotc",  T,  [C,Z],     ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
+  Routine(True,  True,  "1", "nrm2",  T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of each element in the x vector and takes the square root. The resulting L2 norm is stored in the nrm2 buffer.", []),
+  Routine(True,  True,  "1", "asum",  T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of each element in the x vector. The results are stored in the asum buffer.", []),
+  Routine(True,  False, "1", "sum",   T, [S,D,Sc,Dz],["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of each element in the x vector. The results are stored in the sum buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
+  Routine(True,  True,  "1", "amax",  T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the x vector. The resulting integer index is stored in the imax buffer.", []),
+  Routine(True,  False, "1", "max",   T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the x vector. The resulting integer index is stored in the imax buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
+  Routine(True,  False, "1", "min",   T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the x vector. The resulting integer index is stored in the imin buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
 ],
 [ # Level 2: matrix-vector
-  Routine(True,  "2a", "gemv",  T,  [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General matrix-vector multiplication"),
-  Routine(True,  "2a", "gbmv",  T,  [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General banded matrix-vector multiplication"),
-  Routine(True,  "2a", "hemv",  T,  [C,Z],     ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian matrix-vector multiplication"),
-  Routine(True,  "2a", "hbmv",  T,  [C,Z],     ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian banded matrix-vector multiplication"),
-  Routine(True,  "2a", "hpmv",  T,  [C,Z],     ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Hermitian packed matrix-vector multiplication"),
-  Routine(True,  "2a", "symv",  T,  [S,D],     ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric matrix-vector multiplication"),
-  Routine(True,  "2a", "sbmv",  T,  [S,D],     ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric banded matrix-vector multiplication"),
-  Routine(True,  "2a", "spmv",  T,  [S,D],     ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Symmetric packed matrix-vector multiplication"),
-  Routine(True,  "2a", "trmv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular matrix-vector multiplication"),
-  Routine(True,  "2a", "tbmv",  T,  [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular banded matrix-vector multiplication"),
-  Routine(True,  "2a", "tpmv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], True, "Triangular packed matrix-vector multiplication"),
-  Routine(False, "2a", "trsv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a triangular system of equations"),
-  Routine(False, "2a", "tbsv",  T,  [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a banded triangular system of equations"),
-  Routine(False, "2a", "tpsv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], False, "Solves a packed triangular system of equations"),
+  Routine(True,  True,  "2a", "gemv",  T,  [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation y = alpha * A * x + beta * y, in which x is an input vector, y is an input and output vector, A is an input matrix, and alpha and beta are scalars. The matrix A can optionally be transposed before performing the operation.", []),
+  Routine(True,  True,  "2a", "gbmv",  T,  [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is banded instead.", []),
+  Routine(True,  True,  "2a", "hemv",  T,  [C,Z],     ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian matrix instead.", []),
+  Routine(True,  True,  "2a", "hbmv",  T,  [C,Z],     ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian banded matrix instead.", []),
+  Routine(True,  True,  "2a", "hpmv",  T,  [C,Z],     ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian packed matrix instead and represented as AP.", []),
+  Routine(True,  True,  "2a", "symv",  T,  [S,D],     ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix A is symmetric instead.", []),
+  Routine(True,  True,  "2a", "sbmv",  T,  [S,D],     ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is symmetric and banded instead.", []),
+  Routine(True,  True,  "2a", "spmv",  T,  [S,D],     ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is a symmetric packed matrix instead and represented as AP.", []),
+  Routine(True,  True,  "2a", "trmv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix A is triangular instead.", []),
+  Routine(True,  True,  "2a", "tbmv",  T,  [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is triangular and banded instead.", []),
+  Routine(True,  True,  "2a", "tpmv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is a triangular packed matrix instead and repreented as AP.", []),
+  Routine(False, True,  "2a", "trsv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []),
+  Routine(False, True,  "2a", "tbsv",  T,  [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", []),
+  Routine(False, True,  "2a", "tpsv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []),
  # Level 2: matrix update
-  Routine(True,  "2b", "ger",   T,  [S,D],     ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 matrix update"),
-  Routine(True,  "2b", "geru",  T,  [C,Z],     ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex matrix update"),
-  Routine(True,  "2b", "gerc",  T,  [C,Z],     ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex conjugated matrix update"),
-  Routine(True,  "2b", "her",   Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Hermitian rank-1 matrix update"),
-  Routine(True,  "2b", "hpr",   Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Hermitian packed rank-1 matrix update"),
-  Routine(True,  "2b", "her2",  T,  [C,Z],     ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Hermitian rank-2 matrix update"),
-  Routine(True,  "2b", "hpr2",  T,  [C,Z],     ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Hermitian packed rank-2 matrix update"),
-  Routine(True,  "2b", "syr",   T,  [S,D],     ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Symmetric rank-1 matrix update"),
-  Routine(True,  "2b", "spr",   T,  [S,D],     ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Symmetric packed rank-1 matrix update"),
-  Routine(True,  "2b", "syr2",  T,  [S,D],     ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Symmetric rank-2 matrix update"),
-  Routine(True,  "2b", "spr2",  T,  [S,D],     ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Symmetric packed rank-2 matrix update"),
+  Routine(True,  True,  "2b", "ger",   T,  [S,D],     ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "", []),
+  Routine(True,  True,  "2b", "geru",  T,  [C,Z],     ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "", []),
+  Routine(True,  True,  "2b", "gerc",  T,  [C,Z],     ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "", []),
+  Routine(True,  True,  "2b", "her",   Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "", []),
+  Routine(True,  True,  "2b", "hpr",   Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "", []),
+  Routine(True,  True,  "2b", "her2",  T,  [C,Z],     ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "", []),
+  Routine(True,  True,  "2b", "hpr2",  T,  [C,Z],     ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "", []),
+  Routine(True,  True,  "2b", "syr",   T,  [S,D],     ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "", []),
+  Routine(True,  True,  "2b", "spr",   T,  [S,D],     ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "", []),
+  Routine(True,  True,  "2b", "syr2",  T,  [S,D],     ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "", []),
+  Routine(True,  True,  "2b", "spr2",  T,  [S,D],     ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "", []),
 ],
 [ # Level 3: matrix-matrix
-  Routine(True,  "3", "gemm",  T,  [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "General matrix-matrix multiplication"),
-  Routine(True,  "3", "symm",  T,  [S,D,C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], False, "Symmetric matrix-matrix multiplication"),
-  Routine(True,  "3", "hemm",  T,  [C,Z],     ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], False, "Hermitian matrix-matrix multiplication"),
-  Routine(True,  "3", "syrk",  T,  [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], False, "Rank-K update of a symmetric matrix"),
-  Routine(True,  "3", "herk",  Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], False, "Rank-K update of a hermitian matrix"),
-  Routine(True,  "3", "syr2k", T,  [S,D,C,Z], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "Rank-2K update of a symmetric matrix"),
-  Routine(True,  "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "Rank-2K update of a hermitian matrix"),
-  Routine(True,  "3", "trmm",  T,  [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], False, "Triangular matrix-matrix multiplication"),
-  Routine(False, "3", "trsm",  T,  [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], False, "Solves a triangular system of equations"),
+  Routine(True,  True,  "3", "gemm",  T,  [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "", []),
+  Routine(True,  True,  "3", "symm",  T,  [S,D,C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "", []),
+  Routine(True,  True,  "3", "hemm",  T,  [C,Z],     ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "", []),
+  Routine(True,  True,  "3", "syrk",  T,  [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "", []),
+  Routine(True,  True,  "3", "herk",  Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "", []),
+  Routine(True,  True,  "3", "syr2k", T,  [S,D,C,Z], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "", []),
+  Routine(True,  True,  "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "", []),
+  Routine(True,  True,  "3", "trmm",  T,  [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "", []),
+  Routine(False, True,  "3", "trsm",  T,  [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []),
 ]]

 # ==================================================================================================
@ -151,7 +167,7 @@ def clblast_h(routines):
 	result = ""
 	for routine in routines:
 		result += "\n// "+routine.description+": "+routine.ShortNames()+"\n"
-		result += routine.RoutineHeaderCPP(12)+";\n"
+		result += routine.RoutineHeaderCPP(12, " = nullptr")+";\n"
 	return result

 # The C++ API implementation (.cc)
@ -161,10 +177,9 @@ def clblast_cc(routines):
 		indent1 = " "*(20 + routine.Length())
 		result += "\n// "+routine.description+": "+routine.ShortNames()+"\n"
 		if routine.implemented:
-			result += routine.RoutineHeaderCPP(12)+" {\n"
+			result += routine.RoutineHeaderCPP(12, "")+" {\n"
 			result += "  auto queue_cpp = Queue(*queue);\n"
-			result += "  auto event_cpp = Event(*event);\n"
-			result += "  auto routine = X"+routine.name+"<"+routine.template.template+">(queue_cpp, event_cpp);\n"
+			result += "  auto routine = X"+routine.name+"<"+routine.template.template+">(queue_cpp, event);\n"
 			result += "  auto status = routine.SetUp();\n"
 			result += "  if (status != StatusCode::kSuccess) { return status; }\n"
 			result += "  return routine.Do"+routine.name.capitalize()+"("
@ -175,8 +190,8 @@ def clblast_cc(routines):
 			result += "  return StatusCode::kNotImplemented;\n"
 		result += "}\n"
 		for flavour in routine.flavours:
-			indent2 = " "*(23 + routine.Length() + len(flavour.template))
-			result += "template StatusCode "+routine.name.capitalize()+"<"+flavour.template+">("
+			indent2 = " "*(34 + routine.Length() + len(flavour.template))
+			result += "template StatusCode PUBLIC_API "+routine.name.capitalize()+"<"+flavour.template+">("
 			result += (",\n"+indent2).join([a for a in routine.ArgumentsType(flavour)])
 			result += ",\n"+indent2+"cl_command_queue*, cl_event*);\n"
 	return result
@ -189,7 +204,7 @@ def clblast_c_h(routines):
 	for routine in routines:
 		result += "\n// "+routine.description+": "+routine.ShortNames()+"\n"
 		for flavour in routine.flavours:
-			result += routine.RoutineHeaderC(flavour, 20)+";\n"
+			result += routine.RoutineHeaderC(flavour, 31, " PUBLIC_API")+";\n"
 	return result

 # The C API implementation (.cc)
@ -200,7 +215,7 @@ def clblast_c_cc(routines):
 		for flavour in routine.flavours:
 			template = "<"+flavour.template+">" if routine.NoScalars() else ""
 			indent = " "*(26 + routine.Length() + len(template))
-			result += routine.RoutineHeaderC(flavour, 20)+" {\n"
+			result += routine.RoutineHeaderC(flavour, 20, "")+" {\n"
 			result += "  auto status = clblast::"+routine.name.capitalize()+template+"("
 			result += (",\n"+indent).join([a for a in routine.ArgumentsCast(flavour, indent)])
 			result += ",\n"+indent+"queue, event);"
@ -213,22 +228,68 @@ def clblast_c_cc(routines):
 def wrapper_clblas(routines):
 	result = ""
 	for routine in routines:
-		result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNames())
-		if routine.NoScalars():
-			result += routine.RoutineHeaderWrapper(routine.template, True, 21)+";\n"
-		for flavour in routine.flavours:
-			indent = " "*(17 + routine.Length())
-			result += routine.RoutineHeaderWrapper(flavour, False, 21)+" {\n"
-			arguments = routine.ArgumentsWrapper(flavour)
-			if routine.scratch:
-				result += "  auto queue = Queue(queues[0]);\n"
-				result += "  auto context = queue.GetContext();\n"
-				result += "  auto scratch_buffer = Buffer<"+flavour.template+">(context, n*x_inc + x_offset);\n"
-				arguments += ["scratch_buffer()"]
-			result += "  return clblas"+flavour.name+routine.name+"("
-			result += (",\n"+indent).join([a for a in arguments])
-			result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);"
-			result += "\n}\n"
+		if routine.has_tests:
+			result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNames())
+			if routine.NoScalars():
+				result += routine.RoutineHeaderWrapperCL(routine.template, True, 21)+";\n"
+			for flavour in routine.flavours:
+				indent = " "*(17 + routine.Length())
+				result += routine.RoutineHeaderWrapperCL(flavour, False, 21)+" {\n"
+				arguments = routine.ArgumentsWrapperCL(flavour)
+				if routine.scratch:
+					result += "  auto queue = Queue(queues[0]);\n"
+					result += "  auto context = queue.GetContext();\n"
+					result += "  auto scratch_buffer = Buffer<"+flavour.template+">(context, "+routine.scratch+");\n"
+					arguments += ["scratch_buffer()"]
+				result += "  return clblas"+flavour.name+routine.name+"("
+				result += (",\n"+indent).join([a for a in arguments])
+				result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);"
+				result += "\n}\n"
+	return result
+
+# The wrapper to the reference CBLAS routines (for performance/correctness testing)
+def wrapper_cblas(routines):
+	result = ""
+	for routine in routines:
+		if routine.has_tests:
+			result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNames())
+			for flavour in routine.flavours:
+				indent = " "*(10 + routine.Length())
+				result += routine.RoutineHeaderWrapperC(flavour, False, 12)+" {\n"
+				arguments = routine.ArgumentsWrapperC(flavour)
+
+				# Double-precision scalars
+				for scalar in routine.scalars:
+					if flavour.IsComplex(scalar):
+						result += "  const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n"
+
+				# Special case for scalar outputs
+				assignment = ""
+				postfix = ""
+				endofline = ""
+				extra_argument = ""
+				for output_buffer in routine.outputs:
+					if output_buffer in routine.ScalarBuffersFirst():
+						if flavour in [C,Z]:
+							postfix += "_sub"
+							indent += "    "
+							extra_argument += ",\n"+indent+"reinterpret_cast<return_pointer_"+flavour.buffertype[:-1]+">(&"+output_buffer+"_buffer["+output_buffer+"_offset])"
+						elif output_buffer in routine.IndexBuffers():
+							assignment = "((int*)&"+output_buffer+"_buffer[0])["+output_buffer+"_offset] = "
+							indent += " "*len(assignment)
+						else:
+							assignment = output_buffer+"_buffer["+output_buffer+"_offset]"
+							if (flavour.name in ["Sc","Dz"]):
+								assignment = assignment+".real("
+								endofline += ")"
+							else:
+								assignment = assignment+" = "
+							indent += " "*len(assignment)
+
+				result += "  "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"("
+				result += (",\n"+indent).join([a for a in arguments])
+				result += extra_argument+endofline+");"
+				result += "\n}\n"
 	return result

 # ==================================================================================================
@ -246,9 +307,10 @@ files = [
  path_clblast+"/include/clblast_c.h",
  path_clblast+"/src/clblast_c.cc",
  path_clblast+"/test/wrapper_clblas.h",
+  path_clblast+"/test/wrapper_cblas.h",
 ]
-header_lines = [84, 63, 80, 24, 22]
-footer_lines = [6, 3, 5, 2, 6]
+header_lines = [84, 71, 93, 22, 29, 41]
+footer_lines = [17, 71, 19, 14, 6, 6]

 # Checks whether the command-line arguments are valid; exists otherwise
 for f in files:
@ -282,6 +344,8 @@ for i in xrange(0,len(files)):
 				body += clblast_c_cc(routines[level-1])
 			if i == 4:
 				body += wrapper_clblas(routines[level-1])
+			if i == 5:
+				body += wrapper_cblas(routines[level-1])
 		f.write("".join(file_header))
 		f.write(body)
 		f.write("".join(file_footer))
@ -291,57 +355,117 @@ for i in xrange(0,len(files)):
 # Outputs all the correctness-test implementations
 for level in [1,2,3]:
 	for routine in routines[level-1]:
-		filename = path_clblast+"/test/correctness/routines/level"+str(level)+"/x"+routine.name+".cc"
-		with open(filename, "w") as f:
-			body = ""
-			body += "#include \"correctness/testblas.h\"\n"
-			body += "#include \"routines/level"+str(level)+"/x"+routine.name+".h\"\n\n"
-			body += "// Shortcuts to the clblast namespace\n"
-			body += "using float2 = clblast::float2;\n"
-			body += "using double2 = clblast::double2;\n\n"
-			body += "// Main function (not within the clblast namespace)\n"
-			body += "int main(int argc, char *argv[]) {\n"
-			not_first = "false"
-			for flavour in routine.flavours:
-				body += "  clblast::RunTests<clblast::TestX"+routine.name+flavour.TestTemplate()
-				body += ">(argc, argv, "+not_first+", \""+flavour.name+routine.name.upper()+"\");\n"
-				not_first = "true"
-			body += "  return 0;\n"
-			body += "}\n"
-			f.write(header+"\n")
-			f.write(body)
-			f.write(footer)
+		if routine.has_tests:
+			filename = path_clblast+"/test/correctness/routines/level"+str(level)+"/x"+routine.name+".cc"
+			with open(filename, "w") as f:
+				body = ""
+				body += "#include \"correctness/testblas.h\"\n"
+				body += "#include \"routines/level"+str(level)+"/x"+routine.name+".h\"\n\n"
+				body += "// Shortcuts to the clblast namespace\n"
+				body += "using float2 = clblast::float2;\n"
+				body += "using double2 = clblast::double2;\n\n"
+				body += "// Main function (not within the clblast namespace)\n"
+				body += "int main(int argc, char *argv[]) {\n"
+				not_first = "false"
+				for flavour in routine.flavours:
+					body += "  clblast::RunTests<clblast::TestX"+routine.name+flavour.TestTemplate()
+					body += ">(argc, argv, "+not_first+", \""+flavour.name+routine.name.upper()+"\");\n"
+					not_first = "true"
+				body += "  return 0;\n"
+				body += "}\n"
+				f.write(header+"\n")
+				f.write(body)
+				f.write(footer)

 # Outputs all the performance-test implementations
 for level in [1,2,3]:
 	for routine in routines[level-1]:
-		filename = path_clblast+"/test/performance/routines/level"+str(level)+"/x"+routine.name+".cc"
-		with open(filename, "w") as f:
-			body = ""
-			body += "#include \"performance/client.h\"\n"
-			body += "#include \"routines/level"+str(level)+"/x"+routine.name+".h\"\n\n"
-			body += "// Shortcuts to the clblast namespace\n"
-			body += "using float2 = clblast::float2;\n"
-			body += "using double2 = clblast::double2;\n\n"
-			body += "// Main function (not within the clblast namespace)\n"
-			body += "int main(int argc, char *argv[]) {\n"
-			default = PrecisionToFullName(routine.flavours[0].name)
-			body += "  switch(clblast::GetPrecision(argc, argv, clblast::Precision::k"+default+")) {\n"
-			for precision in ["H","S","D","C","Z"]:
-				body += "    case clblast::Precision::k"+PrecisionToFullName(precision)+":"
-				found = False
-				for flavour in routine.flavours:
-					if flavour.name == precision:
-						body += "\n      clblast::RunClient<clblast::TestX"+routine.name+flavour.TestTemplate()
-						body += ">(argc, argv); break;\n"
-						found = True
-				if not found:
-					body += " throw std::runtime_error(\"Unsupported precision mode\");\n"
-			body += "  }\n"
-			body += "  return 0;\n"
-			body += "}\n"
-			f.write(header+"\n")
-			f.write(body)
-			f.write(footer)
+		if routine.has_tests:
+			filename = path_clblast+"/test/performance/routines/level"+str(level)+"/x"+routine.name+".cc"
+			with open(filename, "w") as f:
+				body = ""
+				body += "#include \"performance/client.h\"\n"
+				body += "#include \"routines/level"+str(level)+"/x"+routine.name+".h\"\n\n"
+				body += "// Shortcuts to the clblast namespace\n"
+				body += "using float2 = clblast::float2;\n"
+				body += "using double2 = clblast::double2;\n\n"
+				body += "// Main function (not within the clblast namespace)\n"
+				body += "int main(int argc, char *argv[]) {\n"
+				default = PrecisionToFullName(routine.flavours[0].precision_name)
+				body += "  switch(clblast::GetPrecision(argc, argv, clblast::Precision::k"+default+")) {\n"
+				for precision in ["H","S","D","C","Z"]:
+					body += "    case clblast::Precision::k"+PrecisionToFullName(precision)+":"
+					found = False
+					for flavour in routine.flavours:
+						if flavour.precision_name == precision:
+							body += "\n      clblast::RunClient<clblast::TestX"+routine.name+flavour.TestTemplate()
+							body += ">(argc, argv); break;\n"
+							found = True
+					if not found:
+						body += " throw std::runtime_error(\"Unsupported precision mode\");\n"
+				body += "  }\n"
+				body += "  return 0;\n"
+				body += "}\n"
+				f.write(header+"\n")
+				f.write(body)
+				f.write(footer)
+
+# ==================================================================================================
+
+# Outputs the API documentation
+filename = path_clblast+"/doc/clblast.md"
+with open(filename, "w") as f:
+
+	# Outputs the header
+	f.write("CLBlast: API reference\n")
+	f.write("================\n")
+	f.write("\n\n")
+
+	# Loops over the routines
+	for level in [1,2,3]:
+		for routine in routines[level-1]:
+			if routine.implemented:
+
+				# Routine header
+				f.write("x"+routine.name.upper()+": "+routine.description+"\n")
+				f.write("-------------\n")
+				f.write("\n")
+				f.write(routine.details+"\n")
+				f.write("\n")
+
+				# Routine API
+				f.write("C++ API:\n")
+				f.write("```\n")
+				f.write(routine.RoutineHeaderCPP(12, "")+"\n")
+				f.write("```\n")
+				f.write("\n")
+				f.write("C API:\n")
+				f.write("```\n")
+				for flavour in routine.flavours:
+					f.write(routine.RoutineHeaderC(flavour, 20, "")+"\n")
+				f.write("```\n")
+				f.write("\n")
+
+				# Routine arguments
+				f.write("Arguments to "+routine.name.upper()+":\n")
+				f.write("\n")
+				for argument in routine.ArgumentsDoc():
+					f.write("* "+argument+"\n")
+				f.write("* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.\n")
+				f.write("* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.\n")
+				f.write("\n")
+
+				# Routine requirements
+				if len(routine.RequirementsDoc()) > 0:
+					f.write("Requirements for "+routine.name.upper()+":\n")
+					f.write("\n")
+					for requirement in routine.RequirementsDoc():
+						f.write("* "+requirement+"\n")
+					f.write("\n")
+
+
+				# Routine footer
+				f.write("\n\n")
+

 # ==================================================================================================
--- a/scripts/generator/routine.py
+++ b/scripts/generator/routine.py
@ -28,7 +28,7 @@ def OptionToCLBlast(x):
 	}[x]

 # As above, but for clBLAS data-types
-def OptionToWrapper(x):
+def OptionToWrapperCL(x):
 	return {
 	    'layout': "clblasOrder",
 	    'a_transpose': "clblasTranspose",
@ -39,16 +39,38 @@ def OptionToWrapper(x):
 	    'diagonal': "clblasDiag",
 	}[x]

-# Buffers without 'ld' or 'inc' parameter
-NO_LD_INC = ["dot","ap"]
+# As above, but for CBLAS data-types
+def OptionToWrapperC(x):
+	return {
+	    'layout': "CBLAS_ORDER",
+	    'a_transpose': "CBLAS_TRANSPOSE",
+	    'b_transpose': "CBLAS_TRANSPOSE",
+	    'ab_transpose': "CBLAS_TRANSPOSE",
+	    'side': "CBLAS_SIDE",
+	    'triangle': "CBLAS_UPLO",
+	    'diagonal': "CBLAS_DIAG",
+	}[x]
+
+# Translates an option name to a documentation string
+def OptionToDoc(x):
+	return {
+	    'layout': "Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.",
+	    'a_transpose': "Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
+	    'b_transpose': "Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
+	    'ab_transpose': "Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
+	    'side': "The horizontal position of the triangular matrix, either `Side::kLeft` (141) or `Side::kRight` (142).",
+	    'triangle': "The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).",
+	    'diagonal': "The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for a non-unit values on the diagonal or `Diagonal::kUnit` (132) for a unit values on the diagonal.",
+	}[x]

 # ==================================================================================================

 # Class holding routine-specific information (e.g. name, which arguments, which precisions)
 class Routine():
-	def __init__(self, implemented, level, name, template, flavours, sizes, options,
-	             inputs, outputs, scalars, scratch, description):
+	def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options,
+	             inputs, outputs, scalars, scratch, description, details, requirements):
 		self.implemented = implemented
+		self.has_tests = has_tests
 		self.level = level
 		self.name = name
 		self.template = template
@ -60,6 +82,26 @@ class Routine():
 		self.scalars = scalars
 		self.scratch = scratch # Scratch buffer (e.g. for xDOT)
 		self.description = description
+		self.details = details
+		self.requirements = requirements
+
+	# List of scalar buffers
+	def ScalarBuffersFirst(self):
+		return ["dot","nrm2","asum","sum","imax","imin"]
+	def ScalarBuffersSecond(self):
+		return ["sa","sb","sc","ss","sd1","sd2","sx1","sy1","sparam"]
+
+	# List of scalars other than alpha and beta
+	def OtherScalars(self):
+		return ["cos","sin"]
+
+	# List of buffers with unsigned int type
+	def IndexBuffers(self):
+		return ["imax","imin"]
+
+	# List of buffers without 'inc' or 'ld'
+	def BuffersWithoutLdInc(self):
+		return self.ScalarBuffersFirst() + self.ScalarBuffersSecond() + ["ap"]

 	# Retrieves the number of characters in the routine's name
 	def Length(self):
@ -87,6 +129,12 @@ class Routine():
 			return ["ap","a","b","c"]
 		return ["y","c"]

+	# Distinguish between vectors and matrices
+	def BuffersVector(self):
+		return ["x","y"]
+	def BuffersMatrix(self):
+		return ["a","b","c","ap"]
+
 	# ==============================================================================================

 	# Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')
@ -94,7 +142,7 @@ class Routine():
 		if (name in self.inputs) or (name in self.outputs):
 			a = [name+"_buffer"]
 			b = [name+"_offset"]
-			c = [name+"_"+self.Postfix(name)] if (name not in NO_LD_INC) else []
+			c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
 			return [", ".join(a+b+c)]
 		return []

@ -104,21 +152,32 @@ class Routine():
 		if (name in self.inputs) or (name in self.outputs):
 			a = [prefix+"cl_mem "+name+"_buffer"]
 			b = ["const size_t "+name+"_offset"]
-			c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in NO_LD_INC) else []
+			c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
+			return [", ".join(a+b+c)]
+		return []
+
+	# As above but as vectors
+	def BufferDefVector(self, name, flavour):
+		prefix = "const " if (name in self.inputs) else ""
+		if (name in self.inputs) or (name in self.outputs):
+			a = [prefix+"std::vector<"+flavour.buffertype+">& "+name+"_buffer"]
+			b = ["const size_t "+name+"_offset"]
+			c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
 			return [", ".join(a+b+c)]
 		return []

 	# As above but with Claduc buffers
 	def BufferCladuc(self, name):
 		if (name in self.inputs) or (name in self.outputs):
-			a = ["Buffer<"+self.template.buffertype+">("+name+"_buffer)"]
+			buffertype = "unsigned int" if (name in self.IndexBuffers()) else self.template.buffertype
+			a = ["Buffer<"+buffertype+">("+name+"_buffer)"]
 			b = [name+"_offset"]
-			c = [name+"_"+self.Postfix(name)] if (name not in NO_LD_INC) else []
+			c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
 			return [", ".join(a+b+c)]
 		return []

 	# As above but with a static cast for clBLAS wrapper
-	def BufferWrapper(self, name):
+	def BufferWrapperCL(self, name):
 		if (name in self.inputs) or (name in self.outputs):
 			a = [name+"_buffer"]
 			b = [name+"_offset"]
@ -130,16 +189,47 @@ class Routine():
 			return [", ".join(a+b+c)]
 		return []

+	# As above but with a static cast for CBLAS wrapper
+	def BufferWrapperC(self, name, flavour):
+		prefix = "const " if (name in self.inputs) else ""
+		if (name in self.inputs) or (name in self.outputs):
+			if name == "sy1":
+				a = [name+"_buffer["+name+"_offset]"]
+			elif flavour.precision_name in ["C","Z"]:
+				a = ["reinterpret_cast<"+prefix+flavour.buffertype[:-1]+"*>(&"+name+"_buffer["+name+"_offset])"]
+			else:
+				a = ["&"+name+"_buffer["+name+"_offset]"]
+			c = []
+			if (name in ["x","y"]):
+				c = ["static_cast<int>("+name+"_"+self.Postfix(name)+")"]
+			elif (name in ["a","b","c"]):
+				c = [name+"_"+self.Postfix(name)]
+			return [", ".join(a+c)]
+		return []
+
 	# As above, but only data-types
 	def BufferType(self, name):
 		prefix = "const " if (name in self.inputs) else ""
 		if (name in self.inputs) or (name in self.outputs):
 			a = [prefix+"cl_mem"]
 			b = ["const size_t"]
-			c = ["const size_t"] if (name not in NO_LD_INC) else []
+			c = ["const size_t"] if (name not in self.BuffersWithoutLdInc()) else []
 			return [", ".join(a+b+c)]
 		return []

+	# Retrieves the documentation of the buffers
+	def BufferDoc(self, name):
+		prefix = "const " if (name in self.inputs) else ""
+		inout = "input" if (name in self.inputs) else "output"
+		if (name in self.inputs) or (name in self.outputs):
+			math_name = name.upper()+" matrix" if (name in self.BuffersMatrix()) else name+" vector"
+			incld_description = "Leading dimension " if (name in self.BuffersMatrix()) else "Stride/increment "
+			a = ["`"+prefix+"cl_mem "+name+"_buffer`: OpenCL buffer to store the "+inout+" "+math_name+"."]
+			b = ["`const size_t "+name+"_offset`: The offset in elements from the start of the "+inout+" "+math_name+"."]
+			c = ["`const size_t "+name+"_"+self.Postfix(name)+"`: "+incld_description+"of the "+inout+" "+math_name+"."] if (name not in self.BuffersWithoutLdInc()) else []
+			return a+b+c
+		return []
+
 	# ==============================================================================================

 	# Retrieves the name of a scalar (alpha/beta)
@ -168,6 +258,14 @@ class Routine():
 			return [name]
 		return []

+	# Retrieves the use of a scalar for CBLAS (alpha/beta)
+	def ScalarUseWrapperC(self, name, flavour):
+		if name in self.scalars:
+			if flavour.IsComplex(name):
+				return [name+"_array.data()"]
+			return [name]
+		return []
+
 	# Retrieves the definition of a scalar (alpha/beta)
 	def ScalarDef(self, name, flavour):
 		if name in self.scalars:
@ -192,6 +290,14 @@ class Routine():
 			return ["const "+flavour.beta_cpp]
 		return []

+	# Retrieves the documentation of a scalar
+	def ScalarDoc(self, name):
+		if name in self.scalars:
+			if name == "alpha":
+				return ["`const "+self.template.alpha_cpp+" "+name+"`: Input scalar constant."]
+			return ["`const "+self.template.beta_cpp+" "+name+"`: Input scalar constant."]
+		return []
+
 	# ==============================================================================================

 	# Retrieves a list of comma-separated sizes (m, n, k)
@ -212,6 +318,13 @@ class Routine():
 			return [", ".join(["const size_t" for s in self.sizes])]
 		return []

+	# Retrieves the documentation of the sizes
+	def SizesDoc(self):
+		if self.sizes:
+			definitions = ["`const size_t "+s+"`: Integer size argument." for s in self.sizes]
+			return definitions
+		return []
+
 	# ==============================================================================================

 	# Retrieves a list of options
@ -235,9 +348,16 @@ class Routine():
 		return []

 	# As above, but now using clBLAS data-types
-	def OptionsDefWrapper(self):
+	def OptionsDefWrapperCL(self):
 		if self.options:
-			definitions = ["const "+OptionToWrapper(o)+" "+o for o in self.options]
+			definitions = ["const "+OptionToWrapperCL(o)+" "+o for o in self.options]
+			return [", ".join(definitions)]
+		return []
+
+	# As above, but now using CBLAS data-types
+	def OptionsDefWrapperC(self):
+		if self.options:
+			definitions = ["const "+OptionToWrapperC(o)+" "+o for o in self.options]
 			return [", ".join(definitions)]
 		return []

@ -248,72 +368,129 @@ class Routine():
 			return [", ".join(definitions)]
 		return []

+	# Retrieves the documentation of the options
+	def OptionsDoc(self):
+		if self.options:
+			definitions = ["`const "+OptionToCLBlast(o)+"`: "+OptionToDoc(o) for o in self.options]
+			return definitions
+		return []
+
 	# ==============================================================================================

 	# Retrieves a combination of all the argument names, with Claduc casts
 	def ArgumentsCladuc(self, flavour, indent):
-		return (self.Options() + self.Sizes() + self.BufferCladuc("dot") +
+		return (self.Options() + self.Sizes() +
+		        list(chain(*[self.BufferCladuc(b) for b in self.ScalarBuffersFirst()])) +
 		        self.Scalar("alpha") +
 		        list(chain(*[self.BufferCladuc(b) for b in self.BuffersFirst()])) +
 		        self.Scalar("beta") +
 		        list(chain(*[self.BufferCladuc(b) for b in self.BuffersSecond()])) +
-		        list(chain(*[self.Scalar(s) for s in ["d1","d2","a","b","c","s"]])))
+		        list(chain(*[self.BufferCladuc(b) for b in self.ScalarBuffersSecond()])) +
+		        list(chain(*[self.Scalar(s) for s in self.OtherScalars()])))

 	# Retrieves a combination of all the argument names, with CLBlast casts
 	def ArgumentsCast(self, flavour, indent):
-		return (self.OptionsCast(indent) + self.Sizes() + self.Buffer("dot") +
+		return (self.OptionsCast(indent) + self.Sizes() +
+		        list(chain(*[self.Buffer(b) for b in self.ScalarBuffersFirst()])) +
 		        self.ScalarUse("alpha", flavour) +
 		        list(chain(*[self.Buffer(b) for b in self.BuffersFirst()])) +
 		        self.ScalarUse("beta", flavour) +
 		        list(chain(*[self.Buffer(b) for b in self.BuffersSecond()])) +
-		        list(chain(*[self.ScalarUse(s, flavour) for s in ["d1","d2","a","b","c","s"]])))
+		        list(chain(*[self.Buffer(b) for b in self.ScalarBuffersSecond()])) +
+		        list(chain(*[self.ScalarUse(s, flavour) for s in self.OtherScalars()])))

 	# As above, but for the clBLAS wrapper
-	def ArgumentsWrapper(self, flavour):
-		return (self.Options() + self.Sizes() + self.BufferWrapper("dot") +
+	def ArgumentsWrapperCL(self, flavour):
+		return (self.Options() + self.Sizes() +
+		        list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersFirst()])) +
 		        self.ScalarUseWrapper("alpha", flavour) +
-		        list(chain(*[self.BufferWrapper(b) for b in self.BuffersFirst()])) +
+		        list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersFirst()])) +
 		        self.ScalarUseWrapper("beta", flavour) +
-		        list(chain(*[self.BufferWrapper(b) for b in self.BuffersSecond()])) +
-		        list(chain(*[self.ScalarUseWrapper(s, flavour) for s in ["d1","d2","a","b","c","s"]])))
+		        list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersSecond()])) +
+		        list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersSecond()])) +
+		        list(chain(*[self.ScalarUseWrapper(s, flavour) for s in self.OtherScalars()])))
+
+	# As above, but for the CBLAS wrapper
+	def ArgumentsWrapperC(self, flavour):
+		return (self.Options() + self.Sizes() +
+		        self.ScalarUseWrapperC("alpha", flavour) +
+		        list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersFirst()])) +
+		        self.ScalarUseWrapperC("beta", flavour) +
+		        list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersSecond()])) +
+		        list(chain(*[self.BufferWrapperC(b, flavour) for b in self.ScalarBuffersSecond()])) +
+		        list(chain(*[self.ScalarUseWrapperC(s, flavour) for s in self.OtherScalars()])))

 	# Retrieves a combination of all the argument definitions
 	def ArgumentsDef(self, flavour):
-		return (self.OptionsDef() + self.SizesDef() + self.BufferDef("dot") +
+		return (self.OptionsDef() + self.SizesDef() +
+		        list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) +
 		        self.ScalarDef("alpha", flavour) +
 		        list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) +
 		        self.ScalarDef("beta", flavour) +
 		        list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) +
-		        list(chain(*[self.ScalarDef(s, flavour) for s in ["d1","d2","a","b","c","s"]])))
+		        list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) +
+		        list(chain(*[self.ScalarDef(s, flavour) for s in self.OtherScalars()])))

 	# As above, but clBLAS wrapper plain datatypes
-	def ArgumentsDefWrapper(self, flavour):
-		return (self.OptionsDefWrapper() + self.SizesDef() + self.BufferDef("dot") +
+	def ArgumentsDefWrapperCL(self, flavour):
+		return (self.OptionsDefWrapperCL() + self.SizesDef() +
+		        list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) +
 		        self.ScalarDefPlain("alpha", flavour) +
 		        list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) +
 		        self.ScalarDefPlain("beta", flavour) +
 		        list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) +
-		        list(chain(*[self.ScalarDefPlain(s, flavour) for s in ["d1","d2","a","b","c","s"]])))
+		        list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) +
+		        list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()])))
+
+	# As above, but CBLAS wrapper plain datatypes
+	def ArgumentsDefWrapperC(self, flavour):
+		return (self.OptionsDefWrapperC() + self.SizesDef() +
+		        list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersFirst()])) +
+		        self.ScalarDefPlain("alpha", flavour) +
+		        list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersFirst()])) +
+		        self.ScalarDefPlain("beta", flavour) +
+		        list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersSecond()])) +
+		        list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersSecond()])) +
+		        list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()])))
 	
 	# Retrieves a combination of all the argument types
 	def ArgumentsType(self, flavour):
-		return (self.OptionsType() + self.SizesType() + self.BufferType("dot") +
+		return (self.OptionsType() + self.SizesType() +
+		        list(chain(*[self.BufferType(b) for b in self.ScalarBuffersFirst()])) +
 		        self.ScalarType("alpha", flavour) +
 		        list(chain(*[self.BufferType(b) for b in self.BuffersFirst()])) +
 		        self.ScalarType("beta", flavour) +
 		        list(chain(*[self.BufferType(b) for b in self.BuffersSecond()])) +
-		        list(chain(*[self.ScalarType(s, flavour) for s in ["d1","d2","a","b","c","s"]])))
+		        list(chain(*[self.BufferType(b) for b in self.ScalarBuffersSecond()])) +
+		        list(chain(*[self.ScalarType(s, flavour) for s in self.OtherScalars()])))
+	
+	# Retrieves a combination of all the argument types
+	def ArgumentsDoc(self):
+		return (self.OptionsDoc() + self.SizesDoc() +
+		        list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersFirst()])) +
+		        list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersFirst()])) +
+		        self.ScalarDoc("alpha") +
+		        list(chain(*[self.BufferDoc(b) for b in self.BuffersFirst()])) +
+		        self.ScalarDoc("beta") +
+		        list(chain(*[self.BufferDoc(b) for b in self.BuffersSecond()])) +
+		        list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersSecond()])) +
+		        list(chain(*[self.ScalarDoc(s) for s in self.OtherScalars()])))

+	# ==============================================================================================
+
+	# Retrieves a list of routine requirements for documentation
+	def RequirementsDoc(self):
+		return []

 	# ==============================================================================================

 	# Retrieves the C++ templated definition for a routine
-	def RoutineHeaderCPP(self, spaces):
+	def RoutineHeaderCPP(self, spaces, default_event):
 		indent = " "*(spaces + self.Length())
 		result = "template <"+self.template.name+">\n"
 		result += "StatusCode "+self.name.capitalize()+"("
 		result += (",\n"+indent).join([a for a in self.ArgumentsDef(self.template)])
-		result += ",\n"+indent+"cl_command_queue* queue, cl_event* event)"
+		result += ",\n"+indent+"cl_command_queue* queue, cl_event* event"+default_event+")"
 		return result

 	# As above, but now without variable names
@ -326,15 +503,15 @@ class Routine():
 		return result

 	# As above, but now for C
-	def RoutineHeaderC(self, flavour, spaces):
+	def RoutineHeaderC(self, flavour, spaces, extra_qualifier):
 		indent = " "*(spaces + self.Length())
-		result = "StatusCode CLBlast"+flavour.name+self.name+"("
+		result = "StatusCode"+extra_qualifier+" CLBlast"+flavour.name+self.name+"("
 		result += (",\n"+indent).join([a for a in self.ArgumentsDef(flavour)])
 		result += ",\n"+indent+"cl_command_queue* queue, cl_event* event)"
 		return result

 	# As above, but now for the clBLAS wrapper
-	def RoutineHeaderWrapper(self, flavour, def_only, spaces):
+	def RoutineHeaderWrapperCL(self, flavour, def_only, spaces):
 		template = "<"+flavour.template+">" if self.NoScalars() and not def_only else ""
 		indent = " "*(spaces + self.Length() + len(template))
 		result = ""
@ -344,9 +521,16 @@ class Routine():
 				result += flavour.name
 			result += ">\n"
 		result += "clblasStatus clblasX"+self.name+template+"("
-		result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapper(flavour)])
+		result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperCL(flavour)])
 		result += ",\n"+indent+"cl_uint num_queues, cl_command_queue *queues"
 		result += ",\n"+indent+"cl_uint num_wait_events, const cl_event *wait_events, cl_event *events)"
 		return result

+	# As above, but now for the CBLAS wrapper
+	def RoutineHeaderWrapperC(self, flavour, def_only, spaces):
+		indent = " "*(spaces + self.Length())
+		result = "void cblasX"+self.name+"("
+		result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperC(flavour)])+")"
+		return result
+
 # ==================================================================================================
--- a/src/cache.cc
+++ b/src/cache.cc
@ -0,0 +1,113 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the caching functionality of compiled binaries and programs.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+#include <mutex>
+
+#include "internal/cache.h"
+
+namespace clblast {
+namespace cache {
+// =================================================================================================
+
+// Stores the compiled binary or IR in the cache
+void StoreBinaryToCache(const std::string &binary, const std::string &device_name,
+                        const Precision &precision, const std::string &routine_name) {
+  binary_cache_mutex_.lock();
+  binary_cache_.push_back(BinaryCache{binary, device_name, precision, routine_name});
+  binary_cache_mutex_.unlock();
+}
+
+// Stores the compiled program in the cache
+void StoreProgramToCache(const Program &program, const Context &context,
+                         const Precision &precision, const std::string &routine_name) {
+  program_cache_mutex_.lock();
+  program_cache_.push_back(ProgramCache{program, context.pointer(), precision, routine_name});
+  program_cache_mutex_.unlock();
+}
+
+// Queries the cache and retrieves a matching binary. Assumes that the match is available, throws
+// otherwise.
+const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision,
+                                      const std::string &routine_name) {
+  binary_cache_mutex_.lock();
+  for (auto &cached_binary: binary_cache_) {
+    if (cached_binary.MatchInCache(device_name, precision, routine_name)) {
+      binary_cache_mutex_.unlock();
+      return cached_binary.binary;
+    }
+  }
+  binary_cache_mutex_.unlock();
+  throw std::runtime_error("Internal CLBlast error: Expected binary in cache, but found none.");
+}
+
+// Queries the cache and retrieves a matching program. Assumes that the match is available, throws
+// otherwise.
+const Program& GetProgramFromCache(const Context &context, const Precision &precision,
+                                   const std::string &routine_name) {
+  program_cache_mutex_.lock();
+  for (auto &cached_program: program_cache_) {
+    if (cached_program.MatchInCache(context.pointer(), precision, routine_name)) {
+      program_cache_mutex_.unlock();
+      return cached_program.program;
+    }
+  }
+  program_cache_mutex_.unlock();
+  throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none.");
+}
+
+// Queries the cache to see whether or not the compiled kernel is already there
+bool BinaryIsInCache(const std::string &device_name, const Precision &precision,
+                     const std::string &routine_name) {
+  binary_cache_mutex_.lock();
+  for (auto &cached_binary: binary_cache_) {
+    if (cached_binary.MatchInCache(device_name, precision, routine_name)) {
+      binary_cache_mutex_.unlock();
+      return true;
+    }
+  }
+  binary_cache_mutex_.unlock();
+  return false;
+}
+
+// Queries the cache to see whether or not the compiled kernel is already there
+bool ProgramIsInCache(const Context &context, const Precision &precision,
+                      const std::string &routine_name) {
+  program_cache_mutex_.lock();
+  for (auto &cached_program: program_cache_) {
+    if (cached_program.MatchInCache(context.pointer(), precision, routine_name)) {
+      program_cache_mutex_.unlock();
+      return true;
+    }
+  }
+  program_cache_mutex_.unlock();
+  return false;
+}
+
+// =================================================================================================
+
+// Clears the cache of stored binaries and programs
+StatusCode ClearCache() {
+  binary_cache_mutex_.lock();
+  binary_cache_.clear();
+  binary_cache_mutex_.unlock();
+  program_cache_mutex_.lock();
+  program_cache_.clear();
+  program_cache_mutex_.unlock();
+  return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+} // namespace cache
+} // namespace clblast
--- a/src/clblast.cc
+++ b/src/clblast.cc
--- a/src/clblast_c.cc
+++ b/src/clblast_c.cc
@ -13,9 +13,7 @@

 #include <string>

-extern "C" {
-  #include "clblast_c.h"
-}
+#include "clblast_c.h"
 #include "clblast.h"
 #include "internal/utilities.h"

@ -27,6 +25,118 @@ using double2 = clblast::double2;
 // BLAS level-1 (vector-vector) routines
 // =================================================================================================

+// ROTG
+StatusCode CLBlastSrotg(cl_mem sa_buffer, const size_t sa_offset,
+                        cl_mem sb_buffer, const size_t sb_offset,
+                        cl_mem sc_buffer, const size_t sc_offset,
+                        cl_mem ss_buffer, const size_t ss_offset,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Rotg<float>(sa_buffer, sa_offset,
+                                     sb_buffer, sb_offset,
+                                     sc_buffer, sc_offset,
+                                     ss_buffer, ss_offset,
+                                     queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset,
+                        cl_mem sb_buffer, const size_t sb_offset,
+                        cl_mem sc_buffer, const size_t sc_offset,
+                        cl_mem ss_buffer, const size_t ss_offset,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Rotg<double>(sa_buffer, sa_offset,
+                                      sb_buffer, sb_offset,
+                                      sc_buffer, sc_offset,
+                                      ss_buffer, ss_offset,
+                                      queue, event);
+  return static_cast<StatusCode>(status);
+}
+
+// ROTMG
+StatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
+                         cl_mem sd2_buffer, const size_t sd2_offset,
+                         cl_mem sx1_buffer, const size_t sx1_offset,
+                         const cl_mem sy1_buffer, const size_t sy1_offset,
+                         cl_mem sparam_buffer, const size_t sparam_offset,
+                         cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Rotmg<float>(sd1_buffer, sd1_offset,
+                                      sd2_buffer, sd2_offset,
+                                      sx1_buffer, sx1_offset,
+                                      sy1_buffer, sy1_offset,
+                                      sparam_buffer, sparam_offset,
+                                      queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
+                         cl_mem sd2_buffer, const size_t sd2_offset,
+                         cl_mem sx1_buffer, const size_t sx1_offset,
+                         const cl_mem sy1_buffer, const size_t sy1_offset,
+                         cl_mem sparam_buffer, const size_t sparam_offset,
+                         cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Rotmg<double>(sd1_buffer, sd1_offset,
+                                       sd2_buffer, sd2_offset,
+                                       sx1_buffer, sx1_offset,
+                                       sy1_buffer, sy1_offset,
+                                       sparam_buffer, sparam_offset,
+                                       queue, event);
+  return static_cast<StatusCode>(status);
+}
+
+// ROT
+StatusCode CLBlastSrot(const size_t n,
+                       cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                       const float cos,
+                       const float sin,
+                       cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Rot(n,
+                             x_buffer, x_offset, x_inc,
+                             y_buffer, y_offset, y_inc,
+                             cos,
+                             sin,
+                             queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastDrot(const size_t n,
+                       cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                       const double cos,
+                       const double sin,
+                       cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Rot(n,
+                             x_buffer, x_offset, x_inc,
+                             y_buffer, y_offset, y_inc,
+                             cos,
+                             sin,
+                             queue, event);
+  return static_cast<StatusCode>(status);
+}
+
+// ROTM
+StatusCode CLBlastSrotm(const size_t n,
+                        cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        cl_mem sparam_buffer, const size_t sparam_offset,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Rotm<float>(n,
+                                     x_buffer, x_offset, x_inc,
+                                     y_buffer, y_offset, y_inc,
+                                     sparam_buffer, sparam_offset,
+                                     queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastDrotm(const size_t n,
+                        cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        cl_mem sparam_buffer, const size_t sparam_offset,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Rotm<double>(n,
+                                      x_buffer, x_offset, x_inc,
+                                      y_buffer, y_offset, y_inc,
+                                      sparam_buffer, sparam_offset,
+                                      queue, event);
+  return static_cast<StatusCode>(status);
+}
+
 // SWAP
 StatusCode CLBlastSswap(const size_t n,
                        cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@ -281,6 +391,258 @@ StatusCode CLBlastZdotc(const size_t n,
  return static_cast<StatusCode>(status);
 }

+// NRM2
+StatusCode CLBlastSnrm2(const size_t n,
+                        cl_mem nrm2_buffer, const size_t nrm2_offset,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Nrm2<float>(n,
+                                     nrm2_buffer, nrm2_offset,
+                                     x_buffer, x_offset, x_inc,
+                                     queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastDnrm2(const size_t n,
+                        cl_mem nrm2_buffer, const size_t nrm2_offset,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Nrm2<double>(n,
+                                      nrm2_buffer, nrm2_offset,
+                                      x_buffer, x_offset, x_inc,
+                                      queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastScnrm2(const size_t n,
+                        cl_mem nrm2_buffer, const size_t nrm2_offset,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Nrm2<float2>(n,
+                                      nrm2_buffer, nrm2_offset,
+                                      x_buffer, x_offset, x_inc,
+                                      queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastDznrm2(const size_t n,
+                        cl_mem nrm2_buffer, const size_t nrm2_offset,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Nrm2<double2>(n,
+                                       nrm2_buffer, nrm2_offset,
+                                       x_buffer, x_offset, x_inc,
+                                       queue, event);
+  return static_cast<StatusCode>(status);
+}
+
+// ASUM
+StatusCode CLBlastSasum(const size_t n,
+                        cl_mem asum_buffer, const size_t asum_offset,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Asum<float>(n,
+                                     asum_buffer, asum_offset,
+                                     x_buffer, x_offset, x_inc,
+                                     queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastDasum(const size_t n,
+                        cl_mem asum_buffer, const size_t asum_offset,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Asum<double>(n,
+                                      asum_buffer, asum_offset,
+                                      x_buffer, x_offset, x_inc,
+                                      queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastScasum(const size_t n,
+                        cl_mem asum_buffer, const size_t asum_offset,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Asum<float2>(n,
+                                      asum_buffer, asum_offset,
+                                      x_buffer, x_offset, x_inc,
+                                      queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastDzasum(const size_t n,
+                        cl_mem asum_buffer, const size_t asum_offset,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Asum<double2>(n,
+                                       asum_buffer, asum_offset,
+                                       x_buffer, x_offset, x_inc,
+                                       queue, event);
+  return static_cast<StatusCode>(status);
+}
+
+// SUM
+StatusCode CLBlastSsum(const size_t n,
+                       cl_mem sum_buffer, const size_t sum_offset,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Sum<float>(n,
+                                    sum_buffer, sum_offset,
+                                    x_buffer, x_offset, x_inc,
+                                    queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastDsum(const size_t n,
+                       cl_mem sum_buffer, const size_t sum_offset,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Sum<double>(n,
+                                     sum_buffer, sum_offset,
+                                     x_buffer, x_offset, x_inc,
+                                     queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastScsum(const size_t n,
+                       cl_mem sum_buffer, const size_t sum_offset,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Sum<float2>(n,
+                                     sum_buffer, sum_offset,
+                                     x_buffer, x_offset, x_inc,
+                                     queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastDzsum(const size_t n,
+                       cl_mem sum_buffer, const size_t sum_offset,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Sum<double2>(n,
+                                      sum_buffer, sum_offset,
+                                      x_buffer, x_offset, x_inc,
+                                      queue, event);
+  return static_cast<StatusCode>(status);
+}
+
+// AMAX
+StatusCode CLBlastiSamax(const size_t n,
+                        cl_mem imax_buffer, const size_t imax_offset,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Amax<float>(n,
+                                     imax_buffer, imax_offset,
+                                     x_buffer, x_offset, x_inc,
+                                     queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastiDamax(const size_t n,
+                        cl_mem imax_buffer, const size_t imax_offset,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Amax<double>(n,
+                                      imax_buffer, imax_offset,
+                                      x_buffer, x_offset, x_inc,
+                                      queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastiCamax(const size_t n,
+                        cl_mem imax_buffer, const size_t imax_offset,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Amax<float2>(n,
+                                      imax_buffer, imax_offset,
+                                      x_buffer, x_offset, x_inc,
+                                      queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastiZamax(const size_t n,
+                        cl_mem imax_buffer, const size_t imax_offset,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Amax<double2>(n,
+                                       imax_buffer, imax_offset,
+                                       x_buffer, x_offset, x_inc,
+                                       queue, event);
+  return static_cast<StatusCode>(status);
+}
+
+// MAX
+StatusCode CLBlastiSmax(const size_t n,
+                       cl_mem imax_buffer, const size_t imax_offset,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Max<float>(n,
+                                    imax_buffer, imax_offset,
+                                    x_buffer, x_offset, x_inc,
+                                    queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastiDmax(const size_t n,
+                       cl_mem imax_buffer, const size_t imax_offset,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Max<double>(n,
+                                     imax_buffer, imax_offset,
+                                     x_buffer, x_offset, x_inc,
+                                     queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastiCmax(const size_t n,
+                       cl_mem imax_buffer, const size_t imax_offset,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Max<float2>(n,
+                                     imax_buffer, imax_offset,
+                                     x_buffer, x_offset, x_inc,
+                                     queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastiZmax(const size_t n,
+                       cl_mem imax_buffer, const size_t imax_offset,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Max<double2>(n,
+                                      imax_buffer, imax_offset,
+                                      x_buffer, x_offset, x_inc,
+                                      queue, event);
+  return static_cast<StatusCode>(status);
+}
+
+// MIN
+StatusCode CLBlastiSmin(const size_t n,
+                       cl_mem imin_buffer, const size_t imin_offset,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Min<float>(n,
+                                    imin_buffer, imin_offset,
+                                    x_buffer, x_offset, x_inc,
+                                    queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastiDmin(const size_t n,
+                       cl_mem imin_buffer, const size_t imin_offset,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Min<double>(n,
+                                     imin_buffer, imin_offset,
+                                     x_buffer, x_offset, x_inc,
+                                     queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastiCmin(const size_t n,
+                       cl_mem imin_buffer, const size_t imin_offset,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Min<float2>(n,
+                                     imin_buffer, imin_offset,
+                                     x_buffer, x_offset, x_inc,
+                                     queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastiZmin(const size_t n,
+                       cl_mem imin_buffer, const size_t imin_offset,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Min<double2>(n,
+                                      imin_buffer, imin_offset,
+                                      x_buffer, x_offset, x_inc,
+                                      queue, event);
+  return static_cast<StatusCode>(status);
+}
+
 // =================================================================================================
 // BLAS level-2 (matrix-vector) routines
 // =================================================================================================
@ -2022,3 +2384,15 @@ StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle tri
 }

 // =================================================================================================
+
+// Clears the cache of stored binaries
+StatusCode CLBlastClearCache() {
+  return static_cast<StatusCode>(clblast::ClearCache());
+}
+
+// Fills the cache with binaries for a specific device
+StatusCode CLBlastFillCache(const cl_device_id device) {
+  return static_cast<StatusCode>(clblast::FillCache(device));
+}
+
+// =================================================================================================
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@ -40,6 +40,7 @@ R"(
  typedef float16 real16;
  #define ZERO 0.0f
  #define ONE 1.0f
+  #define SMALLEST -1.0e37f

 // Double-precision 
 #elif PRECISION == 64
@ -50,6 +51,7 @@ R"(
  typedef double16 real16;
  #define ZERO 0.0
  #define ONE 1.0
+  #define SMALLEST -1.0e37

 // Complex single-precision
 #elif PRECISION == 3232
@ -64,6 +66,7 @@ R"(
                           real sC; real sD; real sE; real sF;} real16;
  #define ZERO 0.0f
  #define ONE 1.0f
+  #define SMALLEST -1.0e37f

 // Complex Double-precision
 #elif PRECISION == 6464
@ -78,6 +81,16 @@ R"(
                            real sC; real sD; real sE; real sF;} real16;
  #define ZERO 0.0
  #define ONE 1.0
+  #define SMALLEST -1.0e37
+#endif
+
+// Single-element version of a complex number
+#if PRECISION == 3232
+  typedef float singlereal;
+#elif PRECISION == 6464
+  typedef double singlereal;
+#else
+  typedef real singlereal;
 #endif

 // =================================================================================================
@ -109,6 +122,13 @@ R"(
  #define SetToOne(a) a = ONE
 #endif

+// The absolute value (component-wise)
+#if PRECISION == 3232 || PRECISION == 6464
+  #define AbsoluteValue(value) value.x = fabs(value.x); value.y = fabs(value.y)
+#else
+  #define AbsoluteValue(value) value = fabs(value)
+#endif
+
 // Adds two complex variables
 #if PRECISION == 3232 || PRECISION == 6464
  #define Add(c, a, b) c.x = a.x + b.x; c.y = a.y + b.y
--- a/src/kernels/level1/xamax.opencl
+++ b/src/kernels/level1/xamax.opencl
@ -0,0 +1,140 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Xamax kernel. It implements an index of absolute max computation using
+// reduction kernels. Reduction is split in two parts. In the first (main) kernel the X vector is
+// loaded, followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel
+// is executed with a single workgroup only, computing the final result.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef WGS1
+  #define WGS1 64     // The local work-group size of the main kernel
+#endif
+#ifndef WGS2
+  #define WGS2 64     // The local work-group size of the epilogue kernel
+#endif
+
+// =================================================================================================
+
+// The main reduction kernel, performing the loading and the majority of the operation
+__attribute__((reqd_work_group_size(WGS1, 1, 1)))
+__kernel void Xamax(const int n,
+                    const __global real* restrict xgm, const int x_offset, const int x_inc,
+                    __global singlereal* maxgm, __global unsigned int* imaxgm) {
+  __local singlereal maxlm[WGS1];
+  __local unsigned int imaxlm[WGS1];
+  const int lid = get_local_id(0);
+  const int wgid = get_group_id(0);
+  const int num_groups = get_num_groups(0);
+
+  // Performs loading and the first steps of the reduction
+  #if defined(ROUTINE_MAX) || defined(ROUTINE_MIN) // non-absolute version
+    singlereal max = SMALLEST;
+  #else
+    singlereal max = ZERO;
+  #endif
+  unsigned int imax = 0;
+  int id = wgid*WGS1 + lid;
+  while (id < n) {
+    const int x_index = id*x_inc + x_offset;
+    #if PRECISION == 3232 || PRECISION == 6464
+      singlereal x = xgm[x_index].x;
+    #else
+      singlereal x = xgm[x_index];
+    #endif
+    #if defined(ROUTINE_MAX) // non-absolute maximum version
+      // nothing special here
+    #elif defined(ROUTINE_MIN) // non-absolute minimum version
+      x = -x;
+    #else
+      x = fabs(x);
+    #endif
+    if (x >= max) {
+      max = x;
+      imax = id*x_inc + x_offset;
+    }
+    id += WGS1*num_groups;
+  }
+  maxlm[lid] = max;
+  imaxlm[lid] = imax;
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Performs reduction in local memory
+  #pragma unroll
+  for (int s=WGS1/2; s>0; s=s>>1) {
+    if (lid < s) {
+      if (maxlm[lid + s] >= maxlm[lid]) {
+        maxlm[lid] = maxlm[lid + s];
+        imaxlm[lid] = imaxlm[lid + s];
+      }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+
+  // Stores the per-workgroup result
+  if (lid == 0) {
+    maxgm[wgid] = maxlm[0];
+    imaxgm[wgid] = imaxlm[0];
+  }
+}
+
+// =================================================================================================
+
+// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
+// be launched with a single workgroup only.
+__attribute__((reqd_work_group_size(WGS2, 1, 1)))
+__kernel void XamaxEpilogue(const __global singlereal* restrict maxgm,
+                            const __global unsigned int* restrict imaxgm,
+                            __global unsigned int* imax, const int imax_offset) {
+  __local singlereal maxlm[WGS2];
+  __local unsigned int imaxlm[WGS2];
+  const int lid = get_local_id(0);
+
+  // Performs the first step of the reduction while loading the data
+  if (maxgm[lid + WGS2] >= maxgm[lid]) {
+    maxlm[lid] = maxgm[lid + WGS2];
+    imaxlm[lid] = imaxgm[lid + WGS2];
+  }
+  else {
+    maxlm[lid] = maxgm[lid];
+    imaxlm[lid] = imaxgm[lid];
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Performs reduction in local memory
+  #pragma unroll
+  for (int s=WGS2/2; s>0; s=s>>1) {
+    if (lid < s) {
+      if (maxlm[lid + s] >= maxlm[lid]) {
+        maxlm[lid] = maxlm[lid + s];
+        imaxlm[lid] = imaxlm[lid + s];
+      }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+
+  // Stores the final result
+  if (lid == 0) {
+    imax[imax_offset] = imaxlm[0];
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
--- a/src/kernels/level1/xasum.opencl
+++ b/src/kernels/level1/xasum.opencl
@ -0,0 +1,111 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Xasum kernel. It implements a absolute sum computation using reduction
+// kernels. Reduction is split in two parts. In the first (main) kernel the X vector is loaded,
+// followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel
+// is executed with a single workgroup only, computing the final result.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef WGS1
+  #define WGS1 64     // The local work-group size of the main kernel
+#endif
+#ifndef WGS2
+  #define WGS2 64     // The local work-group size of the epilogue kernel
+#endif
+
+// =================================================================================================
+
+// The main reduction kernel, performing the loading and the majority of the operation
+__attribute__((reqd_work_group_size(WGS1, 1, 1)))
+__kernel void Xasum(const int n,
+                    const __global real* restrict xgm, const int x_offset, const int x_inc,
+                    __global real* output) {
+  __local real lm[WGS1];
+  const int lid = get_local_id(0);
+  const int wgid = get_group_id(0);
+  const int num_groups = get_num_groups(0);
+
+  // Performs loading and the first steps of the reduction
+  real acc;
+  SetToZero(acc);
+  int id = wgid*WGS1 + lid;
+  while (id < n) {
+    real x = xgm[id*x_inc + x_offset];
+    #if defined(ROUTINE_SUM) // non-absolute version
+    #else
+      AbsoluteValue(x);
+    #endif
+    Add(acc, acc, x);
+    id += WGS1*num_groups;
+  }
+  lm[lid] = acc;
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Performs reduction in local memory
+  #pragma unroll
+  for (int s=WGS1/2; s>0; s=s>>1) {
+    if (lid < s) {
+      Add(lm[lid], lm[lid], lm[lid + s]);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+
+  // Stores the per-workgroup result
+  if (lid == 0) {
+    output[wgid] = lm[0];
+  }
+}
+
+// =================================================================================================
+
+// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
+// be launched with a single workgroup only.
+__attribute__((reqd_work_group_size(WGS2, 1, 1)))
+__kernel void XasumEpilogue(const __global real* restrict input,
+                            __global real* asum, const int asum_offset) {
+  __local real lm[WGS2];
+  const int lid = get_local_id(0);
+
+  // Performs the first step of the reduction while loading the data
+  Add(lm[lid], input[lid], input[lid + WGS2]);
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Performs reduction in local memory
+  #pragma unroll
+  for (int s=WGS2/2; s>0; s=s>>1) {
+    if (lid < s) {
+      Add(lm[lid], lm[lid], lm[lid + s]);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+
+  // Computes the absolute value and stores the final result
+  if (lid == 0) {
+    #if PRECISION == 3232 || PRECISION == 6464
+      asum[asum_offset].x = lm[0].x + lm[0].y; // the result is a non-complex number
+    #else
+      asum[asum_offset] = lm[0];
+    #endif
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@ -30,7 +30,8 @@ __kernel void Xaxpy(const int n, const real alpha,
  // Loops over the work that needs to be done (allows for an arbitrary number of threads)
  #pragma unroll
  for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
-    MultiplyAdd(ygm[id*y_inc + y_offset], alpha, xgm[id*x_inc + x_offset]);
+    real xvalue = xgm[id*x_inc + x_offset];
+    MultiplyAdd(ygm[id*y_inc + y_offset], alpha, xvalue);
  }
 }

@ -45,7 +46,9 @@ __kernel void XaxpyFast(const int n, const real alpha,
  #pragma unroll
  for (int w=0; w<WPT; ++w) {
    const int id = w*get_global_size(0) + get_global_id(0);
-    ygm[id] = MultiplyAddVector(ygm[id], alpha, xgm[id]);
+    realV xvalue = xgm[id];
+    realV yvalue = ygm[id];
+    ygm[id] = MultiplyAddVector(yvalue, alpha, xvalue);
  }
 }

--- a/src/kernels/level1/xnrm2.opencl
+++ b/src/kernels/level1/xnrm2.opencl
@ -0,0 +1,109 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Xnrm2 kernel. It implements a squared norm computation using reduction
+// kernels. Reduction is split in two parts. In the first (main) kernel the X vector is squared,
+// followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel
+// is executed with a single workgroup only, computing the final result.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef WGS1
+  #define WGS1 64     // The local work-group size of the main kernel
+#endif
+#ifndef WGS2
+  #define WGS2 64     // The local work-group size of the epilogue kernel
+#endif
+
+// =================================================================================================
+
+// The main reduction kernel, performing the multiplication and the majority of the operation
+__attribute__((reqd_work_group_size(WGS1, 1, 1)))
+__kernel void Xnrm2(const int n,
+                    const __global real* restrict xgm, const int x_offset, const int x_inc,
+                    __global real* output) {
+  __local real lm[WGS1];
+  const int lid = get_local_id(0);
+  const int wgid = get_group_id(0);
+  const int num_groups = get_num_groups(0);
+
+  // Performs multiplication and the first steps of the reduction
+  real acc;
+  SetToZero(acc);
+  int id = wgid*WGS1 + lid;
+  while (id < n) {
+    real x1 = xgm[id*x_inc + x_offset];
+    real x2 = x1;
+    COMPLEX_CONJUGATE(x2);
+    MultiplyAdd(acc, x1, x2);
+    id += WGS1*num_groups;
+  }
+  lm[lid] = acc;
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Performs reduction in local memory
+  #pragma unroll
+  for (int s=WGS1/2; s>0; s=s>>1) {
+    if (lid < s) {
+      Add(lm[lid], lm[lid], lm[lid + s]);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+
+  // Stores the per-workgroup result
+  if (lid == 0) {
+    output[wgid] = lm[0];
+  }
+}
+
+// =================================================================================================
+
+// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
+// be launched with a single workgroup only.
+__attribute__((reqd_work_group_size(WGS2, 1, 1)))
+__kernel void Xnrm2Epilogue(const __global real* restrict input,
+                            __global real* nrm2, const int nrm2_offset) {
+  __local real lm[WGS2];
+  const int lid = get_local_id(0);
+
+  // Performs the first step of the reduction while loading the data
+  Add(lm[lid], input[lid], input[lid + WGS2]);
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Performs reduction in local memory
+  #pragma unroll
+  for (int s=WGS2/2; s>0; s=s>>1) {
+    if (lid < s) {
+      Add(lm[lid], lm[lid], lm[lid + s]);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+
+  // Computes the square root and stores the final result
+  if (lid == 0) {
+    #if PRECISION == 3232 || PRECISION == 6464
+      nrm2[nrm2_offset].x = sqrt(lm[0].x); // the result is a non-complex number
+    #else
+      nrm2[nrm2_offset] = sqrt(lm[0]);
+    #endif
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
--- a/src/kernels/level1/xscal.opencl
+++ b/src/kernels/level1/xscal.opencl
@ -29,8 +29,9 @@ __kernel void Xscal(const int n, const real alpha,
  // Loops over the work that needs to be done (allows for an arbitrary number of threads)
  #pragma unroll
  for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
+    real xvalue = xgm[id*x_inc + x_offset];
    real result;
-    Multiply(result, alpha, xgm[id*x_inc + x_offset]);
+    Multiply(result, alpha, xvalue);
    xgm[id*x_inc + x_offset] = result;
  }
 }
@ -45,8 +46,9 @@ __kernel void XscalFast(const int n, const real alpha,
  #pragma unroll
  for (int w=0; w<WPT; ++w) {
    const int id = w*get_global_size(0) + get_global_id(0);
+    realV xvalue = xgm[id];
    realV result;
-    result = MultiplyVector(result, alpha, xgm[id]);
+    result = MultiplyVector(result, alpha, xvalue);
    xgm[id] = result;
  }
 }
--- a/src/routine.cc
+++ b/src/routine.cc
@ -11,18 +11,17 @@
 //
 // =================================================================================================

+#include <string>
+#include <vector>
+
 #include "internal/routine.h"

 namespace clblast {
 // =================================================================================================

-// The cache of compiled OpenCL programs
-template <typename T>
-std::vector<typename Routine<T>::ProgramCache> Routine<T>::program_cache_;
-
 // Constructor: not much here, because no status codes can be returned
 template <typename T>
-Routine<T>::Routine(Queue &queue, Event &event, const std::string &name,
+Routine<T>::Routine(Queue &queue, EventPointer event, const std::string &name,
                    const std::vector<std::string> &routines, const Precision precision):
    precision_(precision),
    routine_name_(name),
@ -43,65 +42,81 @@ Routine<T>::Routine(Queue &queue, Event &event, const std::string &name,
 template <typename T>
 StatusCode Routine<T>::SetUp() {

-  // Queries the cache to see whether or not the compiled kernel is already there. If not, it will
-  // be built and added to the cache.
-  if (!ProgramIsInCache()) {
+  // Queries the cache to see whether or not the program (context-specific) is already there
+  if (ProgramIsInCache()) { return StatusCode::kSuccess; }

-    // Inspects whether or not cl_khr_fp64 is supported in case of double precision
-    auto extensions = device_.Capabilities();
-    if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
-      if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
-        return StatusCode::kNoDoublePrecision;
-      }
-    }
-
-    // As above, but for cl_khr_fp16 (half precision)
-    if (precision_ == Precision::kHalf) {
-      if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
-        return StatusCode::kNoHalfPrecision;
-      }
-    }
-
-    // Loads the common header (typedefs and defines and such)
-    std::string common_header =
-      #include "kernels/common.opencl"
-    ;
-
-    // Collects the parameters for this device in the form of defines, and adds the precision
-    auto defines = db_.GetDefines();
-    defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
-
-    // Adds the name of the routine as a define
-    defines += "#define ROUTINE_"+routine_name_+"\n";
-
-    // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
-    // performance, but might result in a reduced accuracy.
-    if (device_.Vendor() == "AMD") {
-      defines += "#define USE_CL_MAD 1\n";
-    }
-
-    // Combines everything together into a single source string
-    auto source_string = defines + common_header + source_string_;
-
-    // Compiles the kernel
+  // Queries the cache to see whether or not the binary (device-specific) is already there. If it
+  // is, a program is created and stored in the cache
+  if (BinaryIsInCache()) {
    try {
-      auto program = Program(context_, source_string);
+      auto& binary = cache::GetBinaryFromCache(device_name_, precision_, routine_name_);
+      auto program = Program(device_, context_, binary);
      auto options = std::vector<std::string>();
-      auto build_status = program.Build(device_, options);
-
-      // Checks for compiler crashes/errors/warnings
-      if (build_status == BuildStatus::kError) {
-        auto message = program.GetBuildInfo(device_);
-        fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str());
-        return StatusCode::kBuildProgramFailure;
-      }
-      if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; }
-
-      // Store the compiled program in the cache
-      program_cache_.push_back({program, device_name_, precision_, routine_name_});
+      program.Build(device_, options);
+      StoreProgramToCache(program);
    } catch (...) { return StatusCode::kBuildProgramFailure; }
+    return StatusCode::kSuccess;
  }

+  // Otherwise, the kernel will be compiled and program will be built. Both the binary and the
+  // program will be added to the cache.
+
+  // Inspects whether or not cl_khr_fp64 is supported in case of double precision
+  auto extensions = device_.Capabilities();
+  if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
+    if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
+      return StatusCode::kNoDoublePrecision;
+    }
+  }
+
+  // As above, but for cl_khr_fp16 (half precision)
+  if (precision_ == Precision::kHalf) {
+    if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
+      return StatusCode::kNoHalfPrecision;
+    }
+  }
+
+  // Loads the common header (typedefs and defines and such)
+  std::string common_header =
+    #include "kernels/common.opencl"
+  ;
+
+  // Collects the parameters for this device in the form of defines, and adds the precision
+  auto defines = db_.GetDefines();
+  defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
+
+  // Adds the name of the routine as a define
+  defines += "#define ROUTINE_"+routine_name_+"\n";
+
+  // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
+  // performance, but might result in a reduced accuracy.
+  if (device_.Vendor() == "AMD") {
+    defines += "#define USE_CL_MAD 1\n";
+  }
+
+  // Combines everything together into a single source string
+  auto source_string = defines + common_header + source_string_;
+
+  // Compiles the kernel
+  try {
+    auto program = Program(context_, source_string);
+    auto options = std::vector<std::string>();
+    auto build_status = program.Build(device_, options);
+
+    // Checks for compiler crashes/errors/warnings
+    if (build_status == BuildStatus::kError) {
+      auto message = program.GetBuildInfo(device_);
+      fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str());
+      return StatusCode::kBuildProgramFailure;
+    }
+    if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; }
+
+    // Store the compiled binary and program in the cache
+    const auto binary = program.GetIR();
+    StoreBinaryToCache(binary);
+    StoreProgramToCache(program);
+  } catch (...) { return StatusCode::kBuildProgramFailure; }
+
  // No errors, normal termination of this function
  return StatusCode::kSuccess;
 }
@ -111,7 +126,8 @@ StatusCode Routine<T>::SetUp() {
 // Enqueues a kernel, waits for completion, and checks for errors
 template <typename T>
 StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global,
-                                 const std::vector<size_t> &local) {
+                                 const std::vector<size_t> &local, EventPointer event,
+                                 std::vector<Event>& waitForEvents) {

  // Tests for validity of the local thread sizes
  if (local.size() > max_work_item_dimensions_) {
@ -135,18 +151,21 @@ StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global,

  // Launches the kernel (and checks for launch errors)
  try {
-    kernel.Launch(queue_, global, local, event_);
+    kernel.Launch(queue_, global, local, event, waitForEvents);
  } catch (...) { return StatusCode::kKernelLaunchError; }

-  // Waits for completion of the kernel
-  try {
-    queue_.Finish(event_);
-  } catch (...) { return StatusCode::kKernelRunError; }
-
  // No errors, normal termination of this function
  return StatusCode::kSuccess;
 }

+// As above, but without an event waiting list
+template <typename T>
+StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global,
+                                 const std::vector<size_t> &local, EventPointer event) {
+  auto emptyWaitingList = std::vector<Event>();
+  return RunKernel(kernel, global, local, event, emptyWaitingList);
+}
+
 // =================================================================================================

 // Tests matrix A for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
@ -156,7 +175,7 @@ StatusCode Routine<T>::TestMatrixA(const size_t one, const size_t two, const Buf
                                   const size_t offset, const size_t ld, const size_t data_size) {
  if (ld < one) { return StatusCode::kInvalidLeadDimA; }
  try {
-    auto required_size = (ld*two + offset)*data_size;
+    auto required_size = (ld*(two-1) + one + offset)*data_size;
    auto buffer_size = buffer.GetSize();
    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; }
  } catch (...) { return StatusCode::kInvalidMatrixA; }
@ -170,7 +189,7 @@ StatusCode Routine<T>::TestMatrixB(const size_t one, const size_t two, const Buf
                                   const size_t offset, const size_t ld, const size_t data_size) {
  if (ld < one) { return StatusCode::kInvalidLeadDimB; }
  try {
-    auto required_size = (ld*two + offset)*data_size;
+    auto required_size = (ld*(two-1) + one + offset)*data_size;
    auto buffer_size = buffer.GetSize();
    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryB; }
  } catch (...) { return StatusCode::kInvalidMatrixB; }
@ -184,7 +203,7 @@ StatusCode Routine<T>::TestMatrixC(const size_t one, const size_t two, const Buf
                                   const size_t offset, const size_t ld, const size_t data_size) {
  if (ld < one) { return StatusCode::kInvalidLeadDimC; }
  try {
-    auto required_size = (ld*two + offset)*data_size;
+    auto required_size = (ld*(two-1) + one + offset)*data_size;
    auto buffer_size = buffer.GetSize();
    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryC; }
  } catch (...) { return StatusCode::kInvalidMatrixC; }
@ -212,7 +231,7 @@ StatusCode Routine<T>::TestVectorX(const size_t n, const Buffer<T> &buffer, cons
                                   const size_t inc, const size_t data_size) {
  if (inc == 0) { return StatusCode::kInvalidIncrementX; }
  try {
-    auto required_size = (n*inc + offset)*data_size;
+    auto required_size = ((n-1)*inc + 1 + offset)*data_size;
    auto buffer_size = buffer.GetSize();
    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryX; }
  } catch (...) { return StatusCode::kInvalidVectorX; }
@ -226,7 +245,7 @@ StatusCode Routine<T>::TestVectorY(const size_t n, const Buffer<T> &buffer, cons
                                   const size_t inc, const size_t data_size) {
  if (inc == 0) { return StatusCode::kInvalidIncrementY; }
  try {
-    auto required_size = (n*inc + offset)*data_size;
+    auto required_size = ((n-1)*inc + 1 + offset)*data_size;
    auto buffer_size = buffer.GetSize();
    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryY; }
  } catch (...) { return StatusCode::kInvalidVectorY; }
@ -248,11 +267,25 @@ StatusCode Routine<T>::TestVectorDot(const size_t n, const Buffer<T> &buffer, co
  return StatusCode::kSuccess;
 }

+// Tests vector index for validity: checks for a valid increment, a valid OpenCL buffer, and for a
+// sufficient buffer size.
+template <typename T>
+StatusCode Routine<T>::TestVectorIndex(const size_t n, const Buffer<unsigned int> &buffer,
+                                       const size_t offset, const size_t data_size) {
+  try {
+    auto required_size = (n + offset)*data_size;
+    auto buffer_size = buffer.GetSize();
+    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryDot; }
+  } catch (...) { return StatusCode::kInvalidVectorDot; }
+  return StatusCode::kSuccess;
+}
+
 // =================================================================================================

 // Copies or transposes a matrix and pads/unpads it with zeros
 template <typename T>
-StatusCode Routine<T>::PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
+StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Event>& waitForEvents,
+                                              const size_t src_one, const size_t src_two,
                                              const size_t src_ld, const size_t src_offset,
                                              const Buffer<T> &src,
                                              const size_t dest_one, const size_t dest_two,
@ -334,13 +367,13 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(const size_t src_one, const size_t
        auto global = std::vector<size_t>{dest_one / db_["TRA_WPT"],
                                          dest_two / db_["TRA_WPT"]};
        auto local = std::vector<size_t>{db_["TRA_DIM"], db_["TRA_DIM"]};
-        status = RunKernel(kernel, global, local);
+        status = RunKernel(kernel, global, local, event, waitForEvents);
      }
      else {
        auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PADTRA_WPT"]), db_["PADTRA_TILE"]),
                                          Ceil(CeilDiv(dest_two, db_["PADTRA_WPT"]), db_["PADTRA_TILE"])};
        auto local = std::vector<size_t>{db_["PADTRA_TILE"], db_["PADTRA_TILE"]};
-        status = RunKernel(kernel, global, local);
+        status = RunKernel(kernel, global, local, event, waitForEvents);
      }
    }
    else {
@ -348,13 +381,13 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(const size_t src_one, const size_t
        auto global = std::vector<size_t>{dest_one / db_["COPY_VW"],
                                          dest_two / db_["COPY_WPT"]};
        auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]};
-        status = RunKernel(kernel, global, local);
+        status = RunKernel(kernel, global, local, event, waitForEvents);
      }
      else {
        auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
                                          Ceil(CeilDiv(dest_two, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
        auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
-        status = RunKernel(kernel, global, local);
+        status = RunKernel(kernel, global, local, event, waitForEvents);
      }
    }
    return status;
@ -363,29 +396,6 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(const size_t src_one, const size_t

 // =================================================================================================

-// Queries the cache and retrieves a matching program. Assumes that the match is available, throws
-// otherwise.
-template <typename T>
-const Program& Routine<T>::GetProgramFromCache() const {
-  for (auto &cached_program: program_cache_) {
-    if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) {
-      return cached_program.program;
-    }
-  }
-  throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none.");
-}
-
-// Queries the cache to see whether or not the compiled kernel is already there
-template <typename T>
-bool Routine<T>::ProgramIsInCache() const {
-  for (auto &cached_program: program_cache_) {
-    if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) { return true; }
-  }
-  return false;
-}
-
-// =================================================================================================
-
 // Compiles the templated class
 template class Routine<float>;
 template class Routine<double>;
--- a/src/routines/level1/xamax.cc
+++ b/src/routines/level1/xamax.cc
@ -0,0 +1,112 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xamax class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level1/xamax.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xamax<float>::precision_ = Precision::kSingle;
+template <> const Precision Xamax<double>::precision_ = Precision::kDouble;
+template <> const Precision Xamax<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xamax<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xamax<T>::Xamax(Queue &queue, EventPointer event, const std::string &name):
+    Routine<T>(queue, event, name, {"Xdot"}, precision_) {
+  source_string_ =
+    #include "../../kernels/level1/xamax.opencl"
+  ;
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xamax<T>::DoAmax(const size_t n,
+                            const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+  // Makes sure all dimensions are larger than zero
+  if (n == 0) { return StatusCode::kInvalidDimension; }
+
+  // Tests the vectors for validity
+  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestVectorIndex(1, imax_buffer, imax_offset, sizeof(unsigned int));
+  if (ErrorIn(status)) { return status; }
+
+  // Retrieves the Xamax kernels from the compiled binary
+  try {
+    const auto program = GetProgramFromCache();
+    auto kernel1 = Kernel(program, "Xamax");
+    auto kernel2 = Kernel(program, "XamaxEpilogue");
+
+    // Creates the buffer for intermediate values
+    auto temp_size = 2*db_["WGS2"];
+    auto temp_buffer1 = Buffer<T>(context_, temp_size);
+    auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size);
+
+    // Sets the kernel arguments
+    kernel1.SetArgument(0, static_cast<int>(n));
+    kernel1.SetArgument(1, x_buffer());
+    kernel1.SetArgument(2, static_cast<int>(x_offset));
+    kernel1.SetArgument(3, static_cast<int>(x_inc));
+    kernel1.SetArgument(4, temp_buffer1());
+    kernel1.SetArgument(5, temp_buffer2());
+
+    // Event waiting list
+    auto eventWaitList = std::vector<Event>();
+
+    // Launches the main kernel
+    auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
+    auto local1 = std::vector<size_t>{db_["WGS1"]};
+    auto kernelEvent = Event();
+    status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
+    if (ErrorIn(status)) { return status; }
+    eventWaitList.push_back(kernelEvent);
+
+    // Sets the arguments for the epilogue kernel
+    kernel2.SetArgument(0, temp_buffer1());
+    kernel2.SetArgument(1, temp_buffer2());
+    kernel2.SetArgument(2, imax_buffer());
+    kernel2.SetArgument(3, static_cast<int>(imax_offset));
+
+    // Launches the epilogue kernel
+    auto global2 = std::vector<size_t>{db_["WGS2"]};
+    auto local2 = std::vector<size_t>{db_["WGS2"]};
+    status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
+    if (ErrorIn(status)) { return status; }
+
+    // Succesfully finished the computation
+    return StatusCode::kSuccess;
+  } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xamax<float>;
+template class Xamax<double>;
+template class Xamax<float2>;
+template class Xamax<double2>;
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routines/level1/xasum.cc
+++ b/src/routines/level1/xasum.cc
@ -0,0 +1,109 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xasum class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level1/xasum.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xasum<float>::precision_ = Precision::kSingle;
+template <> const Precision Xasum<double>::precision_ = Precision::kDouble;
+template <> const Precision Xasum<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xasum<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xasum<T>::Xasum(Queue &queue, EventPointer event, const std::string &name):
+    Routine<T>(queue, event, name, {"Xdot"}, precision_) {
+  source_string_ =
+    #include "../../kernels/level1/xasum.opencl"
+  ;
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xasum<T>::DoAsum(const size_t n,
+                            const Buffer<T> &asum_buffer, const size_t asum_offset,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+  // Makes sure all dimensions are larger than zero
+  if (n == 0) { return StatusCode::kInvalidDimension; }
+
+  // Tests the vectors for validity
+  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestVectorDot(1, asum_buffer, asum_offset, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Retrieves the Xasum kernels from the compiled binary
+  try {
+    const auto program = GetProgramFromCache();
+    auto kernel1 = Kernel(program, "Xasum");
+    auto kernel2 = Kernel(program, "XasumEpilogue");
+
+    // Creates the buffer for intermediate values
+    auto temp_size = 2*db_["WGS2"];
+    auto temp_buffer = Buffer<T>(context_, temp_size);
+
+    // Sets the kernel arguments
+    kernel1.SetArgument(0, static_cast<int>(n));
+    kernel1.SetArgument(1, x_buffer());
+    kernel1.SetArgument(2, static_cast<int>(x_offset));
+    kernel1.SetArgument(3, static_cast<int>(x_inc));
+    kernel1.SetArgument(4, temp_buffer());
+
+    // Event waiting list
+    auto eventWaitList = std::vector<Event>();
+
+    // Launches the main kernel
+    auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
+    auto local1 = std::vector<size_t>{db_["WGS1"]};
+    auto kernelEvent = Event();
+    status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
+    if (ErrorIn(status)) { return status; }
+    eventWaitList.push_back(kernelEvent);
+
+    // Sets the arguments for the epilogue kernel
+    kernel2.SetArgument(0, temp_buffer());
+    kernel2.SetArgument(1, asum_buffer());
+    kernel2.SetArgument(2, static_cast<int>(asum_offset));
+
+    // Launches the epilogue kernel
+    auto global2 = std::vector<size_t>{db_["WGS2"]};
+    auto local2 = std::vector<size_t>{db_["WGS2"]};
+    status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
+    if (ErrorIn(status)) { return status; }
+
+    // Succesfully finished the computation
+    return StatusCode::kSuccess;
+  } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xasum<float>;
+template class Xasum<double>;
+template class Xasum<float2>;
+template class Xasum<double2>;
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routines/level1/xaxpy.cc
+++ b/src/routines/level1/xaxpy.cc
@ -29,7 +29,7 @@ template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDoub

 // Constructor: forwards to base class constructor
 template <typename T>
-Xaxpy<T>::Xaxpy(Queue &queue, Event &event, const std::string &name):
+Xaxpy<T>::Xaxpy(Queue &queue, EventPointer event, const std::string &name):
    Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
  source_string_ =
    #include "../../kernels/level1/level1.opencl"
@ -64,7 +64,7 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,

  // Retrieves the Xaxpy kernel from the compiled binary
  try {
-    auto& program = GetProgramFromCache();
+    const auto program = GetProgramFromCache();
    auto kernel = Kernel(program, kernel_name);

    // Sets the kernel arguments
@ -89,19 +89,16 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
    if (use_fast_kernel) {
      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local);
+      status = RunKernel(kernel, global, local, event_);
    }
    else {
      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local);
+      status = RunKernel(kernel, global, local, event_);
    }
    if (ErrorIn(status)) { return status; }

-    // Waits for all kernels to finish
-    queue_.Finish();
-
    // Succesfully finished the computation
    return StatusCode::kSuccess;
  } catch (...) { return StatusCode::kInvalidKernel; }
--- a/src/routines/level1/xcopy.cc
+++ b/src/routines/level1/xcopy.cc
@ -29,7 +29,7 @@ template <> const Precision Xcopy<double2>::precision_ = Precision::kComplexDoub

 // Constructor: forwards to base class constructor
 template <typename T>
-Xcopy<T>::Xcopy(Queue &queue, Event &event, const std::string &name):
+Xcopy<T>::Xcopy(Queue &queue, EventPointer event, const std::string &name):
    Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
  source_string_ =
    #include "../../kernels/level1/level1.opencl"
@ -64,7 +64,7 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,

  // Retrieves the Xcopy kernel from the compiled binary
  try {
-    auto& program = GetProgramFromCache();
+    const auto program = GetProgramFromCache();
    auto kernel = Kernel(program, kernel_name);

    // Sets the kernel arguments
@ -87,19 +87,16 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
    if (use_fast_kernel) {
      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local);
+      status = RunKernel(kernel, global, local, event_);
    }
    else {
      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local);
+      status = RunKernel(kernel, global, local, event_);
    }
    if (ErrorIn(status)) { return status; }

-    // Waits for all kernels to finish
-    queue_.Finish();
-
    // Succesfully finished the computation
    return StatusCode::kSuccess;
  } catch (...) { return StatusCode::kInvalidKernel; }
--- a/src/routines/level1/xdot.cc
+++ b/src/routines/level1/xdot.cc
@ -29,7 +29,7 @@ template <> const Precision Xdot<double2>::precision_ = Precision::kComplexDoubl

 // Constructor: forwards to base class constructor
 template <typename T>
-Xdot<T>::Xdot(Queue &queue, Event &event, const std::string &name):
+Xdot<T>::Xdot(Queue &queue, EventPointer event, const std::string &name):
    Routine<T>(queue, event, name, {"Xdot"}, precision_) {
  source_string_ =
    #include "../../kernels/level1/xdot.opencl"
@ -59,7 +59,7 @@ StatusCode Xdot<T>::DoDot(const size_t n,

  // Retrieves the Xdot kernels from the compiled binary
  try {
-    auto& program = GetProgramFromCache();
+    const auto program = GetProgramFromCache();
    auto kernel1 = Kernel(program, "Xdot");
    auto kernel2 = Kernel(program, "XdotEpilogue");

@ -78,11 +78,16 @@ StatusCode Xdot<T>::DoDot(const size_t n,
    kernel1.SetArgument(7, temp_buffer());
    kernel1.SetArgument(8, static_cast<int>(do_conjugate));

+    // Event waiting list
+    auto eventWaitList = std::vector<Event>();
+
    // Launches the main kernel
    auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
    auto local1 = std::vector<size_t>{db_["WGS1"]};
-    status = RunKernel(kernel1, global1, local1);
+    auto kernelEvent = Event();
+    status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
    if (ErrorIn(status)) { return status; }
+    eventWaitList.push_back(kernelEvent);

    // Sets the arguments for the epilogue kernel
    kernel2.SetArgument(0, temp_buffer());
@ -92,12 +97,9 @@ StatusCode Xdot<T>::DoDot(const size_t n,
    // Launches the epilogue kernel
    auto global2 = std::vector<size_t>{db_["WGS2"]};
    auto local2 = std::vector<size_t>{db_["WGS2"]};
-    status = RunKernel(kernel2, global2, local2);
+    status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
    if (ErrorIn(status)) { return status; }

-    // Waits for all kernels to finish
-    queue_.Finish();
-
    // Succesfully finished the computation
    return StatusCode::kSuccess;
  } catch (...) { return StatusCode::kInvalidKernel; }
--- a/src/routines/level1/xdotc.cc
+++ b/src/routines/level1/xdotc.cc
@ -21,7 +21,7 @@ namespace clblast {

 // Constructor: forwards to base class constructor
 template <typename T>
-Xdotc<T>::Xdotc(Queue &queue, Event &event, const std::string &name):
+Xdotc<T>::Xdotc(Queue &queue, EventPointer event, const std::string &name):
    Xdot<T>(queue, event, name) {
 }

--- a/src/routines/level1/xdotu.cc
+++ b/src/routines/level1/xdotu.cc
@ -20,7 +20,7 @@ namespace clblast {

 // Constructor: forwards to base class constructor
 template <typename T>
-Xdotu<T>::Xdotu(Queue &queue, Event &event, const std::string &name):
+Xdotu<T>::Xdotu(Queue &queue, EventPointer event, const std::string &name):
    Xdot<T>(queue, event, name) {
 }

--- a/src/routines/level1/xnrm2.cc
+++ b/src/routines/level1/xnrm2.cc
@ -0,0 +1,109 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xnrm2 class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level1/xnrm2.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xnrm2<float>::precision_ = Precision::kSingle;
+template <> const Precision Xnrm2<double>::precision_ = Precision::kDouble;
+template <> const Precision Xnrm2<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xnrm2<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xnrm2<T>::Xnrm2(Queue &queue, EventPointer event, const std::string &name):
+    Routine<T>(queue, event, name, {"Xdot"}, precision_) {
+  source_string_ =
+    #include "../../kernels/level1/xnrm2.opencl"
+  ;
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xnrm2<T>::DoNrm2(const size_t n,
+                            const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+  // Makes sure all dimensions are larger than zero
+  if (n == 0) { return StatusCode::kInvalidDimension; }
+
+  // Tests the vectors for validity
+  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestVectorDot(1, nrm2_buffer, nrm2_offset, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Retrieves the Xnrm2 kernels from the compiled binary
+  try {
+    const auto program = GetProgramFromCache();
+    auto kernel1 = Kernel(program, "Xnrm2");
+    auto kernel2 = Kernel(program, "Xnrm2Epilogue");
+
+    // Creates the buffer for intermediate values
+    auto temp_size = 2*db_["WGS2"];
+    auto temp_buffer = Buffer<T>(context_, temp_size);
+
+    // Sets the kernel arguments
+    kernel1.SetArgument(0, static_cast<int>(n));
+    kernel1.SetArgument(1, x_buffer());
+    kernel1.SetArgument(2, static_cast<int>(x_offset));
+    kernel1.SetArgument(3, static_cast<int>(x_inc));
+    kernel1.SetArgument(4, temp_buffer());
+
+    // Event waiting list
+    auto eventWaitList = std::vector<Event>();
+
+    // Launches the main kernel
+    auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
+    auto local1 = std::vector<size_t>{db_["WGS1"]};
+    auto kernelEvent = Event();
+    status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
+    if (ErrorIn(status)) { return status; }
+    eventWaitList.push_back(kernelEvent);
+
+    // Sets the arguments for the epilogue kernel
+    kernel2.SetArgument(0, temp_buffer());
+    kernel2.SetArgument(1, nrm2_buffer());
+    kernel2.SetArgument(2, static_cast<int>(nrm2_offset));
+
+    // Launches the epilogue kernel
+    auto global2 = std::vector<size_t>{db_["WGS2"]};
+    auto local2 = std::vector<size_t>{db_["WGS2"]};
+    status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
+    if (ErrorIn(status)) { return status; }
+
+    // Succesfully finished the computation
+    return StatusCode::kSuccess;
+  } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xnrm2<float>;
+template class Xnrm2<double>;
+template class Xnrm2<float2>;
+template class Xnrm2<double2>;
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routines/level1/xscal.cc
+++ b/src/routines/level1/xscal.cc
@ -29,7 +29,7 @@ template <> const Precision Xscal<double2>::precision_ = Precision::kComplexDoub

 // Constructor: forwards to base class constructor
 template <typename T>
-Xscal<T>::Xscal(Queue &queue, Event &event, const std::string &name):
+Xscal<T>::Xscal(Queue &queue, EventPointer event, const std::string &name):
    Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
  source_string_ =
    #include "../../kernels/level1/level1.opencl"
@ -60,7 +60,7 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,

  // Retrieves the Xscal kernel from the compiled binary
  try {
-    auto& program = GetProgramFromCache();
+    const auto program = GetProgramFromCache();
    auto kernel = Kernel(program, kernel_name);

    // Sets the kernel arguments
@ -81,19 +81,16 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
    if (use_fast_kernel) {
      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local);
+      status = RunKernel(kernel, global, local, event_);
    }
    else {
      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local);
+      status = RunKernel(kernel, global, local, event_);
    }
    if (ErrorIn(status)) { return status; }

-    // Waits for all kernels to finish
-    queue_.Finish();
-
    // Succesfully finished the computation
    return StatusCode::kSuccess;
  } catch (...) { return StatusCode::kInvalidKernel; }
--- a/src/routines/level1/xswap.cc
+++ b/src/routines/level1/xswap.cc
@ -29,7 +29,7 @@ template <> const Precision Xswap<double2>::precision_ = Precision::kComplexDoub

 // Constructor: forwards to base class constructor
 template <typename T>
-Xswap<T>::Xswap(Queue &queue, Event &event, const std::string &name):
+Xswap<T>::Xswap(Queue &queue, EventPointer event, const std::string &name):
    Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
  source_string_ =
    #include "../../kernels/level1/level1.opencl"
@ -64,7 +64,7 @@ StatusCode Xswap<T>::DoSwap(const size_t n,

  // Retrieves the Xswap kernel from the compiled binary
  try {
-    auto& program = GetProgramFromCache();
+    const auto program = GetProgramFromCache();
    auto kernel = Kernel(program, kernel_name);

    // Sets the kernel arguments
@ -87,19 +87,16 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
    if (use_fast_kernel) {
      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local);
+      status = RunKernel(kernel, global, local, event_);
    }
    else {
      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local);
+      status = RunKernel(kernel, global, local, event_);
    }
    if (ErrorIn(status)) { return status; }

-    // Waits for all kernels to finish
-    queue_.Finish();
-
    // Succesfully finished the computation
    return StatusCode::kSuccess;
  } catch (...) { return StatusCode::kInvalidKernel; }
--- a/src/routines/level2/xgbmv.cc
+++ b/src/routines/level2/xgbmv.cc
@ -21,7 +21,7 @@ namespace clblast {

 // Constructor: forwards to base class constructor
 template <typename T>
-Xgbmv<T>::Xgbmv(Queue &queue, Event &event, const std::string &name):
+Xgbmv<T>::Xgbmv(Queue &queue, EventPointer event, const std::string &name):
    Xgemv<T>(queue, event, name) {
 }

--- a/src/routines/level2/xgemv.cc
+++ b/src/routines/level2/xgemv.cc
@ -29,7 +29,7 @@ template <> const Precision Xgemv<double2>::precision_ = Precision::kComplexDoub

 // Constructor: forwards to base class constructor
 template <typename T>
-Xgemv<T>::Xgemv(Queue &queue, Event &event, const std::string &name):
+Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
    Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) {
  source_string_ =
    #include "../../kernels/level2/xgemv.opencl"
@ -136,7 +136,7 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,

  // Retrieves the Xgemv kernel from the compiled binary
  try {
-    auto& program = GetProgramFromCache();
+    const auto program = GetProgramFromCache();
    auto kernel = Kernel(program, kernel_name);

    // Sets the kernel arguments
@ -162,12 +162,9 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
    // Launches the kernel
    auto global = std::vector<size_t>{global_size};
    auto local = std::vector<size_t>{local_size};
-    status = RunKernel(kernel, global, local);
+    status = RunKernel(kernel, global, local, event_);
    if (ErrorIn(status)) { return status; }

-    // Waits for all kernels to finish
-    queue_.Finish();
-
    // Succesfully finished the computation
    return StatusCode::kSuccess;
  } catch (...) { return StatusCode::kInvalidKernel; }
--- a/src/routines/level2/xger.cc
+++ b/src/routines/level2/xger.cc
@ -29,7 +29,7 @@ template <> const Precision Xger<double2>::precision_ = Precision::kComplexDoubl

 // Constructor: forwards to base class constructor
 template <typename T>
-Xger<T>::Xger(Queue &queue, Event &event, const std::string &name):
+Xger<T>::Xger(Queue &queue, EventPointer event, const std::string &name):
    Routine<T>(queue, event, name, {"Xger"}, precision_) {
  source_string_ =
    #include "../../kernels/level2/level2.opencl"
@ -66,7 +66,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,

  // Retrieves the Xgemv kernel from the compiled binary
  try {
-    auto& program = GetProgramFromCache();
+    const auto program = GetProgramFromCache();
    auto kernel = Kernel(program, "Xger");

    // Sets the kernel arguments
@ -89,12 +89,9 @@ StatusCode Xger<T>::DoGer(const Layout layout,
    auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
    auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
    auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
-    status = RunKernel(kernel, global, local);
+    status = RunKernel(kernel, global, local, event_);
    if (ErrorIn(status)) { return status; }

-    // Waits for all kernels to finish
-    queue_.Finish();
-
    // Succesfully finished the computation
    return StatusCode::kSuccess;
  } catch (...) { return StatusCode::kInvalidKernel; }
--- a/src/routines/level2/xgerc.cc
+++ b/src/routines/level2/xgerc.cc
@ -20,7 +20,7 @@ namespace clblast {

 // Constructor: forwards to base class constructor
 template <typename T>
-Xgerc<T>::Xgerc(Queue &queue, Event &event, const std::string &name):
+Xgerc<T>::Xgerc(Queue &queue, EventPointer event, const std::string &name):
    Xger<T>(queue, event, name) {
 }

--- a/src/routines/level2/xgeru.cc
+++ b/src/routines/level2/xgeru.cc
@ -20,7 +20,7 @@ namespace clblast {

 // Constructor: forwards to base class constructor
 template <typename T>
-Xgeru<T>::Xgeru(Queue &queue, Event &event, const std::string &name):
+Xgeru<T>::Xgeru(Queue &queue, EventPointer event, const std::string &name):
    Xger<T>(queue, event, name) {
 }

--- a/src/routines/level2/xhbmv.cc
+++ b/src/routines/level2/xhbmv.cc
@ -21,7 +21,7 @@ namespace clblast {

 // Constructor: forwards to base class constructor
 template <typename T>
-Xhbmv<T>::Xhbmv(Queue &queue, Event &event, const std::string &name):
+Xhbmv<T>::Xhbmv(Queue &queue, EventPointer event, const std::string &name):
    Xgemv<T>(queue, event, name) {
 }

--- a/Show more
+++ b/Show more