Merge pull request #127 from CNugteren/development

Update to version 0.10.0
2016-11-27 15:59:21 +01:00 · 2016-11-27 15:59:21 +01:00 · e52f9a9ff2
parent 115af8c78e 2cf7d8429a
commit e52f9a9ff2
278 changed files with 17037 additions and 8161 deletions
--- a/17
+++ b/17
@ -1,4 +1,21 @@

+Version 0.10.0
+- Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header
+- Changed the enums in the C API to avoid potential name clashes with external code
+- Added a Netlib CBLAS compatible API (not recommended for full control over performance)
+- Greatly improved the way exceptions are handled in the library (thanks to 'intelfx')
+- Improved performance of GEMM kernels for small sizes by using a direct single-kernel implementation
+- Fixed a bug in the tests and samples related to waiting for an invalid event
+- Fixed a bug in the SYRK/SYR2K/HERK/HER2K routines that would occur with specific tuning parameters
+- Fixed a bug in the TRMM routine that would overwrite input data before consuming everything
+- Added support for compilation under Visual Studio 2013 (MSVC++ 12.0)
+- Added an option to set OpenCL compiler options through the env variable CLBLAST_BUILD_OPTIONS
+- Added an option to run tuned kernels multiple times to average execution times
+- Added an option to build a static version of the library
+- Made it possible to use the command-line environmental vars everywhere and without re-running CMake
+- Various minor fixes and enhancements
+- Added tuned parameters for various devices (see README)
+
 Version 0.9.0
 - Updated to version 6.0 of the CLCudaAPI C++11 OpenCL header
 - Improved performance significantly of rotated GEMV computations
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -18,14 +18,16 @@ set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_fla
 # CMake project details
 project("clblast" C CXX)
 set(clblast_VERSION_MAJOR 0)
-set(clblast_VERSION_MINOR 9)
+set(clblast_VERSION_MINOR 10)
 set(clblast_VERSION_PATCH 0)

 # Options and their default values
+option(BUILD_SHARED_LIBS "Build a shared (ON) or static library (OFF)" ON)
 option(SAMPLES "Enable compilation of the examples" OFF)
 option(TUNERS "Enable compilation of the tuners" OFF)
 option(CLIENTS "Enable compilation of the clients to test and compare performance" OFF)
 option(TESTS "Enable compilation of the correctness tests" OFF)
+option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF)

 # Compile in verbose mode with additional diagnostic messages
 option(VERBOSE "Compile in verbose mode for additional diagnostic messages" OFF)
@ -64,12 +66,24 @@ elseif(MSVC)
  endif()
 endif()

+# DLL Settings
+if(MSVC)
+  if(BUILD_SHARED_LIBS)
+    add_definitions(" /DCLBLAST_DLL")
+  endif()
+endif(MSVC)
+
 # C++ compiler settings
 if(MSVC)
  set(FLAGS "/Ox")
  set(FLAGS "${FLAGS} /wd4715")
 else()
-  set(FLAGS "-O3 -std=c++11")
+  set(FLAGS "-std=c++11")
+  if(VERBOSE)
+    set(FLAGS "${FLAGS} -O1 -g")
+  else()
+    set(FLAGS "${FLAGS} -O3")
+  endif()
  if(CMAKE_CXX_COMPILER_ID STREQUAL GNU)
    set(FLAGS "${FLAGS} -Wall -Wno-comment -Wno-return-type -Wno-switch -Wno-missing-noreturn")
    if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0)
@ -134,9 +148,13 @@ endif()
 # ==================================================================================================

 # Sets the supported routines and the used kernels. New routines and kernels should be added here.
-set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger xgemm xgemv)
+set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger
+            xgemm xgemm_direct xgemv)
 set(SAMPLE_PROGRAMS_CPP sgemm)
 set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache)
+if(NETLIB)
+  set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib)
+endif()
 set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
 set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
                    xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
@ -151,12 +169,16 @@ set(PRECISIONS 32 64 3232 6464 16)
 set(SOURCES
  src/database/database.cpp
  src/routines/common.cpp
+  src/utilities/clblast_exceptions.cpp
+  src/utilities/utilities.cpp
  src/cache.cpp
  src/clblast.cpp
  src/clblast_c.cpp
  src/routine.cpp
-  src/utilities.cpp
 )
+if(NETLIB)
+  set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp)
+endif()
 foreach(ROUTINE ${LEVEL1_ROUTINES})
  set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cpp)
 endforeach()
@ -171,7 +193,12 @@ foreach(ROUTINE ${LEVELX_ROUTINES})
 endforeach()

 # Creates and links the library
-add_library(clblast SHARED ${SOURCES})
+if(BUILD_SHARED_LIBS)
+  add_library(clblast SHARED ${SOURCES})
+else(BUILD_SHARED_LIBS)
+  add_library(clblast STATIC ${SOURCES})
+endif()
+
 target_link_libraries(clblast ${OPENCL_LIBRARIES})

 # Includes directories: CLBlast and OpenCL
@ -183,7 +210,9 @@ target_include_directories(clblast PUBLIC

 # Sets the proper __declspec(dllexport) keyword for Visual Studio when the library is built
 if(MSVC)
-  target_compile_definitions(clblast PRIVATE COMPILING_DLL=1) # requires at least CMake 2.8.11
+  if(BUILD_SHARED_LIBS)
+    target_compile_definitions(clblast PRIVATE COMPILING_DLL=1) # requires at least CMake 2.8.11
+  endif()
 endif()

 # Installs the library
@ -191,19 +220,19 @@ install(TARGETS clblast EXPORT CLBlast DESTINATION lib)
 install(FILES include/clblast.h DESTINATION include)
 install(FILES include/clblast_c.h DESTINATION include)
 install(FILES include/clblast_half.h DESTINATION include)
+if(NETLIB)
+  install(FILES include/clblast_netlib_c.h DESTINATION include)
+endif()

 # Installs the config for find_package in dependent projects
 install(EXPORT CLBlast DESTINATION lib/cmake/CLBLast FILE CLBlastConfig.cmake)

-# ==================================================================================================
-
-# Sets a default platform ($DEVICEPLATFORM) and device ($CLBLAST_DEVICE) to run tuners and tests on
-set(DEVICEPLATFORM )
-if(DEFINED ENV{CLBLAST_DEVICE})
-  set(DEVICEPLATFORM ${DEVICEPLATFORM} -device $ENV{CLBLAST_DEVICE})
-endif()
-if(DEFINED ENV{CLBLAST_PLATFORM})
-  set(DEVICEPLATFORM ${DEVICEPLATFORM} -platform $ENV{CLBLAST_PLATFORM})
+# Install pkg-config file on Linux
+if(UNIX)
+    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/clblast.pc.in"
+                   "${CMAKE_CURRENT_BINARY_DIR}/clblast.pc" @ONLY IMMEDIATE)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/clblast.pc
+            DESTINATION lib/pkgconfig)
 endif()

 # ==================================================================================================
@ -239,7 +268,7 @@ if(TUNERS)
  # Visual Studio requires the sources of non-exported objects/libraries
  set(TUNERS_COMMON )
  if(MSVC)
-    set(TUNERS_COMMON ${TUNERS_COMMON} src/utilities.cpp)
+    set(TUNERS_COMMON ${TUNERS_COMMON} src/utilities/utilities.cpp)
  endif()

  # Adds tuning executables
@ -255,7 +284,7 @@ if(TUNERS)
  set(ALLTUNERSDEPENDS )
  foreach(KERNEL ${KERNELS})
    foreach(PRECISION ${PRECISIONS})
-      set(ALLTUNERS ${ALLTUNERS} COMMAND clblast_tuner_${KERNEL} -precision ${PRECISION} ${DEVICEPLATFORM})
+      set(ALLTUNERS ${ALLTUNERS} COMMAND clblast_tuner_${KERNEL} -precision ${PRECISION})
    endforeach()
    set(ALLTUNERSDEPENDS clblast_tuner_${KERNEL})
  endforeach()
@ -272,9 +301,10 @@ if(CLIENTS OR TESTS)
  set(REF_INCLUDES )
  set(REF_LIBRARIES )
  if(CLBLAS_FOUND)
+    find_package(Threads)
+    set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
    set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS})
-    set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES})
-    if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+    if(MSVC)
      add_definitions(" /DCLBLAST_REF_CLBLAS")
    else()
      add_definitions(" -DCLBLAST_REF_CLBLAS")
@ -283,7 +313,7 @@ if(CLIENTS OR TESTS)
  if(CBLAS_FOUND)
    set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS})
    set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES})
-    if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+    if(MSVC)
      add_definitions(" /DCLBLAST_REF_CBLAS")
    else()
      add_definitions(" -DCLBLAST_REF_CBLAS")
@ -301,7 +331,7 @@ if(CLIENTS)
  # Visual Studio requires the sources of non-exported objects/libraries
  set(CLIENTS_COMMON )
  if(MSVC)
-    set(CLIENTS_COMMON ${CLIENTS_COMMON} src/utilities.cpp test/performance/client.cpp)
+    set(CLIENTS_COMMON ${CLIENTS_COMMON} src/utilities/utilities.cpp test/performance/client.cpp)
  else()
    # Creates the common performance-tests objects (requires CMake 2.8.8)
    add_library(test_performance_common OBJECT test/performance/client.cpp)
@ -309,7 +339,7 @@ if(CLIENTS)
    # Adds CLBlast's interface include paths because we can't link to CLBlast here
    target_include_directories(test_performance_common PRIVATE
                               $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
-                               ${clblast_SOURCE_DIR})
+                               ${clblast_SOURCE_DIR} ${REF_INCLUDES})
    set(CLIENTS_COMMON ${CLIENTS_COMMON} $<TARGET_OBJECTS:test_performance_common>)
  endif()

@ -348,7 +378,7 @@ if(TESTS)
  # Visual Studio requires the sources of non-exported objects/libraries
  set(TESTS_COMMON )
  if(MSVC)
-    set(TESTS_COMMON ${TESTS_COMMON} src/utilities.cpp
+    set(TESTS_COMMON ${TESTS_COMMON} src/utilities/utilities.cpp
        test/correctness/tester.cpp test/correctness/testblas.cpp)
  else()
    # Creates the common correctness-tests objects (requires CMake 2.8.8)
@ -356,7 +386,7 @@ if(TESTS)
                test/correctness/tester.cpp test/correctness/testblas.cpp)
    target_include_directories(test_correctness_common PUBLIC
                               $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
-                               ${clblast_SOURCE_DIR})
+                               ${clblast_SOURCE_DIR} ${REF_INCLUDES})
    set(TESTS_COMMON ${TESTS_COMMON} $<TARGET_OBJECTS:test_correctness_common>)
  endif()

@ -381,14 +411,14 @@ if(TESTS)
    target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
    install(TARGETS clblast_test_${ROUTINE} DESTINATION bin)
    target_include_directories(clblast_test_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES})
-    add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE} ${DEVICEPLATFORM})
+    add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE})
  endforeach()

  # Adds 'alltests' target: runs all tests
  set(ALLTESTS )
  set(ALLTESTSDEPENDS )
  foreach(ROUTINE ${ROUTINES})
-    set(ALLTESTS ${ALLTESTS} COMMAND clblast_test_${ROUTINE} ${DEVICEPLATFORM})
+    set(ALLTESTS ${ALLTESTS} COMMAND clblast_test_${ROUTINE})
    set(ALLTESTSDEPENDS clblast_test_${ROUTINE})
  endforeach()
  add_custom_target(alltests ${ALLTESTS} DEPENDS ${ALLTESTSDEPENDS})
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,20 @@
+
+CLBlast: Contributing guidelines
+================
+
+For information about the CLBlast library, see the [README](README.md) file instead.
+
+Tuning results
+-------------
+
+A [dedicated GitHub issue](https://github.com/CNugteren/CLBlast/issues/1) is available to post new tuning results. If you compiled with the tuners (see the [README](README.md) for instructions), ran one of the tuners on your device (or all perhaps?), and feel that these results should be included in the next release of CLBlast, please post them there. You can do this by attaching the JSON files to the issue (archived in a .ZIP file).
+
+
+Code improvements and additions
+-------------
+
+Pull requests are welcome as long as they:
+
+* Contain unit additions or modifications
+* Follow the CLBlast coding style, which is loosely based on the [Google C++ style guide](https://google-styleguide.googlecode.com/svn/trunk/cppguide.html) and the Effective C++ books by Scott Meyers. We use a tab-size of 2 spaces and a max-width of 100 characters.
+* Are made against the `development` branch.
--- a/207
+++ b/207
@ -1,14 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/

-Copyright (c) 2015 Cedric Nugteren
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+   1. Definitions.

- http://www.apache.org/licenses/LICENSE-2.0
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.

-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2015 Cedric Nugteren
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
@ -48,7 +48,7 @@ The pre-requisites for compilation of CLBlast are:
  - Clang 3.3 or newer
  - AppleClang 5.0 or newer
  - ICC 14.0 or newer
-  - MSVC (Visual Studio) 2015 or newer
+  - MSVC (Visual Studio) 2013 or newer
 * An OpenCL 1.1 or newer library, for example:
  - Apple OpenCL
  - NVIDIA CUDA SDK
@ -74,6 +74,10 @@ A custom installation folder can be specified when calling CMake:

    cmake -DCMAKE_INSTALL_PREFIX=/path/to/install/directory ..

+Building a static version of the library instead of shared one (.dylib/.so/.dll) can be done by disabling the `BUILD_SHARED_LIBS` option when calling CMake. For example:
+
+    cmake -DBUILD_SHARED_LIBS=OFF ..
+

 Using the library
 -------------
@ -90,6 +94,12 @@ Afterwards, any of CLBlast's routines can be called directly: there is no need t

    cmake -DSAMPLES=ON ..

+Furthermore, it is possible to optionally set an OS environmental variable `CLBLAST_BUILD_OPTIONS` to pass specific build options to the OpenCL compiler.
+
+There is also a Netlib CBLAS C API available. This is however not recommended for full control over performance, since at every call it will copy all buffers to and from the OpenCL device. Especially for level 1 and level 2 BLAS functions performance will be impacted severly. However, it can be useful if you don't want to touch OpenCL at all. You can set the default device and platform by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. This API can be used as follows after providing the `-DNETLIB=ON` flag to CMake:
+
+    #include <clblast_netlib_c.h>
+

 Using the tuners (optional)
 -------------
@ -105,8 +115,9 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
  - GeForce GTX 750 Ti
  - GeForce GTX 980
  - GeForce GTX 1070
-  - GeForce GTX Titan
-  - GeForce GTX Titan X
+  - GeForce GTX TITAN
+  - GeForce GTX TITAN Black
+  - GeForce GTX TITAN X
  - Tesla K20m
  - Tesla K40m
 * AMD GPUs:
@ -115,10 +126,12 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
  - Oland
  - Pitcairn
  - Tahiti
+  - Tonga
 * Intel GPUs:
  - HD Graphics 530
-  - HD Graphics Haswell Ultrabook GT2 Mobile
  - HD Graphics 5500 BroadWell U-Processor GT2
+  - HD Graphics Haswell Ultrabook GT2 Mobile
+  - HD Graphics IvyBridge M GT2
  - HD Graphics Skylake ULT GT2
  - Iris
  - Iris Pro
@ -134,9 +147,9 @@ If your device is not (yet) among this list or if you want to tune CLBlast for s

    cmake -DTUNERS=ON ..

-Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.3.1 or higher).
+Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.6.0 or higher).

-Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake.
+Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables.

 The tuners output a JSON-file with the results. The best results need to be added to `src/database/kernels/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).

@ -168,7 +181,7 @@ To build these tests, another BLAS library is needed to serve as a reference. Th

 Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is tested for correctness against [clBLAS](http://github.com/clMathLibraries/clBLAS) and/or a regular CPU BLAS library. If both are installed on your system, setting the command-line option `-clblas 1` or `-cblas 1` will select the library to test against for the `clblast_test_xxxxx` executables. All tests have a `-verbose` option to enable additional diagnostic output. They also have a `-full_test` option to increase coverage further.

-All tests can be run directly together in one go through the `make alltests` target or using CTest (`make test` or `ctest`). In the latter case the output is less verbose. Both cases allow you to set the default device and platform to non-zero by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake.
+All tests can be run directly together in one go through the `make alltests` target or using CTest (`make test` or `ctest`). In the latter case the output is less verbose. Both cases allow you to set the default device and platform to non-zero by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. Further options can be supplied through the `CLBLAST_ARGUMENTS` environmental variable (e.g. export CLBLAST_ARGUMENTS="-full_test -cblas 1 -clblas 0" on a UNIX system).


 Compiling the performance tests/clients (optional)
@ -277,11 +290,11 @@ The `samples/haxpy.c` example shows how to use these convencience functions when
 Contributing
 -------------

-Contributions are welcome in the form of tuning results for OpenCL devices previously untested. Furthermore, merge requests are welcome as long as they contain unit additions or modifications. Furthermore, they should follow the CLBlast coding style, which is based on the [Google C++ style guide](https://google-styleguide.googlecode.com/svn/trunk/cppguide.html) and the Effective C++ books by Scott Meyers.
+Contributions are welcome in the form of tuning results for OpenCL devices previously untested or pull requests. See [the contributing guidelines](CONTRIBUTING.md) for more details.

 The contributing authors (code, pull requests, testing) so far are:

-* [Cedric Nugteren](http://www.cedricnugteren.nl) - main author
+* [Cedric Nugteren](http://cnugteren.github.io) - main author
 * [Anton Lokhmotov](https://github.com/psyhtest)
 * [Dragan Djuric](https://github.com/blueberry)
 * [Marco Hutter](http://marco-hutter.de/)
@ -289,6 +302,7 @@ The contributing authors (code, pull requests, testing) so far are:
 * [Gian-Carlo Pascutto](https://github.com/gcp)
 * [Ivan Shapovalov](https://github.com/intelfx)
 * [Dimitri Van Assche](https://github.com/dvasschemacq)
+* [Shehzan Mohammed](https://shehzan10.github.io)

 Tuning and testing on a variety of OpenCL devices was made possible by:

@ -296,9 +310,10 @@ Tuning and testing on a variety of OpenCL devices was made possible by:
 * [ASCI DAS4 and DAS5](http://www.cs.vu.nl/das4/)
 * [dividiti](http://www.dividiti.com)
 * [SURFsara HPC center](http://www.surfsara.com)
+* [ArrayFire](http://arrayfire.org)


 Support us
 -------------

-This project started in March 2015 as an evenings and weekends free-time project next to a full-time job for Cedric Nugteren. If you are in the position to support the project by OpenCL-hardware donations or otherwise, please find contact information on the [website of the main author](http://www.cedricnugteren.nl).
+This project started in March 2015 as an evenings and weekends free-time project next to a full-time job for Cedric Nugteren. If you are in the position to support the project by OpenCL-hardware donations or otherwise, please find contact information on the [website of the main author](http://cnugteren.github.io).
--- a/clblast.pc.in
+++ b/clblast.pc.in
@ -0,0 +1,10 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+includedir=${prefix}/include
+libdir=${exec_prefix}/lib
+
+Name: CLBlast
+Description: CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11
+Version: @clblast_VERSION_MAJOR@.@clblast_VERSION_MINOR@.@clblast_VERSION_PATCH@
+Libs: -L${libdir} -lclblast
+Cflags: -I${includedir}
--- a/doc/clblast.md
+++ b/doc/clblast.md
--- a/doc/performance/GeForce_GTX480/SAXPY.pdf
+++ b/doc/performance/GeForce_GTX480/SAXPY.pdf
--- a/doc/performance/GeForce_GTX480/SGEMM.pdf
+++ b/doc/performance/GeForce_GTX480/SGEMM.pdf
--- a/doc/performance/GeForce_GTX480/SGEMV.pdf
+++ b/doc/performance/GeForce_GTX480/SGEMV.pdf
--- a/doc/performance/GeForce_GTX480/SSYMM.pdf
+++ b/doc/performance/GeForce_GTX480/SSYMM.pdf
--- a/doc/performance/GeForce_GTX750Ti/SAXPY.pdf
+++ b/doc/performance/GeForce_GTX750Ti/SAXPY.pdf
--- a/doc/performance/GeForce_GTX750Ti/SGEMM.pdf
+++ b/doc/performance/GeForce_GTX750Ti/SGEMM.pdf
--- a/doc/performance/GeForce_GTX750Ti/SGEMV.pdf
+++ b/doc/performance/GeForce_GTX750Ti/SGEMV.pdf
--- a/doc/performance/GeForce_GTX750Ti/SSYMM.pdf
+++ b/doc/performance/GeForce_GTX750Ti/SSYMM.pdf
--- a/doc/performance/Intel_IrisPro/SAXPY.pdf
+++ b/doc/performance/Intel_IrisPro/SAXPY.pdf
--- a/doc/performance/Intel_IrisPro/SGEMM.pdf
+++ b/doc/performance/Intel_IrisPro/SGEMM.pdf
--- a/doc/performance/Intel_IrisPro/SGEMV.pdf
+++ b/doc/performance/Intel_IrisPro/SGEMV.pdf
--- a/doc/performance/Intel_SkylakeULTGT2/HGEMM.pdf
+++ b/doc/performance/Intel_SkylakeULTGT2/HGEMM.pdf
--- a/doc/performance/Intel_SkylakeULTGT2/SGEMM.pdf
+++ b/doc/performance/Intel_SkylakeULTGT2/SGEMM.pdf
--- a/doc/performance/Radeon_M370X/SGEMM.pdf
+++ b/doc/performance/Radeon_M370X/SGEMM.pdf
--- a/doc/performance/Radeon_M370X/SGEMV.pdf
+++ b/doc/performance/Radeon_M370X/SGEMV.pdf
--- a/doc/performance/Radeon_M370X/SSYMM.pdf
+++ b/doc/performance/Radeon_M370X/SSYMM.pdf
--- a/include/clblast.h
+++ b/include/clblast.h
@ -27,8 +27,8 @@

 // Exports library functions under Windows when building a DLL. See also:
 // https://msdn.microsoft.com/en-us/library/a90k134d.aspx
-#ifdef _WIN32
-  #ifdef COMPILING_DLL
+#if defined(_WIN32) && defined(CLBLAST_DLL)
+  #if defined(COMPILING_DLL)
    #define PUBLIC_API __declspec(dllexport)
  #else
    #define PUBLIC_API __declspec(dllimport)
@ -46,14 +46,34 @@ enum class StatusCode {

  // Status codes in common with the OpenCL standard
  kSuccess                   =   0, // CL_SUCCESS
+  kOpenCLCompilerNotAvailable=  -3, // CL_COMPILER_NOT_AVAILABLE
  kTempBufferAllocFailure    =  -4, // CL_MEM_OBJECT_ALLOCATION_FAILURE
-  kBuildProgramFailure       = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error
+  kOpenCLOutOfResources      =  -5, // CL_OUT_OF_RESOURCES
+  kOpenCLOutOfHostMemory     =  -6, // CL_OUT_OF_HOST_MEMORY
+  kOpenCLBuildProgramFailure = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error
+  kInvalidValue              = -30, // CL_INVALID_VALUE
+  kInvalidCommandQueue       = -36, // CL_INVALID_COMMAND_QUEUE
+  kInvalidMemObject          = -38, // CL_INVALID_MEM_OBJECT
  kInvalidBinary             = -42, // CL_INVALID_BINARY
+  kInvalidBuildOptions       = -43, // CL_INVALID_BUILD_OPTIONS
+  kInvalidProgram            = -44, // CL_INVALID_PROGRAM
+  kInvalidProgramExecutable  = -45, // CL_INVALID_PROGRAM_EXECUTABLE
+  kInvalidKernelName         = -46, // CL_INVALID_KERNEL_NAME
+  kInvalidKernelDefinition   = -47, // CL_INVALID_KERNEL_DEFINITION
  kInvalidKernel             = -48, // CL_INVALID_KERNEL
+  kInvalidArgIndex           = -49, // CL_INVALID_ARG_INDEX
+  kInvalidArgValue           = -50, // CL_INVALID_ARG_VALUE
+  kInvalidArgSize            = -51, // CL_INVALID_ARG_SIZE
+  kInvalidKernelArgs         = -52, // CL_INVALID_KERNEL_ARGS
  kInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions
  kInvalidLocalThreadsTotal  = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total
  kInvalidLocalThreadsDim    = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension
-  kInvalidTempBufferSize     = -61, // CL_INVALID_BUFFER_SIZE
+  kInvalidGlobalOffset       = -56, // CL_INVALID_GLOBAL_OFFSET
+  kInvalidEventWaitList      = -57, // CL_INVALID_EVENT_WAIT_LIST
+  kInvalidEvent              = -58, // CL_INVALID_EVENT
+  kInvalidOperation          = -59, // CL_INVALID_OPERATION
+  kInvalidBufferSize         = -61, // CL_INVALID_BUFFER_SIZE
+  kInvalidGlobalWorkSize     = -63, // CL_INVALID_GLOBAL_WORK_SIZE

  // Status codes in common with the clBLAS library
  kNotImplemented            = -1024, // Routine or functionality not implemented yet
@ -75,13 +95,14 @@ enum class StatusCode {
  kInsufficientMemoryY       = -1007, // Vector Y's OpenCL buffer is too small

  // Custom additional status codes for CLBlast
-  kKernelLaunchError         = -2048, // Problem occurred when enqueuing the kernel
-  kKernelRunError            = -2047, // Problem occurred while running the kernel
  kInvalidLocalMemUsage      = -2046, // Not enough local memory available on this device
  kNoHalfPrecision           = -2045, // Half precision (16-bits) not supported by the device
  kNoDoublePrecision         = -2044, // Double precision (64-bits) not supported by the device
  kInvalidVectorScalar       = -2043, // The unit-sized vector is not a valid OpenCL buffer
  kInsufficientMemoryScalar  = -2042, // The unit-sized vector's OpenCL buffer is too small
+  kDatabaseError             = -2041, // Entry for the device was not found in the database
+  kUnknownError              = -2040, // A catch-all error code representing an unspecified error
+  kUnexpectedError           = -2039, // A catch-all error code representing an unexpected exception
 };

 // Matrix layout and transpose types
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
--- a/include/clblast_half.h
+++ b/include/clblast_half.h
@ -25,6 +25,11 @@
  #include <CL/opencl.h>
 #endif

+// MSVC 2013 doesn't fully support C99
+#ifdef _MSC_VER
+    #define inline __inline
+#endif
+
 // =================================================================================================

 // Host data-type for half-precision floating-point (16-bit). This is based on the OpenCL type,
--- a/include/clblast_netlib_c.h
+++ b/include/clblast_netlib_c.h
@ -0,0 +1,920 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Netlib CBLAS interface to the CLBlast BLAS routines, performing all buffer
+// copies automatically and running on the default OpenCL platform and device. For full control over
+// performance, it is advised to use the regular clblast.h or clblast_c.h headers instead.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_CLBLAST_NETLIB_C_H_
+#define CLBLAST_CLBLAST_NETLIB_C_H_
+
+// Exports library functions under Windows when building a DLL. See also:
+// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
+#if defined(_WIN32) && defined(CLBLAST_DLL)
+  #if defined(COMPILING_DLL)
+    #define PUBLIC_API __declspec(dllexport)
+  #else
+    #define PUBLIC_API __declspec(dllimport)
+  #endif
+#else
+  #define PUBLIC_API
+#endif
+
+// The C interface
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// =================================================================================================
+
+// Matrix layout and transpose types
+typedef enum CLBlastLayout_ { CLBlastLayoutRowMajor = 101,
+                              CLBlastLayoutColMajor = 102 } CLBlastLayout;
+typedef enum CLBlastTranspose_ { CLBlastTransposeNo = 111, CLBlastTransposeYes = 112,
+                                 CLBlastTransposeConjugate = 113 } CLBlastTranspose;
+typedef enum CLBlastTriangle_ { CLBlastTriangleUpper = 121,
+                                CLBlastTriangleLower = 122 } CLBlastTriangle;
+typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131,
+                                CLBlastDiagonalUnit = 132 } CLBlastDiagonal;
+typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide;
+
+// For full compatibility with CBLAS
+typedef CLBlastLayout CBLAS_ORDER;
+typedef CLBlastTranspose CBLAS_TRANSPOSE;
+typedef CLBlastTriangle CBLAS_UPLO;
+typedef CLBlastDiagonal CBLAS_DIAG;
+typedef CLBlastSide CBLAS_SIDE;
+#define CblasRowMajor CLBlastLayoutRowMajor
+#define CblasColMajor CLBlastLayoutColMajor
+#define CblasNoTrans CLBlastTransposeNo
+#define CblasTrans CLBlastTransposeYes
+#define CblasConjTrans CLBlastTransposeConjugate
+#define CblasUpper CLBlastTriangleUpper
+#define CblasLower CLBlastTriangleLower
+#define CblasNonUnit CLBlastDiagonalNonUnit
+#define CblasUnit CLBlastDiagonalUnit
+#define CblasLeft CLBlastSideLeft
+#define CblasRight CLBlastSideRight
+
+// =================================================================================================
+// BLAS level-1 (vector-vector) routines
+// =================================================================================================
+
+// Generate givens plane rotation: SROTG/DROTG
+void PUBLIC_API cblas_srotg(float* sa,
+                            float* sb,
+                            float* sc,
+                            float* ss);
+void PUBLIC_API cblas_drotg(double* sa,
+                            double* sb,
+                            double* sc,
+                            double* ss);
+
+// Generate modified givens plane rotation: SROTMG/DROTMG
+void PUBLIC_API cblas_srotmg(float* sd1,
+                             float* sd2,
+                             float* sx1,
+                             const float sy1,
+                             float* sparam);
+void PUBLIC_API cblas_drotmg(double* sd1,
+                             double* sd2,
+                             double* sx1,
+                             const double sy1,
+                             double* sparam);
+
+// Apply givens plane rotation: SROT/DROT
+void PUBLIC_API cblas_srot(const int n,
+                           float* x, const int x_inc,
+                           float* y, const int y_inc,
+                           const float cos,
+                           const float sin);
+void PUBLIC_API cblas_drot(const int n,
+                           double* x, const int x_inc,
+                           double* y, const int y_inc,
+                           const double cos,
+                           const double sin);
+
+// Apply modified givens plane rotation: SROTM/DROTM
+void PUBLIC_API cblas_srotm(const int n,
+                            float* x, const int x_inc,
+                            float* y, const int y_inc,
+                            float* sparam);
+void PUBLIC_API cblas_drotm(const int n,
+                            double* x, const int x_inc,
+                            double* y, const int y_inc,
+                            double* sparam);
+
+// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP
+void PUBLIC_API cblas_sswap(const int n,
+                            float* x, const int x_inc,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_dswap(const int n,
+                            double* x, const int x_inc,
+                            double* y, const int y_inc);
+void PUBLIC_API cblas_cswap(const int n,
+                            void* x, const int x_inc,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zswap(const int n,
+                            void* x, const int x_inc,
+                            void* y, const int y_inc);
+
+// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
+void PUBLIC_API cblas_sscal(const int n,
+                            const float alpha,
+                            float* x, const int x_inc);
+void PUBLIC_API cblas_dscal(const int n,
+                            const double alpha,
+                            double* x, const int x_inc);
+void PUBLIC_API cblas_cscal(const int n,
+                            const void* alpha,
+                            void* x, const int x_inc);
+void PUBLIC_API cblas_zscal(const int n,
+                            const void* alpha,
+                            void* x, const int x_inc);
+
+// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
+void PUBLIC_API cblas_scopy(const int n,
+                            const float* x, const int x_inc,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_dcopy(const int n,
+                            const double* x, const int x_inc,
+                            double* y, const int y_inc);
+void PUBLIC_API cblas_ccopy(const int n,
+                            const void* x, const int x_inc,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zcopy(const int n,
+                            const void* x, const int x_inc,
+                            void* y, const int y_inc);
+
+// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
+void PUBLIC_API cblas_saxpy(const int n,
+                            const float alpha,
+                            const float* x, const int x_inc,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_daxpy(const int n,
+                            const double alpha,
+                            const double* x, const int x_inc,
+                            double* y, const int y_inc);
+void PUBLIC_API cblas_caxpy(const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zaxpy(const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            void* y, const int y_inc);
+
+// Dot product of two vectors: SDOT/DDOT/HDOT
+float PUBLIC_API cblas_sdot(const int n,
+                            const float* x, const int x_inc,
+                            const float* y, const int y_inc);
+double PUBLIC_API cblas_ddot(const int n,
+                             const double* x, const int x_inc,
+                             const double* y, const int y_inc);
+
+// Dot product of two complex vectors: CDOTU/ZDOTU
+void PUBLIC_API cblas_cdotu_sub(const int n,
+                                const void* x, const int x_inc,
+                                const void* y, const int y_inc,
+                                void* dot);
+void PUBLIC_API cblas_zdotu_sub(const int n,
+                                const void* x, const int x_inc,
+                                const void* y, const int y_inc,
+                                void* dot);
+
+// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC
+void PUBLIC_API cblas_cdotc_sub(const int n,
+                                const void* x, const int x_inc,
+                                const void* y, const int y_inc,
+                                void* dot);
+void PUBLIC_API cblas_zdotc_sub(const int n,
+                                const void* x, const int x_inc,
+                                const void* y, const int y_inc,
+                                void* dot);
+
+// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
+float PUBLIC_API cblas_snrm2(const int n,
+                             const float* x, const int x_inc);
+double PUBLIC_API cblas_dnrm2(const int n,
+                              const double* x, const int x_inc);
+float PUBLIC_API cblas_scnrm2(const int n,
+                             const void* x, const int x_inc);
+double PUBLIC_API cblas_dznrm2(const int n,
+                              const void* x, const int x_inc);
+
+// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
+float PUBLIC_API cblas_sasum(const int n,
+                             const float* x, const int x_inc);
+double PUBLIC_API cblas_dasum(const int n,
+                              const double* x, const int x_inc);
+float PUBLIC_API cblas_scasum(const int n,
+                             const void* x, const int x_inc);
+double PUBLIC_API cblas_dzasum(const int n,
+                              const void* x, const int x_inc);
+
+// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
+float PUBLIC_API cblas_ssum(const int n,
+                            const float* x, const int x_inc);
+double PUBLIC_API cblas_dsum(const int n,
+                             const double* x, const int x_inc);
+float PUBLIC_API cblas_scsum(const int n,
+                            const void* x, const int x_inc);
+double PUBLIC_API cblas_dzsum(const int n,
+                             const void* x, const int x_inc);
+
+// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
+int PUBLIC_API cblas_isamax(const int n,
+                           const float* x, const int x_inc);
+int PUBLIC_API cblas_idamax(const int n,
+                           const double* x, const int x_inc);
+int PUBLIC_API cblas_icamax(const int n,
+                           const void* x, const int x_inc);
+int PUBLIC_API cblas_izamax(const int n,
+                           const void* x, const int x_inc);
+
+// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
+int PUBLIC_API cblas_ismax(const int n,
+                          const float* x, const int x_inc);
+int PUBLIC_API cblas_idmax(const int n,
+                          const double* x, const int x_inc);
+int PUBLIC_API cblas_icmax(const int n,
+                          const void* x, const int x_inc);
+int PUBLIC_API cblas_izmax(const int n,
+                          const void* x, const int x_inc);
+
+// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
+int PUBLIC_API cblas_ismin(const int n,
+                          const float* x, const int x_inc);
+int PUBLIC_API cblas_idmin(const int n,
+                          const double* x, const int x_inc);
+int PUBLIC_API cblas_icmin(const int n,
+                          const void* x, const int x_inc);
+int PUBLIC_API cblas_izmin(const int n,
+                          const void* x, const int x_inc);
+
+// =================================================================================================
+// BLAS level-2 (matrix-vector) routines
+// =================================================================================================
+
+// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV
+void PUBLIC_API cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            const float* x, const int x_inc,
+                            const float beta,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            const double* x, const int x_inc,
+                            const double beta,
+                            double* y, const int y_inc);
+void PUBLIC_API cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+
+// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
+void PUBLIC_API cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n, const int kl, const int ku,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            const float* x, const int x_inc,
+                            const float beta,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n, const int kl, const int ku,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            const double* x, const int x_inc,
+                            const double beta,
+                            double* y, const int y_inc);
+void PUBLIC_API cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n, const int kl, const int ku,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n, const int kl, const int ku,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+
+// Hermitian matrix-vector multiplication: CHEMV/ZHEMV
+void PUBLIC_API cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+
+// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV
+void PUBLIC_API cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n, const int k,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n, const int k,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+
+// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV
+void PUBLIC_API cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* ap,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* ap,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+
+// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV
+void PUBLIC_API cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            const float* x, const int x_inc,
+                            const float beta,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            const double* x, const int x_inc,
+                            const double beta,
+                            double* y, const int y_inc);
+
+// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
+void PUBLIC_API cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n, const int k,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            const float* x, const int x_inc,
+                            const float beta,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n, const int k,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            const double* x, const int x_inc,
+                            const double beta,
+                            double* y, const int y_inc);
+
+// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
+void PUBLIC_API cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const float alpha,
+                            const float* ap,
+                            const float* x, const int x_inc,
+                            const float beta,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const double alpha,
+                            const double* ap,
+                            const double* x, const int x_inc,
+                            const double beta,
+                            double* y, const int y_inc);
+
+// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
+void PUBLIC_API cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const float* a, const int a_ld,
+                            float* x, const int x_inc);
+void PUBLIC_API cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const double* a, const int a_ld,
+                            double* x, const int x_inc);
+void PUBLIC_API cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+void PUBLIC_API cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+
+// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
+void PUBLIC_API cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const float* a, const int a_ld,
+                            float* x, const int x_inc);
+void PUBLIC_API cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const double* a, const int a_ld,
+                            double* x, const int x_inc);
+void PUBLIC_API cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+void PUBLIC_API cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+
+// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
+void PUBLIC_API cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const float* ap,
+                            float* x, const int x_inc);
+void PUBLIC_API cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const double* ap,
+                            double* x, const int x_inc);
+void PUBLIC_API cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* ap,
+                            void* x, const int x_inc);
+void PUBLIC_API cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* ap,
+                            void* x, const int x_inc);
+
+// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
+void PUBLIC_API cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const float* a, const int a_ld,
+                            float* x, const int x_inc);
+void PUBLIC_API cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const double* a, const int a_ld,
+                            double* x, const int x_inc);
+void PUBLIC_API cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+void PUBLIC_API cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+
+// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV
+void PUBLIC_API cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const float* a, const int a_ld,
+                            float* x, const int x_inc);
+void PUBLIC_API cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const double* a, const int a_ld,
+                            double* x, const int x_inc);
+void PUBLIC_API cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+void PUBLIC_API cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+
+// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV
+void PUBLIC_API cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const float* ap,
+                            float* x, const int x_inc);
+void PUBLIC_API cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const double* ap,
+                            double* x, const int x_inc);
+void PUBLIC_API cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* ap,
+                            void* x, const int x_inc);
+void PUBLIC_API cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* ap,
+                            void* x, const int x_inc);
+
+// General rank-1 matrix update: SGER/DGER/HGER
+void PUBLIC_API cblas_sger(const CLBlastLayout layout,
+                           const int m, const int n,
+                           const float alpha,
+                           const float* x, const int x_inc,
+                           const float* y, const int y_inc,
+                           float* a, const int a_ld);
+void PUBLIC_API cblas_dger(const CLBlastLayout layout,
+                           const int m, const int n,
+                           const double alpha,
+                           const double* x, const int x_inc,
+                           const double* y, const int y_inc,
+                           double* a, const int a_ld);
+
+// General rank-1 complex matrix update: CGERU/ZGERU
+void PUBLIC_API cblas_cgeru(const CLBlastLayout layout,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* a, const int a_ld);
+void PUBLIC_API cblas_zgeru(const CLBlastLayout layout,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* a, const int a_ld);
+
+// General rank-1 complex conjugated matrix update: CGERC/ZGERC
+void PUBLIC_API cblas_cgerc(const CLBlastLayout layout,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* a, const int a_ld);
+void PUBLIC_API cblas_zgerc(const CLBlastLayout layout,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* a, const int a_ld);
+
+// Hermitian rank-1 matrix update: CHER/ZHER
+void PUBLIC_API cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const float alpha,
+                           const void* x, const int x_inc,
+                           void* a, const int a_ld);
+void PUBLIC_API cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const double alpha,
+                           const void* x, const int x_inc,
+                           void* a, const int a_ld);
+
+// Hermitian packed rank-1 matrix update: CHPR/ZHPR
+void PUBLIC_API cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const float alpha,
+                           const void* x, const int x_inc,
+                           void* ap);
+void PUBLIC_API cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const double alpha,
+                           const void* x, const int x_inc,
+                           void* ap);
+
+// Hermitian rank-2 matrix update: CHER2/ZHER2
+void PUBLIC_API cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* a, const int a_ld);
+void PUBLIC_API cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* a, const int a_ld);
+
+// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2
+void PUBLIC_API cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* ap);
+void PUBLIC_API cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* ap);
+
+// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
+void PUBLIC_API cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const float alpha,
+                           const float* x, const int x_inc,
+                           float* a, const int a_ld);
+void PUBLIC_API cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const double alpha,
+                           const double* x, const int x_inc,
+                           double* a, const int a_ld);
+
+// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
+void PUBLIC_API cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const float alpha,
+                           const float* x, const int x_inc,
+                           float* ap);
+void PUBLIC_API cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const double alpha,
+                           const double* x, const int x_inc,
+                           double* ap);
+
+// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
+void PUBLIC_API cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const float alpha,
+                            const float* x, const int x_inc,
+                            const float* y, const int y_inc,
+                            float* a, const int a_ld);
+void PUBLIC_API cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const double alpha,
+                            const double* x, const int x_inc,
+                            const double* y, const int y_inc,
+                            double* a, const int a_ld);
+
+// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
+void PUBLIC_API cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const float alpha,
+                            const float* x, const int x_inc,
+                            const float* y, const int y_inc,
+                            float* ap);
+void PUBLIC_API cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const double alpha,
+                            const double* x, const int x_inc,
+                            const double* y, const int y_inc,
+                            double* ap);
+
+// =================================================================================================
+// BLAS level-3 (matrix-matrix) routines
+// =================================================================================================
+
+// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
+void PUBLIC_API cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                            const int m, const int n, const int k,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            const float* b, const int b_ld,
+                            const float beta,
+                            float* c, const int c_ld);
+void PUBLIC_API cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                            const int m, const int n, const int k,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            const double* b, const int b_ld,
+                            const double beta,
+                            double* c, const int c_ld);
+void PUBLIC_API cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                            const int m, const int n, const int k,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* b, const int b_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+void PUBLIC_API cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                            const int m, const int n, const int k,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* b, const int b_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+
+// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
+void PUBLIC_API cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                            const int m, const int n,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            const float* b, const int b_ld,
+                            const float beta,
+                            float* c, const int c_ld);
+void PUBLIC_API cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                            const int m, const int n,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            const double* b, const int b_ld,
+                            const double beta,
+                            double* c, const int c_ld);
+void PUBLIC_API cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* b, const int b_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+void PUBLIC_API cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* b, const int b_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+
+// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
+void PUBLIC_API cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* b, const int b_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+void PUBLIC_API cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* b, const int b_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+
+// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK
+void PUBLIC_API cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                            const int n, const int k,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            const float beta,
+                            float* c, const int c_ld);
+void PUBLIC_API cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                            const int n, const int k,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            const double beta,
+                            double* c, const int c_ld);
+void PUBLIC_API cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                            const int n, const int k,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+void PUBLIC_API cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                            const int n, const int k,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+
+// Rank-K update of a hermitian matrix: CHERK/ZHERK
+void PUBLIC_API cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                            const int n, const int k,
+                            const float alpha,
+                            const void* a, const int a_ld,
+                            const float beta,
+                            void* c, const int c_ld);
+void PUBLIC_API cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                            const int n, const int k,
+                            const double alpha,
+                            const void* a, const int a_ld,
+                            const double beta,
+                            void* c, const int c_ld);
+
+// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K
+void PUBLIC_API cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                             const int n, const int k,
+                             const float alpha,
+                             const float* a, const int a_ld,
+                             const float* b, const int b_ld,
+                             const float beta,
+                             float* c, const int c_ld);
+void PUBLIC_API cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                             const int n, const int k,
+                             const double alpha,
+                             const double* a, const int a_ld,
+                             const double* b, const int b_ld,
+                             const double beta,
+                             double* c, const int c_ld);
+void PUBLIC_API cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                             const int n, const int k,
+                             const void* alpha,
+                             const void* a, const int a_ld,
+                             const void* b, const int b_ld,
+                             const void* beta,
+                             void* c, const int c_ld);
+void PUBLIC_API cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                             const int n, const int k,
+                             const void* alpha,
+                             const void* a, const int a_ld,
+                             const void* b, const int b_ld,
+                             const void* beta,
+                             void* c, const int c_ld);
+
+// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
+void PUBLIC_API cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                             const int n, const int k,
+                             const void* alpha,
+                             const void* a, const int a_ld,
+                             const void* b, const int b_ld,
+                             const float beta,
+                             void* c, const int c_ld);
+void PUBLIC_API cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                             const int n, const int k,
+                             const void* alpha,
+                             const void* a, const int a_ld,
+                             const void* b, const int b_ld,
+                             const double beta,
+                             void* c, const int c_ld);
+
+// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM
+void PUBLIC_API cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            float* b, const int b_ld);
+void PUBLIC_API cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            double* b, const int b_ld);
+void PUBLIC_API cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            void* b, const int b_ld);
+void PUBLIC_API cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            void* b, const int b_ld);
+
+// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
+void PUBLIC_API cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            float* b, const int b_ld);
+void PUBLIC_API cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            double* b, const int b_ld);
+void PUBLIC_API cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            void* b, const int b_ld);
+void PUBLIC_API cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            void* b, const int b_ld);
+
+// =================================================================================================
+// Extra non-BLAS routines (level-X)
+// =================================================================================================
+
+// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
+void PUBLIC_API cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                                const int m, const int n,
+                                const float alpha,
+                                const float* a, const int a_ld,
+                                float* b, const int b_ld);
+void PUBLIC_API cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                                const int m, const int n,
+                                const double alpha,
+                                const double* a, const int a_ld,
+                                double* b, const int b_ld);
+void PUBLIC_API cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                                const int m, const int n,
+                                const void* alpha,
+                                const void* a, const int a_ld,
+                                void* b, const int b_ld);
+void PUBLIC_API cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                                const int m, const int n,
+                                const void* alpha,
+                                const void* a, const int a_ld,
+                                void* b, const int b_ld);
+
+// =================================================================================================
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+// CLBLAST_CLBLAST_NETLIB_C_H_
+#endif
--- a/samples/cache.c
+++ b/samples/cache.c
@ -106,14 +106,16 @@ void run_example_routine(const cl_device_id device) {
  clock_t start = clock();

  // Calls an example routine
-  StatusCode status = CLBlastSasum(n,
-                                   device_output, 0,
-                                   device_input, 0, 1,
-                                   &queue, &event);
+  CLBlastStatusCode status = CLBlastSasum(n,
+                                          device_output, 0,
+                                          device_input, 0, 1,
+                                          &queue, &event);

  // Wait for completion
-  clWaitForEvents(1, &event);
-  clReleaseEvent(event);
+  if (status == CLBlastSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }

  // Retrieves the execution time
  clock_t diff = clock() - start;
--- a/samples/dgemv.c
+++ b/samples/dgemv.c
@ -74,18 +74,20 @@ int main(void) {
  clEnqueueWriteBuffer(queue, device_y, CL_TRUE, 0, m*sizeof(double), host_y, 0, NULL, NULL);

  // Call the DGEMV routine.
-  StatusCode status = CLBlastDgemv(kRowMajor, kNo,
-                                   m, n,
-                                   alpha,
-                                   device_a, 0, a_ld,
-                                   device_x, 0, 1,
-                                   beta,
-                                   device_y, 0, 1,
-                                   &queue, &event);
+  CLBlastStatusCode status = CLBlastDgemv(CLBlastLayoutRowMajor, CLBlastTransposeNo,
+                                          m, n,
+                                          alpha,
+                                          device_a, 0, a_ld,
+                                          device_x, 0, 1,
+                                          beta,
+                                          device_y, 0, 1,
+                                          &queue, &event);

  // Wait for completion
-  clWaitForEvents(1, &event);
-  clReleaseEvent(event);
+  if (status == CLBlastSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }

  // Example completed. See "clblast_c.h" for status codes (0 -> success).
  printf("Completed DGEMV with status %d\n", status);
--- a/samples/haxpy.c
+++ b/samples/haxpy.c
@ -71,14 +71,16 @@ int main(void) {
  clEnqueueWriteBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL);

  // Call the HAXPY routine.
-  StatusCode status = CLBlastHaxpy(n, alpha,
-                                   device_a, 0, 1,
-                                   device_b, 0, 1,
-                                   &queue, &event);
+  CLBlastStatusCode status = CLBlastHaxpy(n, alpha,
+                                          device_a, 0, 1,
+                                          device_b, 0, 1,
+                                          &queue, &event);

  // Wait for completion
-  clWaitForEvents(1, &event);
-  clReleaseEvent(event);
+  if (status == CLBlastSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }

  // Copies the result back to the host
  clEnqueueReadBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL);
--- a/samples/sasum.c
+++ b/samples/sasum.c
@ -67,14 +67,16 @@ int main(void) {
  clEnqueueWriteBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);

  // Call the SASUM routine.
-  StatusCode status = CLBlastSasum(n,
-                                   device_output, 0,
-                                   device_input, 0, 1,
-                                   &queue, &event);
+  CLBlastStatusCode status = CLBlastSasum(n,
+                                          device_output, 0,
+                                          device_input, 0, 1,
+                                          &queue, &event);

  // Wait for completion
-  clWaitForEvents(1, &event);
-  clReleaseEvent(event);
+  if (status == CLBlastSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }

  // Copies the result back to the host
  clEnqueueReadBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);
--- a/samples/sgemm.c
+++ b/samples/sgemm.c
@ -77,18 +77,21 @@ int main(void) {
  clEnqueueWriteBuffer(queue, device_c, CL_TRUE, 0, m*n*sizeof(float), host_c, 0, NULL, NULL);

  // Call the SGEMM routine.
-  StatusCode status = CLBlastSgemm(kRowMajor, kNo, kNo,
-                                   m, n, k,
-                                   alpha,
-                                   device_a, 0, a_ld,
-                                   device_b, 0, b_ld,
-                                   beta,
-                                   device_c, 0, c_ld,
-                                   &queue, &event);
+  CLBlastStatusCode status = CLBlastSgemm(CLBlastLayoutRowMajor,
+                                          CLBlastTransposeNo, CLBlastTransposeNo,
+                                          m, n, k,
+                                          alpha,
+                                          device_a, 0, a_ld,
+                                          device_b, 0, b_ld,
+                                          beta,
+                                          device_c, 0, c_ld,
+                                          &queue, &event);

  // Wait for completion
-  clWaitForEvents(1, &event);
-  clReleaseEvent(event);
+  if (status == CLBlastSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }

  // Example completed. See "clblast_c.h" for status codes (0 -> success).
  printf("Completed SGEMM with status %d\n", status);
--- a/samples/sgemm.cpp
+++ b/samples/sgemm.cpp
@ -95,8 +95,10 @@ int main() {
                              &queue_plain, &event);

  // Record the execution time
-  clWaitForEvents(1, &event);
-  clReleaseEvent(event);
+  if (status == clblast::StatusCode::kSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
  auto elapsed_time = std::chrono::steady_clock::now() - start_time;
  auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();

--- a/samples/sgemm_netlib.c
+++ b/samples/sgemm_netlib.c
@ -0,0 +1,69 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the Netlib CBLAS API of the CLBlast library. This API is not
+// recommended if you want full control over performance: it will internally copy buffers from and
+// to the OpenCL device.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+// Includes the CLBlast library (Netlib CBLAS interface)
+#include <clblast_netlib_c.h>
+
+// =================================================================================================
+
+// Example use of the single-precision routine SGEMM
+int main(void) {
+
+  // Example SGEMM arguments
+  const int m = 128;
+  const int n = 64;
+  const int k = 512;
+  const float alpha = 0.7f;
+  const float beta = 1.0f;
+  const int a_ld = k;
+  const int b_ld = n;
+  const int c_ld = n;
+
+  // Populate host matrices with some example data
+  float* host_a = (float*)malloc(sizeof(float)*m*k);
+  float* host_b = (float*)malloc(sizeof(float)*n*k);
+  float* host_c = (float*)malloc(sizeof(float)*m*n);
+  for (int i=0; i<m*k; ++i) { host_a[i] = 12.193f; }
+  for (int i=0; i<n*k; ++i) { host_b[i] = -8.199f; }
+  for (int i=0; i<m*n; ++i) { host_c[i] = 0.0f; }
+
+  // Call the SGEMM routine.
+  cblas_sgemm(CLBlastLayoutRowMajor,
+              CLBlastTransposeNo, CLBlastTransposeNo,
+              m, n, k,
+              alpha,
+              host_a, a_ld,
+              host_b, b_ld,
+              beta,
+              host_c, c_ld);
+
+  // Example completed
+  printf("Completed SGEMM\n");
+
+  // Clean-up
+  free(host_a);
+  free(host_b);
+  free(host_c);
+  return 0;
+}
+
+// =================================================================================================
--- a/scripts/database/database.py
+++ b/scripts/database/database.py
@ -18,7 +18,7 @@ import database.bests as bests
 import database.defaults as defaults

 # Server storing a copy of the database
-DATABASE_SERVER_URL = "http://www.cedricnugteren.nl/tuning/clblast.json"
+DATABASE_SERVER_URL = "https://raw.githubusercontent.com/CNugteren/CLBlast-database/master/database.json"

 # OpenCL vendor names and their short name
 VENDOR_TRANSLATION_TABLE = {
--- a/scripts/database/database/clblast.py
+++ b/scripts/database/database/clblast.py
@ -54,19 +54,20 @@ def get_cpp_header(family):
 //
 // This file populates the database with best-found tuning parameters for the '%s' kernels.
 //\n"""
-            % family.title() + get_cpp_separator() + "\n\nnamespace clblast {\n" + get_cpp_separator())
+            % family.title() + get_cpp_separator() + \
+             "\n\nnamespace clblast {\n" + "namespace database {\n" + get_cpp_separator())


 def get_cpp_footer():
    """Retrieves the C++ footer"""
-    return "\n} // namespace clblast\n"
+    return "\n} // namespace database\n" + "} // namespace clblast\n"


 def get_cpp_precision(family, precision):
    """Retrieves the C++ code for the start of a new precision"""
    precision_string = precision_to_string(precision)
    camelcase_name = family.title().replace("_", "")
-    return("\n\nconst Database::DatabaseEntry Database::%s%s = {\n  \"%s\", Precision::k%s, {\n"
+    return("\n\nconst Database::DatabaseEntry %s%s = {\n  \"%s\", Precision::k%s, {\n"
           % (camelcase_name, precision_string, camelcase_name, precision_string))


--- a/scripts/database/database/defaults.py
+++ b/scripts/database/database/defaults.py
@ -5,6 +5,8 @@
 # Author(s):
 #   Cedric Nugteren <www.cedricnugteren.nl>

+import ast
+from collections import defaultdict

 import clblast
 import bests
@ -137,6 +139,10 @@ def get_smallest_best_parameters(group):
    return min_parameters


+def get_parameter_names(section):
+    return [result["parameters"] for result in section["results"]]
+
+
 def get_common_best_parameters(group, group_identifier, verbose):
    """Sets defaults based on the best values of entries supported by all devices. This might cause a problem in case
    not every device was tuned with the same parameters. In that case it falls back to the above method to retrieve
@ -154,19 +160,48 @@ def get_common_best_parameters(group, group_identifier, verbose):
            result["relative_performance"] = minimum_time / result["time"]

    # Determine which parameters are available for all devices
-    common_parameters = [result["parameters"] for result in group[0]["results"]]  # Parameters of the first section
+    common_parameters = get_parameter_names(group[0])  # Parameters of the first section
    for i in range(1, num_devices):
-        section_parameters = [result["parameters"] for result in group[i]["results"]]
+        section_parameters = get_parameter_names(group[i])
        common_parameters = [p for p in section_parameters if p in common_parameters]  # Intersection of the parameters

    # Fall back to another method in case there are no shared entries at all across devices
    if len(common_parameters) == 0:
        if verbose:
-            print("[database] No common kernels for: " + str(group_identifier) + " with devices: %d " % num_devices)
-        smallest_best_parameters = get_smallest_best_parameters(group)
+            print("[database] No common kernels for: " + str(group_identifier) + " across all %d devices " % num_devices)
+
+        # Computes the amount of devices with shared parameters
+        parameters_count = defaultdict(int)
+        for i in range(0, num_devices):
+            for parameters in get_parameter_names(group[i]):
+                parameters_count[str(parameters)] += 1
+        num_devices_common = max(parameters_count.values())
+
+        # Fall back method in case there are no shared entries at all across devices
+        if num_devices_common == 1:
+            print("[database] Warning: No common kernels for: " + str(group_identifier) + " at all")
+            smallest_best_parameters = get_smallest_best_parameters(group)
+            if verbose:
+                print("[database] " + str(group_identifier))
+            return smallest_best_parameters
+
+        # Checks if perhaps there are many more shared parameters with a bit fewer devices
+        num_parameters_common = defaultdict(int)
+        for count in parameters_count.values():
+            if count != 1:
+                num_parameters_common[str(count)] += 1
+        if num_parameters_common[str(num_devices_common - 1)] > num_parameters_common[str(num_devices_common)]:
+            num_devices_common -= 1
        if verbose:
-            print("[database] " + str(group_identifier))
-        return smallest_best_parameters
+            print("[database] Found %d common kernels for: " % num_parameters_common[str(num_devices_common)] +
+                  str(group_identifier) + " across %d out of %d devices " % (num_devices_common, num_devices))
+
+        # Populates the common parameters
+        for parameters_string in parameters_count.keys():
+            count = parameters_count[parameters_string]
+            if count == num_devices_common:
+                parameters = ast.literal_eval(parameters_string)
+                common_parameters.append(parameters)

    # Removes entries with parameters which are not common
    common_results = []
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@ -12,6 +12,8 @@
 #    clblast.cpp
 #    clblast_c.h
 #    clblast_c.cpp
+#    clblast_netlib_c.h
+#    clblast_netlib_c.cpp
 #    wrapper_clblas.h
 #    wrapper_cblas.h
 # It also generates the main functions for the correctness and performance tests as found in
@ -29,9 +31,18 @@ import generator.doc as doc
 from generator.routine import Routine
 from generator.datatype import H, S, D, C, Z, Sc, Dz, iH, iS, iD, iC, iZ, Css, Zdd, Ccs, Zzd, T, Tc, TU

-
-HEADER_LINES = [96, 73, 97, 22, 29, 41]
-FOOTER_LINES = [17, 75, 19, 14, 6, 6]
+FILES = [
+    "/include/clblast.h",
+    "/src/clblast.cpp",
+    "/include/clblast_c.h",
+    "/src/clblast_c.cpp",
+    "/test/wrapper_clblas.hpp",
+    "/test/wrapper_cblas.hpp",
+    "/include/clblast_netlib_c.h",
+    "/src/clblast_netlib_c.cpp",
+]
+HEADER_LINES = [117, 73, 118, 22, 29, 41, 65, 32]
+FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 2]

 # Different possibilities for requirements
 ald_m = "The value of `a_ld` must be at least `m`."
@ -48,70 +59,105 @@ bld_trans_n_k = "When `transpose == Transpose::kNo`, then `b_ld` must be at leas
 cld_m = "The value of `c_ld` must be at least `m`."
 cld_n = "The value of `c_ld` must be at least `n`."

+
+# Helper functions to compute vector and matrix sizes
+def size_helper(condition, size_one, size_two, multiplier):
+    length = "(" + condition + ")" + " ? " + size_one + " * " + multiplier + " : " + size_two + " * " + multiplier
+    return length
+
+
+def layout_transpose_condition(prefix):
+    return "(layout == CLBlastLayoutColMajor && " + prefix + "_transpose != CLBlastTransposeNo) || " +\
+           "(layout == CLBlastLayoutRowMajor && " + prefix + "_transpose == CLBlastTransposeNo)"
+
+
+# Different possibilities for the vector and matrix sizes
+xn = "n * x_inc"
+xm = "m * x_inc"
+yn = "n * y_inc"
+ym = "m * y_inc"
+an = "n * a_ld"
+apn = "((n*(n+1)) / 2)"
+cn = "n * c_ld"
+xmn = size_helper("a_transpose != CLBlastTransposeNo", "m", "n", "x_inc")
+ynm = size_helper("a_transpose != CLBlastTransposeNo", "n", "m", "y_inc")
+amn = size_helper("layout == CLBlastLayoutRowMajor", "m", "n", "a_ld")
+amns = size_helper("side == CLBlastSideLeft", "m", "n", "a_ld")
+amk = size_helper(layout_transpose_condition("a"), "m", "k", "a_ld")
+ank = size_helper(layout_transpose_condition("a"), "n", "k", "a_ld")
+ankab = size_helper(layout_transpose_condition("ab"), "n", "k", "a_ld")
+bkn = size_helper(layout_transpose_condition("b"), "k", "n", "b_ld")
+bnkab = size_helper(layout_transpose_condition("ab"), "n", "k", "b_ld")
+bmn = size_helper("layout == CLBlastLayoutRowMajor", "m", "n", "b_ld")
+bnma = size_helper(layout_transpose_condition("a"), "n", "m", "b_ld")
+cmn = size_helper("layout == CLBlastLayoutRowMajor", "m", "n", "c_ld")
+ammn = size_helper("layout == CLBlastLayoutRowMajor", "m", "((side == CLBlastSideLeft) ? m : n)", "a_ld")
+bmnn = size_helper("layout == CLBlastLayoutRowMajor", "((side == CLBlastSideLeft) ? m : n)", "n", "b_ld")
+
 # ==================================================================================================

 # Populates a list of routines
 ROUTINES = [
 [  # Level 1: vector-vector
-  Routine(False, True,  "1", "rotg",  T, [S,D],            [],                  [],                                                     [],         ["sa","sb","sc","ss"],        [],               "",    "Generate givens plane rotation", "", []),
-  Routine(False, True,  "1", "rotmg", T, [S,D],            [],                  [],                                                     ["sy1"],    ["sd1","sd2","sx1","sparam"], [],               "",    "Generate modified givens plane rotation", "", []),
-  Routine(False, True,  "1", "rot",   T, [S,D],            ["n"],               [],                                                     [],         ["x","y"],                    ["cos","sin"],    "",    "Apply givens plane rotation", "", []),
-  Routine(False, True,  "1", "rotm",  T, [S,D],            ["n"],               [],                                                     [],         ["x","y","sparam"],           [],               "",    "Apply modified givens plane rotation", "", []),
-  Routine(True,  True,  "1", "swap",  T, [S,D,C,Z,H],      ["n"],               [],                                                     [],         ["x","y"],                    [],               "",    "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []),
-  Routine(True,  True,  "1", "scal",  T, [S,D,C,Z,H],      ["n"],               [],                                                     [],         ["x"],                        ["alpha"],        "",    "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []),
-  Routine(True,  True,  "1", "copy",  T, [S,D,C,Z,H],      ["n"],               [],                                                     ["x"],      ["y"],                        [],               "",    "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []),
-  Routine(True,  True,  "1", "axpy",  T, [S,D,C,Z,H],      ["n"],               [],                                                     ["x"],      ["y"],                        ["alpha"],        "",    "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []),
-  Routine(True,  True,  "1", "dot",   T, [S,D,H],          ["n"],               [],                                                     ["x","y"],  ["dot"],                      [],               "n",   "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []),
-  Routine(True,  True,  "1", "dotu",  T, [C,Z],            ["n"],               [],                                                     ["x","y"],  ["dot"],                      [],               "n",   "Dot product of two complex vectors", "See the regular xDOT routine.", []),
-  Routine(True,  True,  "1", "dotc",  T, [C,Z],            ["n"],               [],                                                     ["x","y"],  ["dot"],                      [],               "n",   "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
-  Routine(True,  True,  "1", "nrm2",  T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["nrm2"],                     [],               "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
-  Routine(True,  True,  "1", "asum",  T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["asum"],                     [],               "n",   "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
-  Routine(True,  False, "1", "sum",   T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["sum"],                      [],               "n",   "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
-  Routine(True,  True,  "1", "amax",  T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [],               "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
-  Routine(True,  False, "1", "max",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [],               "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
-  Routine(True,  False, "1", "min",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imin"],                     [],               "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
+  Routine(False, True,  "1", "rotg",  T, [S,D],            [],                  [],                                                     [],         ["sa","sb","sc","ss"],        ["1","1","1","1"], [],       "",    "Generate givens plane rotation", "", []),
+  Routine(False, True,  "1", "rotmg", T, [S,D],            [],                  [],                                                     ["sy1"],    ["sd1","sd2","sx1","sparam"], ["1","1","1","1","1"], [],   "",    "Generate modified givens plane rotation", "", []),
+  Routine(False, True,  "1", "rot",   T, [S,D],            ["n"],               [],                                                     [],         ["x","y"],                    [xn,yn],       ["cos","sin"],"",    "Apply givens plane rotation", "", []),
+  Routine(False, True,  "1", "rotm",  T, [S,D],            ["n"],               [],                                                     [],         ["x","y","sparam"],           [xn,yn,"1"],   [],           "",    "Apply modified givens plane rotation", "", []),
+  Routine(True,  True,  "1", "swap",  T, [S,D,C,Z,H],      ["n"],               [],                                                     [],         ["x","y"],                    [xn,yn],       [],           "",    "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []),
+  Routine(True,  True,  "1", "scal",  T, [S,D,C,Z,H],      ["n"],               [],                                                     [],         ["x"],                        [xn],          ["alpha"],    "",    "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []),
+  Routine(True,  True,  "1", "copy",  T, [S,D,C,Z,H],      ["n"],               [],                                                     ["x"],      ["y"],                        [xn,yn],       [],           "",    "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []),
+  Routine(True,  True,  "1", "axpy",  T, [S,D,C,Z,H],      ["n"],               [],                                                     ["x"],      ["y"],                        [xn,yn],       ["alpha"],    "",    "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []),
+  Routine(True,  True,  "1", "dot",   T, [S,D,H],          ["n"],               [],                                                     ["x","y"],  ["dot"],                      [xn,yn,"1"],   [],           "n",   "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []),
+  Routine(True,  True,  "1", "dotu",  T, [C,Z],            ["n"],               [],                                                     ["x","y"],  ["dot"],                      [xn,yn,"1"],   [],           "n",   "Dot product of two complex vectors", "See the regular xDOT routine.", []),
+  Routine(True,  True,  "1", "dotc",  T, [C,Z],            ["n"],               [],                                                     ["x","y"],  ["dot"],                      [xn,yn,"1"],   [],           "n",   "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
+  Routine(True,  True,  "1", "nrm2",  T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["nrm2"],                     [xn,"1"],      [],           "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
+  Routine(True,  True,  "1", "asum",  T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["asum"],                     [xn,"1"],      [],           "n",   "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
+  Routine(True,  False, "1", "sum",   T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["sum"],                      [xn,"1"],      [],           "n",   "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
+  Routine(True,  True,  "1", "amax",  T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [xn,"1"],      [],           "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
+  Routine(True,  False, "1", "max",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [xn,"1"],      [],           "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
+  Routine(True,  False, "1", "min",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imin"],                     [xn,"1"],      [],           "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
 ],
 [  # Level 2: matrix-vector
-  Routine(True,  True,  "2a", "gemv",  T,  [S,D,C,Z,H],    ["m","n"],           ["layout","a_transpose"],                               ["a","x"],  ["y"],                        ["alpha","beta"], "",    "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
-  Routine(True,  True,  "2a", "gbmv",  T,  [S,D,C,Z,H],    ["m","n","kl","ku"], ["layout","a_transpose"],                               ["a","x"],  ["y"],                        ["alpha","beta"], "",    "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]),
-  Routine(True,  True,  "2a", "hemv",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["a","x"],  ["y"],                        ["alpha","beta"], "",    "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]),
-  Routine(True,  True,  "2a", "hbmv",  T,  [C,Z],          ["n","k"],           ["layout","triangle"],                                  ["a","x"],  ["y"],                        ["alpha","beta"], "",    "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]),
-  Routine(True,  True,  "2a", "hpmv",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["ap","x"], ["y"],                        ["alpha","beta"], "",    "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2a", "symv",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["a","x"],  ["y"],                        ["alpha","beta"], "",    "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]),
-  Routine(True,  True,  "2a", "sbmv",  T,  [S,D,H],        ["n","k"],           ["layout","triangle"],                                  ["a","x"],  ["y"],                        ["alpha","beta"], "",    "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]),
-  Routine(True,  True,  "2a", "spmv",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["ap","x"], ["y"],                        ["alpha","beta"], "",    "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2a", "trmv",  T,  [S,D,C,Z,H],    ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [],               "n",   "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
-  Routine(True,  True,  "2a", "tbmv",  T,  [S,D,C,Z,H],    ["n","k"],           ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [],               "n",   "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
-  Routine(True,  True,  "2a", "tpmv",  T,  [S,D,C,Z,H],    ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["ap"],     ["x"],                        [],               "n",   "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
-  Routine(False, True,  "2a", "trsv",  T,  [S,D,C,Z],      ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [],               "",    "Solves a triangular system of equations", "", []),
-  Routine(False, True,  "2a", "tbsv",  T,  [S,D,C,Z],      ["n","k"],           ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [],               "",    "Solves a banded triangular system of equations", "", [ald_k_one]),
-  Routine(False, True,  "2a", "tpsv",  T,  [S,D,C,Z],      ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["ap"],     ["x"],                        [],               "",    "Solves a packed triangular system of equations", "", []),
+  Routine(True,  True,  "2a", "gemv",  T,  [S,D,C,Z,H],    ["m","n"],           ["layout","a_transpose"],                               ["a","x"],  ["y"],                        [amn,xmn,ynm], ["alpha","beta"], "",    "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
+  Routine(True,  True,  "2a", "gbmv",  T,  [S,D,C,Z,H],    ["m","n","kl","ku"], ["layout","a_transpose"],                               ["a","x"],  ["y"],                        [amn,xmn,ynm], ["alpha","beta"], "",    "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]),
+  Routine(True,  True,  "2a", "hemv",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]),
+  Routine(True,  True,  "2a", "hbmv",  T,  [C,Z],          ["n","k"],           ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]),
+  Routine(True,  True,  "2a", "hpmv",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["ap","x"], ["y"],                        [apn,xn,yn],   ["alpha","beta"], "",    "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  "2a", "symv",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]),
+  Routine(True,  True,  "2a", "sbmv",  T,  [S,D,H],        ["n","k"],           ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]),
+  Routine(True,  True,  "2a", "spmv",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["ap","x"], ["y"],                        [apn,xn,yn],   ["alpha","beta"], "",    "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  "2a", "trmv",  T,  [S,D,C,Z,H],    ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "n",   "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
+  Routine(True,  True,  "2a", "tbmv",  T,  [S,D,C,Z,H],    ["n","k"],           ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "n",   "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
+  Routine(True,  True,  "2a", "tpmv",  T,  [S,D,C,Z,H],    ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["ap"],     ["x"],                        [apn,xn],      [],               "n",   "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
+  Routine(False, True,  "2a", "trsv",  T,  [S,D,C,Z],      ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "",    "Solves a triangular system of equations", "", []),
+  Routine(False, True,  "2a", "tbsv",  T,  [S,D,C,Z],      ["n","k"],           ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "",    "Solves a banded triangular system of equations", "", [ald_k_one]),
+  Routine(False, True,  "2a", "tpsv",  T,  [S,D,C,Z],      ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["ap"],     ["x"],                        [apn,xn],      [],               "",    "Solves a packed triangular system of equations", "", []),
  # Level 2: matrix update
-  Routine(True,  True,  "2b", "ger",   T,  [S,D,H],        ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        ["alpha"],        "",    "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]),
-  Routine(True,  True,  "2b", "geru",  T,  [C,Z],          ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        ["alpha"],        "",    "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]),
-  Routine(True,  True,  "2b", "gerc",  T,  [C,Z],          ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        ["alpha"],        "",    "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]),
-  Routine(True,  True,  "2b", "her",   Tc, [Css,Zdd],      ["n"],               ["layout","triangle"],                                  ["x"],      ["a"],                        ["alpha"],        "",    "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]),
-  Routine(True,  True,  "2b", "hpr",   Tc, [Css,Zdd],      ["n"],               ["layout","triangle"],                                  ["x"],      ["ap"],                       ["alpha"],        "",    "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2b", "her2",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["x","y"],  ["a"],                        ["alpha"],        "",    "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]),
-  Routine(True,  True,  "2b", "hpr2",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["x","y"],  ["ap"],                       ["alpha"],        "",    "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2b", "syr",   T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x"],      ["a"],                        ["alpha"],        "",    "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]),
-  Routine(True,  True,  "2b", "spr",   T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x"],      ["ap"],                       ["alpha"],        "",    "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2b", "syr2",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x","y"],  ["a"],                        ["alpha"],        "",    "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]),
-  Routine(True,  True,  "2b", "spr2",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x","y"],  ["ap"],                       ["alpha"],        "",    "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  "2b", "ger",   T,  [S,D,H],        ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        [xm,yn,amn],   ["alpha"],        "",    "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]),
+  Routine(True,  True,  "2b", "geru",  T,  [C,Z],          ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        [xm,yn,amn],   ["alpha"],        "",    "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]),
+  Routine(True,  True,  "2b", "gerc",  T,  [C,Z],          ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        [xm,yn,amn],   ["alpha"],        "",    "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]),
+  Routine(True,  True,  "2b", "her",   Tc, [Css,Zdd],      ["n"],               ["layout","triangle"],                                  ["x"],      ["a"],                        [xn,an],       ["alpha"],        "",    "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]),
+  Routine(True,  True,  "2b", "hpr",   Tc, [Css,Zdd],      ["n"],               ["layout","triangle"],                                  ["x"],      ["ap"],                       [xn,apn],      ["alpha"],        "",    "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  "2b", "her2",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["x","y"],  ["a"],                        [xn,yn,an],    ["alpha"],        "",    "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]),
+  Routine(True,  True,  "2b", "hpr2",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["x","y"],  ["ap"],                       [xn,yn,apn],   ["alpha"],        "",    "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  "2b", "syr",   T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x"],      ["a"],                        [xn,an],       ["alpha"],        "",    "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]),
+  Routine(True,  True,  "2b", "spr",   T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x"],      ["ap"],                       [xn,apn],      ["alpha"],        "",    "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  "2b", "syr2",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x","y"],  ["a"],                        [xn,yn,an],    ["alpha"],        "",    "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]),
+  Routine(True,  True,  "2b", "spr2",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x","y"],  ["ap"],                       [xn,yn,apn],   ["alpha"],        "",    "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
 ],
 [  # Level 3: matrix-matrix
-  Routine(True,  True,  "3", "gemm",  T,  [S,D,C,Z,H],     ["m","n","k"],        ["layout","a_transpose","b_transpose"],                ["a","b"],  ["c"],                        ["alpha","beta"], "",    "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
-  Routine(True,  True,  "3", "symm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle"],                          ["a","b"],  ["c"],                        ["alpha","beta"], "",    "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]),
-  Routine(True,  True,  "3", "hemm",  T,  [C,Z],           ["m","n"],            ["layout","side","triangle"],                          ["a","b"],  ["c"],                        ["alpha","beta"], "",    "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]),
-  Routine(True,  True,  "3", "syrk",  T,  [S,D,C,Z,H],     ["n","k"],            ["layout","triangle","a_transpose"],                   ["a"],      ["c"],                        ["alpha","beta"], "",    "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]),
-  Routine(True,  True,  "3", "herk",  Tc, [Css,Zdd],       ["n","k"],            ["layout","triangle","a_transpose"],                   ["a"],      ["c"],                        ["alpha","beta"], "",    "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]),
-  Routine(True,  True,  "3", "syr2k", T,  [S,D,C,Z,H],     ["n","k"],            ["layout","triangle","ab_transpose"],                  ["a","b"],  ["c"],                        ["alpha","beta"], "",    "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
-  Routine(True,  True,  "3", "her2k", TU, [Ccs,Zzd],       ["n","k"],            ["layout","triangle","ab_transpose"],                  ["a","b"],  ["c"],                        ["alpha","beta"], "",    "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
-  Routine(True,  True,  "3", "trmm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        ["alpha"],        "",    "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
-  Routine(False, True,  "3", "trsm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        ["alpha"],        "",    "Solves a triangular system of equations", "", []),
+  Routine(True,  True,  "3", "gemm",  T,  [S,D,C,Z,H],     ["m","n","k"],        ["layout","a_transpose","b_transpose"],                ["a","b"],  ["c"],                        [amk,bkn,cmn],   ["alpha","beta"], "",    "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
+  Routine(True,  True,  "3", "symm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle"],                          ["a","b"],  ["c"],                        [ammn,bmnn,cmn], ["alpha","beta"], "",    "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]),
+  Routine(True,  True,  "3", "hemm",  T,  [C,Z],           ["m","n"],            ["layout","side","triangle"],                          ["a","b"],  ["c"],                        [ammn,bmnn,cmn], ["alpha","beta"], "",    "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]),
+  Routine(True,  True,  "3", "syrk",  T,  [S,D,C,Z,H],     ["n","k"],            ["layout","triangle","a_transpose"],                   ["a"],      ["c"],                        [ank,cn],        ["alpha","beta"], "",    "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]),
+  Routine(True,  True,  "3", "herk",  Tc, [Css,Zdd],       ["n","k"],            ["layout","triangle","a_transpose"],                   ["a"],      ["c"],                        [ank,cn],        ["alpha","beta"], "",    "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]),
+  Routine(True,  True,  "3", "syr2k", T,  [S,D,C,Z,H],     ["n","k"],            ["layout","triangle","ab_transpose"],                  ["a","b"],  ["c"],                        [ankab,bnkab,cn],["alpha","beta"], "",    "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
+  Routine(True,  True,  "3", "her2k", TU, [Ccs,Zzd],       ["n","k"],            ["layout","triangle","ab_transpose"],                  ["a","b"],  ["c"],                        [ankab,bnkab,cn],["alpha","beta"], "",    "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
+  Routine(True,  True,  "3", "trmm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        [amns,bmn],      ["alpha"],        "",    "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
+  Routine(False, True,  "3", "trsm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        [amns,bmn],      ["alpha"],        "",    "Solves a triangular system of equations", "", []),
 ],
 [  # Level X: extra routines (not part of BLAS)
-  Routine(True,  True,  "x", "omatcopy", T, [S,D,C,Z,H],   ["m","n"],            ["layout","a_transpose"],                              ["a"],      ["b"],                        ["alpha"],        "",    "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
+  Routine(True,  True,  "x", "omatcopy", T, [S,D,C,Z,H],   ["m","n"],            ["layout","a_transpose"],                              ["a"],      ["b"],                        [amn,bnma],      ["alpha"],        "",    "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
 ]]


@ -124,33 +170,23 @@ def main(argv):
    cl_args = parser.parse_args(argv)
    library_root = cl_args.clblast_root

-    # Sets all the files the output
-    files = [
-        library_root + "/include/clblast.h",
-        library_root + "/src/clblast.cpp",
-        library_root + "/include/clblast_c.h",
-        library_root + "/src/clblast_c.cpp",
-        library_root + "/test/wrapper_clblas.hpp",
-        library_root + "/test/wrapper_cblas.hpp",
-    ]
-
    # Checks whether the command-line arguments are valid; exists otherwise
-    for f in files:
-        if not os.path.isfile(f):
+    for f in FILES:
+        if not os.path.isfile(library_root + f):
            print("[ERROR] The path '" + library_root + "' does not point to the root of the CLBlast library")
            sys.exit()

    # Iterates over all regular files to output
-    for i in range(0, len(files)):
+    for i in range(0, len(FILES)):

        # Stores the header and the footer of the original file
-        with open(files[i]) as f:
+        with open(library_root + FILES[i]) as f:
            original = f.readlines()
        file_header = original[:HEADER_LINES[i]]
        file_footer = original[-FOOTER_LINES[i]:]

        # Re-writes the body of the file
-        with open(files[i], "w") as f:
+        with open(library_root + FILES[i], "w") as f:
            body = ""
            levels = [1, 2, 3] if (i == 4 or i == 5) else [1, 2, 3, 4]
            for level in levels:
@ -168,6 +204,10 @@ def main(argv):
                        body += cpp.wrapper_clblas(routine)
                    if i == 5:
                        body += cpp.wrapper_cblas(routine)
+                    if i == 6:
+                        body += cpp.clblast_netlib_c_h(routine)
+                    if i == 7:
+                        body += cpp.clblast_netlib_c_cc(routine)
            f.write("".join(file_header))
            f.write(body)
            f.write("".join(file_footer))
--- a/scripts/generator/generator/cpp.py
+++ b/scripts/generator/generator/cpp.py
@ -45,17 +45,18 @@ def clblast_h(routine):

 def clblast_cc(routine):
    """The C++ API implementation (.cpp)"""
-    indent1 = " " * (20 + routine.length())
+    indent1 = " " * (15 + routine.length())
    result = NL + "// " + routine.description + ": " + routine.short_names() + NL
    if routine.implemented:
        result += routine.routine_header_cpp(12, "") + " {" + NL
-        result += "  auto queue_cpp = Queue(*queue);" + NL
-        result += "  auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL
-        result += "  auto status = routine.SetUp();" + NL
-        result += "  if (status != StatusCode::kSuccess) { return status; }" + NL
-        result += "  return routine.Do" + routine.name.capitalize() + "("
+        result += "  try {" + NL
+        result += "    auto queue_cpp = Queue(*queue);" + NL
+        result += "    auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL
+        result += "    routine.Do" + routine.name.capitalize() + "("
        result += ("," + NL + indent1).join([a for a in routine.arguments_clcudaapi()])
        result += ");" + NL
+        result += "    return StatusCode::kSuccess;" + NL
+        result += "  } catch (...) { return DispatchException(); }" + NL
    else:
        result += routine.routine_header_type_cpp(12) + " {" + NL
        result += "  return StatusCode::kNotImplemented;" + NL
@ -72,7 +73,7 @@ def clblast_c_h(routine):
    """The C API header (.h)"""
    result = NL + "// " + routine.description + ": " + routine.short_names() + NL
    for flavour in routine.flavours:
-        result += routine.routine_header_c(flavour, 31, " PUBLIC_API") + ";" + NL
+        result += routine.routine_header_c(flavour, 38, " PUBLIC_API") + ";" + NL
    return result


@ -81,12 +82,89 @@ def clblast_c_cc(routine):
    result = NL + "// " + routine.name.upper() + NL
    for flavour in routine.flavours:
        template = "<" + flavour.template + ">" if routine.no_scalars() else ""
-        indent = " " * (26 + routine.length() + len(template))
-        result += routine.routine_header_c(flavour, 20, "") + " {" + NL
-        result += "  auto status = clblast::" + routine.name.capitalize() + template + "("
+        indent = " " * (16 + routine.length() + len(template))
+        result += routine.routine_header_c(flavour, 27, "") + " {" + NL
+        result += "  try {" + NL
+        result += "    return static_cast<CLBlastStatusCode>(" + NL
+        result += "      clblast::" + routine.name.capitalize() + template + "("
        result += ("," + NL + indent).join([a for a in routine.arguments_cast(flavour, indent)])
-        result += "," + NL + indent + "queue, event);"
-        result += NL + "  return static_cast<StatusCode>(status);" + NL + "}" + NL
+        result += "," + NL + indent + "queue, event)" + NL
+        result += "    );" + NL
+        result += "  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }" + NL
+        result += "}" + NL
+    return result
+
+
+def clblast_netlib_c_h(routine):
+    """The Netlib CBLAS API header (.h)"""
+    result = NL + "// " + routine.description + ": " + routine.short_names() + NL
+    for flavour in routine.flavours:
+        if flavour.precision_name in ["S", "D", "C", "Z"]:
+            result += routine.routine_header_netlib(flavour, 20, " PUBLIC_API") + ";" + NL
+    return result
+
+
+def clblast_netlib_c_cc(routine):
+    """The Netlib CBLAS API implementation (.cpp)"""
+    result = NL + "// " + routine.name.upper() + NL
+    for flavour in routine.flavours:
+
+        # There is a version available in CBLAS
+        if flavour.precision_name in ["S", "D", "C", "Z"]:
+            template = "<" + flavour.template + ">" if routine.no_scalars() else ""
+            name_postfix = "_sub" if routine.name in routine.routines_scalar_no_return() else ""
+            indent = " " * (21 + routine.length() + len(template))
+            result += routine.routine_header_netlib(flavour, 9, "") + " {" + NL
+
+            # Initialize OpenCL
+            result += "  auto device = get_device();" + NL
+            result += "  auto context = clblast::Context(device);" + NL
+            result += "  auto queue = clblast::Queue(context, device);" + NL
+
+            # Set alpha and beta
+            result += "".join("  " + s + NL for s in routine.scalar_create_cpp(flavour))
+
+            # Copy data structures to the device
+            for i, name in enumerate(routine.inputs + routine.outputs):
+                result += "  " + routine.set_size(name, routine.buffer_sizes[i]) + NL
+            for i, name in enumerate(routine.inputs + routine.outputs):
+                buffer_type = routine.get_buffer_type(name, flavour)
+                result += "  " + routine.create_buffer(name, buffer_type) + NL
+                if name in routine.scalar_buffers_second_non_pointer():
+                    result += "  " + buffer_type + " " + name + "_vec[1]; " + name + "_vec[0] = " + name + ";" + NL
+            for name in routine.inputs + routine.outputs:
+                if name not in routine.scalar_buffers_first():
+                    prefix = "" if name in routine.outputs else "const "
+                    buffer_type = routine.get_buffer_type(name, flavour)
+                    result += "  " + routine.write_buffer(name, prefix + buffer_type) + NL
+
+            # The function call
+            result += "  auto queue_cl = queue();" + NL
+            result += "  auto s = clblast::" + routine.name.capitalize() + template + "("
+            result += ("," + NL + indent).join([a for a in routine.arguments_netlib(flavour, indent)])
+            result += "," + NL + indent + "&queue_cl);" + NL
+
+            # Error handling
+            result += "  if (s != clblast::StatusCode::kSuccess) {" + NL
+            result += "    throw std::runtime_error(\"CLBlast returned with error code \" + clblast::ToString(s));" + NL
+            result += "  }" + NL
+
+            # Copy back and clean-up
+            for name in routine.outputs:
+                if name in routine.scalar_buffers_first() and routine.name not in routine.routines_scalar_no_return():
+                    buffer_type = routine.get_buffer_type(name, flavour)
+                    result += "  " + buffer_type + " " + name + "[" + name + "_size];" + NL
+            for name in routine.outputs:
+                buffer_type = routine.get_buffer_type(name, flavour)
+                result += "  " + routine.read_buffer(name, buffer_type) + NL
+            for name in routine.outputs:
+                if name in routine.scalar_buffers_first() and routine.name not in routine.routines_scalar_no_return():
+                    result += "  return " + name + "[0]"
+                    if flavour.buffer_type in ["float2", "double2"]:
+                        if name not in routine.index_buffers():
+                            result += ".real()"
+                    result += ";" + NL
+            result += "}" + NL
    return result


@ -218,8 +296,9 @@ def performance_test(routine, level_string):
    result += "using double2 = clblast::double2;" + NL + NL
    result += "// Main function (not within the clblast namespace)" + NL
    result += "int main(int argc, char *argv[]) {" + NL
+    result += "  const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);" + NL
    default = convert.precision_to_full_name(routine.flavours[0].precision_name)
-    result += "  switch(clblast::GetPrecision(argc, argv, clblast::Precision::k" + default + ")) {" + NL
+    result += "  switch(clblast::GetPrecision(command_line_args, clblast::Precision::k" + default + ")) {" + NL
    for precision in ["H", "S", "D", "C", "Z"]:
        result += "    case clblast::Precision::k" + convert.precision_to_full_name(precision) + ":"
        found = False
--- a/scripts/generator/generator/datatype.py
+++ b/scripts/generator/generator/datatype.py
@ -54,6 +54,22 @@ class DataType:
            return self.beta_cl + "{{beta.real(), beta.imag()}}"
        return "beta"

+    def use_alpha_clblast(self):
+        """Transforms a Netlib CBLAS parameter to CLBlast style"""
+        if self.alpha_cpp == D_FLOAT2:
+            return self.alpha_cpp + "{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}"
+        elif self.alpha_cpp == D_DOUBLE2:
+            return self.alpha_cpp + "{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}"
+        return "alpha"
+
+    def use_beta_clblast(self):
+        """As above, but for beta instead of alpha"""
+        if self.beta_cpp == D_FLOAT2:
+            return self.beta_cpp + "{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}"
+        elif self.beta_cpp == D_DOUBLE2:
+            return self.beta_cpp + "{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}"
+        return "beta"
+
    def test_template(self):
        """Returns the template as used in the correctness/performance tests"""
        if self.buffer_type != self.beta_cpp:
@ -65,6 +81,10 @@ class DataType:
        return ((scalar == "alpha" and self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]) or
                (scalar == "beta" and self.beta_cpp in [D_FLOAT2, D_DOUBLE2]))

+    def is_non_standard(self):
+        """Current type is of a non-standard type"""
+        return self.buffer_type in [D_HALF, D_FLOAT2, D_DOUBLE2]
+

 # Regular data-types
 H = DataType("H", "H", D_HALF, [D_HALF] * 2 + [D_HALF_OPENCL] * 2, D_HALF)  # half (16)
--- a/scripts/generator/generator/doc.py
+++ b/scripts/generator/generator/doc.py
@ -32,7 +32,7 @@ def generate(routine):
    result += "C API:" + NL
    result += "```" + NL
    for flavour in routine.flavours:
-        result += routine.routine_header_c(flavour, 20, "") + NL
+        result += routine.routine_header_c(flavour, 27, "") + NL
    result += "```" + NL + NL

    # Routine arguments
--- a/scripts/generator/generator/routine.py
+++ b/scripts/generator/generator/routine.py
@ -13,7 +13,8 @@ import generator.convert as convert
 class Routine:
    """Class holding routine-specific information (e.g. name, which arguments, which precisions)"""
    def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options,
-                 inputs, outputs, scalars, scratch, description, details, requirements):
+                 inputs, outputs, buffer_sizes, scalars, scratch,
+                 description, details, requirements):
        self.implemented = implemented
        self.has_tests = has_tests
        self.level = level
@ -24,6 +25,7 @@ class Routine:
        self.options = options
        self.inputs = inputs
        self.outputs = outputs
+        self.buffer_sizes = buffer_sizes
        self.scalars = scalars
        self.scratch = scratch  # Scratch buffer (e.g. for xDOT)
        self.description = description
@ -40,6 +42,11 @@ class Routine:
        """List of scalar buffers"""
        return ["sa", "sb", "sc", "ss", "sd1", "sd2", "sx1", "sy1", "sparam"]

+    @staticmethod
+    def scalar_buffers_second_non_pointer():
+        """As above, but these ones are not passed as pointers but as scalars instead"""
+        return ["sy1"]
+
    @staticmethod
    def other_scalars():
        """List of scalars other than alpha and beta"""
@ -65,6 +72,34 @@ class Routine:
        """Distinguish between vectors and matrices"""
        return ["a", "b", "c", "ap"]

+    @staticmethod
+    def routines_scalar_no_return():
+        return ["dotu", "dotc"]
+
+    @staticmethod
+    def set_size(name, size):
+        """Sets the size of a buffer"""
+        return "const auto " + name + "_size = " + size + ";"
+
+    @staticmethod
+    def create_buffer(name, template):
+        """Creates a new CLCudaAPI buffer"""
+        return "auto " + name + "_buffer = clblast::Buffer<" + template + ">(context, " + name + "_size);"
+
+    def write_buffer(self, name, template):
+        """Writes to a CLCudaAPI buffer"""
+        postfix = ""
+        if name in self.scalar_buffers_second_non_pointer():
+            postfix = "_vec"
+        data_structure = "reinterpret_cast<" + template + "*>(" + name + postfix + ")"
+        return name + "_buffer.Write(queue, " + name + "_size, " + data_structure + ");"
+
+    @staticmethod
+    def read_buffer(name, template):
+        """Reads from a CLCudaAPI buffer"""
+        data_structure = "reinterpret_cast<" + template + "*>(" + name + ")"
+        return name + "_buffer.Read(queue, " + name + "_size, " + data_structure + ");"
+
    def non_index_inputs(self):
        """Lists of input/output buffers not index (integer)"""
        buffers = self.inputs[:]  # make a copy
@ -85,6 +120,11 @@ class Routine:
        """List of buffers without 'inc' or 'ld'"""
        return self.scalar_buffers_first() + self.scalar_buffers_second() + ["ap"]

+    def get_buffer_type(self, name, flavour):
+        if name in self.index_buffers():
+            return "int"
+        return flavour.buffer_type
+
    def length(self):
        """Retrieves the number of characters in the routine's name"""
        return len(self.name)
@ -133,6 +173,15 @@ class Routine:
            return [", ".join(a + b + c)]
        return []

+    def buffer_zero_offset(self, name):
+        """As above, but with an offset value of zero"""
+        if name in self.inputs or name in self.outputs:
+            a = [name + "_buffer()"]
+            b = ["0"]
+            c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else []
+            return [", ".join(a + b + c)]
+        return []
+
    def buffer_def(self, name):
        """As above but with data-types"""
        prefix = "const " if name in self.inputs else ""
@ -163,6 +212,17 @@ class Routine:
            return [", ".join(a + b + c)]
        return []

+    def buffer_def_pointer(self, name, flavour):
+        """As above but as plain C pointer"""
+        prefix = "const " if name in self.inputs else ""
+        if name in self.inputs or name in self.outputs:
+            data_type = "void" if flavour.is_non_standard() else flavour.buffer_type
+            pointer = "" if name in self.scalar_buffers_second_non_pointer() else "*"
+            a = [prefix + data_type + pointer + " " + name + ""]
+            c = ["const int " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
+            return [", ".join(a + c)]
+        return []
+
    def buffer_clcudaapi(self, name):
        """As above but with CLCudaAPI buffers"""
        if name in self.inputs or name in self.outputs:
@ -238,6 +298,12 @@ class Routine:
            return [name]
        return []

+    def scalar_cpp(self, name):
+        """As above, but with _cpp as a suffix"""
+        if name in self.scalars:
+            return [name + "_cpp"]
+        return []
+
    def scalar_half_to_float(self, name):
        """As above, but converts from float to half"""
        if name in self.scalars:
@ -288,6 +354,16 @@ class Routine:
            return ["const " + flavour.beta_cpp + " " + name]
        return []

+    def scalar_def_void(self, name, flavour):
+        """Retrieves the definition of a scalar (alpha/beta) but make it a void pointer in case of non-standard types"""
+        if name in self.scalars:
+            if name == "alpha":
+                data_type = "void*" if flavour.is_complex("alpha") else flavour.alpha_cpp
+                return ["const " + data_type + " " + name]
+            data_type = "void*" if flavour.is_complex("beta") else flavour.beta_cpp
+            return ["const " + data_type + " " + name]
+        return []
+
    def scalar_type(self, name, flavour):
        """Retrieves the type of a scalar (alpha/beta)"""
        if name in self.scalars:
@ -304,6 +380,16 @@ class Routine:
            return ["`const " + self.template.beta_cpp + " " + name + "`: Input scalar constant."]
        return []

+    def scalar_create_cpp(self, flavour):
+        """Creates a C++ version of a scalar based on a void*"""
+        result = []
+        for name in self.scalars:
+            if name == "alpha":
+                result.append("const auto alpha_cpp = " + flavour.use_alpha_clblast() + ";")
+            elif name == "beta":
+                result.append("const auto beta_cpp = " + flavour.use_beta_clblast() + ";")
+        return result
+
    def sizes_list(self):
        """Retrieves a list of comma-separated sizes (m, n, k)"""
        if self.sizes:
@ -316,6 +402,12 @@ class Routine:
            return [", ".join(["const size_t " + s for s in self.sizes])]
        return []

+    def sizes_def_netlib(self):
+        """Retrieves the definition of the sizes (m,n,k) for the CBLAS API"""
+        if self.sizes:
+            return [", ".join(["const int " + s for s in self.sizes])]
+        return []
+
    def sizes_type(self):
        """Retrieves the types of the sizes (m,n,k)"""
        if self.sizes:
@ -349,6 +441,13 @@ class Routine:
            return [", ".join(definitions)]
        return []

+    def options_def_c(self):
+        """As above, but now for the C API"""
+        if self.options:
+            definitions = ["const CLBlast" + convert.option_to_clblast(o) + " " + o for o in self.options]
+            return [", ".join(definitions)]
+        return []
+
    def options_def_wrapper_clblas(self):
        """As above, but now using clBLAS data-types"""
        if self.options:
@ -421,6 +520,17 @@ class Routine:
                list(chain(*[self.buffer(b) for b in self.scalar_buffers_second()])) +
                list(chain(*[self.scalar_use(s, flavour) for s in self.other_scalars()])))

+    def arguments_netlib(self, flavour, indent):
+        """As above, but for the Netlib CBLAS API"""
+        return (self.options_cast(indent) + self.sizes_list() +
+                list(chain(*[self.buffer_zero_offset(b) for b in self.scalar_buffers_first()])) +
+                self.scalar_cpp("alpha") +
+                list(chain(*[self.buffer_zero_offset(b) for b in self.buffers_first()])) +
+                self.scalar_cpp("beta") +
+                list(chain(*[self.buffer_zero_offset(b) for b in self.buffers_second()])) +
+                list(chain(*[self.buffer_zero_offset(b) for b in self.scalar_buffers_second()])) +
+                list(chain(*[self.scalar(s) for s in self.other_scalars()])))
+
    def arguments_wrapper_clblas(self, flavour):
        """As above, but for the clBLAS wrapper"""
        return (self.options_list() + self.sizes_list() +
@ -453,6 +563,30 @@ class Routine:
                list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) +
                list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))

+    def arguments_def_netlib(self, flavour):
+        """As above, but for the Netlib CBLAS API"""
+        result=(self.options_def_c() + self.sizes_def_netlib() +
+                self.scalar_def_void("alpha", flavour) +
+                list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_first()])) +
+                self.scalar_def_void("beta", flavour) +
+                list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_second()])) +
+                list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_second()])) +
+                list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
+        if self.name in self.routines_scalar_no_return():
+            result += list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_first()]))
+        return result
+
+    def arguments_def_c(self, flavour):
+        """As above, but for the C API"""
+        return (self.options_def_c() + self.sizes_def() +
+                list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_first()])) +
+                self.scalar_def("alpha", flavour) +
+                list(chain(*[self.buffer_def(b) for b in self.buffers_first()])) +
+                self.scalar_def("beta", flavour) +
+                list(chain(*[self.buffer_def(b) for b in self.buffers_second()])) +
+                list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) +
+                list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
+
    def arguments_def_wrapper_clblas(self, flavour):
        """As above, but clBLAS wrapper plain data-types"""
        return (self.options_def_wrapper_clblas() + self.sizes_def() +
@ -523,11 +657,30 @@ class Routine:
    def routine_header_c(self, flavour, spaces, extra_qualifier):
        """As above, but now for C"""
        indent = " " * (spaces + self.length())
-        result = "StatusCode" + extra_qualifier + " CLBlast" + flavour.name + self.name + "("
-        result += (",\n" + indent).join([a for a in self.arguments_def(flavour)])
+        result = "CLBlastStatusCode" + extra_qualifier + " CLBlast" + flavour.name + self.name + "("
+        result += (",\n" + indent).join([a for a in self.arguments_def_c(flavour)])
        result += ",\n" + indent + "cl_command_queue* queue, cl_event* event)"
        return result

+    def routine_header_netlib(self, flavour, spaces, extra_qualifier):
+        """As above, but now for the original Netlib CBLAS API"""
+        return_type = "void"
+        for output in self.outputs:
+            if output in self.index_buffers():
+                return_type = "int"
+                break
+            if output in self.scalar_buffers_first() and self.name not in self.routines_scalar_no_return():
+                return_type = flavour.buffer_type.replace("2", "")
+                break
+        indent = " " * (spaces + len(return_type) + self.length())
+        routine_name = self.name
+        if self.name in self.routines_scalar_no_return():
+            routine_name += "_sub"
+            indent += "    "
+        result = return_type + extra_qualifier + " cblas_" + flavour.name.lower() + routine_name + "("
+        result += (",\n" + indent).join([a for a in self.arguments_def_netlib(flavour)]) + ")"
+        return result
+
    def routine_header_wrapper_clblas(self, flavour, def_only, spaces):
        """As above, but now for the clBLAS wrapper"""
        template = "<" + flavour.template + ">" if self.no_scalars() and not def_only else ""
--- a/scripts/graphs/common.r
+++ b/scripts/graphs/common.r
@ -17,7 +17,6 @@ purplish  = "#550077" # [ 85,  0,119] lumi=26
 blueish   = "#4765b1" # [ 71,101,177] lumi=100
 redish    = "#d67568" # [214,117,104] lumi=136
 greenish  = "#9bd4ca" # [155,212,202] lumi=199
-colourset = c(blueish, redish, greenish, purplish)

 # Sets the graph markers (circles, triangles, etc.)
 pchs = c(15, 18, 17, 12)
@ -31,11 +30,14 @@ options("width"=170)

 # ==================================================================================================

-# Constants
-num_runs <- 4
+# Settings
+num_runs <- 5
+num_runs_short <- 50
+xtics_subset_threshold <- 100
+xtics_subset_stepsize <- 8
+
 devices <- c("-platform","-device")
 options_string <- "-q -no_abbrv -cblas 0"
-library_names <- c("CLBlast", "clBLAS")

 # Command-line arguments
 command_line <- commandArgs(trailingOnly=TRUE)
@ -50,6 +52,19 @@ device_id <- command_line[2]
 devices_values <- c(platform_id, device_id)
 devices_string <- paste(devices, devices_values, collapse=" ")

+
+# Filter the string: only lines containing a ";" can be valid lines
+filter_string <- function(raw_result_string) {
+  result_string <- c()
+  for (line in raw_result_string) {
+    if (grepl(";",line)) {
+      result_string <-
+       c(result_string, line)
+    }
+  }
+  return(result_string)
+}
+
 # ==================================================================================================

 # The main function
@ -65,12 +80,28 @@ main <- function(routine_name, precision, test_names, test_values,
  if (precision == 6464) { display_name <- gsub("^X","Z",display_name); }
  executable <- paste("./clblast_client_", routine_name, sep="")

+  # Display
+  library_names <- c("CLBlast", "clBLAS")
+  if (precision == 16) { library_names <- c("CLBlast FP16", "CLBlast FP32", "clBLAS FP32"); }
+  colourset <- c(blueish, redish)
+  if (precision == 16) { colourset <- c(blueish, purplish, redish); }
+
  # Configures the outputfile
-  pdf(paste(display_name, ".pdf", sep=""), height=8, width=13)
-  par(mfrow=c(2, 3))
-  par(oma=c(0, 0, 0, 0))
-  par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)]
-  par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)]
+  file_name <- paste(display_name, ".pdf", sep="")
+  if (length(test_names) == 6) {
+    pdf(file_name, height=8, width=13)
+    par(mfrow=c(2, 3))
+    par(oma=c(0, 0, 0, 0))
+    par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)]
+    par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)]
+  }
+  else { # length(test_names) == 2
+    pdf(file_name, height=8, width=13)
+    par(mfrow=c(2, 1))
+    par(oma=c(0, 0, 0, 0))
+    par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)]
+    par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)]
+  }

  # Loops over the test-cases
  for (test_id in 1:length(test_names)) {
@ -84,19 +115,32 @@ main <- function(routine_name, precision, test_names, test_values,
      arguments <- paste(devices_string, params_string, options_string, sep=" ")
      print(paste("Running", executable, arguments, sep=" "))
      raw_result_string <- system2(command=executable, args=arguments, stdout=TRUE)
-
-      # Filter the string: only lines containing a ";" can be valid lines
-      result_string <- c()
-      for (line in raw_result_string) {
-        if (grepl(";",line)) {
-          result_string <-
-           c(result_string, line)
-        }
-      }
+      result_string <- filter_string(raw_result_string)

      # Reads the result into a dataframe
      command_db <- read.csv(text=result_string, sep=";")

+      # For half-precision: also runs the FP32 version for comparison
+      if (precision == 16) {
+        params_string <- gsub("-precision 16", "-precision 32", params_string)
+        arguments <- paste(devices_string, params_string, options_string, sep=" ")
+        print(paste("Running", executable, arguments, sep=" "))
+        raw_result_string <- system2(command=executable, args=arguments, stdout=TRUE)
+        result_string <- filter_string(raw_result_string)
+
+        # Reads the result into a dataframe
+        command_db_32 <- read.csv(text=result_string, sep=";")
+        stopifnot(nrow(command_db) == nrow(command_db_32))
+
+        # Combines the results
+        command_db["ms_FP32_1"] = command_db_32$ms_1
+        command_db["GFLOPS_FP32_1"] = command_db_32$GFLOPS_1
+        command_db["GBs_FP32_1"] = command_db_32$GBs_1
+        command_db["ms_FP32_2"] = command_db_32$ms_2
+        command_db["GFLOPS_FP32_2"] = command_db_32$GFLOPS_2
+        command_db["GBs_FP32_2"] = command_db_32$GBs_2
+      }
+
      # Append the results to the final dataframe
      if (command_id == 1) {
        db <- command_db
@ -120,22 +164,36 @@ main <- function(routine_name, precision, test_names, test_values,

    # Plots the graph with GFLOPS on the Y-axis
    if (metric_gflops) {
-      plot_graph(xdata=xdata, ydata=list(db$GFLOPS_1, db$GFLOPS_2), log_setting=log_scale,
+      if (precision == 16) {
+        ydata = list(db$GFLOPS_1, db$GFLOPS_FP32_1, db$GFLOPS_FP32_2)
+        ymax = max(max(db$GFLOPS_1), max(db$GFLOPS_FP32_1), max(db$GFLOPS_FP32_2))
+      } else {
+        ydata = list(db$GFLOPS_1, db$GFLOPS_2)
+        ymax = max(max(db$GFLOPS_1), max(db$GFLOPS_2))
+      }
+      plot_graph(xdata=xdata, ydata=ydata, log_setting=log_scale,
                 xmin=min(xdata), xmax=max(xdata),
-                 ymin=0, ymax=max(max(db$GFLOPS_1),max(db$GFLOPS_2)),
+                 ymin=0, ymax=ymax,
                 xtics=xtics,
                 xlabel=test_xlabels[[test_id]], ylabel="GFLOPS (higher is better)",
                 graph_title=paste(display_name, test_names[[test_id]], sep=" "),
-                 multiple=50, experiment_names=library_names)
+                 multiple=50, experiment_names=library_names, colourset=colourset)
    # Plots the graph with GB/s on the Y-axis
    } else {
-      plot_graph(xdata=xdata, ydata=list(db$GBs_1, db$GBs_2), log_setting=log_scale,
+      if (precision == 16) {
+        ydata = list(db$GBs_1, db$GBs_FP32_1, db$GBs_FP32_2)
+        ymax = max(max(db$GBs_1), max(db$GBs_FP32_1), max(db$GBs_FP32_2))
+      } else {
+        ydata = list(db$GBs_1, db$GBs_2)
+        ymax = max(max(db$GBs_1), max(db$GBs_2))
+      }
+      plot_graph(xdata=xdata, ydata=ydata, log_setting=log_scale,
                 xmin=min(xdata), xmax=max(xdata),
-                 ymin=0, ymax=max(max(db$GBs_1),max(db$GBs_2)),
+                 ymin=0, ymax=ymax,
                 xtics=xtics,
                 xlabel=test_xlabels[[test_id]], ylabel="GB/s (higher is better)",
                 graph_title=paste(display_name, test_names[[test_id]], sep=" "),
-                 multiple=10, experiment_names=library_names)
+                 multiple=10, experiment_names=library_names, colourset=colourset)
    }
  }
 }
@ -147,7 +205,7 @@ plot_graph <- function(xdata, ydata, log_setting,
                       xmin, xmax, ymin, ymax,
                       xtics, xlabel, ylabel,
                       graph_title,
-                       multiple, experiment_names) {
+                       multiple, experiment_names, colourset) {

  # Update the ymax to the next multiple of something
  ymax <- multiple*ceiling(ymax/multiple)
@ -169,7 +227,12 @@ plot_graph <- function(xdata, ydata, log_setting,
       main="", xlab="", ylab="",
       ylim=c(ymin, ymax), xlim=c(xmin, xmax), axes=F, "n")
  axis(side=2, las=2)
-  axis(side=1, at=xdata, labels=xtics, las=2)
+  if (length(xdata) > xtics_subset_threshold) {  # Too many indices to print, plot only every Nth
+    subset <- seq(from=1, to=length(xdata), by=xtics_subset_stepsize)
+    axis(side=1, at=xdata[subset], labels=xtics[subset], las=2)
+  } else {
+    axis(side=1, at=xdata, labels=xtics, las=2)
+  }
  title(xlab=xlabel, line=-1)
  title(ylab=ylabel, line=2)
  title(graph_title, line=-2)
--- a/scripts/graphs/xgemm_small.r
+++ b/scripts/graphs/xgemm_small.r
@ -0,0 +1,56 @@
+
+# ==================================================================================================
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+# project uses a tab-size of two spaces and a max-width of 100 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+#
+# This file implements the performance script for small sizes of Xgemm, testing the direct kernel
+#
+# ==================================================================================================
+
+# Includes the common functions
+args <- commandArgs(trailingOnly = FALSE)
+thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
+source(file.path(dirname(thisfile), "common.r"))
+
+# ==================================================================================================
+
+# Settings
+routine_name <- "xgemm"
+parameters <- c("-m","-n","-k","-layout","-transA","-transB",
+                "-num_steps","-step","-runs","-precision")
+precision <- 32
+
+# Sets the names of the test-cases
+test_names <- list(
+  "small matrices in steps of 16",
+  "small matrices in steps of 1"
+)
+
+# Defines the test-cases
+test_values <- list(
+  list(c( 128,  128,  128, 102, 111, 111, 57, 16, num_runs_short, precision)),
+  list(c( 128,  128,  128, 102, 111, 111, 385, 1, num_runs_short, precision))
+)
+
+# Defines the x-labels corresponding to the test-cases
+test_xlabels <- list(
+  "matrix sizes (m=n=k)",
+  "matrix sizes (m=n=k)"
+)
+
+# Defines the x-axis of the test-cases
+test_xaxis <- list(
+  c("m", ""),
+  c("m", "")
+)
+
+# ==================================================================================================
+
+# Start the script
+main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
+     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
+
+# ==================================================================================================
--- a/scripts/graphs/xsymm.r
+++ b/scripts/graphs/xsymm.r
@ -35,32 +35,32 @@ test_names <- list(

 # Defines the test-cases
 test_values <- list(
-  list(c( 128,  128, 102, 111, 111, 16, 128, num_runs, precision)),
-  list(c( 129,  129, 102, 111, 111, 16, 128, num_runs, precision)),
-  list(c( 512,  512, 102, 111, 111, 16, 1, num_runs, precision)),
-  list(c(2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)),
+  list(c( 128,  128, 102, 141, 121, 16, 128, num_runs, precision)),
+  list(c( 129,  129, 102, 141, 121, 16, 128, num_runs, precision)),
+  list(c( 512,  512, 102, 141, 121, 16, 1, num_runs, precision)),
+  list(c(2048, 2048, 102, 141, 121, 16, 1, num_runs, precision)),
  list(
-    c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision)
+    c(1024, 1024, 101, 141, 121, 1, 0, num_runs, precision),
+    c(1024, 1024, 101, 141, 122, 1, 0, num_runs, precision),
+    c(1024, 1024, 101, 142, 121, 1, 0, num_runs, precision),
+    c(1024, 1024, 101, 142, 122, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 141, 121, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 141, 122, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 142, 121, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 142, 122, 1, 0, num_runs, precision)
  ),
  list(
-    c(   8,    8, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  16,   16, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  32,   32, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  64,   64, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 128,  128, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 256,  256, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 512,  512, 102, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
-    c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision),
-    c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision),
-    c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision)
+    c(   8,    8, 102, 141, 121, 1, 0, num_runs, precision),
+    c(  16,   16, 102, 141, 121, 1, 0, num_runs, precision),
+    c(  32,   32, 102, 141, 121, 1, 0, num_runs, precision),
+    c(  64,   64, 102, 141, 121, 1, 0, num_runs, precision),
+    c( 128,  128, 102, 141, 121, 1, 0, num_runs, precision),
+    c( 256,  256, 102, 141, 121, 1, 0, num_runs, precision),
+    c( 512,  512, 102, 141, 121, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 141, 121, 1, 0, num_runs, precision),
+    c(2048, 2048, 102, 141, 121, 1, 0, num_runs, precision),
+    c(4096, 4096, 102, 141, 121, 1, 0, num_runs, precision),
+    c(8192, 8192, 102, 141, 121, 1, 0, num_runs, precision)
  )
 )

--- a/scripts/graphs/xsyrk.r
+++ b/scripts/graphs/xsyrk.r
@ -35,32 +35,32 @@ test_names <- list(

 # Defines the test-cases
 test_values <- list(
-  list(c( 128,  128, 102, 111, 111, 16, 128, num_runs, precision)),
-  list(c( 129,  129, 102, 111, 111, 16, 128, num_runs, precision)),
-  list(c( 512,  512, 102, 111, 111, 16, 1, num_runs, precision)),
-  list(c(2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)),
+  list(c( 128,  128, 102, 121, 111, 16, 128, num_runs, precision)),
+  list(c( 129,  129, 102, 121, 111, 16, 128, num_runs, precision)),
+  list(c( 512,  512, 102, 121, 111, 16, 1, num_runs, precision)),
+  list(c(2048, 2048, 102, 121, 111, 16, 1, num_runs, precision)),
  list(
-    c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision)
+    c(1024, 1024, 101, 121, 111, 1, 0, num_runs, precision),
+    c(1024, 1024, 101, 121, 112, 1, 0, num_runs, precision),
+    c(1024, 1024, 101, 122, 111, 1, 0, num_runs, precision),
+    c(1024, 1024, 101, 122, 112, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 121, 111, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 121, 112, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 122, 111, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 122, 112, 1, 0, num_runs, precision)
  ),
  list(
-    c(   8,    8, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  16,   16, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  32,   32, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  64,   64, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 128,  128, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 256,  256, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 512,  512, 102, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
-    c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision),
-    c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision),
-    c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision)
+    c(   8,    8, 102, 121, 111, 1, 0, num_runs, precision),
+    c(  16,   16, 102, 121, 111, 1, 0, num_runs, precision),
+    c(  32,   32, 102, 121, 111, 1, 0, num_runs, precision),
+    c(  64,   64, 102, 121, 111, 1, 0, num_runs, precision),
+    c( 128,  128, 102, 121, 111, 1, 0, num_runs, precision),
+    c( 256,  256, 102, 121, 111, 1, 0, num_runs, precision),
+    c( 512,  512, 102, 121, 111, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 121, 111, 1, 0, num_runs, precision),
+    c(2048, 2048, 102, 121, 111, 1, 0, num_runs, precision),
+    c(4096, 4096, 102, 121, 111, 1, 0, num_runs, precision),
+    c(8192, 8192, 102, 121, 111, 1, 0, num_runs, precision)
  )
 )

--- a/src/buffer_test.hpp
+++ b/src/buffer_test.hpp
@ -1,121 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the tests for the OpenCL buffers (matrices and vectors). These tests are
-// templated and thus header-only.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_BUFFER_TEST_H_
-#define CLBLAST_BUFFER_TEST_H_
-
-#include "clblast.h"
-
-namespace clblast {
-// =================================================================================================
-
-// Tests matrix 'A' for validity
-template <typename T>
-StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
-                       const size_t offset, const size_t ld) {
-  if (ld < one) { return StatusCode::kInvalidLeadDimA; }
-  try {
-    const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
-  } catch (...) { return StatusCode::kInvalidMatrixA; }
-  return StatusCode::kSuccess;
-}
-
-// Tests matrix 'B' for validity
-template <typename T>
-StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
-                       const size_t offset, const size_t ld) {
-  if (ld < one) { return StatusCode::kInvalidLeadDimB; }
-  try {
-    const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryB; }
-  } catch (...) { return StatusCode::kInvalidMatrixB; }
-  return StatusCode::kSuccess;
-}
-
-// Tests matrix 'C' for validity
-template <typename T>
-StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
-                       const size_t offset, const size_t ld) {
-  if (ld < one) { return StatusCode::kInvalidLeadDimC; }
-  try {
-    const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryC; }
-  } catch (...) { return StatusCode::kInvalidMatrixC; }
-  return StatusCode::kSuccess;
-}
-
-// Tests matrix 'AP' for validity
-template <typename T>
-StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer, const size_t offset) {
-  try {
-    const auto required_size = (((n * (n + 1)) / 2) + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
-  } catch (...) { return StatusCode::kInvalidMatrixA; }
-  return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-
-// Tests vector 'X' for validity
-template <typename T>
-StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
-                       const size_t inc) {
-  if (inc == 0) { return StatusCode::kInvalidIncrementX; }
-  try {
-    const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryX; }
-  } catch (...) { return StatusCode::kInvalidVectorX; }
-  return StatusCode::kSuccess;
-}
-
-// Tests vector 'Y' for validity
-template <typename T>
-StatusCode TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
-                       const size_t inc) {
-  if (inc == 0) { return StatusCode::kInvalidIncrementY; }
-  try {
-    const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryY; }
-  } catch (...) { return StatusCode::kInvalidVectorY; }
-  return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-
-// Tests vector 'scalar' for validity
-template <typename T>
-StatusCode TestVectorScalar(const size_t n, const Buffer<T> &buffer, const size_t offset) {
-  try {
-    const auto required_size = (n + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
-  } catch (...) { return StatusCode::kInvalidVectorScalar; }
-  return StatusCode::kSuccess;
-}
-
-// Tests vector 'index' for validity
-template <typename T>
-StatusCode TestVectorIndex(const size_t n, const Buffer<T> &buffer, const size_t offset) {
-  try {
-    const auto required_size = (n + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
-  } catch (...) { return StatusCode::kInvalidVectorScalar; }
-  return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_BUFFER_TEST_H_
-#endif
--- a/src/cache.cpp
+++ b/src/cache.cpp
@ -57,7 +57,7 @@ const std::string& GetBinaryFromCache(const std::string &device_name, const Prec
    }
  }
  binary_cache_mutex_.unlock();
-  throw std::runtime_error("Internal CLBlast error: Expected binary in cache, but found none.");
+  throw LogicError("GetBinaryFromCache: Expected binary in cache, but found none");
 }

 // Queries the cache and retrieves a matching program. Assumes that the match is available, throws
@ -75,7 +75,7 @@ const Program& GetProgramFromCache(const Context &context, const Precision &prec
    }
  }
  program_cache_mutex_.unlock();
-  throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none.");
+  throw LogicError("GetProgramFromCache: Expected program in cache, but found none");
 }

 // Queries the cache to see whether or not the compiled kernel is already there
@ -109,14 +109,13 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
 // =================================================================================================

 // Clears the cache of stored binaries and programs
-StatusCode CacheClearAll() {
+void CacheClearAll() {
  binary_cache_mutex_.lock();
  binary_cache_.clear();
  binary_cache_mutex_.unlock();
  program_cache_mutex_.lock();
  program_cache_.clear();
  program_cache_mutex_.unlock();
-  return StatusCode::kSuccess;
 }

 // =================================================================================================
--- a/src/cache.hpp
+++ b/src/cache.hpp
@ -18,7 +18,7 @@
 #include <vector>
 #include <mutex>

-#include "utilities.hpp"
+#include "utilities/utilities.hpp"

 namespace clblast {
 // =================================================================================================
@ -89,7 +89,7 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
 // =================================================================================================

 // Clears the cache of stored binaries
-StatusCode CacheClearAll();
+void CacheClearAll();

 // =================================================================================================
 } // namespace clblast
--- a/src/clblast.cpp
+++ b/src/clblast.cpp
--- a/src/clblast_c.cpp
+++ b/src/clblast_c.cpp
--- a/src/clblast_netlib_c.cpp
+++ b/src/clblast_netlib_c.cpp
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@ -12,8 +12,8 @@
 // Portability here means that a similar header exists for CUDA with the same classes and
 // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
 //
-// This file is taken from the Claduc project <https://github.com/CNugteren/Claduc> and therefore
-// contains the following header copyright notice:
+// This file is taken from the CLCudaAPI project <https://github.com/CNugteren/CLCudaAPI> and
+// therefore contains the following header copyright notice:
 //
 // =================================================================================================
 //
@ -41,30 +41,52 @@
 #include <string>    // std::string
 #include <vector>    // std::vector
 #include <memory>    // std::shared_ptr
-#include <stdexcept> // std::runtime_error
 #include <numeric>   // std::accumulate
+#include <cstring>   // std::strlen

 // OpenCL
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
 #if defined(__APPLE__) || defined(__MACOSX)
  #include <OpenCL/opencl.h>
 #else
  #include <CL/opencl.h>
 #endif

+// Exception classes
+#include "cxpp11_common.hpp"
+
 namespace clblast {
 // =================================================================================================

-// Error occurred in the C++11 OpenCL header (this file)
-inline void Error(const std::string &message) {
-  throw std::runtime_error("Internal OpenCL error: "+message);
-}
+// Represents a runtime error returned by an OpenCL API function
+class CLError : public ErrorCode<DeviceError, cl_int> {
+ public:
+  explicit CLError(cl_int status, const std::string &where):
+      ErrorCode(status,
+                where,
+                "OpenCL error: " + where + ": " + std::to_string(static_cast<int>(status))) {
+  }
+
+  static void Check(const cl_int status, const std::string &where) {
+    if (status != CL_SUCCESS) {
+      throw CLError(status, where);
+    }
+  }
+
+  static void CheckDtor(const cl_int status, const std::string &where) {
+    if (status != CL_SUCCESS) {
+      fprintf(stderr, "CLBlast: %s (ignoring)\n", CLError(status, where).what());
+    }
+  }
+};
+
+// =================================================================================================

 // Error occurred in OpenCL
-inline void CheckError(const cl_int status) {
-  if (status != CL_SUCCESS) {
-    throw std::runtime_error("Internal OpenCL error: "+std::to_string(status));
-  }
-}
+#define CheckError(call) CLError::Check(call, CLError::TrimCallString(#call))
+
+// Error occured in OpenCL (no-exception version for destructors)
+#define CheckErrorDtor(call) CLError::CheckDtor(call, CLError::TrimCallString(#call))

 // =================================================================================================

@ -81,7 +103,7 @@ class Event {
  // Regular constructor with memory management
  explicit Event():
      event_(new cl_event, [](cl_event* e) {
-        if (*e) { CheckError(clReleaseEvent(*e)); }
+        if (*e) { CheckErrorDtor(clReleaseEvent(*e)); }
        delete e;
      }) {
    *event_ = nullptr;
@ -92,19 +114,18 @@ class Event {
    CheckError(clWaitForEvents(1, &(*event_)));
  }

-  // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
-  // the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation:
-  // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx
+  // Retrieves the elapsed time of the last recorded event.
+  // (Note that there is a bug in Apple's OpenCL implementation of the 'clGetEventProfilingInfo' function:
+  //  http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx)
+  // However, in our case the reply size is fixed to be cl_ulong, so we are not affected.
  float GetElapsedTime() const {
    WaitForCompletion();
-    auto bytes = size_t{0};
-    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
-    auto time_start = size_t{0};
-    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
-    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes);
-    auto time_end = size_t{0};
-    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
-    return (time_end - time_start) * 1.0e-6f;
+    const auto bytes = sizeof(cl_ulong);
+    auto time_start = cl_ulong{0};
+    CheckError(clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr));
+    auto time_end = cl_ulong{0};
+    CheckError(clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr));
+    return static_cast<float>(time_end - time_start) * 1.0e-6f;
  }

  // Accessor to the private data-member
@ -132,10 +153,14 @@ class Platform {
  explicit Platform(const size_t platform_id) {
    auto num_platforms = cl_uint{0};
    CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
-    if (num_platforms == 0) { Error("no platforms found"); }
+    if (num_platforms == 0) {
+      throw RuntimeError("Platform: no platforms found");
+    }
+    if (platform_id >= num_platforms) {
+      throw RuntimeError("Platform: invalid platform ID "+std::to_string(platform_id));
+    }
    auto platforms = std::vector<cl_platform_id>(num_platforms);
    CheckError(clGetPlatformIDs(num_platforms, platforms.data(), nullptr));
-    if (platform_id >= num_platforms) { Error("invalid platform ID "+std::to_string(platform_id)); }
    platform_ = platforms[platform_id];
  }

@ -152,6 +177,17 @@ class Platform {
  cl_platform_id platform_;
 };

+// Retrieves a vector with all platforms
+inline std::vector<Platform> GetAllPlatforms() {
+  auto num_platforms = cl_uint{0};
+  CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
+  auto all_platforms = std::vector<Platform>();
+  for (size_t platform_id = 0; platform_id < static_cast<size_t>(num_platforms); ++platform_id) {
+    all_platforms.push_back(Platform(platform_id));
+  }
+  return all_platforms;
+}
+
 // =================================================================================================

 // C++11 version of 'cl_device_id'
@ -164,11 +200,16 @@ class Device {
  // Initialize the device. Note that this constructor can throw exceptions!
  explicit Device(const Platform &platform, const size_t device_id) {
    auto num_devices = platform.NumDevices();
-    if (num_devices == 0) { Error("no devices found"); }
+    if (num_devices == 0) {
+      throw RuntimeError("Device: no devices found");
+    }
+    if (device_id >= num_devices) {
+      throw RuntimeError("Device: invalid device ID "+std::to_string(device_id));
+    }
+
    auto devices = std::vector<cl_device_id>(num_devices);
    CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, static_cast<cl_uint>(num_devices),
                              devices.data(), nullptr));
-    if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); }
    device_ = devices[device_id];
  }

@ -201,8 +242,8 @@ class Device {
  std::vector<size_t> MaxWorkItemSizes() const {
    return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
  }
-  cl_ulong LocalMemSize() const {
-    return GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE);
+  unsigned long LocalMemSize() const {
+    return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE));
  }
  std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
  size_t CoreClock() const {
@ -238,9 +279,11 @@ class Device {
  // Query for a specific type of device or brand
  bool IsCPU() const { return Type() == "CPU"; }
  bool IsGPU() const { return Type() == "GPU"; }
-  bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; }
+  bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc." ||
+                              Vendor() == "AuthenticAMD";; }
  bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; }
-  bool IsIntel() const { return Vendor() == "Intel" || Vendor() == "GenuineIntel"; }
+  bool IsIntel() const { return Vendor() == "INTEL" || Vendor() == "Intel" ||
+                                Vendor() == "GenuineIntel"; }
  bool IsARM() const { return Vendor() == "ARM"; }

  // Accessor to the private data-member
@ -271,7 +314,8 @@ class Device {
    auto result = std::string{};
    result.resize(bytes);
    CheckError(clGetDeviceInfo(device_, info, bytes, &result[0], nullptr));
-    return std::string{result.c_str()}; // Removes any trailing '\0'-characters
+    result.resize(strlen(result.c_str())); // Removes any trailing '\0'-characters
+    return result;
  }
 };

@ -289,11 +333,11 @@ class Context {

  // Regular constructor with memory management
  explicit Context(const Device &device):
-      context_(new cl_context, [](cl_context* c) { CheckError(clReleaseContext(*c)); delete c; }) {
+      context_(new cl_context, [](cl_context* c) { CheckErrorDtor(clReleaseContext(*c)); delete c; }) {
    auto status = CL_SUCCESS;
    const cl_device_id dev = device();
    *context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status);
-    CheckError(status);
+    CLError::Check(status, "clCreateContext");
  }

  // Accessor to the private data-member
@ -318,18 +362,18 @@ class Program {

  // Source-based constructor with memory management
  explicit Program(const Context &context, std::string source):
-      program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
+      program_(new cl_program, [](cl_program* p) { CheckErrorDtor(clReleaseProgram(*p)); delete p; }),
      length_(source.length()),
      source_(std::move(source)),
      source_ptr_(&source_[0]) {
    auto status = CL_SUCCESS;
    *program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status);
-    CheckError(status);
+    CLError::Check(status, "clCreateProgramWithSource");
  }

  // Binary-based constructor with memory management
  explicit Program(const Device &device, const Context &context, const std::string& binary):
-      program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
+      program_(new cl_program, [](cl_program* p) { CheckErrorDtor(clReleaseProgram(*p)); delete p; }),
      length_(binary.length()),
      source_(binary),
      source_ptr_(&source_[0]) {
@ -339,25 +383,16 @@ class Program {
    *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_,
                                          reinterpret_cast<const unsigned char**>(&source_ptr_),
                                          &status1, &status2);
-    CheckError(status1);
-    CheckError(status2);
+    CLError::Check(status1, "clCreateProgramWithBinary (binary status)");
+    CLError::Check(status2, "clCreateProgramWithBinary");
  }

  // Compiles the device program and returns whether or not there where any warnings/errors
-  BuildStatus Build(const Device &device, std::vector<std::string> &options) {
+  void Build(const Device &device, std::vector<std::string> &options) {
+    options.push_back("-cl-std=CL1.1");
    auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
    const cl_device_id dev = device();
-    auto status = clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr);
-    if (status == CL_BUILD_PROGRAM_FAILURE) {
-      return BuildStatus::kError;
-    }
-    else if (status == CL_INVALID_BINARY) {
-      return BuildStatus::kInvalid;
-    }
-    else {
-      CheckError(status);
-      return BuildStatus::kSuccess;
-    }
+    CheckError(clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr));
  }

  // Retrieves the warning/error message from the compiler (if any)
@ -405,24 +440,11 @@ class Queue {

  // Regular constructor with memory management
  explicit Queue(const Context &context, const Device &device):
-      queue_(new cl_command_queue, [](cl_command_queue* s) { CheckError(clReleaseCommandQueue(*s));
+      queue_(new cl_command_queue, [](cl_command_queue* s) { CheckErrorDtor(clReleaseCommandQueue(*s));
                                                             delete s; }) {
    auto status = CL_SUCCESS;
-    #ifdef CL_VERSION_2_0
-      size_t ocl_version = device.VersionNumber();
-      if (ocl_version >= 200)
-      {
-        cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
-        *queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status);
-      }
-      else
-      {
-        *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
-      }
-    #else
-      *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
-    #endif
-    CheckError(status);
+    *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
+    CLError::Check(status, "clCreateCommandQueue");
  }

  // Synchronizes the queue
@ -514,7 +536,7 @@ class Buffer {
    if (access_ == BufferAccess::kWriteOnly) { flags = CL_MEM_WRITE_ONLY; }
    auto status = CL_SUCCESS;
    *buffer_ = clCreateBuffer(context(), flags, size*sizeof(T), nullptr, &status);
-    CheckError(status);
+    CLError::Check(status, "clCreateBuffer");
  }

  // As above, but now with read/write access as a default
@ -535,18 +557,24 @@ class Buffer {

  // Copies from device to host: reading the device buffer a-synchronously
  void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
-    if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
+    if (access_ == BufferAccess::kWriteOnly) {
+      throw LogicError("Buffer: reading from a write-only buffer");
+    }
    CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
                                   host, 0, nullptr, nullptr));
  }
  void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
                 const size_t offset = 0) const {
-    if (host.size() < size) { Error("target host buffer is too small"); }
+    if (host.size() < size) {
+      throw LogicError("Buffer: target host buffer is too small");
+    }
    ReadAsync(queue, size, host.data(), offset);
  }
  void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
                 const size_t offset = 0) const {
-    if (host.size() < size) { Error("target host buffer is too small"); }
+    if (host.size() < size) {
+      throw LogicError("Buffer: target host buffer is too small");
+    }
    ReadAsync(queue, size, host.data(), offset);
  }

@ -566,8 +594,12 @@ class Buffer {

  // Copies from host to device: writing the device buffer a-synchronously
  void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
-    if (access_ == BufferAccess::kReadOnly) { Error("writing to a read-only buffer"); }
-    if (GetSize() < (offset+size)*sizeof(T)) { Error("target device buffer is too small"); }
+    if (access_ == BufferAccess::kReadOnly) {
+      throw LogicError("Buffer: writing to a read-only buffer");
+    }
+    if (GetSize() < (offset+size)*sizeof(T)) {
+      throw LogicError("Buffer: target device buffer is too small");
+    }
    CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
                                    host, 0, nullptr, nullptr));
  }
@ -606,8 +638,7 @@ class Buffer {

  // Retrieves the actual allocated size in bytes
  size_t GetSize() const {
-    auto bytes = size_t{0};
-    CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, 0, nullptr, &bytes));
+    const auto bytes = sizeof(size_t);
    auto result = size_t{0};
    CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr));
    return result;
@ -634,10 +665,10 @@ class Kernel {

  // Regular constructor with memory management
  explicit Kernel(const Program &program, const std::string &name):
-      kernel_(new cl_kernel, [](cl_kernel* k) { CheckError(clReleaseKernel(*k)); delete k; }) {
+      kernel_(new cl_kernel, [](cl_kernel* k) { CheckErrorDtor(clReleaseKernel(*k)); delete k; }) {
    auto status = CL_SUCCESS;
    *kernel_ = clCreateKernel(program(), name.c_str(), &status);
-    CheckError(status);
+    CLError::Check(status, "clCreateKernel");
  }

  // Sets a kernel argument at the indicated position
@ -658,17 +689,16 @@ class Kernel {
  }

  // Retrieves the amount of local memory used per work-group for this kernel
-  cl_ulong LocalMemUsage(const Device &device) const {
-    auto bytes = size_t{0};
+  unsigned long LocalMemUsage(const Device &device) const {
+    const auto bytes = sizeof(cl_ulong);
    auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE};
-    CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, 0, nullptr, &bytes));
    auto result = cl_ulong{0};
    CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr));
-    return result;
+    return static_cast<unsigned long>(result);
  }

  // Retrieves the name of the kernel
-  std::string GetFunctionName() {
+  std::string GetFunctionName() const {
    auto bytes = size_t{0};
    CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &bytes));
    auto result = std::string{};
@ -689,6 +719,7 @@ class Kernel {
  void Launch(const Queue &queue, const std::vector<size_t> &global,
              const std::vector<size_t> &local, EventPointer event,
              const std::vector<Event> &waitForEvents) {
+
    // Builds a plain version of the events waiting list
    auto waitForEventsPlain = std::vector<cl_event>();
    for (auto &waitEvent : waitForEvents) {
--- a/src/cxpp11_common.hpp
+++ b/src/cxpp11_common.hpp
@ -0,0 +1,109 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Ivan Shapovalov <intelfx@intelfx.name>
+//
+// This file contains exception classes corresponding to 'clpp11.hpp'. It is also part of the
+// CLCudaAPI project. See 'clpp11.hpp' for more details.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_CXPP11_COMMON_H_
+#define CLBLAST_CXPP11_COMMON_H_
+
+#include <string>    // std::string
+#include <stdexcept> // std::runtime_error
+
+namespace clblast {
+// =================================================================================================
+
+// Basic exception class: represents an error happened inside our code
+// (as opposed to an error in C++ runtime)
+template <typename Base>
+class Error : public Base {
+ public:
+  // Perfect forwarding of the constructor since "using Base::Base" is not supported by VS 2013
+  template <typename... Args>
+  Error(Args&&... args):
+      Base(std::forward<Args>(args)...) {
+  }
+};
+
+// =================================================================================================
+
+// Represents a generic device-specific runtime error (returned by an OpenCL or CUDA API function)
+class DeviceError : public Error<std::runtime_error> {
+ public:
+   // Perfect forwarding of the constructor since "using Error<std::runtime_error>::Error" is not
+   // supported by VS 2013
+   template <typename... Args>
+   DeviceError(Args&&... args):
+       Error<std::runtime_error>(std::forward<Args>(args)...) {
+   }
+
+  static std::string TrimCallString(const char *where) {
+    const char *paren = strchr(where, '(');
+    if (paren) {
+      return std::string(where, paren);
+    } else {
+      return std::string(where);
+    }
+  }
+};
+
+// =================================================================================================
+
+// Represents a generic runtime error (aka environmental problem)
+class RuntimeError : public Error<std::runtime_error> {
+ public:
+  explicit RuntimeError(const std::string &reason):
+      Error("Run-time error: " + reason) {
+  }
+};
+
+// =================================================================================================
+
+// Represents a generic logic error (aka failed assertion)
+class LogicError : public Error<std::logic_error> {
+ public:
+  explicit LogicError(const std::string &reason):
+      Error("Internal logic error: " + reason) {
+  }
+};
+
+// =================================================================================================
+
+// Internal exception base class with a status field and a subclass-specific "details" field
+// which can be used to recreate an exception
+template <typename Base, typename Status>
+class ErrorCode : public Base {
+ public:
+  ErrorCode(Status status, const std::string &details, const std::string &reason):
+      Base(reason),
+      status_(status),
+      details_(details) {
+  }
+
+  Status status() const {
+    return status_;
+  }
+
+  const std::string& details() const {
+    return details_;
+  }
+
+ private:
+  const Status status_;
+  const std::string details_;
+};
+
+// =================================================================================================
+
+} // namespace clblast
+
+// CLBLAST_CXPP11_COMMON_H_
+#endif
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "utilities.hpp"
+#include "utilities/utilities.hpp"

 #include "database/database.hpp"
 #include "database/kernels/xaxpy.hpp"
@ -21,27 +21,42 @@
 #include "database/kernels/xgemv_fast_rot.hpp"
 #include "database/kernels/xger.hpp"
 #include "database/kernels/xgemm.hpp"
+#include "database/kernels/xgemm_direct.hpp"
 #include "database/kernels/copy.hpp"
 #include "database/kernels/pad.hpp"
 #include "database/kernels/transpose.hpp"
 #include "database/kernels/padtranspose.hpp"
+#include "database/kernel_selection.hpp"

 namespace clblast {
 // =================================================================================================

 // Initializes the database
-const std::vector<Database::DatabaseEntry> Database::database = {
-  XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble,
-  XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble,
-  XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble,
-  XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble,
-  XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble,
-  XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble,
-  XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
-  CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
-  PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble,
-  TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble,
-  PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble
+const std::vector<const Database::DatabaseEntry*> Database::database = {
+  &database::XaxpyHalf, &database::XaxpySingle, &database::XaxpyDouble, &database::XaxpyComplexSingle, &database::XaxpyComplexDouble,
+  &database::XdotHalf, &database::XdotSingle, &database::XdotDouble, &database::XdotComplexSingle, &database::XdotComplexDouble,
+  &database::XgemvHalf, &database::XgemvSingle, &database::XgemvDouble, &database::XgemvComplexSingle, &database::XgemvComplexDouble,
+  &database::XgemvFastHalf, &database::XgemvFastSingle, &database::XgemvFastDouble, &database::XgemvFastComplexSingle, &database::XgemvFastComplexDouble,
+  &database::XgemvFastRotHalf, &database::XgemvFastRotSingle, &database::XgemvFastRotDouble, &database::XgemvFastRotComplexSingle, &database::XgemvFastRotComplexDouble,
+  &database::XgerHalf, &database::XgerSingle, &database::XgerDouble, &database::XgerComplexSingle, &database::XgerComplexDouble,
+  &database::XgemmHalf, &database::XgemmSingle, &database::XgemmDouble, &database::XgemmComplexSingle, &database::XgemmComplexDouble,
+  &database::XgemmDirectHalf, &database::XgemmDirectSingle, &database::XgemmDirectDouble, &database::XgemmDirectComplexSingle, &database::XgemmDirectComplexDouble,
+  &database::CopyHalf, &database::CopySingle, &database::CopyDouble, &database::CopyComplexSingle, &database::CopyComplexDouble,
+  &database::PadHalf, &database::PadSingle, &database::PadDouble, &database::PadComplexSingle, &database::PadComplexDouble,
+  &database::TransposeHalf, &database::TransposeSingle, &database::TransposeDouble, &database::TransposeComplexSingle, &database::TransposeComplexDouble,
+  &database::PadtransposeHalf, &database::PadtransposeSingle, &database::PadtransposeDouble, &database::PadtransposeComplexSingle, &database::PadtransposeComplexDouble,
+  &database::KernelSelectionHalf, &database::KernelSelectionSingle, &database::KernelSelectionDouble, &database::KernelSelectionComplexSingle, &database::KernelSelectionComplexDouble
+};
+
+// The OpenCL device vendors
+const std::string Database::kDeviceVendorAll = "default";
+
+// Alternative names for some OpenCL vendors
+const std::unordered_map<std::string, std::string> Database::kVendorNames{
+  { "Intel(R) Corporation", "Intel" },
+  { "GenuineIntel", "Intel" },
+  { "Advanced Micro Devices, Inc.", "AMD" },
+  { "NVIDIA Corporation", "NVIDIA" },
 };

 // =================================================================================================
@ -49,7 +64,7 @@ const std::vector<Database::DatabaseEntry> Database::database = {
 // Constructor, computing device properties and populating the parameter-vector from the database.
 // This takes an optional overlay database in case of custom tuning or custom kernels.
 Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
-                   const Precision precision, const std::vector<DatabaseEntry> &overlay):
+                   const Precision precision, const std::vector<const DatabaseEntry*> &overlay):
  parameters_{} {

  // Finds information of the current device
@ -69,15 +84,15 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
  for (auto &kernel: kernels) {
    auto search_result = ParametersPtr{};

-    for (auto db: { &overlay, &database }) {
-      search_result = Search(kernel, device_type, device_vendor, device_name, precision, *db);
+    for (auto &db: { database, overlay}) {
+      search_result = Search(kernel, device_type, device_vendor, device_name, precision, db);
      if (search_result) {
        parameters_.insert(search_result->begin(), search_result->end());
        break;
      }
    }

-    if (!search_result) { throw std::runtime_error("Database error, could not find a suitable entry"); }
+    if (!search_result) { throw RuntimeErrorCode(StatusCode::kDatabaseError); }
  }
 }

@ -100,17 +115,17 @@ Database::ParametersPtr Database::Search(const std::string &this_kernel,
                                         const std::string &this_vendor,
                                         const std::string &this_device,
                                         const Precision this_precision,
-                                         const std::vector<DatabaseEntry> &this_database) const {
+                                         const std::vector<const DatabaseEntry*> &this_database) const {

  // Selects the right kernel
  for (auto &db: this_database) {
-    if (db.kernel == this_kernel && db.precision == this_precision) {
+    if (db->kernel == this_kernel && db->precision == this_precision) {

      // Searches for the right vendor and device type, or selects the default if unavailable. This
      // assumes that the default vendor / device type is last in the database.
-      for (auto &vendor: db.vendors) {
+      for (auto &vendor: db->vendors) {
        if ((vendor.name == this_vendor || vendor.name == kDeviceVendorAll) &&
-            (vendor.type == this_type || vendor.type == kDeviceTypeAll)) {
+            (vendor.type == this_type || vendor.type == database::kDeviceTypeAll)) {

          // Searches for the right device. If the current device is unavailable, selects the vendor
          // default parameters. This assumes the default is last in the database.
--- a/src/database/database.hpp
+++ b/src/database/database.hpp
@ -21,11 +21,24 @@
 #include <vector>
 #include <unordered_map>

-#include "utilities.hpp"
+#include "utilities/utilities.hpp"

 namespace clblast {
 // =================================================================================================

+// A special namespace to hold all the global constant variables (including the database entries)
+namespace database {
+
+  // The OpenCL device types
+  const std::string kDeviceTypeCPU = "CPU";
+  const std::string kDeviceTypeGPU = "GPU";
+  const std::string kDeviceTypeAccelerator = "accelerator";
+  const std::string kDeviceTypeAll = "default";
+
+} // namespace database
+
+// =================================================================================================
+
 // See comment at top of file for a description of the class
 class Database {
 public:
@ -36,54 +49,32 @@ class Database {

  // Structures for content inside the database
  struct DatabaseDevice {
-    const std::string name;
-    const Parameters parameters;
+    std::string name;
+    Parameters parameters;
  };
  struct DatabaseVendor {
-    const std::string type;
-    const std::string name;
-    const std::vector<DatabaseDevice> devices;
+    std::string type;
+    std::string name;
+    std::vector<DatabaseDevice> devices;
  };
  struct DatabaseEntry {
-    const std::string kernel;
-    const Precision precision;
-    const std::vector<DatabaseVendor> vendors;
+    std::string kernel;
+    Precision precision;
+    std::vector<DatabaseVendor> vendors;
  };

-  // The OpenCL device types
-  static constexpr auto kDeviceTypeCPU = "CPU";
-  static constexpr auto kDeviceTypeGPU = "GPU";
-  static constexpr auto kDeviceTypeAccelerator = "accelerator";
-  static constexpr auto kDeviceTypeAll = "default";
-
  // The OpenCL device vendors
-  static constexpr auto kDeviceVendorAll = "default";
+  static const std::string kDeviceVendorAll;

  // Alternative names for some OpenCL vendors
-  const std::unordered_map<std::string,std::string> kVendorNames {
-    {"Intel(R) Corporation", "Intel"},
-    {"GenuineIntel", "Intel"},
-    {"Advanced Micro Devices, Inc.", "AMD"},
-    {"NVIDIA Corporation", "NVIDIA"},
-  };
+  static const std::unordered_map<std::string, std::string> kVendorNames;

  // The database consists of separate database entries, stored together in a vector
-  static const DatabaseEntry XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
-  static const DatabaseEntry XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble;
-  static const DatabaseEntry XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
-  static const DatabaseEntry XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble;
-  static const DatabaseEntry XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble;
-  static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
-  static const DatabaseEntry XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
-  static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
-  static const DatabaseEntry PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
-  static const DatabaseEntry TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble;
-  static const DatabaseEntry PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble;
-  static const std::vector<DatabaseEntry> database;
+  static const std::vector<const DatabaseEntry*> database;

  // The constructor with a user-provided database overlay (potentially an empty vector)
  explicit Database(const Queue &queue, const std::vector<std::string> &routines,
-                    const Precision precision, const std::vector<DatabaseEntry> &overlay);
+                    const Precision precision, const std::vector<const DatabaseEntry*> &overlay);

  // Accessor of values by key
  size_t operator[](const std::string key) const { return parameters_.find(key)->second; }
@ -95,7 +86,8 @@ class Database {
  // Search method for a specified database, returning pointer (possibly a nullptr)
  ParametersPtr Search(const std::string &this_kernel, const std::string &this_type,
                       const std::string &this_vendor, const std::string &this_device,
-                       const Precision this_precision, const std::vector<DatabaseEntry> &db) const;
+                       const Precision this_precision,
+                       const std::vector<const DatabaseEntry*> &db) const;

  // Found parameters suitable for this device/kernel
  Parameters parameters_;
--- a/src/database/kernel_selection.hpp
+++ b/src/database/kernel_selection.hpp
@ -0,0 +1,136 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This determines when to switch between the direct (for small sizes) and in-direct GEMM kernel
+// with pre/post-processing kernels (for larger sizes). These can be set in a similar way as for the
+// regular kernel tuning parameters: they can be specific for a certain vendor or device or can use
+// some common default values.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+// =================================================================================================
+
+const Database::DatabaseEntry KernelSelectionHalf = {
+  "KernelSelection", Precision::kHalf, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry KernelSelectionSingle = {
+  "KernelSelection", Precision::kSingle, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry KernelSelectionComplexSingle = {
+  "KernelSelection", Precision::kComplexSingle, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry KernelSelectionDouble = {
+  "KernelSelection", Precision::kDouble, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry KernelSelectionComplexDouble = {
+  "KernelSelection", Precision::kComplexDouble, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace database
+} // namespace clblast
--- a/src/database/kernels/copy.hpp
+++ b/src/database/kernels/copy.hpp
@ -12,20 +12,21 @@
 // =================================================================================================

 namespace clblast {
+namespace database {
 // =================================================================================================

-const Database::DatabaseEntry Database::CopyHalf = {
+const Database::DatabaseEntry CopyHalf = {
  "Copy", Precision::kHalf, {
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",4} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} } },
      }
    },
  }
@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::CopyHalf = {

 // =================================================================================================

-const Database::DatabaseEntry Database::CopySingle = {
+const Database::DatabaseEntry CopySingle = {
  "Copy", Precision::kSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -42,6 +43,7 @@ const Database::DatabaseEntry Database::CopySingle = {
        { "Oland",                                           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
        { "Pitcairn",                                        { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
        { "Tahiti",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+        { "Tonga",                                           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
      }
    },
@ -64,10 +66,11 @@ const Database::DatabaseEntry Database::CopySingle = {
        { "Intel(R) HD Graphics 530",                        { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",8} } },
        { "Iris",                                            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "Iris Pro",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
-        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
      }
    },
    { // Intel accelerators
@ -84,9 +87,10 @@ const Database::DatabaseEntry Database::CopySingle = {
        { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
        { "GeForce GTX 680",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
        { "GeForce GTX 750",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
-        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
        { "GeForce GTX 980",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
+        { "GeForce GTX TITAN Black",                         { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} } },
        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
        { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
@ -103,7 +107,7 @@ const Database::DatabaseEntry Database::CopySingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::CopyComplexSingle = {
+const Database::DatabaseEntry CopyComplexSingle = {
  "Copy", Precision::kComplexSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -112,6 +116,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
        { "Oland",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Pitcairn",                                        { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+        { "Tonga",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
@ -128,7 +133,8 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
        { "Intel(R) HD Graphics 530",                        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",2} } },
        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",4} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "Iris",                                            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "Iris Pro",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } },
        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
@ -147,8 +153,9 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
        { "GeForce GTX 480",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX 750",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
-        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX 980",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX TITAN Black",                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
        { "Tesla K40m",                                      { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
@ -165,7 +172,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::CopyDouble = {
+const Database::DatabaseEntry CopyDouble = {
  "Copy", Precision::kDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -174,6 +181,7 @@ const Database::DatabaseEntry Database::CopyDouble = {
        { "Oland",                                           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
        { "Pitcairn",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "Tonga",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",4} } },
        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
      }
    },
@ -205,18 +213,19 @@ const Database::DatabaseEntry Database::CopyDouble = {
        { "GeForce GTX 670",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "GeForce GTX 680",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "GeForce GTX 750",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "GeForce GTX 980",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+        { "GeForce GTX TITAN Black",                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
-        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
      }
    },
  }
@ -224,7 +233,7 @@ const Database::DatabaseEntry Database::CopyDouble = {

 // =================================================================================================

-const Database::DatabaseEntry Database::CopyComplexDouble = {
+const Database::DatabaseEntry CopyComplexDouble = {
  "Copy", Precision::kComplexDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -233,6 +242,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
        { "Oland",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Pitcairn",                                        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "Tonga",                                           { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
@ -264,9 +274,10 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
        { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX 680",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX 750",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX 980",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX TITAN Black",                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
@ -282,4 +293,5 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
 };

 // =================================================================================================
+} // namespace database
 } // namespace clblast
--- a/src/database/kernels/pad.hpp
+++ b/src/database/kernels/pad.hpp
@ -12,14 +12,15 @@
 // =================================================================================================

 namespace clblast {
+namespace database {
 // =================================================================================================

-const Database::DatabaseEntry Database::PadHalf = {
+const Database::DatabaseEntry PadHalf = {
  "Pad", Precision::kHalf, {
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
      }
    },
@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::PadHalf = {

 // =================================================================================================

-const Database::DatabaseEntry Database::PadSingle = {
+const Database::DatabaseEntry PadSingle = {
  "Pad", Precision::kSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -42,7 +43,8 @@ const Database::DatabaseEntry Database::PadSingle = {
        { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Pitcairn",                                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Tahiti",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "Tonga",                                           { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
      }
    },
    { // ARM GPUs
@ -64,10 +66,11 @@ const Database::DatabaseEntry Database::PadSingle = {
        { "Intel(R) HD Graphics 530",                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
        { "Iris",                                            { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "Iris Pro",                                        { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
      }
    },
    { // Intel accelerators
@ -84,9 +87,10 @@ const Database::DatabaseEntry Database::PadSingle = {
        { "GeForce GTX 670",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
        { "GeForce GTX 680",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
        { "GeForce GTX 750",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
-        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
        { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX TITAN",                               { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "GeForce GTX TITAN Black",                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "Tesla K40m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@ -103,7 +107,7 @@ const Database::DatabaseEntry Database::PadSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::PadComplexSingle = {
+const Database::DatabaseEntry PadComplexSingle = {
  "Pad", Precision::kComplexSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -112,6 +116,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
        { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Pitcairn",                                        { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Tahiti",                                          { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Tonga",                                           { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
@ -134,7 +139,8 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
        { "Intel(R) HD Graphics 530",                        { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Iris",                                            { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
        { "Iris Pro",                                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
@ -154,13 +160,14 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
        { "GeForce GTX 670",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "GeForce GTX 680",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "GeForce GTX 750",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX TITAN",                               { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "GeForce GTX TITAN Black",                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Tesla K40m",                                      { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
      }
    },
    { // Default
@ -173,7 +180,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::PadDouble = {
+const Database::DatabaseEntry PadDouble = {
  "Pad", Precision::kDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -182,7 +189,8 @@ const Database::DatabaseEntry Database::PadDouble = {
        { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Pitcairn",                                        { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Tahiti",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "Tonga",                                           { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
    { // ARM GPUs
@ -216,6 +224,7 @@ const Database::DatabaseEntry Database::PadDouble = {
        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX 980",                                 { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX TITAN",                               { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX TITAN Black",                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tesla K40m",                                      { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
@ -232,7 +241,7 @@ const Database::DatabaseEntry Database::PadDouble = {

 // =================================================================================================

-const Database::DatabaseEntry Database::PadComplexDouble = {
+const Database::DatabaseEntry PadComplexDouble = {
  "Pad", Precision::kComplexDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -241,7 +250,8 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
        { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "Pitcairn",                                        { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tahiti",                                          { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Tonga",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
    { // ARM GPUs
@ -272,9 +282,10 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
        { "GeForce GTX 670",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX 680",                                 { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX 750",                                 { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX TITAN",                               { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "GeForce GTX TITAN Black",                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
        { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tesla K20m",                                      { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Tesla K40m",                                      { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@ -290,4 +301,5 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
 };

 // =================================================================================================
+} // namespace database
 } // namespace clblast
--- a/src/database/kernels/padtranspose.hpp
+++ b/src/database/kernels/padtranspose.hpp
@ -12,14 +12,15 @@
 // =================================================================================================

 namespace clblast {
+namespace database {
 // =================================================================================================

-const Database::DatabaseEntry Database::PadtransposeHalf = {
+const Database::DatabaseEntry PadtransposeHalf = {
  "Padtranspose", Precision::kHalf, {
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
      }
    },
@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::PadtransposeHalf = {

 // =================================================================================================

-const Database::DatabaseEntry Database::PadtransposeSingle = {
+const Database::DatabaseEntry PadtransposeSingle = {
  "Padtranspose", Precision::kSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -42,6 +43,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
        { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
        { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
        { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+        { "Tonga",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
      }
    },
@ -64,6 +66,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
        { "Intel(R) HD Graphics 530",                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "Iris",                                            { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "Iris Pro",                                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@ -87,6 +90,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
        { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
        { "GeForce GTX 980",                                 { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "GeForce GTX TITAN Black",                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
        { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
        { "Tesla K20m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
@ -103,7 +107,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
+const Database::DatabaseEntry PadtransposeComplexSingle = {
  "Padtranspose", Precision::kComplexSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -112,6 +116,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
        { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Tonga",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
      }
    },
@ -134,6 +139,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
        { "Intel(R) HD Graphics 530",                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
        { "Iris",                                            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "Iris Pro",                                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@ -157,6 +163,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
        { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX 980",                                 { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX TITAN Black",                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
        { "Tesla K20m",                                      { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@ -173,7 +180,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::PadtransposeDouble = {
+const Database::DatabaseEntry PadtransposeDouble = {
  "Padtranspose", Precision::kDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -182,6 +189,7 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
        { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
        { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Tahiti",                                          { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Tonga",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
      }
    },
@ -216,6 +224,7 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
        { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
        { "GeForce GTX 980",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX TITAN Black",                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
        { "Tesla K20m",                                      { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@ -232,7 +241,7 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {

 // =================================================================================================

-const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
+const Database::DatabaseEntry PadtransposeComplexDouble = {
  "Padtranspose", Precision::kComplexDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -241,6 +250,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
        { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+        { "Tonga",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
      }
    },
@ -272,9 +282,10 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
        { "GeForce GTX 670",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
        { "GeForce GTX 750",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
        { "GeForce GTX 980",                                 { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX TITAN Black",                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
        { "Tesla K20m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@ -290,4 +301,5 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
 };

 // =================================================================================================
+} // namespace database
 } // namespace clblast
--- a/src/database/kernels/transpose.hpp
+++ b/src/database/kernels/transpose.hpp
@ -12,20 +12,21 @@
 // =================================================================================================

 namespace clblast {
+namespace database {
 // =================================================================================================

-const Database::DatabaseEntry Database::TransposeHalf = {
+const Database::DatabaseEntry TransposeHalf = {
  "Transpose", Precision::kHalf, {
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
-        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
      }
    },
  }
@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::TransposeHalf = {

 // =================================================================================================

-const Database::DatabaseEntry Database::TransposeSingle = {
+const Database::DatabaseEntry TransposeSingle = {
  "Transpose", Precision::kSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -42,7 +43,8 @@ const Database::DatabaseEntry Database::TransposeSingle = {
        { "Oland",                                           { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
        { "Pitcairn",                                        { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "Tahiti",                                          { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "Tonga",                                           { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
      }
    },
    { // ARM GPUs
@ -64,10 +66,11 @@ const Database::DatabaseEntry Database::TransposeSingle = {
        { "Intel(R) HD Graphics 530",                        { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
        { "Iris",                                            { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "Iris Pro",                                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
-        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
      }
    },
    { // Intel accelerators
@ -87,6 +90,7 @@ const Database::DatabaseEntry Database::TransposeSingle = {
        { "GeForce GTX 750 Ti",                              { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "GeForce GTX 980",                                 { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "GeForce GTX TITAN Black",                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "GeForce GTX TITAN X",                             { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "Tesla K20m",                                      { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "Tesla K40m",                                      { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
@ -103,7 +107,7 @@ const Database::DatabaseEntry Database::TransposeSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::TransposeComplexSingle = {
+const Database::DatabaseEntry TransposeComplexSingle = {
  "Transpose", Precision::kComplexSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -112,6 +116,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
        { "Oland",                                           { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
        { "Pitcairn",                                        { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "Tahiti",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "Tonga",                                           { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
      }
    },
@ -134,7 +139,8 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
        { "Intel(R) HD Graphics 530",                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "Iris",                                            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "Iris Pro",                                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
@ -151,6 +157,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
        { "GeForce GTX 750 Ti",                              { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX 980",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX TITAN Black",                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
@ -159,7 +166,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
      }
    },
  }
@ -167,7 +174,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::TransposeDouble = {
+const Database::DatabaseEntry TransposeDouble = {
  "Transpose", Precision::kDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -176,6 +183,7 @@ const Database::DatabaseEntry Database::TransposeDouble = {
        { "Oland",                                           { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
        { "Pitcairn",                                        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
        { "Tahiti",                                          { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "Tonga",                                           { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
      }
    },
@ -207,9 +215,10 @@ const Database::DatabaseEntry Database::TransposeDouble = {
        { "GeForce GTX 670",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
        { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
        { "GeForce GTX 750",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "GeForce GTX 980",                                 { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "GeForce GTX TITAN",                               { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "GeForce GTX TITAN Black",                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
@ -226,7 +235,7 @@ const Database::DatabaseEntry Database::TransposeDouble = {

 // =================================================================================================

-const Database::DatabaseEntry Database::TransposeComplexDouble = {
+const Database::DatabaseEntry TransposeComplexDouble = {
  "Transpose", Precision::kComplexDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -235,7 +244,8 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = {
        { "Oland",                                           { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "Pitcairn",                                        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "Tahiti",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
-        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "Tonga",                                           { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
      }
    },
    { // ARM GPUs
@ -263,6 +273,7 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = {
        { "GeForce GTX 750 Ti",                              { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX 980",                                 { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX TITAN Black",                         { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
@ -278,4 +289,5 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = {
 };

 // =================================================================================================
+} // namespace database
 } // namespace clblast
--- a/src/database/kernels/xaxpy.hpp
+++ b/src/database/kernels/xaxpy.hpp
@ -12,14 +12,15 @@
 // =================================================================================================

 namespace clblast {
+namespace database {
 // =================================================================================================

-const Database::DatabaseEntry Database::XaxpyHalf = {
+const Database::DatabaseEntry XaxpyHalf = {
  "Xaxpy", Precision::kHalf, {
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",4}, {"WGS",512}, {"WPT",8} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",8}, {"WGS",64}, {"WPT",1} } },
        { "default",                                         { {"VW",8}, {"WGS",64}, {"WPT",1} } },
      }
    },
@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XaxpyHalf = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XaxpySingle = {
+const Database::DatabaseEntry XaxpySingle = {
  "Xaxpy", Precision::kSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -42,7 +43,8 @@ const Database::DatabaseEntry Database::XaxpySingle = {
        { "Oland",                                           { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "Pitcairn",                                        { {"VW",2}, {"WGS",128}, {"WPT",1} } },
        { "Tahiti",                                          { {"VW",2}, {"WGS",64}, {"WPT",1} } },
-        { "default",                                         { {"VW",2}, {"WGS",256}, {"WPT",1} } },
+        { "Tonga",                                           { {"VW",1}, {"WGS",256}, {"WPT",8} } },
+        { "default",                                         { {"VW",2}, {"WGS",64}, {"WPT",2} } },
      }
    },
    { // ARM GPUs
@ -64,10 +66,11 @@ const Database::DatabaseEntry Database::XaxpySingle = {
        { "Intel(R) HD Graphics 530",                        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",1}, {"WGS",512}, {"WPT",2} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",8}, {"WGS",512}, {"WPT",1} } },
        { "Iris",                                            { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "Iris Pro",                                        { {"VW",1}, {"WGS",128}, {"WPT",2} } },
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",4}, {"WGS",256}, {"WPT",1} } },
      }
    },
    { // Intel accelerators
@ -84,9 +87,10 @@ const Database::DatabaseEntry Database::XaxpySingle = {
        { "GeForce GTX 670",                                 { {"VW",2}, {"WGS",64}, {"WPT",1} } },
        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "GeForce GTX 750",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW",2}, {"WGS",64}, {"WPT",1} } },
        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
        { "GeForce GTX TITAN",                               { {"VW",4}, {"WGS",256}, {"WPT",1} } },
+        { "GeForce GTX TITAN Black",                         { {"VW",4}, {"WGS",128}, {"WPT",4} } },
        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "Tesla K20m",                                      { {"VW",4}, {"WGS",128}, {"WPT",1} } },
        { "Tesla K40m",                                      { {"VW",4}, {"WGS",128}, {"WPT",1} } },
@ -95,7 +99,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",4}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",4}, {"WGS",256}, {"WPT",1} } },
      }
    },
  }
@ -103,7 +107,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XaxpyComplexSingle = {
+const Database::DatabaseEntry XaxpyComplexSingle = {
  "Xaxpy", Precision::kComplexSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -112,6 +116,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
        { "Oland",                                           { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "Pitcairn",                                        { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "Tahiti",                                          { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "Tonga",                                           { {"VW",1}, {"WGS",256}, {"WPT",8} } },
        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
      }
    },
@ -134,10 +139,11 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
        { "Intel(R) HD Graphics 530",                        { {"VW",4}, {"WGS",64}, {"WPT",2} } },
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",2}, {"WGS",512}, {"WPT",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",4}, {"WGS",64}, {"WPT",1} } },
        { "Iris",                                            { {"VW",2}, {"WGS",128}, {"WPT",1} } },
        { "Iris Pro",                                        { {"VW",1}, {"WGS",256}, {"WPT",8} } },
-        { "default",                                         { {"VW",1}, {"WGS",256}, {"WPT",2} } },
+        { "default",                                         { {"VW",4}, {"WGS",64}, {"WPT",1} } },
      }
    },
    { // Intel accelerators
@ -157,6 +163,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",512}, {"WPT",1} } },
        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+        { "GeForce GTX TITAN Black",                         { {"VW",1}, {"WGS",128}, {"WPT",2} } },
        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",512}, {"WPT",1} } },
        { "Tesla K20m",                                      { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "Tesla K40m",                                      { {"VW",1}, {"WGS",128}, {"WPT",1} } },
@ -173,7 +180,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XaxpyDouble = {
+const Database::DatabaseEntry XaxpyDouble = {
  "Xaxpy", Precision::kDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -182,6 +189,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
        { "Oland",                                           { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "Pitcairn",                                        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "Tahiti",                                          { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "Tonga",                                           { {"VW",1}, {"WGS",128}, {"WPT",4} } },
        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
    },
@ -213,18 +221,19 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
        { "GeForce GTX 670",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "GeForce GTX 750",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",256}, {"WPT",2} } },
        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
        { "GeForce GTX TITAN",                               { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
+        { "GeForce GTX TITAN Black",                         { {"VW",2}, {"WGS",128}, {"WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",512}, {"WPT",1} } },
        { "Tesla K20m",                                      { {"VW",2}, {"WGS",128}, {"WPT",1} } },
        { "Tesla K40m",                                      { {"VW",2}, {"WGS",128}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "default",                                         { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",2} } },
      }
    },
  }
@ -232,7 +241,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XaxpyComplexDouble = {
+const Database::DatabaseEntry XaxpyComplexDouble = {
  "Xaxpy", Precision::kComplexDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -241,6 +250,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
        { "Oland",                                           { {"VW",1}, {"WGS",256}, {"WPT",1} } },
        { "Pitcairn",                                        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "Tahiti",                                          { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "Tonga",                                           { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
      }
    },
@ -272,9 +282,10 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
        { "GeForce GTX 670",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "GeForce GTX 750",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",256}, {"WPT",2} } },
+        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",64}, {"WPT",2} } },
        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
        { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS",64}, {"WPT",4} } },
+        { "GeForce GTX TITAN Black",                         { {"VW",1}, {"WGS",128}, {"WPT",4} } },
        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
        { "Tesla K20m",                                      { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "Tesla K40m",                                      { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@ -290,4 +301,5 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
 };

 // =================================================================================================
+} // namespace database
 } // namespace clblast
--- a/src/database/kernels/xdot.hpp
+++ b/src/database/kernels/xdot.hpp
@ -12,20 +12,21 @@
 // =================================================================================================

 namespace clblast {
+namespace database {
 // =================================================================================================

-const Database::DatabaseEntry Database::XdotHalf = {
+const Database::DatabaseEntry XdotHalf = {
  "Xdot", Precision::kHalf, {
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",32}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",32}, {"WGS2",32} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",128}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",32}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
      }
    },
  }
@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XdotHalf = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XdotSingle = {
+const Database::DatabaseEntry XdotSingle = {
  "Xdot", Precision::kSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -41,6 +42,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
        { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
        { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",32} } },
        { "Tahiti",                                          { {"WGS1",128}, {"WGS2",32} } },
+        { "Tonga",                                           { {"WGS1",64}, {"WGS2",32} } },
        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
      }
    },
@ -55,7 +57,8 @@ const Database::DatabaseEntry Database::XdotSingle = {
        { "Intel(R) HD Graphics 530",                        { {"WGS1",64}, {"WGS2",32} } },
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WGS2",32} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WGS2",32} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",512}, {"WGS2",128} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",32}, {"WGS2",32} } },
        { "Iris Pro",                                        { {"WGS1",512}, {"WGS2",64} } },
        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
      }
@ -68,7 +71,9 @@ const Database::DatabaseEntry Database::XdotSingle = {
        { "GeForce GTX 670",                                 { {"WGS1",512}, {"WGS2",1024} } },
        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",128} } },
        { "GeForce GTX 750",                                 { {"WGS1",128}, {"WGS2",32} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WGS2",32} } },
        { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",32} } },
+        { "GeForce GTX TITAN Black",                         { {"WGS1",512}, {"WGS2",64} } },
        { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
        { "Tesla K20m",                                      { {"WGS1",1024}, {"WGS2",32} } },
        { "default",                                         { {"WGS1",256}, {"WGS2",256} } },
@ -84,7 +89,7 @@ const Database::DatabaseEntry Database::XdotSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XdotComplexSingle = {
+const Database::DatabaseEntry XdotComplexSingle = {
  "Xdot", Precision::kComplexSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -92,7 +97,8 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
        { "Oland",                                           { {"WGS1",128}, {"WGS2",32} } },
        { "Pitcairn",                                        { {"WGS1",256}, {"WGS2",32} } },
        { "Tahiti",                                          { {"WGS1",64}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
+        { "Tonga",                                           { {"WGS1",256}, {"WGS2",64} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",64} } },
      }
    },
    { // Intel CPUs
@ -106,7 +112,8 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
        { "Intel(R) HD Graphics 530",                        { {"WGS1",256}, {"WGS2",32} } },
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",32}, {"WGS2",32} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",32}, {"WGS2",32} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",512}, {"WGS2",32} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",32}, {"WGS2",256} } },
        { "Iris Pro",                                        { {"WGS1",32}, {"WGS2",32} } },
        { "default",                                         { {"WGS1",32}, {"WGS2",32} } },
      }
@ -119,7 +126,9 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
        { "GeForce GTX 670",                                 { {"WGS1",256}, {"WGS2",32} } },
        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",64} } },
        { "GeForce GTX 750",                                 { {"WGS1",64}, {"WGS2",32} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WGS2",32} } },
        { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",64} } },
+        { "GeForce GTX TITAN Black",                         { {"WGS1",128}, {"WGS2",64} } },
        { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
        { "Tesla K20m",                                      { {"WGS1",512}, {"WGS2",32} } },
        { "default",                                         { {"WGS1",512}, {"WGS2",64} } },
@ -127,7 +136,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",256}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",64} } },
      }
    },
  }
@ -135,7 +144,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XdotDouble = {
+const Database::DatabaseEntry XdotDouble = {
  "Xdot", Precision::kDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -143,7 +152,8 @@ const Database::DatabaseEntry Database::XdotDouble = {
        { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
        { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",32} } },
        { "Tahiti",                                          { {"WGS1",256}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
+        { "Tonga",                                           { {"WGS1",128}, {"WGS2",64} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",64} } },
      }
    },
    { // Intel CPUs
@ -160,10 +170,12 @@ const Database::DatabaseEntry Database::XdotDouble = {
        { "GeForce GTX 670",                                 { {"WGS1",256}, {"WGS2",32} } },
        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",64} } },
        { "GeForce GTX 750",                                 { {"WGS1",64}, {"WGS2",256} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",128}, {"WGS2",64} } },
        { "GeForce GTX 980",                                 { {"WGS1",128}, {"WGS2",32} } },
+        { "GeForce GTX TITAN Black",                         { {"WGS1",128}, {"WGS2",64} } },
        { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
        { "Tesla K20m",                                      { {"WGS1",512}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",256}, {"WGS2",64} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",64} } },
      }
    },
    { // Default
@ -176,7 +188,7 @@ const Database::DatabaseEntry Database::XdotDouble = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XdotComplexDouble = {
+const Database::DatabaseEntry XdotComplexDouble = {
  "Xdot", Precision::kComplexDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -184,6 +196,7 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
        { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
        { "Pitcairn",                                        { {"WGS1",256}, {"WGS2",32} } },
        { "Tahiti",                                          { {"WGS1",256}, {"WGS2",32} } },
+        { "Tonga",                                           { {"WGS1",128}, {"WGS2",64} } },
        { "default",                                         { {"WGS1",256}, {"WGS2",32} } },
      }
    },
@ -201,7 +214,9 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
        { "GeForce GTX 670",                                 { {"WGS1",512}, {"WGS2",128} } },
        { "GeForce GTX 680",                                 { {"WGS1",256}, {"WGS2",64} } },
        { "GeForce GTX 750",                                 { {"WGS1",256}, {"WGS2",32} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WGS2",32} } },
        { "GeForce GTX 980",                                 { {"WGS1",64}, {"WGS2",32} } },
+        { "GeForce GTX TITAN Black",                         { {"WGS1",128}, {"WGS2",32} } },
        { "GeForce GTX TITAN X",                             { {"WGS1",128}, {"WGS2",32} } },
        { "Tesla K20m",                                      { {"WGS1",128}, {"WGS2",32} } },
        { "default",                                         { {"WGS1",128}, {"WGS2",64} } },
@ -216,4 +231,5 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
 };

 // =================================================================================================
+} // namespace database
 } // namespace clblast
--- a/src/database/kernels/xgemm.hpp
+++ b/src/database/kernels/xgemm.hpp
@ -12,13 +12,20 @@
 // =================================================================================================

 namespace clblast {
+namespace database {
 // =================================================================================================

-const Database::DatabaseEntry Database::XgemmHalf = {
+const Database::DatabaseEntry XgemmHalf = {
  "Xgemm", Precision::kHalf, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+      }
+    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
      }
    },
  }
@ -26,7 +33,7 @@ const Database::DatabaseEntry Database::XgemmHalf = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XgemmSingle = {
+const Database::DatabaseEntry XgemmSingle = {
  "Xgemm", Precision::kSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -35,7 +42,8 @@ const Database::DatabaseEntry Database::XgemmSingle = {
        { "Oland",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
        { "Pitcairn",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tahiti",                                          { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "Tonga",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
      }
    },
    { // ARM GPUs
@ -57,10 +65,11 @@ const Database::DatabaseEntry Database::XgemmSingle = {
        { "Intel(R) HD Graphics 530",                        { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
        { "Iris",                                            { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
-        { "Iris Pro",                                        { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "Iris Pro",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
      }
    },
    { // Intel accelerators
@ -77,18 +86,19 @@ const Database::DatabaseEntry Database::XgemmSingle = {
        { "GeForce GTX 670",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
        { "GeForce GTX 680",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
        { "GeForce GTX 750",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
-        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",4} } },
+        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",2} } },
        { "GeForce GTX 980",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",8} } },
        { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
+        { "GeForce GTX TITAN Black",                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
        { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
        { "Tesla K20m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
        { "Tesla K40m",                                      { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
      }
    },
  }
@ -96,7 +106,7 @@ const Database::DatabaseEntry Database::XgemmSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XgemmComplexSingle = {
+const Database::DatabaseEntry XgemmComplexSingle = {
  "Xgemm", Precision::kComplexSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -105,7 +115,8 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
        { "Oland",                                           { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
        { "Pitcairn",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
        { "Tahiti",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "Tonga",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
      }
    },
    { // ARM GPUs
@ -127,10 +138,11 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
        { "Intel(R) HD Graphics 530",                        { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
        { "Iris",                                            { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Iris Pro",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
      }
    },
    { // Intel accelerators
@ -147,18 +159,19 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
        { "GeForce GTX 670",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
        { "GeForce GTX 680",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
        { "GeForce GTX 750",                                 { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
-        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
        { "GeForce GTX 980",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
        { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "GeForce GTX TITAN Black",                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
        { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
        { "Tesla K20m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
        { "Tesla K40m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
      }
    },
  }
@ -166,7 +179,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XgemmDouble = {
+const Database::DatabaseEntry XgemmDouble = {
  "Xgemm", Precision::kDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -175,7 +188,8 @@ const Database::DatabaseEntry Database::XgemmDouble = {
        { "Oland",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
        { "Pitcairn",                                        { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
        { "Tahiti",                                          { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "Tonga",                                           { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
      }
    },
    { // ARM GPUs
@ -189,7 +203,7 @@ const Database::DatabaseEntry Database::XgemmDouble = {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
-        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+        { "default",                                         { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
      }
    },
    { // Intel accelerators
@ -206,18 +220,19 @@ const Database::DatabaseEntry Database::XgemmDouble = {
        { "GeForce GTX 670",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "GeForce GTX 680",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
        { "GeForce GTX 750",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
-        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
+        { "GeForce GTX 750 Ti",                              { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
        { "GeForce GTX 980",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
        { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+        { "GeForce GTX TITAN Black",                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tesla K20m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tesla K40m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
      }
    },
  }
@ -225,7 +240,7 @@ const Database::DatabaseEntry Database::XgemmDouble = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XgemmComplexDouble = {
+const Database::DatabaseEntry XgemmComplexDouble = {
  "Xgemm", Precision::kComplexDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -234,7 +249,8 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
        { "Oland",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
        { "Pitcairn",                                        { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tahiti",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "Tonga",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
      }
    },
    { // ARM GPUs
@ -265,21 +281,23 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
        { "GeForce GTX 670",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
        { "GeForce GTX 680",                                 { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "GeForce GTX 750",                                 { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
-        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+        { "GeForce GTX 750 Ti",                              { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "GeForce GTX 980",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+        { "GeForce GTX TITAN Black",                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
        { "GeForce GTX TITAN X",                             { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tesla K20m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tesla K40m",                                      { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
      }
    },
  }
 };

 // =================================================================================================
+} // namespace database
 } // namespace clblast
--- a/src/database/kernels/xgemm_direct.hpp
+++ b/src/database/kernels/xgemm_direct.hpp
@ -0,0 +1,154 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Database generator <database.py>
+//
+// This file populates the database with best-found tuning parameters for the 'Xgemm_Direct' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+// =================================================================================================
+
+const Database::DatabaseEntry XgemmDirectHalf = {
+  "XgemmDirect", Precision::kHalf, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XgemmDirectSingle = {
+  "XgemmDirect", Precision::kSingle, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+        { "Tonga",                                           { {"KWID",16}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",32}, {"NDIMCD",8}, {"PADA",0}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+      }
+    },
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
+        { "Iris Pro",                                        { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
+        { "GeForce GTX TITAN Black",                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XgemmDirectComplexSingle = {
+  "XgemmDirect", Precision::kComplexSingle, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "Tonga",                                           { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+      }
+    },
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "Iris Pro",                                        { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } },
+        { "GeForce GTX TITAN Black",                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XgemmDirectDouble = {
+  "XgemmDirect", Precision::kDouble, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "Tonga",                                           { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+        { "GeForce GTX TITAN Black",                         { {"KWID",8}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XgemmDirectComplexDouble = {
+  "XgemmDirect", Precision::kComplexDouble, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "Tonga",                                           { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+        { "GeForce GTX TITAN Black",                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace database
+} // namespace clblast
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@ -12,14 +12,15 @@
 // =================================================================================================

 namespace clblast {
+namespace database {
 // =================================================================================================

-const Database::DatabaseEntry Database::XgemvHalf = {
+const Database::DatabaseEntry XgemvHalf = {
  "Xgemv", Precision::kHalf, {
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",128}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",256}, {"WPT1",1} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
      }
    },
@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XgemvHalf = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XgemvSingle = {
+const Database::DatabaseEntry XgemvSingle = {
  "Xgemv", Precision::kSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -42,6 +43,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
        { "Oland",                                           { {"WGS1",128}, {"WPT1",1} } },
        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1} } },
        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1} } },
+        { "Tonga",                                           { {"WGS1",128}, {"WPT1",2} } },
        { "default",                                         { {"WGS1",128}, {"WPT1",1} } },
      }
    },
@ -57,10 +59,11 @@ const Database::DatabaseEntry Database::XgemvSingle = {
        { "Intel(R) HD Graphics 530",                        { {"WGS1",256}, {"WPT1",1} } },
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WPT1",1} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",256}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",32}, {"WPT1",1} } },
        { "Iris",                                            { {"WGS1",64}, {"WPT1",2} } },
        { "Iris Pro",                                        { {"WGS1",256}, {"WPT1",2} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",128}, {"WPT1",1} } },
      }
    },
    { // Intel accelerators
@ -77,9 +80,10 @@ const Database::DatabaseEntry Database::XgemvSingle = {
        { "GeForce GTX 670",                                 { {"WGS1",64}, {"WPT1",1} } },
        { "GeForce GTX 680",                                 { {"WGS1",256}, {"WPT1",1} } },
        { "GeForce GTX 750",                                 { {"WGS1",256}, {"WPT1",1} } },
-        { "GeForce GTX 750 Ti",                              { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WPT1",1} } },
        { "GeForce GTX 980",                                 { {"WGS1",128}, {"WPT1",1} } },
        { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX TITAN Black",                         { {"WGS1",256}, {"WPT1",1} } },
        { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WPT1",1} } },
        { "Tesla K20m",                                      { {"WGS1",128}, {"WPT1",1} } },
        { "Tesla K40m",                                      { {"WGS1",256}, {"WPT1",1} } },
@ -88,7 +92,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",128}, {"WPT1",1} } },
      }
    },
  }
@ -96,7 +100,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XgemvComplexSingle = {
+const Database::DatabaseEntry XgemvComplexSingle = {
  "Xgemv", Precision::kComplexSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -105,6 +109,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
        { "Oland",                                           { {"WGS1",64}, {"WPT1",1} } },
        { "Pitcairn",                                        { {"WGS1",64}, {"WPT1",1} } },
        { "Tahiti",                                          { {"WGS1",64}, {"WPT1",1} } },
+        { "Tonga",                                           { {"WGS1",32}, {"WPT1",1} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
      }
    },
@ -120,6 +125,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
        { "Intel(R) HD Graphics 530",                        { {"WGS1",64}, {"WPT1",1} } },
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",256}, {"WPT1",1} } },
        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WPT1",1} } },
        { "Iris",                                            { {"WGS1",256}, {"WPT1",1} } },
        { "Iris Pro",                                        { {"WGS1",64}, {"WPT1",1} } },
@ -140,8 +146,9 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
        { "GeForce GTX 670",                                 { {"WGS1",64}, {"WPT1",1} } },
        { "GeForce GTX 680",                                 { {"WGS1",64}, {"WPT1",1} } },
        { "GeForce GTX 750",                                 { {"WGS1",128}, {"WPT1",1} } },
-        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WPT1",1} } },
        { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX TITAN Black",                         { {"WGS1",32}, {"WPT1",1} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
      }
    },
@ -155,7 +162,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XgemvDouble = {
+const Database::DatabaseEntry XgemvDouble = {
  "Xgemv", Precision::kDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -164,6 +171,7 @@ const Database::DatabaseEntry Database::XgemvDouble = {
        { "Oland",                                           { {"WGS1",256}, {"WPT1",1} } },
        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1} } },
        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1} } },
+        { "Tonga",                                           { {"WGS1",32}, {"WPT1",1} } },
        { "default",                                         { {"WGS1",256}, {"WPT1",1} } },
      }
    },
@ -188,9 +196,10 @@ const Database::DatabaseEntry Database::XgemvDouble = {
        { "GeForce GTX 670",                                 { {"WGS1",128}, {"WPT1",1} } },
        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WPT1",1} } },
        { "GeForce GTX 750",                                 { {"WGS1",64}, {"WPT1",1} } },
-        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WPT1",1} } },
        { "GeForce GTX 980",                                 { {"WGS1",64}, {"WPT1",1} } },
        { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX TITAN Black",                         { {"WGS1",32}, {"WPT1",1} } },
        { "GeForce GTX TITAN X",                             { {"WGS1",64}, {"WPT1",1} } },
        { "Tesla K20m",                                      { {"WGS1",256}, {"WPT1",1} } },
        { "Tesla K40m",                                      { {"WGS1",256}, {"WPT1",1} } },
@ -207,7 +216,7 @@ const Database::DatabaseEntry Database::XgemvDouble = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XgemvComplexDouble = {
+const Database::DatabaseEntry XgemvComplexDouble = {
  "Xgemv", Precision::kComplexDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -216,6 +225,7 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = {
        { "Oland",                                           { {"WGS1",256}, {"WPT1",1} } },
        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1} } },
        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1} } },
+        { "Tonga",                                           { {"WGS1",64}, {"WPT1",1} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
      }
    },
@ -249,4 +259,5 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = {
 };

 // =================================================================================================
+} // namespace database
 } // namespace clblast
--- a/src/database/kernels/xgemv_fast.hpp
+++ b/src/database/kernels/xgemv_fast.hpp
@ -12,14 +12,15 @@
 // =================================================================================================

 namespace clblast {
+namespace database {
 // =================================================================================================

-const Database::DatabaseEntry Database::XgemvFastHalf = {
+const Database::DatabaseEntry XgemvFastHalf = {
  "XgemvFast", Precision::kHalf, {
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
        { "default",                                         { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
      }
    },
@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XgemvFastHalf = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XgemvFastSingle = {
+const Database::DatabaseEntry XgemvFastSingle = {
  "XgemvFast", Precision::kSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -42,6 +43,7 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
        { "Oland",                                           { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Tahiti",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Tonga",                                           { {"VW2",1}, {"WGS2",16}, {"WPT2",4} } },
        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
      }
    },
@ -57,10 +59,11 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
        { "Intel(R) HD Graphics 530",                        { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
        { "Iris",                                            { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
        { "Iris Pro",                                        { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
-        { "default",                                         { {"VW2",2}, {"WGS2",64}, {"WPT2",2} } },
+        { "default",                                         { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
      }
    },
    { // Intel accelerators
@ -77,9 +80,10 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
        { "GeForce GTX 670",                                 { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
        { "GeForce GTX 680",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
        { "GeForce GTX 750",                                 { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
-        { "GeForce GTX 750 Ti",                              { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
        { "GeForce GTX 980",                                 { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
        { "GeForce GTX TITAN",                               { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX TITAN Black",                         { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
        { "GeForce GTX TITAN X",                             { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Tesla K20m",                                      { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
        { "Tesla K40m",                                      { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
@ -96,7 +100,7 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
+const Database::DatabaseEntry XgemvFastComplexSingle = {
  "XgemvFast", Precision::kComplexSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -105,6 +109,7 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
        { "Oland",                                           { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Tahiti",                                          { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "Tonga",                                           { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
      }
    },
@ -120,7 +125,8 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
        { "Intel(R) HD Graphics 530",                        { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW2",1}, {"WGS2",32}, {"WPT2",4} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } },
        { "Iris",                                            { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Iris Pro",                                        { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
@ -139,7 +145,6 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
        { "GeForce GTX 480",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "GeForce GTX 670",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "GeForce GTX 680",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
-        { "GeForce GTX 750 Ti",                              { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
      }
    },
@ -153,7 +158,7 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XgemvFastDouble = {
+const Database::DatabaseEntry XgemvFastDouble = {
  "XgemvFast", Precision::kDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -162,6 +167,7 @@ const Database::DatabaseEntry Database::XgemvFastDouble = {
        { "Oland",                                           { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Tahiti",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Tonga",                                           { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
      }
    },
@ -186,9 +192,10 @@ const Database::DatabaseEntry Database::XgemvFastDouble = {
        { "GeForce GTX 670",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
        { "GeForce GTX 680",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
        { "GeForce GTX 750",                                 { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
-        { "GeForce GTX 750 Ti",                              { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
        { "GeForce GTX 980",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "GeForce GTX TITAN",                               { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX TITAN Black",                         { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
        { "GeForce GTX TITAN X",                             { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
        { "Tesla K20m",                                      { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
        { "Tesla K40m",                                      { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
@ -205,7 +212,7 @@ const Database::DatabaseEntry Database::XgemvFastDouble = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XgemvFastComplexDouble = {
+const Database::DatabaseEntry XgemvFastComplexDouble = {
  "XgemvFast", Precision::kComplexDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -214,6 +221,7 @@ const Database::DatabaseEntry Database::XgemvFastComplexDouble = {
        { "Oland",                                           { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
        { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Tahiti",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Tonga",                                           { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } },
        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
      }
    },
@ -247,4 +255,5 @@ const Database::DatabaseEntry Database::XgemvFastComplexDouble = {
 };

 // =================================================================================================
+} // namespace database
 } // namespace clblast
--- a/src/database/kernels/xgemv_fast_rot.hpp
+++ b/src/database/kernels/xgemv_fast_rot.hpp
@ -12,13 +12,20 @@
 // =================================================================================================

 namespace clblast {
+namespace database {
 // =================================================================================================

-const Database::DatabaseEntry Database::XgemvFastRotHalf = {
+const Database::DatabaseEntry XgemvFastRotHalf = {
  "XgemvFastRot", Precision::kHalf, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
+      }
+    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
      }
    },
  }
@ -26,12 +33,13 @@ const Database::DatabaseEntry Database::XgemvFastRotHalf = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XgemvFastRotSingle = {
+const Database::DatabaseEntry XgemvFastRotSingle = {
  "XgemvFastRot", Precision::kSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
-        { "default",                                         { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
+        { "Tonga",                                           { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
      }
    },
    { // Intel CPUs
@ -44,20 +52,23 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = {
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
        { "Iris Pro",                                        { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
-        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
        { "GeForce GTX TITAN",                               { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+        { "GeForce GTX TITAN Black",                         { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
      }
    },
  }
@ -65,12 +76,13 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
+const Database::DatabaseEntry XgemvFastRotComplexSingle = {
  "XgemvFastRot", Precision::kComplexSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
-        { "default",                                         { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
+        { "Tonga",                                           { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
      }
    },
    { // Intel CPUs
@ -83,14 +95,15 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",128}, {"WPT3",8} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW3",4}, {"WGS3",32}, {"WPT3",8} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
        { "Iris Pro",                                        { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
        { "default",                                         { {"VW3",2}, {"WGS3",32}, {"WPT3",8} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
      }
    },
  }
@ -98,11 +111,12 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XgemvFastRotDouble = {
+const Database::DatabaseEntry XgemvFastRotDouble = {
  "XgemvFastRot", Precision::kDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
+        { "Tonga",                                           { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
        { "default",                                         { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
      }
    },
@ -114,8 +128,10 @@ const Database::DatabaseEntry Database::XgemvFastRotDouble = {
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
        { "GeForce GTX TITAN",                               { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+        { "GeForce GTX TITAN Black",                         { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
      }
    },
    { // Default
@ -128,12 +144,13 @@ const Database::DatabaseEntry Database::XgemvFastRotDouble = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XgemvFastRotComplexDouble = {
+const Database::DatabaseEntry XgemvFastRotComplexDouble = {
  "XgemvFastRot", Precision::kComplexDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
-        { "default",                                         { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
+        { "Tonga",                                           { {"VW3",4}, {"WGS3",16}, {"WPT3",8} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",16} } },
      }
    },
    { // Intel CPUs
@ -151,4 +168,5 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexDouble = {
 };

 // =================================================================================================
+} // namespace database
 } // namespace clblast
--- a/src/database/kernels/xger.hpp
+++ b/src/database/kernels/xger.hpp
@ -12,20 +12,21 @@
 // =================================================================================================

 namespace clblast {
+namespace database {
 // =================================================================================================

-const Database::DatabaseEntry Database::XgerHalf = {
+const Database::DatabaseEntry XgerHalf = {
  "Xger", Precision::kHalf, {
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
+        { "default",                                         { {"WGS1",4}, {"WGS2",8}, {"WPT",2} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",4}, {"WGS2",8}, {"WPT",2} } },
      }
    },
  }
@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XgerHalf = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XgerSingle = {
+const Database::DatabaseEntry XgerSingle = {
  "Xger", Precision::kSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -42,7 +43,8 @@ const Database::DatabaseEntry Database::XgerSingle = {
        { "Oland",                                           { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
        { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
        { "Tahiti",                                          { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
-        { "default",                                         { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
+        { "Tonga",                                           { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
      }
    },
    { // ARM GPUs
@ -63,7 +65,8 @@ const Database::DatabaseEntry Database::XgerSingle = {
        { "Intel(R) HD Graphics 530",                        { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",8}, {"WGS2",8}, {"WPT",4} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",32}, {"WGS2",4}, {"WPT",4} } },
        { "Iris Pro",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
        { "default",                                         { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
      }
@ -76,8 +79,10 @@ const Database::DatabaseEntry Database::XgerSingle = {
        { "GeForce GTX 670",                                 { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
        { "GeForce GTX 750",                                 { {"WGS1",64}, {"WGS2",16}, {"WPT",4} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
        { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
+        { "GeForce GTX TITAN Black",                         { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
      }
    },
    { // Default
@ -90,7 +95,7 @@ const Database::DatabaseEntry Database::XgerSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XgerComplexSingle = {
+const Database::DatabaseEntry XgerComplexSingle = {
  "Xger", Precision::kComplexSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -99,7 +104,8 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
        { "Oland",                                           { {"WGS1",4}, {"WGS2",8}, {"WPT",1} } },
        { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
        { "Tahiti",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
-        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
+        { "Tonga",                                           { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
      }
    },
    { // ARM GPUs
@ -120,9 +126,10 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
        { "Intel(R) HD Graphics 530",                        { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",512}, {"WGS2",1}, {"WPT",1} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } },
        { "Iris Pro",                                        { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
      }
    },
    { // NVIDIA GPUs
@ -133,13 +140,15 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
        { "GeForce GTX 670",                                 { {"WGS1",16}, {"WGS2",32}, {"WPT",2} } },
        { "GeForce GTX 680",                                 { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
        { "GeForce GTX 750",                                 { {"WGS1",32}, {"WGS2",16}, {"WPT",4} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
        { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
+        { "GeForce GTX TITAN Black",                         { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
      }
    },
  }
@ -147,7 +156,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XgerDouble = {
+const Database::DatabaseEntry XgerDouble = {
  "Xger", Precision::kDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -156,7 +165,8 @@ const Database::DatabaseEntry Database::XgerDouble = {
        { "Oland",                                           { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
        { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
        { "Tahiti",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
+        { "Tonga",                                           { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
      }
    },
    { // ARM GPUs
@ -180,8 +190,10 @@ const Database::DatabaseEntry Database::XgerDouble = {
        { "GeForce GTX 670",                                 { {"WGS1",32}, {"WGS2",32}, {"WPT",2} } },
        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
        { "GeForce GTX 750",                                 { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WGS2",16}, {"WPT",1} } },
        { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
+        { "GeForce GTX TITAN Black",                         { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
      }
    },
    { // Default
@ -194,7 +206,7 @@ const Database::DatabaseEntry Database::XgerDouble = {

 // =================================================================================================

-const Database::DatabaseEntry Database::XgerComplexDouble = {
+const Database::DatabaseEntry XgerComplexDouble = {
  "Xger", Precision::kComplexDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
@ -203,6 +215,7 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
        { "Oland",                                           { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
        { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
        { "Tahiti",                                          { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
+        { "Tonga",                                           { {"WGS1",16}, {"WGS2",4}, {"WPT",1} } },
        { "default",                                         { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
      }
    },
@ -227,7 +240,9 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
        { "GeForce GTX 670",                                 { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } },
        { "GeForce GTX 680",                                 { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
        { "GeForce GTX 750",                                 { {"WGS1",8}, {"WGS2",32}, {"WPT",4} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
        { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+        { "GeForce GTX TITAN Black",                         { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
        { "default",                                         { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
      }
    },
@ -240,4 +255,5 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
 };

 // =================================================================================================
+} // namespace database
 } // namespace clblast
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@ -204,7 +204,7 @@ R"(
 #if PRECISION == 3232 || PRECISION == 6464
  #define COMPLEX_CONJUGATE(value) value.x = value.x; value.y = -value.y
 #else
-  #define COMPLEX_CONJUGATE(value) value = value
+  #define COMPLEX_CONJUGATE(value) 
 #endif

 // =================================================================================================
--- a/src/kernels/level3/xgemm_direct_part1.opencl
+++ b/src/kernels/level3/xgemm_direct_part1.opencl
@ -0,0 +1,273 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is a generic GEMM kernel that works for all sizes and configurations: it doesn't require any
+// pre and and post-processing kernels.
+//
+// This kernel is seperated into three files. This is part 1 out of 3.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library. Note that all parameters here have a
+// suffix 'D' to denote that they are for the 'direct' version of the GEMM kernel.
+#ifndef WGD
+  #define WGD 8      // Tile-size in dimension M, N, and K (e.g. 8, 16, 32, 64)
+#endif
+#ifndef MDIMCD
+  #define MDIMCD 8    // Threads per workgroup in M-dimension (e.g. 8, 16, 32)
+#endif
+#ifndef NDIMCD
+  #define NDIMCD 8    // Threads per workgroup in N-dimension (e.g. 8, 16, 32)
+#endif
+#ifndef MDIMAD
+  #define MDIMAD 8    // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD
+#endif
+#ifndef NDIMBD
+  #define NDIMBD 8    // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD
+#endif
+#ifndef KWID
+  #define KWID 1      // Unroll factor of the WGD loop (smaller or equal than WGD)
+#endif
+#ifndef VWMD
+  #define VWMD 1      // Vector width of matrices A and C
+#endif
+#ifndef VWND
+  #define VWND 1      // Vector width of matrix B
+#endif
+#ifndef PADA
+  #define PADA 1      // Local memory padding for matrix A
+#endif
+#ifndef PADB
+  #define PADB 1      // Local memory padding for matrix B
+#endif
+
+// Helper parameters based on the above tuning parameters
+#define MWID (WGD/MDIMCD)                // Work per work-item (M-dimension)
+#define NWID (WGD/NDIMCD)                // Work per work-item (N-dimension)
+#define KDIMAD ((MDIMCD*NDIMCD)/(MDIMAD)) // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD
+#define KDIMBD ((MDIMCD*NDIMCD)/(NDIMBD)) // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD
+#define MWAD (WGD/MDIMAD)                // Amount of loads-per-thread for matrix A (M-dimension)
+#define KWAD (WGD/KDIMAD)                // Amount of loads-per-thread for matrix A (K-dimension)
+#define KWBD (WGD/KDIMBD)                // Amount of loads-per-thread for matrix B (K-dimension)
+#define NWBD (WGD/NDIMBD)                // Amount of loads-per-thread for matrix B (N-dimension)
+
+// =================================================================================================
+
+// Data-widths in dimension M
+#if VWMD == 1
+    typedef real realMD;
+#elif VWMD == 2
+    typedef real2 realMD;
+#elif VWMD == 4
+    typedef real4 realMD;
+#elif VWMD == 8
+    typedef real8 realMD;
+#elif VWMD == 16
+    typedef real16 realMD;
+#endif
+
+// Data-widths in dimension N
+#if VWND == 1
+    typedef real realND;
+#elif VWND == 2
+    typedef real2 realND;
+#elif VWND == 4
+    typedef real4 realND;
+#elif VWND == 8
+    typedef real8 realND;
+#elif VWND == 16
+    typedef real16 realND;
+#endif
+
+// =================================================================================================
+
+// Initializes the accumulation registers to zero
+inline void InitAccRegistersDirect(real cpm[NWID][MWID]) {
+  #pragma unroll
+  for (int mi=0; mi<MWID; ++mi) {
+    #pragma unroll
+    for (int ni=0; ni<NWID; ++ni) {
+      SetToZero(cpm[ni][mi]);
+    }
+  }
+}
+
+// =================================================================================================
+
+// Performs the actual computation: Cpm += Apm * Bpm
+inline void MultiplyAccumulateDirect(real cpm[NWID][MWID], real apm[MWID], real bpm[NWID]) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    #pragma unroll
+    for (int mi=0; mi<MWID; ++mi) {
+      MultiplyAdd(cpm[ni][mi], apm[mi], bpm[ni]);
+    }
+  }
+}
+
+// =================================================================================================
+
+// Loads global off-chip memory into thread-private register files. This function is specific for
+// loading the A input matrix.
+inline void GlobalToPrivateDirectA(const __global real* restrict agms, real apm[MWID],
+                                   const int a_ld, const int a_offset, const int idm, const int idk,
+                                   const int a_transpose, const int a_conjugate) {
+  #pragma unroll
+  for (int mi=0; mi<MWID; ++mi) {
+    const int a_index = (a_transpose) ? (idm + mi)*a_ld + idk : idk*a_ld + (idm + mi);
+    apm[mi] = agms[a_index + a_offset];
+    if (a_conjugate) { COMPLEX_CONJUGATE(apm[mi]); }
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToPrivateDirectB(const __global real* restrict bgms, real bpm[NWID],
+                                   const int b_ld, const int b_offset, const int idn, const int idk,
+                                   const int b_transpose, const int b_conjugate) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    const int b_index = (b_transpose) ? (idn + ni)*b_ld + idk : idk*b_ld + (idn + ni);
+    bpm[ni] = bgms[b_index + b_offset];
+    if (b_conjugate) { COMPLEX_CONJUGATE(bpm[ni]); }
+  }
+}
+
+// Loads global off-chip memory into thread-private register files. This function is specific for
+// loading the A input matrix. This is the same as above but now includes a bounds check.
+inline void GlobalToPrivateCheckedA(const __global real* restrict agms, real apm[MWID],
+                                    const int a_ld, const int a_offset, const int idm, const int idk,
+                                    const int a_transpose, const int a_conjugate,
+                                    const int kSizeM) {
+  #pragma unroll
+  for (int mi=0; mi<MWID; ++mi) {
+    if (idm + mi < kSizeM) {
+      const int a_index = (a_transpose) ? (idm + mi)*a_ld + idk : idk*a_ld + (idm + mi);
+      apm[mi] = agms[a_index + a_offset];
+      if (a_conjugate) { COMPLEX_CONJUGATE(apm[mi]); }
+    }
+    else {
+      SetToZero(apm[mi]);
+    }
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToPrivateCheckedB(const __global real* restrict bgms, real bpm[NWID],
+                                    const int b_ld, const int b_offset, const int idn, const int idk,
+                                    const int b_transpose, const int b_conjugate,
+                                    const int kSizeN) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    if (idn + ni < kSizeN) {
+      const int b_index = (b_transpose) ? (idn + ni)*b_ld + idk : idk*b_ld + (idn + ni);
+      bpm[ni] = bgms[b_index + b_offset];
+      if (b_conjugate) { COMPLEX_CONJUGATE(bpm[ni]); }
+    }
+    else {
+      SetToZero(bpm[ni]);
+    }
+  }
+}
+
+// =================================================================================================
+
+// Caches on-chip local memory into per-thread private memory (registers). This function is specific
+// for caching the A input matrix.
+inline void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const int kg,
+                                  const int a_transpose) {
+  #pragma unroll
+  for (int mi=0; mi<MWID; ++mi) {
+    const int mg = mi + get_local_id(0)*MWID;
+    const int index = (a_transpose) ? mg*(WGD + PADA) + kg : kg*(WGD + PADA) + mg;
+    apm[mi] = alm[index];
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void LocalToPrivateDirectB(__local real* blm, real bpm[NWID], const int kg,
+                                  const int b_transpose) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    const int ng = ni + get_local_id(1)*NWID;
+    const int index = (b_transpose) ? ng*(WGD + PADB) + kg : kg*(WGD + PADB) + ng;
+    bpm[ni] = blm[index];
+  }
+}
+
+// =================================================================================================
+
+// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
+// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
+inline void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID],
+                               const int idm, const int idn,
+                               const real alpha, const real beta,
+                               const int c_ld, const int c_offset, const int c_transpose) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    #pragma unroll
+    for (int mi=0; mi<MWID; ++mi) {
+
+      // Determines the destination index
+      int c_index = (c_transpose) ? (idm + mi)*c_ld + (idn + ni) : (idn + ni)*c_ld + (idm + mi);
+
+      // The final multiplication with alpha (in case beta == 0)
+      real result;
+      if (IsZero(beta)) {
+        Multiply(result, alpha, cpm[ni][mi]);
+      }
+      // The final multiplication with alpha and the addition with beta*C
+      else {
+        AXPBY(result, alpha, cpm[ni][mi], beta, cgm[c_index + c_offset]);
+      }
+      cgm[c_index + c_offset] = result;
+    }
+  }
+}
+
+// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
+// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
+inline void StoreResultsChecked(__global real* cgm, real cpm[NWID][MWID],
+                                const int idm, const int idn, const int kSizeM, const int kSizeN,
+                                const real alpha, const real beta,
+                                const int c_ld, const int c_offset, const int c_transpose) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    #pragma unroll
+    for (int mi=0; mi<MWID; ++mi) {
+      if ((idm + mi) < kSizeM && (idn + ni) < kSizeN) {
+
+        // Determines the destination index
+        int c_index = (c_transpose) ? (idm + mi)*c_ld + (idn + ni) : (idn + ni)*c_ld + (idm + mi);
+
+        // The final multiplication with alpha (in case beta == 0)
+        real result;
+        if (IsZero(beta)) {
+          Multiply(result, alpha, cpm[ni][mi]);
+        }
+        // The final multiplication with alpha and the addition with beta*C
+        else {
+          AXPBY(result, alpha, cpm[ni][mi], beta, cgm[c_index + c_offset]);
+        }
+        cgm[c_index + c_offset] = result;
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
--- a/src/kernels/level3/xgemm_direct_part2.opencl
+++ b/src/kernels/level3/xgemm_direct_part2.opencl
@ -0,0 +1,314 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is part 2 of 3 of the GEMM kernel. See part 1 for more information.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
+// caching the A input matrix.
+inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm,
+                                 const int a_ld, const int a_offset, const int kwg,
+                                 const int a_transpose, const int a_conjugate) {
+  #if MDIMCD == MDIMAD
+    const int la0 = get_local_id(0);
+    const int la1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int la0 = tid % MDIMAD;
+    const int la1 = tid / MDIMAD;
+  #endif
+  #pragma unroll
+  for (int mia=0; mia<MWAD/VWMD; ++mia) {
+    #pragma unroll
+    for (int kia=0; kia<KWAD; ++kia) {
+
+      // Computes the indices for the global memory
+      int mg = mia + la0*(MWAD/VWMD);
+      int kg = kia + la1*KWAD;
+      int idm = (a_transpose) ? mg + kwg/VWMD : mg + GetGroupID0()*(WGD/VWMD);
+      int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      const realMD avec = agm[idk*(a_ld/VWMD) + idm + a_offset];
+      #if VWMD == 1
+         alm[kg*(WGD + PADA) + mg] = avec;
+      #elif VWMD == 2
+         alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.x;
+         alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.y;
+      #elif VWMD == 4
+         alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.x;
+         alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.y;
+         alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.z;
+         alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.w;
+      #elif VWMD == 8
+         alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.s0;
+         alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.s1;
+         alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.s2;
+         alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.s3;
+         alm[kg*(WGD + PADA) + mg*VWMD + 4] = avec.s4;
+         alm[kg*(WGD + PADA) + mg*VWMD + 5] = avec.s5;
+         alm[kg*(WGD + PADA) + mg*VWMD + 6] = avec.s6;
+         alm[kg*(WGD + PADA) + mg*VWMD + 7] = avec.s7;
+      #elif VWMD == 16
+         alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.s0;
+         alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.s1;
+         alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.s2;
+         alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.s3;
+         alm[kg*(WGD + PADA) + mg*VWMD + 4] = avec.s4;
+         alm[kg*(WGD + PADA) + mg*VWMD + 5] = avec.s5;
+         alm[kg*(WGD + PADA) + mg*VWMD + 6] = avec.s6;
+         alm[kg*(WGD + PADA) + mg*VWMD + 7] = avec.s7;
+         alm[kg*(WGD + PADA) + mg*VWMD + 8] = avec.s8;
+         alm[kg*(WGD + PADA) + mg*VWMD + 9] = avec.s9;
+         alm[kg*(WGD + PADA) + mg*VWMD + 10] = avec.sA;
+         alm[kg*(WGD + PADA) + mg*VWMD + 11] = avec.sB;
+         alm[kg*(WGD + PADA) + mg*VWMD + 12] = avec.sC;
+         alm[kg*(WGD + PADA) + mg*VWMD + 13] = avec.sD;
+         alm[kg*(WGD + PADA) + mg*VWMD + 14] = avec.sE;
+         alm[kg*(WGD + PADA) + mg*VWMD + 15] = avec.sF;
+      #endif
+      if (a_conjugate) {
+        for (int vm=0; vm<VWMD; ++vm) {
+          COMPLEX_CONJUGATE(alm[kg*(WGD + PADA) + mg*VWMD + vm]);
+        }
+      }
+    }
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToLocalDirectB(const __global realND* restrict bgm, __local real* blm,
+                                 const int b_ld, const int b_offset, const int kwg,
+                                 const int b_transpose, const int b_conjugate) {
+  #if MDIMCD == NDIMBD
+    const int lb0 = get_local_id(0);
+    const int lb1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int lb0 = tid % NDIMBD;
+    const int lb1 = tid / NDIMBD;
+  #endif
+  #pragma unroll
+  for (int kib=0; kib<KWBD; ++kib) {
+    #pragma unroll
+    for (int nib=0; nib<NWBD/VWND; ++nib) {
+
+      // Computes the indices for the global memory
+      int ng = nib + lb0*(NWBD/VWND);
+      int kg = kib + lb1*KWBD;
+      int idn = (b_transpose) ? ng + kwg/VWND : ng + GetGroupID1()*(WGD/VWND);
+      int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      const realND bvec = bgm[idk*(b_ld/VWND) + idn + b_offset];
+      #if VWND == 1
+         blm[kg*(WGD + PADB) + ng] = bvec;
+      #elif VWND == 2
+         blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.x;
+         blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.y;
+      #elif VWND == 4
+         blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.x;
+         blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.y;
+         blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.z;
+         blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.w;
+      #elif VWND == 8
+         blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.s0;
+         blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.s1;
+         blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.s2;
+         blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.s3;
+         blm[kg*(WGD + PADB) + ng*VWND + 4] = bvec.s4;
+         blm[kg*(WGD + PADB) + ng*VWND + 5] = bvec.s5;
+         blm[kg*(WGD + PADB) + ng*VWND + 6] = bvec.s6;
+         blm[kg*(WGD + PADB) + ng*VWND + 7] = bvec.s7;
+      #elif VWND == 16
+         blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.s0;
+         blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.s1;
+         blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.s2;
+         blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.s3;
+         blm[kg*(WGD + PADB) + ng*VWND + 4] = bvec.s4;
+         blm[kg*(WGD + PADB) + ng*VWND + 5] = bvec.s5;
+         blm[kg*(WGD + PADB) + ng*VWND + 6] = bvec.s6;
+         blm[kg*(WGD + PADB) + ng*VWND + 7] = bvec.s7;
+         blm[kg*(WGD + PADB) + ng*VWND + 8] = bvec.s8;
+         blm[kg*(WGD + PADB) + ng*VWND + 9] = bvec.s9;
+         blm[kg*(WGD + PADB) + ng*VWND + 10] = bvec.sA;
+         blm[kg*(WGD + PADB) + ng*VWND + 11] = bvec.sB;
+         blm[kg*(WGD + PADB) + ng*VWND + 12] = bvec.sC;
+         blm[kg*(WGD + PADB) + ng*VWND + 13] = bvec.sD;
+         blm[kg*(WGD + PADB) + ng*VWND + 14] = bvec.sE;
+         blm[kg*(WGD + PADB) + ng*VWND + 15] = bvec.sF;
+      #endif
+      if (b_conjugate) {
+        for (int vn=0; vn<VWND; ++vn) {
+          COMPLEX_CONJUGATE(blm[kg*(WGD + PADB) + ng*VWND + vn]);
+        }
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
+// caching the A input matrix. In contrast to the functions above, this function performs doesn't
+// use the vector data-types.
+inline void GlobalToLocalScalarA(const __global real* restrict agms, __local real* alm,
+                                 const int a_ld, const int a_offset, const int kwg,
+                                 const int a_transpose, const int a_conjugate) {
+  #if MDIMCD == MDIMAD
+    const int la0 = get_local_id(0);
+    const int la1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int la0 = tid % MDIMAD;
+    const int la1 = tid / MDIMAD;
+  #endif
+  #pragma unroll
+  for (int mia=0; mia<MWAD; ++mia) {
+    #pragma unroll
+    for (int kia=0; kia<KWAD; ++kia) {
+
+      // Computes the indices for the global memory
+      int mg = mia + la0*MWAD;
+      int kg = kia + la1*KWAD;
+      int idm = (a_transpose) ? mg + kwg : mg + GetGroupID0()*WGD;
+      int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      real result = agms[idk*a_ld + idm + a_offset];
+      if (a_conjugate) { COMPLEX_CONJUGATE(result); }
+      alm[kg*(WGD + PADA) + mg] = result;
+    }
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToLocalScalarB(const __global real* restrict bgms, __local real* blm,
+                                 const int b_ld, const int b_offset, const int kwg,
+                                 const int b_transpose, const int b_conjugate) {
+  #if MDIMCD == NDIMBD
+    const int lb0 = get_local_id(0);
+    const int lb1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int lb0 = tid % NDIMBD;
+    const int lb1 = tid / NDIMBD;
+  #endif
+  #pragma unroll
+  for (int kib=0; kib<KWBD; ++kib) {
+    #pragma unroll
+    for (int nib=0; nib<NWBD; ++nib) {
+
+      // Computes the indices for the global memory
+      int ng = nib + lb0*NWBD;
+      int kg = kib + lb1*KWBD;
+      int idn = (b_transpose) ? ng + kwg : ng + GetGroupID1()*WGD;
+      int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      real result = bgms[idk*b_ld + idn + b_offset];
+      if (b_conjugate) { COMPLEX_CONJUGATE(result); }
+      blm[kg*(WGD + PADB) + ng] = result;
+    }
+  }
+}
+
+// =================================================================================================
+
+// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
+// caching the A input matrix. In contrast to the functions above, this function performs bounds
+// checks and doesn't use the vector data-types.
+inline void GlobalToLocalCheckedA(const __global real* restrict agms, __local real* alm,
+                                  const int a_ld, const int a_offset, const int kwg,
+                                  const int a_transpose, const int a_conjugate,
+                                  const int kSizeM, const int kSizeK) {
+  #if MDIMCD == MDIMAD
+    const int la0 = get_local_id(0);
+    const int la1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int la0 = tid % MDIMAD;
+    const int la1 = tid / MDIMAD;
+  #endif
+  #pragma unroll
+  for (int mia=0; mia<MWAD; ++mia) {
+    #pragma unroll
+    for (int kia=0; kia<KWAD; ++kia) {
+
+      // Computes the indices for the global memory
+      int mg = mia + la0*MWAD;
+      int kg = kia + la1*KWAD;
+      int idm = (a_transpose) ? mg + kwg : mg + GetGroupID0()*WGD;
+      int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      int condition = (a_transpose) ? idm < kSizeK : idm < kSizeM;
+      if (condition) {
+        real result = agms[idk*a_ld + idm + a_offset];
+        if (a_conjugate) { COMPLEX_CONJUGATE(result); }
+        alm[kg*(WGD + PADA) + mg] = result;
+      }
+      else {
+        SetToZero(alm[kg*(WGD + PADA) + mg]);
+      }
+    }
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToLocalCheckedB(const __global real* restrict bgms, __local real* blm,
+                                  const int b_ld, const int b_offset, const int kwg,
+                                  const int b_transpose, const int b_conjugate,
+                                  const int kSizeN, const int kSizeK) {
+  #if MDIMCD == NDIMBD
+    const int lb0 = get_local_id(0);
+    const int lb1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int lb0 = tid % NDIMBD;
+    const int lb1 = tid / NDIMBD;
+  #endif
+  #pragma unroll
+  for (int kib=0; kib<KWBD; ++kib) {
+    #pragma unroll
+    for (int nib=0; nib<NWBD; ++nib) {
+
+      // Computes the indices for the global memory
+      int ng = nib + lb0*NWBD;
+      int kg = kib + lb1*KWBD;
+      int idn = (b_transpose) ? ng + kwg : ng + GetGroupID1()*WGD;
+      int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      int condition = (b_transpose) ? idn < kSizeK : idn < kSizeN;
+      if (condition) {
+        real result = bgms[idk*b_ld + idn + b_offset];
+        if (b_conjugate) { COMPLEX_CONJUGATE(result); }
+        blm[kg*(WGD + PADB) + ng] = result;
+      }
+      else {
+        SetToZero(blm[kg*(WGD + PADB) + ng]);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
--- a/src/kernels/level3/xgemm_direct_part3.opencl
+++ b/src/kernels/level3/xgemm_direct_part3.opencl
@ -0,0 +1,214 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is part 3 of 3 of the GEMM kernel. See part 1 for more information.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Main body of the kernel. This is the direct version without pre/post processing and restrictions.
+inline void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK,
+                        const real_arg arg_alpha,
+                        const real_arg arg_beta,
+                        const __global realMD* restrict agm, const int a_offset, const int a_ld,
+                        const __global realND* restrict bgm, const int b_offset, const int b_ld,
+                        __global real* cgm, const int c_offset, const int c_ld,
+                        __local real* alm, __local real* blm,
+                        const int a_transpose, const int b_transpose, const int c_transpose,
+                        const int a_conjugate, const int b_conjugate) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
+
+  // Extra pointers to scalar versions of global memory
+  const __global real* restrict agms = (const __global real* restrict) agm;
+  const __global real* restrict bgms = (const __global real* restrict) bgm;
+
+  // Allocates workitem-private memory (registers)
+  real apm[MWID];
+  real bpm[NWID];
+  real cpm[NWID][MWID];
+
+  // Initializes the accumulation registers
+  InitAccRegistersDirect(cpm);
+
+  // The faster version of GEMM is not allowed on the (incomplete) borders. Therefore, this section
+  // processes only the main parts: output blocks of WGD by WGD.
+  const int idm = get_local_id(0) * MWID + GetGroupID0() * WGD;
+  const int idn = get_local_id(1) * NWID + GetGroupID1() * WGD;
+  if ((idm < (kSizeM/WGD)*WGD) && (idn < (kSizeN/WGD)*WGD)) {
+
+    // Loops over all complete workgroup tiles (K-dimension)
+    int kwg = 0;
+    for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) {
+
+      // Loads data: off-chip --> local (matrix A and B)
+      if (a_ld % VWMD == 0) {
+        GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
+      }
+      else {
+        GlobalToLocalScalarA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
+      }
+      if (b_ld % VWND == 0) {
+        GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate);
+      }
+      else {
+        GlobalToLocalScalarB(bgms, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate);
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      // Loops over all workitem tiles, unrolled by a factor KWID
+      for (int pwi=0; pwi<WGD; pwi+=KWID) {
+        #pragma unroll
+        for (int pit=0; pit<KWID; ++pit) {
+          int kg = pwi + pit;
+
+          // Loads data: local --> private (matrix A and B)
+          LocalToPrivateDirectA(alm, apm, kg, a_transpose);
+          LocalToPrivateDirectB(blm, bpm, kg, b_transpose);
+
+          // Performs the accumulation (Cpm += Apm * Bpm)
+          MultiplyAccumulateDirect(cpm, apm, bpm);
+        }
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // Loop over the remaining part (incomplete tile in K-dimension)
+    for (; kwg < kSizeK; ++kwg) {
+
+      // Loads data: off-chip --> private (matrix A and B)
+      GlobalToPrivateDirectA(agms, apm, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate);
+      GlobalToPrivateDirectB(bgms, bpm, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate);
+
+      // Performs the accumulation (Cpm += Apm * Bpm)
+      MultiplyAccumulateDirect(cpm, apm, bpm);
+    }
+
+    // Stores a tile of results and performs the multiplication with alpha and beta
+    StoreResultsDirect(cgm, cpm, idm, idn, alpha, beta, c_ld, c_offset, c_transpose);
+  }
+
+  // Simple but slower version for the parts on the edge (incomplete tiles in M and N-dimensions)
+  else {
+
+    // Loops over all complete workgroup tiles (K-dimension)
+    int kwg = 0;
+    for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) {
+
+      // Loads data: off-chip --> local (matrix A and B)
+      GlobalToLocalCheckedA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate, kSizeM, kSizeK);
+      GlobalToLocalCheckedB(bgms, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate, kSizeN, kSizeK);
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      // Loops over all workitem tiles, unrolled by a factor KWID
+      for (int pwi=0; pwi<WGD; pwi+=KWID) {
+        #pragma unroll
+        for (int pit=0; pit<KWID; ++pit) {
+          int kg = pwi + pit;
+
+          // Loads data: local --> private (matrix A and B)
+          LocalToPrivateDirectA(alm, apm, kg, a_transpose);
+          LocalToPrivateDirectB(blm, bpm, kg, b_transpose);
+
+          // Performs the accumulation (Cpm += Apm * Bpm)
+          MultiplyAccumulateDirect(cpm, apm, bpm);
+        }
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // Loop over the remaining part (incomplete tile in K-dimension)
+    for (; kwg < kSizeK; ++kwg) {
+
+      // Loads data: off-chip --> private (matrix A and B)
+      GlobalToPrivateCheckedA(agms, apm, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate, kSizeM);
+      GlobalToPrivateCheckedB(bgms, bpm, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate, kSizeN);
+
+      // Performs the accumulation (Cpm += Apm * Bpm)
+      MultiplyAccumulateDirect(cpm, apm, bpm);
+    }
+
+    // Stores a tile of results and performs the multiplication with alpha and beta
+    StoreResultsChecked(cgm, cpm, idm, idn, kSizeM, kSizeN, alpha, beta, c_ld, c_offset, c_transpose);
+  }
+}
+
+// =================================================================================================
+
+// Direct version of the GEMM kernel with [A, B] = [non-transposed, non-transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK,
+                            const real_arg arg_alpha, const real_arg arg_beta,
+                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
+                            const __global realND* restrict bgm, const int b_offset, const int b_ld,
+                            __global real* cgm, const int c_offset, const int c_ld,
+                            const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 0, 0, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the GEMM kernel with [A, B] = [non-transposed, transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK,
+                            const real_arg arg_alpha, const real_arg arg_beta,
+                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
+                            const __global realND* restrict bgm, const int b_offset, const int b_ld,
+                            __global real* cgm, const int c_offset, const int c_ld,
+                            const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 0, 1, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the GEMM kernel with [A, B] = [transposed, non-transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK,
+                            const real_arg arg_alpha, const real_arg arg_beta,
+                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
+                            const __global realND* restrict bgm, const int b_offset, const int b_ld,
+                            __global real* cgm, const int c_offset, const int c_ld,
+                            const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 1, 0, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the GEMM kernel with [A, B] = [transposed, transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectTT(const int kSizeM, const int kSizeN, const int kSizeK,
+                            const real_arg arg_alpha, const real_arg arg_beta,
+                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
+                            const __global realND* restrict bgm, const int b_offset, const int b_ld,
+                            __global real* cgm, const int c_offset, const int c_ld,
+                            const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 1, 1, c_transpose, a_conjugate, b_conjugate);
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@ -113,7 +113,7 @@ void XgemmUpper(const int kSizeN, const int kSizeK,
  const real beta = GetRealArg(arg_beta);

  // Skip these threads if they do not contain threads contributing to the upper-triangle
-  if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
+  if ((GetGroupID1() + 1)*NWG < GetGroupID0()*MWG) {
    return;
  }

@ -153,7 +153,7 @@ void XgemmLower(const int kSizeN, const int kSizeK,
  const real beta = GetRealArg(arg_beta);

  // Skip these threads if they do not contain threads contributing to the lower-triangle
-  if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
+  if (GetGroupID1()*NWG > (GetGroupID0() + 1)*MWG) {
    return;
  }

--- a/src/routine.cpp
+++ b/src/routine.cpp
@ -14,16 +14,18 @@
 #include <string>
 #include <vector>
 #include <chrono>
+#include <cstdlib>

 #include "routine.hpp"

 namespace clblast {
 // =================================================================================================

-// Constructor: not much here, because no status codes can be returned
+// The constructor does all heavy work, errors are returned as exceptions
 Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
                 const std::vector<std::string> &routines, const Precision precision,
-                 const std::vector<Database::DatabaseEntry> &userDatabase):
+                 const std::vector<const Database::DatabaseEntry*> &userDatabase,
+                 std::initializer_list<const char *> source):
    precision_(precision),
    routine_name_(name),
    queue_(queue),
@ -32,27 +34,24 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
    device_(queue_.GetDevice()),
    device_name_(device_.Name()),
    db_(queue_, routines, precision_, userDatabase) {
-}
-
-// =================================================================================================
-
-// Separate set-up function to allow for status codes to be returned
-StatusCode Routine::SetUp() {

  // Queries the cache to see whether or not the program (context-specific) is already there
-  if (ProgramIsInCache(context_, precision_, routine_name_)) { return StatusCode::kSuccess; }
+  if (ProgramIsInCache(context_, precision_, routine_name_)) { return; }
+
+  // Sets the build options from an environmental variable (if set)
+  auto options = std::vector<std::string>();
+  const auto environment_variable = std::getenv("CLBLAST_BUILD_OPTIONS");
+  if (environment_variable != nullptr) {
+    options.push_back(std::string(environment_variable));
+  }

  // Queries the cache to see whether or not the binary (device-specific) is already there. If it
  // is, a program is created and stored in the cache
  if (BinaryIsInCache(device_name_, precision_, routine_name_)) {
-    try {
-      auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
-      auto program = Program(device_, context_, binary);
-      auto options = std::vector<std::string>();
-      program.Build(device_, options);
-      StoreProgramToCache(program, context_, precision_, routine_name_);
-    } catch (...) { return StatusCode::kBuildProgramFailure; }
-    return StatusCode::kSuccess;
+    auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
+    auto program = Program(device_, context_, binary);
+    program.Build(device_, options);
+    StoreProgramToCache(program, context_, precision_, routine_name_);
  }

  // Otherwise, the kernel will be compiled and program will be built. Both the binary and the
@ -62,48 +61,50 @@ StatusCode Routine::SetUp() {
  const auto extensions = device_.Capabilities();
  if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
    if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
-      return StatusCode::kNoDoublePrecision;
+      throw RuntimeErrorCode(StatusCode::kNoDoublePrecision);
    }
  }

  // As above, but for cl_khr_fp16 (half precision)
  if (precision_ == Precision::kHalf) {
    if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
-      return StatusCode::kNoHalfPrecision;
+      throw RuntimeErrorCode(StatusCode::kNoHalfPrecision);
    }
  }

-  // Loads the common header (typedefs and defines and such)
-  std::string common_header =
-    #include "kernels/common.opencl"
-  ;
-
  // Collects the parameters for this device in the form of defines, and adds the precision
-  auto defines = db_.GetDefines();
-  defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
+  auto source_string = db_.GetDefines();
+  source_string += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";

  // Adds the name of the routine as a define
-  defines += "#define ROUTINE_"+routine_name_+"\n";
+  source_string += "#define ROUTINE_"+routine_name_+"\n";

  // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
  // performance, but might result in a reduced accuracy.
  if (device_.IsAMD() && device_.IsGPU()) {
-    defines += "#define USE_CL_MAD 1\n";
+    source_string += "#define USE_CL_MAD 1\n";
  }

  // For specific devices, use staggered/shuffled workgroup indices.
  if (device_.IsAMD() && device_.IsGPU()) {
-    defines += "#define USE_STAGGERED_INDICES 1\n";
+    source_string += "#define USE_STAGGERED_INDICES 1\n";
  }

  // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
  // performance through better cache behaviour
  if (device_.IsARM() && device_.IsGPU()) {
-    defines += "#define GLOBAL_MEM_FENCE 1\n";
+    source_string += "#define GLOBAL_MEM_FENCE 1\n";
  }

-  // Combines everything together into a single source string
-  const auto source_string = defines + common_header + source_string_;
+  // Loads the common header (typedefs and defines and such)
+  source_string +=
+    #include "kernels/common.opencl"
+  ;
+
+  // Adds routine-specific code to the constructed source string
+  for (const char *s: source) {
+    source_string += s;
+  }

  // Prints details of the routine to compile in case of debugging in verbose mode
  #ifdef VERBOSE
@ -113,24 +114,21 @@ StatusCode Routine::SetUp() {
  #endif

  // Compiles the kernel
+  auto program = Program(context_, source_string);
  try {
-    auto program = Program(context_, source_string);
-    auto options = std::vector<std::string>();
-    const auto build_status = program.Build(device_, options);
-
-    // Checks for compiler crashes/errors/warnings
-    if (build_status == BuildStatus::kError) {
-      const auto message = program.GetBuildInfo(device_);
-      fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str());
-      return StatusCode::kBuildProgramFailure;
+    program.Build(device_, options);
+  } catch (const CLError &e) {
+    if (e.status() == CL_BUILD_PROGRAM_FAILURE) {
+      fprintf(stdout, "OpenCL compiler error/warning: %s\n",
+              program.GetBuildInfo(device_).c_str());
    }
-    if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; }
+    throw;
+  }

-    // Store the compiled binary and program in the cache
-    const auto binary = program.GetIR();
-    StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
-    StoreProgramToCache(program, context_, precision_, routine_name_);
-  } catch (...) { return StatusCode::kBuildProgramFailure; }
+  // Store the compiled binary and program in the cache
+  const auto binary = program.GetIR();
+  StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
+  StoreProgramToCache(program, context_, precision_, routine_name_);

  // Prints the elapsed compilation time in case of debugging in verbose mode
  #ifdef VERBOSE
@ -138,9 +136,6 @@ StatusCode Routine::SetUp() {
    const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
    printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
  #endif
-
-  // No errors, normal termination of this function
-  return StatusCode::kSuccess;
 }

 // =================================================================================================
--- a/src/routine.hpp
+++ b/src/routine.hpp
@ -19,9 +19,9 @@
 #include <string>
 #include <vector>

-#include "utilities.hpp"
+#include "utilities/utilities.hpp"
 #include "cache.hpp"
-#include "buffer_test.hpp"
+#include "utilities/buffer_test.hpp"
 #include "database/database.hpp"
 #include "routines/common.hpp"

@ -34,21 +34,19 @@ class Routine {

  // Base class constructor. The user database is an optional extra database to override the
  // built-in database.
+  // All heavy preparation work is done inside this constructor.
  explicit Routine(Queue &queue, EventPointer event, const std::string &name,
                   const std::vector<std::string> &routines, const Precision precision,
-                   const std::vector<Database::DatabaseEntry> &userDatabase = {});
-
-  // Set-up phase of the kernel
-  StatusCode SetUp();
+                   const std::vector<const Database::DatabaseEntry*> &userDatabase,
+                   std::initializer_list<const char *> source);

 protected:

  // Non-static variable for the precision
  const Precision precision_;

-  // The routine's name and its kernel-source in string form
+  // The routine's name
  const std::string routine_name_;
-  std::string source_string_;

  // The OpenCL objects, accessible only from derived classes
  Queue queue_;
--- a/src/routines/common.cpp
+++ b/src/routines/common.cpp
@ -20,22 +20,26 @@ namespace clblast {
 // =================================================================================================

 // Enqueues a kernel, waits for completion, and checks for errors
-StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
-                     std::vector<size_t> global, const std::vector<size_t> &local,
-                     EventPointer event, const std::vector<Event> &waitForEvents) {
+void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
+               std::vector<size_t> global, const std::vector<size_t> &local,
+               EventPointer event, const std::vector<Event> &waitForEvents) {

  if (!local.empty()) {
    // Tests for validity of the local thread sizes
    if (local.size() > device.MaxWorkItemDimensions()) {
-      return StatusCode::kInvalidLocalNumDimensions;
+      throw RuntimeErrorCode(StatusCode::kInvalidLocalNumDimensions);
    }
    const auto max_work_item_sizes = device.MaxWorkItemSizes();
    for (auto i=size_t{0}; i<local.size(); ++i) {
-      if (local[i] > max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; }
+      if (local[i] > max_work_item_sizes[i]) {
+        throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsDim);
+      }
    }
    auto local_size = size_t{1};
    for (auto &item: local) { local_size *= item; }
-    if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; }
+    if (local_size > device.MaxWorkGroupSize()) {
+      throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal);
+    }

    // Make sure the global thread sizes are at least equal to the local sizes
    for (auto i=size_t{0}; i<global.size(); ++i) {
@ -45,7 +49,9 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,

  // Tests for local memory usage
  const auto local_mem_usage = kernel.LocalMemUsage(device);
-  if (!device.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; }
+  if (!device.IsLocalMemoryValid(local_mem_usage)) {
+    throw RuntimeErrorCode(StatusCode::kInvalidLocalMemUsage);
+  }

  // Prints the name of the kernel to launch in case of debugging in verbose mode
  #ifdef VERBOSE
@ -55,9 +61,7 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
  #endif

  // Launches the kernel (and checks for launch errors)
-  try {
-    kernel.Launch(queue, global, local, event, waitForEvents);
-  } catch (...) { return StatusCode::kKernelLaunchError; }
+  kernel.Launch(queue, global, local, event, waitForEvents);

  // Prints the elapsed execution time in case of debugging in verbose mode
  #ifdef VERBOSE
@ -66,9 +70,6 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
    const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
    printf("[DEBUG] Completed kernel in %.2lf ms\n", timing);
  #endif
-
-  // No errors, normal termination of this function
-  return StatusCode::kSuccess;
 }

 // =================================================================================================
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@ -27,29 +27,29 @@ namespace clblast {
 // =================================================================================================

 // Enqueues a kernel, waits for completion, and checks for errors
-StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
-                     std::vector<size_t> global, const std::vector<size_t> &local,
-                     EventPointer event, const std::vector<Event> &waitForEvents = {});
+void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
+               std::vector<size_t> global, const std::vector<size_t> &local,
+               EventPointer event, const std::vector<Event> &waitForEvents = {});

 // =================================================================================================

 // Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
 // to write to symmetric and triangular matrices through optional arguments.
 template <typename T>
-StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
-                                  const Database &db,
-                                  EventPointer event, const std::vector<Event> &waitForEvents,
-                                  const size_t src_one, const size_t src_two,
-                                  const size_t src_ld, const size_t src_offset,
-                                  const Buffer<T> &src,
-                                  const size_t dest_one, const size_t dest_two,
-                                  const size_t dest_ld, const size_t dest_offset,
-                                  const Buffer<T> &dest,
-                                  const T alpha,
-                                  const Program &program, const bool do_pad,
-                                  const bool do_transpose, const bool do_conjugate,
-                                  const bool upper = false, const bool lower = false,
-                                  const bool diagonal_imag_zero = false) {
+void PadCopyTransposeMatrix(Queue &queue, const Device &device,
+                            const Database &db,
+                            EventPointer event, const std::vector<Event> &waitForEvents,
+                            const size_t src_one, const size_t src_two,
+                            const size_t src_ld, const size_t src_offset,
+                            const Buffer<T> &src,
+                            const size_t dest_one, const size_t dest_two,
+                            const size_t dest_ld, const size_t dest_offset,
+                            const Buffer<T> &dest,
+                            const T alpha,
+                            const Program &program, const bool do_pad,
+                            const bool do_transpose, const bool do_conjugate,
+                            const bool upper = false, const bool lower = false,
+                            const bool diagonal_imag_zero = false) {

  // Determines whether or not the fast-version could potentially be used
  auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
@ -61,8 +61,8 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
  if (do_transpose) {
    if (use_fast_kernel &&
        IsMultiple(src_ld, db["TRA_WPT"]) &&
-        IsMultiple(src_one, db["TRA_WPT"]*db["TRA_WPT"]) &&
-        IsMultiple(src_two, db["TRA_WPT"]*db["TRA_WPT"])) {
+        IsMultiple(src_one, db["TRA_WPT"]*db["TRA_DIM"]) &&
+        IsMultiple(src_two, db["TRA_WPT"]*db["TRA_DIM"])) {
      kernel_name = "TransposeMatrixFast";
    }
    else {
@ -84,77 +84,75 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
  }

  // Retrieves the kernel from the compiled binary
-  try {
-    auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program, kernel_name);

-    // Sets the kernel arguments
+  // Sets the kernel arguments
+  if (use_fast_kernel) {
+    kernel.SetArgument(0, static_cast<int>(src_ld));
+    kernel.SetArgument(1, src());
+    kernel.SetArgument(2, dest());
+    kernel.SetArgument(3, GetRealArg(alpha));
+  }
+  else {
+    kernel.SetArgument(0, static_cast<int>(src_one));
+    kernel.SetArgument(1, static_cast<int>(src_two));
+    kernel.SetArgument(2, static_cast<int>(src_ld));
+    kernel.SetArgument(3, static_cast<int>(src_offset));
+    kernel.SetArgument(4, src());
+    kernel.SetArgument(5, static_cast<int>(dest_one));
+    kernel.SetArgument(6, static_cast<int>(dest_two));
+    kernel.SetArgument(7, static_cast<int>(dest_ld));
+    kernel.SetArgument(8, static_cast<int>(dest_offset));
+    kernel.SetArgument(9, dest());
+    kernel.SetArgument(10, GetRealArg(alpha));
+    if (do_pad) {
+      kernel.SetArgument(11, static_cast<int>(do_conjugate));
+    }
+    else {
+      kernel.SetArgument(11, static_cast<int>(upper));
+      kernel.SetArgument(12, static_cast<int>(lower));
+      kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
+    }
+  }
+
+  // Launches the kernel and returns the error code. Uses global and local thread sizes based on
+  // parameters in the database.
+  if (do_transpose) {
    if (use_fast_kernel) {
-      kernel.SetArgument(0, static_cast<int>(src_ld));
-      kernel.SetArgument(1, src());
-      kernel.SetArgument(2, dest());
-      kernel.SetArgument(3, GetRealArg(alpha));
+      const auto global = std::vector<size_t>{
+        dest_one / db["TRA_WPT"],
+        dest_two / db["TRA_WPT"]
+      };
+      const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
+      RunKernel(kernel, queue, device, global, local, event, waitForEvents);
    }
    else {
-      kernel.SetArgument(0, static_cast<int>(src_one));
-      kernel.SetArgument(1, static_cast<int>(src_two));
-      kernel.SetArgument(2, static_cast<int>(src_ld));
-      kernel.SetArgument(3, static_cast<int>(src_offset));
-      kernel.SetArgument(4, src());
-      kernel.SetArgument(5, static_cast<int>(dest_one));
-      kernel.SetArgument(6, static_cast<int>(dest_two));
-      kernel.SetArgument(7, static_cast<int>(dest_ld));
-      kernel.SetArgument(8, static_cast<int>(dest_offset));
-      kernel.SetArgument(9, dest());
-      kernel.SetArgument(10, GetRealArg(alpha));
-      if (do_pad) {
-        kernel.SetArgument(11, static_cast<int>(do_conjugate));
-      }
-      else {
-        kernel.SetArgument(11, static_cast<int>(upper));
-        kernel.SetArgument(12, static_cast<int>(lower));
-        kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
-      }
+      const auto global = std::vector<size_t>{
+        Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
+        Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
+      };
+      const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
+      RunKernel(kernel, queue, device, global, local, event, waitForEvents);
    }
-
-    // Launches the kernel and returns the error code. Uses global and local thread sizes based on
-    // parameters in the database.
-    if (do_transpose) {
-      if (use_fast_kernel) {
-        const auto global = std::vector<size_t>{
-          dest_one / db["TRA_WPT"],
-          dest_two / db["TRA_WPT"]
-        };
-        const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
-        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
-      }
-      else {
-        const auto global = std::vector<size_t>{
-          Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
-          Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
-        };
-        const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
-        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
-      }
+  }
+  else {
+    if (use_fast_kernel) {
+      const auto global = std::vector<size_t>{
+        dest_one / db["COPY_VW"],
+        dest_two / db["COPY_WPT"]
+      };
+      const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
+      RunKernel(kernel, queue, device, global, local, event, waitForEvents);
    }
    else {
-      if (use_fast_kernel) {
-        const auto global = std::vector<size_t>{
-          dest_one / db["COPY_VW"],
-          dest_two / db["COPY_WPT"]
-        };
-        const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
-        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
-      }
-      else {
-        const auto global = std::vector<size_t>{
-          Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
-          Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
-        };
-        const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
-        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
-      }
+      const auto global = std::vector<size_t>{
+        Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
+        Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
+      };
+      const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
+      RunKernel(kernel, queue, device, global, local, event, waitForEvents);
    }
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  }
 }

 // =================================================================================================
--- a/src/routines/level1/xamax.cpp
+++ b/src/routines/level1/xamax.cpp
@ -22,74 +22,64 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xamax<T>::Xamax(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
-  source_string_ =
+    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
    #include "../../kernels/level1/xamax.opencl"
-  ;
+    }) {
 }

 // =================================================================================================

 // The main routine
 template <typename T>
-StatusCode Xamax<T>::DoAmax(const size_t n,
-                            const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xamax<T>::DoAmax(const size_t n,
+                      const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorIndex(1, imax_buffer, imax_offset);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorIndex(1, imax_buffer, imax_offset);

  // Retrieves the Xamax kernels from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel1 = Kernel(program, "Xamax");
-    auto kernel2 = Kernel(program, "XamaxEpilogue");
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel1 = Kernel(program, "Xamax");
+  auto kernel2 = Kernel(program, "XamaxEpilogue");

-    // Creates the buffer for intermediate values
-    auto temp_size = 2*db_["WGS2"];
-    auto temp_buffer1 = Buffer<T>(context_, temp_size);
-    auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size);
+  // Creates the buffer for intermediate values
+  auto temp_size = 2*db_["WGS2"];
+  auto temp_buffer1 = Buffer<T>(context_, temp_size);
+  auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size);

-    // Sets the kernel arguments
-    kernel1.SetArgument(0, static_cast<int>(n));
-    kernel1.SetArgument(1, x_buffer());
-    kernel1.SetArgument(2, static_cast<int>(x_offset));
-    kernel1.SetArgument(3, static_cast<int>(x_inc));
-    kernel1.SetArgument(4, temp_buffer1());
-    kernel1.SetArgument(5, temp_buffer2());
+  // Sets the kernel arguments
+  kernel1.SetArgument(0, static_cast<int>(n));
+  kernel1.SetArgument(1, x_buffer());
+  kernel1.SetArgument(2, static_cast<int>(x_offset));
+  kernel1.SetArgument(3, static_cast<int>(x_inc));
+  kernel1.SetArgument(4, temp_buffer1());
+  kernel1.SetArgument(5, temp_buffer2());

-    // Event waiting list
-    auto eventWaitList = std::vector<Event>();
+  // Event waiting list
+  auto eventWaitList = std::vector<Event>();

-    // Launches the main kernel
-    auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
-    auto local1 = std::vector<size_t>{db_["WGS1"]};
-    auto kernelEvent = Event();
-    status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
-    if (ErrorIn(status)) { return status; }
-    eventWaitList.push_back(kernelEvent);
+  // Launches the main kernel
+  auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
+  auto local1 = std::vector<size_t>{db_["WGS1"]};
+  auto kernelEvent = Event();
+  RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
+  eventWaitList.push_back(kernelEvent);

-    // Sets the arguments for the epilogue kernel
-    kernel2.SetArgument(0, temp_buffer1());
-    kernel2.SetArgument(1, temp_buffer2());
-    kernel2.SetArgument(2, imax_buffer());
-    kernel2.SetArgument(3, static_cast<int>(imax_offset));
+  // Sets the arguments for the epilogue kernel
+  kernel2.SetArgument(0, temp_buffer1());
+  kernel2.SetArgument(1, temp_buffer2());
+  kernel2.SetArgument(2, imax_buffer());
+  kernel2.SetArgument(3, static_cast<int>(imax_offset));

-    // Launches the epilogue kernel
-    auto global2 = std::vector<size_t>{db_["WGS2"]};
-    auto local2 = std::vector<size_t>{db_["WGS2"]};
-    status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the epilogue kernel
+  auto global2 = std::vector<size_t>{db_["WGS2"]};
+  auto local2 = std::vector<size_t>{db_["WGS2"]};
+  RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
 }

 // =================================================================================================
--- a/src/routines/level1/xamax.hpp
+++ b/src/routines/level1/xamax.hpp
@ -28,9 +28,9 @@ class Xamax: public Routine {
  Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX");

  // Templated-precision implementation of the routine
-  StatusCode DoAmax(const size_t n,
-                    const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+  void DoAmax(const size_t n,
+              const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xasum.cpp
+++ b/src/routines/level1/xasum.cpp
@ -22,71 +22,61 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xasum<T>::Xasum(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
-  source_string_ =
+    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
    #include "../../kernels/level1/xasum.opencl"
-  ;
+    }) {
 }

 // =================================================================================================

 // The main routine
 template <typename T>
-StatusCode Xasum<T>::DoAsum(const size_t n,
-                            const Buffer<T> &asum_buffer, const size_t asum_offset,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xasum<T>::DoAsum(const size_t n,
+                      const Buffer<T> &asum_buffer, const size_t asum_offset,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorScalar(1, asum_buffer, asum_offset);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorScalar(1, asum_buffer, asum_offset);

  // Retrieves the Xasum kernels from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel1 = Kernel(program, "Xasum");
-    auto kernel2 = Kernel(program, "XasumEpilogue");
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel1 = Kernel(program, "Xasum");
+  auto kernel2 = Kernel(program, "XasumEpilogue");

-    // Creates the buffer for intermediate values
-    auto temp_size = 2*db_["WGS2"];
-    auto temp_buffer = Buffer<T>(context_, temp_size);
+  // Creates the buffer for intermediate values
+  auto temp_size = 2*db_["WGS2"];
+  auto temp_buffer = Buffer<T>(context_, temp_size);

-    // Sets the kernel arguments
-    kernel1.SetArgument(0, static_cast<int>(n));
-    kernel1.SetArgument(1, x_buffer());
-    kernel1.SetArgument(2, static_cast<int>(x_offset));
-    kernel1.SetArgument(3, static_cast<int>(x_inc));
-    kernel1.SetArgument(4, temp_buffer());
+  // Sets the kernel arguments
+  kernel1.SetArgument(0, static_cast<int>(n));
+  kernel1.SetArgument(1, x_buffer());
+  kernel1.SetArgument(2, static_cast<int>(x_offset));
+  kernel1.SetArgument(3, static_cast<int>(x_inc));
+  kernel1.SetArgument(4, temp_buffer());

-    // Event waiting list
-    auto eventWaitList = std::vector<Event>();
+  // Event waiting list
+  auto eventWaitList = std::vector<Event>();

-    // Launches the main kernel
-    auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
-    auto local1 = std::vector<size_t>{db_["WGS1"]};
-    auto kernelEvent = Event();
-    status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
-    if (ErrorIn(status)) { return status; }
-    eventWaitList.push_back(kernelEvent);
+  // Launches the main kernel
+  auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
+  auto local1 = std::vector<size_t>{db_["WGS1"]};
+  auto kernelEvent = Event();
+  RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
+  eventWaitList.push_back(kernelEvent);

-    // Sets the arguments for the epilogue kernel
-    kernel2.SetArgument(0, temp_buffer());
-    kernel2.SetArgument(1, asum_buffer());
-    kernel2.SetArgument(2, static_cast<int>(asum_offset));
+  // Sets the arguments for the epilogue kernel
+  kernel2.SetArgument(0, temp_buffer());
+  kernel2.SetArgument(1, asum_buffer());
+  kernel2.SetArgument(2, static_cast<int>(asum_offset));

-    // Launches the epilogue kernel
-    auto global2 = std::vector<size_t>{db_["WGS2"]};
-    auto local2 = std::vector<size_t>{db_["WGS2"]};
-    status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the epilogue kernel
+  auto global2 = std::vector<size_t>{db_["WGS2"]};
+  auto local2 = std::vector<size_t>{db_["WGS2"]};
+  RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
 }

 // =================================================================================================
--- a/src/routines/level1/xasum.hpp
+++ b/src/routines/level1/xasum.hpp
@ -28,9 +28,9 @@ class Xasum: public Routine {
  Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM");

  // Templated-precision implementation of the routine
-  StatusCode DoAsum(const size_t n,
-                    const Buffer<T> &asum_buffer, const size_t asum_offset,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+  void DoAsum(const size_t n,
+              const Buffer<T> &asum_buffer, const size_t asum_offset,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xaxpy.cpp
+++ b/src/routines/level1/xaxpy.cpp
@ -22,29 +22,26 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xaxpy<T>::Xaxpy(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
-  source_string_ =
+    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
    #include "../../kernels/level1/level1.opencl"
    #include "../../kernels/level1/xaxpy.opencl"
-  ;
+    }) {
 }

 // =================================================================================================

 // The main routine
 template <typename T>
-StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorY(n, y_buffer, y_offset, y_inc);

  // Determines whether or not the fast-version can be used
  bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -55,45 +52,39 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
  auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy";

  // Retrieves the Xaxpy kernel from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel = Kernel(program, kernel_name);
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel = Kernel(program, kernel_name);

-    // Sets the kernel arguments
-    if (use_fast_kernel) {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, GetRealArg(alpha));
-      kernel.SetArgument(2, x_buffer());
-      kernel.SetArgument(3, y_buffer());
-    }
-    else {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, GetRealArg(alpha));
-      kernel.SetArgument(2, x_buffer());
-      kernel.SetArgument(3, static_cast<int>(x_offset));
-      kernel.SetArgument(4, static_cast<int>(x_inc));
-      kernel.SetArgument(5, y_buffer());
-      kernel.SetArgument(6, static_cast<int>(y_offset));
-      kernel.SetArgument(7, static_cast<int>(y_inc));
-    }
+  // Sets the kernel arguments
+  if (use_fast_kernel) {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, GetRealArg(alpha));
+    kernel.SetArgument(2, x_buffer());
+    kernel.SetArgument(3, y_buffer());
+  }
+  else {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, GetRealArg(alpha));
+    kernel.SetArgument(2, x_buffer());
+    kernel.SetArgument(3, static_cast<int>(x_offset));
+    kernel.SetArgument(4, static_cast<int>(x_inc));
+    kernel.SetArgument(5, y_buffer());
+    kernel.SetArgument(6, static_cast<int>(y_offset));
+    kernel.SetArgument(7, static_cast<int>(y_inc));
+  }

-    // Launches the kernel
-    if (use_fast_kernel) {
-      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    else {
-      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
-      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the kernel
+  if (use_fast_kernel) {
+    auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
+  else {
+    auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+    auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
 }

 // =================================================================================================
--- a/src/routines/level1/xaxpy.hpp
+++ b/src/routines/level1/xaxpy.hpp
@ -28,9 +28,9 @@ class Xaxpy: public Routine {
  Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY");

  // Templated-precision implementation of the routine
-  StatusCode DoAxpy(const size_t n, const T alpha,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoAxpy(const size_t n, const T alpha,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xcopy.cpp
+++ b/src/routines/level1/xcopy.cpp
@ -22,29 +22,26 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xcopy<T>::Xcopy(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
-  source_string_ =
+    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
    #include "../../kernels/level1/level1.opencl"
    #include "../../kernels/level1/xcopy.opencl"
-  ;
+    }) {
 }

 // =================================================================================================

 // The main routine
 template <typename T>
-StatusCode Xcopy<T>::DoCopy(const size_t n,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xcopy<T>::DoCopy(const size_t n,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorY(n, y_buffer, y_offset, y_inc);

  // Determines whether or not the fast-version can be used
  bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -55,43 +52,37 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
  auto kernel_name = (use_fast_kernel) ? "XcopyFast" : "Xcopy";

  // Retrieves the Xcopy kernel from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel = Kernel(program, kernel_name);
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel = Kernel(program, kernel_name);

-    // Sets the kernel arguments
-    if (use_fast_kernel) {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, x_buffer());
-      kernel.SetArgument(2, y_buffer());
-    }
-    else {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, x_buffer());
-      kernel.SetArgument(2, static_cast<int>(x_offset));
-      kernel.SetArgument(3, static_cast<int>(x_inc));
-      kernel.SetArgument(4, y_buffer());
-      kernel.SetArgument(5, static_cast<int>(y_offset));
-      kernel.SetArgument(6, static_cast<int>(y_inc));
-    }
+  // Sets the kernel arguments
+  if (use_fast_kernel) {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, x_buffer());
+    kernel.SetArgument(2, y_buffer());
+  }
+  else {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, x_buffer());
+    kernel.SetArgument(2, static_cast<int>(x_offset));
+    kernel.SetArgument(3, static_cast<int>(x_inc));
+    kernel.SetArgument(4, y_buffer());
+    kernel.SetArgument(5, static_cast<int>(y_offset));
+    kernel.SetArgument(6, static_cast<int>(y_inc));
+  }

-    // Launches the kernel
-    if (use_fast_kernel) {
-      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    else {
-      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
-      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the kernel
+  if (use_fast_kernel) {
+    auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
+  else {
+    auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+    auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
 }

 // =================================================================================================
--- a/src/routines/level1/xcopy.hpp
+++ b/src/routines/level1/xcopy.hpp
@ -28,9 +28,9 @@ class Xcopy: public Routine {
  Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY");

  // Templated-precision implementation of the routine
-  StatusCode DoCopy(const size_t n,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoCopy(const size_t n,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xdot.cpp
+++ b/src/routines/level1/xdot.cpp
@ -22,79 +22,68 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xdot<T>::Xdot(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
-  source_string_ =
+    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
    #include "../../kernels/level1/xdot.opencl"
-  ;
+    }) {
 }

 // =================================================================================================

 // The main routine
 template <typename T>
-StatusCode Xdot<T>::DoDot(const size_t n,
-                          const Buffer<T> &dot_buffer, const size_t dot_offset,
-                          const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                          const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                          const bool do_conjugate) {
+void Xdot<T>::DoDot(const size_t n,
+                    const Buffer<T> &dot_buffer, const size_t dot_offset,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    const bool do_conjugate) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorScalar(1, dot_buffer, dot_offset);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorY(n, y_buffer, y_offset, y_inc);
+  TestVectorScalar(1, dot_buffer, dot_offset);

  // Retrieves the Xdot kernels from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel1 = Kernel(program, "Xdot");
-    auto kernel2 = Kernel(program, "XdotEpilogue");
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel1 = Kernel(program, "Xdot");
+  auto kernel2 = Kernel(program, "XdotEpilogue");

-    // Creates the buffer for intermediate values
-    auto temp_size = 2*db_["WGS2"];
-    auto temp_buffer = Buffer<T>(context_, temp_size);
+  // Creates the buffer for intermediate values
+  auto temp_size = 2*db_["WGS2"];
+  auto temp_buffer = Buffer<T>(context_, temp_size);

-    // Sets the kernel arguments
-    kernel1.SetArgument(0, static_cast<int>(n));
-    kernel1.SetArgument(1, x_buffer());
-    kernel1.SetArgument(2, static_cast<int>(x_offset));
-    kernel1.SetArgument(3, static_cast<int>(x_inc));
-    kernel1.SetArgument(4, y_buffer());
-    kernel1.SetArgument(5, static_cast<int>(y_offset));
-    kernel1.SetArgument(6, static_cast<int>(y_inc));
-    kernel1.SetArgument(7, temp_buffer());
-    kernel1.SetArgument(8, static_cast<int>(do_conjugate));
+  // Sets the kernel arguments
+  kernel1.SetArgument(0, static_cast<int>(n));
+  kernel1.SetArgument(1, x_buffer());
+  kernel1.SetArgument(2, static_cast<int>(x_offset));
+  kernel1.SetArgument(3, static_cast<int>(x_inc));
+  kernel1.SetArgument(4, y_buffer());
+  kernel1.SetArgument(5, static_cast<int>(y_offset));
+  kernel1.SetArgument(6, static_cast<int>(y_inc));
+  kernel1.SetArgument(7, temp_buffer());
+  kernel1.SetArgument(8, static_cast<int>(do_conjugate));

-    // Event waiting list
-    auto eventWaitList = std::vector<Event>();
+  // Event waiting list
+  auto eventWaitList = std::vector<Event>();

-    // Launches the main kernel
-    auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
-    auto local1 = std::vector<size_t>{db_["WGS1"]};
-    auto kernelEvent = Event();
-    status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
-    if (ErrorIn(status)) { return status; }
-    eventWaitList.push_back(kernelEvent);
+  // Launches the main kernel
+  auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
+  auto local1 = std::vector<size_t>{db_["WGS1"]};
+  auto kernelEvent = Event();
+  RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
+  eventWaitList.push_back(kernelEvent);

-    // Sets the arguments for the epilogue kernel
-    kernel2.SetArgument(0, temp_buffer());
-    kernel2.SetArgument(1, dot_buffer());
-    kernel2.SetArgument(2, static_cast<int>(dot_offset));
+  // Sets the arguments for the epilogue kernel
+  kernel2.SetArgument(0, temp_buffer());
+  kernel2.SetArgument(1, dot_buffer());
+  kernel2.SetArgument(2, static_cast<int>(dot_offset));

-    // Launches the epilogue kernel
-    auto global2 = std::vector<size_t>{db_["WGS2"]};
-    auto local2 = std::vector<size_t>{db_["WGS2"]};
-    status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the epilogue kernel
+  auto global2 = std::vector<size_t>{db_["WGS2"]};
+  auto local2 = std::vector<size_t>{db_["WGS2"]};
+  RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
 }

 // =================================================================================================
--- a/src/routines/level1/xdot.hpp
+++ b/src/routines/level1/xdot.hpp
@ -28,11 +28,11 @@ class Xdot: public Routine {
  Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT");

  // Templated-precision implementation of the routine
-  StatusCode DoDot(const size_t n,
-                   const Buffer<T> &dot_buffer, const size_t dot_offset,
-                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                   const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                   const bool do_conjugate = false);
+  void DoDot(const size_t n,
+             const Buffer<T> &dot_buffer, const size_t dot_offset,
+             const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+             const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+             const bool do_conjugate = false);
 };

 // =================================================================================================
--- a/src/routines/level1/xdotc.cpp
+++ b/src/routines/level1/xdotc.cpp
@ -29,14 +29,14 @@ Xdotc<T>::Xdotc(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xdotc<T>::DoDotc(const size_t n,
-                            const Buffer<T> &dot_buffer, const size_t dot_offset,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
-  return DoDot(n, dot_buffer, dot_offset,
-               x_buffer, x_offset, x_inc,
-               y_buffer, y_offset, y_inc,
-               true);
+void Xdotc<T>::DoDotc(const size_t n,
+                      const Buffer<T> &dot_buffer, const size_t dot_offset,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+  DoDot(n, dot_buffer, dot_offset,
+        x_buffer, x_offset, x_inc,
+        y_buffer, y_offset, y_inc,
+        true);
 }

 // =================================================================================================
--- a/src/routines/level1/xdotc.hpp
+++ b/src/routines/level1/xdotc.hpp
@ -31,10 +31,10 @@ class Xdotc: public Xdot<T> {
  Xdotc(Queue &queue, EventPointer event, const std::string &name = "DOTC");

  // Templated-precision implementation of the routine
-  StatusCode DoDotc(const size_t n,
-                    const Buffer<T> &dot_buffer, const size_t dot_offset,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoDotc(const size_t n,
+              const Buffer<T> &dot_buffer, const size_t dot_offset,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xdotu.cpp
+++ b/src/routines/level1/xdotu.cpp
@ -28,14 +28,14 @@ Xdotu<T>::Xdotu(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xdotu<T>::DoDotu(const size_t n,
-                            const Buffer<T> &dot_buffer, const size_t dot_offset,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
-  return DoDot(n, dot_buffer, dot_offset,
-               x_buffer, x_offset, x_inc,
-               y_buffer, y_offset, y_inc,
-               false);
+void Xdotu<T>::DoDotu(const size_t n,
+                      const Buffer<T> &dot_buffer, const size_t dot_offset,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+  DoDot(n, dot_buffer, dot_offset,
+        x_buffer, x_offset, x_inc,
+        y_buffer, y_offset, y_inc,
+        false);
 }

 // =================================================================================================
--- a/src/routines/level1/xdotu.hpp
+++ b/src/routines/level1/xdotu.hpp
@ -31,10 +31,10 @@ class Xdotu: public Xdot<T> {
  Xdotu(Queue &queue, EventPointer event, const std::string &name = "DOTU");

  // Templated-precision implementation of the routine
-  StatusCode DoDotu(const size_t n,
-                    const Buffer<T> &dot_buffer, const size_t dot_offset,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoDotu(const size_t n,
+              const Buffer<T> &dot_buffer, const size_t dot_offset,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xmax.hpp
+++ b/src/routines/level1/xmax.hpp
@ -35,10 +35,10 @@ class Xmax: public Xamax<T> {

  // Forwards to the regular absolute version. The implementation difference is realised in the
  // kernel through a pre-processor macro based on the name of the routine.
-  StatusCode DoMax(const size_t n,
-                   const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
-                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
-    return DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc);
+  void DoMax(const size_t n,
+             const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
+             const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+    DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc);
  }
 };

--- a/src/routines/level1/xmin.hpp
+++ b/src/routines/level1/xmin.hpp
@ -35,10 +35,10 @@ class Xmin: public Xamax<T> {

  // Forwards to the regular max-absolute version. The implementation difference is realised in the
  // kernel through a pre-processor macro based on the name of the routine.
-  StatusCode DoMin(const size_t n,
-                   const Buffer<unsigned int> &imin_buffer, const size_t imin_offset,
-                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
-    return DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc);
+  void DoMin(const size_t n,
+             const Buffer<unsigned int> &imin_buffer, const size_t imin_offset,
+             const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+    DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc);
  }
 };

--- a/src/routines/level1/xnrm2.cpp
+++ b/src/routines/level1/xnrm2.cpp
@ -22,71 +22,61 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xnrm2<T>::Xnrm2(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
-  source_string_ =
+    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
    #include "../../kernels/level1/xnrm2.opencl"
-  ;
+    }) {
 }

 // =================================================================================================

 // The main routine
 template <typename T>
-StatusCode Xnrm2<T>::DoNrm2(const size_t n,
-                            const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xnrm2<T>::DoNrm2(const size_t n,
+                      const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorScalar(1, nrm2_buffer, nrm2_offset);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorScalar(1, nrm2_buffer, nrm2_offset);

  // Retrieves the Xnrm2 kernels from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel1 = Kernel(program, "Xnrm2");
-    auto kernel2 = Kernel(program, "Xnrm2Epilogue");
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel1 = Kernel(program, "Xnrm2");
+  auto kernel2 = Kernel(program, "Xnrm2Epilogue");

-    // Creates the buffer for intermediate values
-    auto temp_size = 2*db_["WGS2"];
-    auto temp_buffer = Buffer<T>(context_, temp_size);
+  // Creates the buffer for intermediate values
+  auto temp_size = 2*db_["WGS2"];
+  auto temp_buffer = Buffer<T>(context_, temp_size);

-    // Sets the kernel arguments
-    kernel1.SetArgument(0, static_cast<int>(n));
-    kernel1.SetArgument(1, x_buffer());
-    kernel1.SetArgument(2, static_cast<int>(x_offset));
-    kernel1.SetArgument(3, static_cast<int>(x_inc));
-    kernel1.SetArgument(4, temp_buffer());
+  // Sets the kernel arguments
+  kernel1.SetArgument(0, static_cast<int>(n));
+  kernel1.SetArgument(1, x_buffer());
+  kernel1.SetArgument(2, static_cast<int>(x_offset));
+  kernel1.SetArgument(3, static_cast<int>(x_inc));
+  kernel1.SetArgument(4, temp_buffer());

-    // Event waiting list
-    auto eventWaitList = std::vector<Event>();
+  // Event waiting list
+  auto eventWaitList = std::vector<Event>();

-    // Launches the main kernel
-    auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
-    auto local1 = std::vector<size_t>{db_["WGS1"]};
-    auto kernelEvent = Event();
-    status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
-    if (ErrorIn(status)) { return status; }
-    eventWaitList.push_back(kernelEvent);
+  // Launches the main kernel
+  auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
+  auto local1 = std::vector<size_t>{db_["WGS1"]};
+  auto kernelEvent = Event();
+  RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
+  eventWaitList.push_back(kernelEvent);

-    // Sets the arguments for the epilogue kernel
-    kernel2.SetArgument(0, temp_buffer());
-    kernel2.SetArgument(1, nrm2_buffer());
-    kernel2.SetArgument(2, static_cast<int>(nrm2_offset));
+  // Sets the arguments for the epilogue kernel
+  kernel2.SetArgument(0, temp_buffer());
+  kernel2.SetArgument(1, nrm2_buffer());
+  kernel2.SetArgument(2, static_cast<int>(nrm2_offset));

-    // Launches the epilogue kernel
-    auto global2 = std::vector<size_t>{db_["WGS2"]};
-    auto local2 = std::vector<size_t>{db_["WGS2"]};
-    status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the epilogue kernel
+  auto global2 = std::vector<size_t>{db_["WGS2"]};
+  auto local2 = std::vector<size_t>{db_["WGS2"]};
+  RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
 }

 // =================================================================================================
--- a/src/routines/level1/xnrm2.hpp
+++ b/src/routines/level1/xnrm2.hpp
@ -28,9 +28,9 @@ class Xnrm2: public Routine {
  Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2");

  // Templated-precision implementation of the routine
-  StatusCode DoNrm2(const size_t n,
-                    const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+  void DoNrm2(const size_t n,
+              const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xscal.cpp
+++ b/src/routines/level1/xscal.cpp
@ -22,26 +22,24 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xscal<T>::Xscal(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
-  source_string_ =
+    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
    #include "../../kernels/level1/level1.opencl"
    #include "../../kernels/level1/xscal.opencl"
-  ;
+    }) {
 }

 // =================================================================================================

 // The main routine
 template <typename T>
-StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xscal<T>::DoScal(const size_t n, const T alpha,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vector for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);

  // Determines whether or not the fast-version can be used
  bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -51,41 +49,35 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
  auto kernel_name = (use_fast_kernel) ? "XscalFast" : "Xscal";

  // Retrieves the Xscal kernel from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel = Kernel(program, kernel_name);
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel = Kernel(program, kernel_name);

-    // Sets the kernel arguments
-    if (use_fast_kernel) {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, alpha);
-      kernel.SetArgument(2, x_buffer());
-    }
-    else {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, alpha);
-      kernel.SetArgument(2, x_buffer());
-      kernel.SetArgument(3, static_cast<int>(x_offset));
-      kernel.SetArgument(4, static_cast<int>(x_inc));
-    }
+  // Sets the kernel arguments
+  if (use_fast_kernel) {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, GetRealArg(alpha));
+    kernel.SetArgument(2, x_buffer());
+  }
+  else {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, GetRealArg(alpha));
+    kernel.SetArgument(2, x_buffer());
+    kernel.SetArgument(3, static_cast<int>(x_offset));
+    kernel.SetArgument(4, static_cast<int>(x_inc));
+  }

-    // Launches the kernel
-    if (use_fast_kernel) {
-      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    else {
-      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
-      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the kernel
+  if (use_fast_kernel) {
+    auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
+  else {
+    auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+    auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
 }

 // =================================================================================================
--- a/src/routines/level1/xscal.hpp
+++ b/src/routines/level1/xscal.hpp
@ -28,8 +28,8 @@ class Xscal: public Routine {
  Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL");

  // Templated-precision implementation of the routine
-  StatusCode DoScal(const size_t n, const T alpha,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+  void DoScal(const size_t n, const T alpha,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xsum.hpp
+++ b/src/routines/level1/xsum.hpp
@ -35,10 +35,10 @@ class Xsum: public Xasum<T> {

  // Forwards to the regular absolute version. The implementation difference is realised in the
  // kernel through a pre-processor macro based on the name of the routine.
-  StatusCode DoSum(const size_t n,
-                   const Buffer<T> &sum_buffer, const size_t sum_offset,
-                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
-    return DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc);
+  void DoSum(const size_t n,
+             const Buffer<T> &sum_buffer, const size_t sum_offset,
+             const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+    DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc);
  }
 };

--- a/src/routines/level1/xswap.cpp
+++ b/src/routines/level1/xswap.cpp
@ -22,29 +22,26 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xswap<T>::Xswap(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
-  source_string_ =
+    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
    #include "../../kernels/level1/level1.opencl"
    #include "../../kernels/level1/xswap.opencl"
-  ;
+    }) {
 }

 // =================================================================================================

 // The main routine
 template <typename T>
-StatusCode Xswap<T>::DoSwap(const size_t n,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xswap<T>::DoSwap(const size_t n,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorY(n, y_buffer, y_offset, y_inc);

  // Determines whether or not the fast-version can be used
  bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -55,43 +52,37 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
  auto kernel_name = (use_fast_kernel) ? "XswapFast" : "Xswap";

  // Retrieves the Xswap kernel from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel = Kernel(program, kernel_name);
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel = Kernel(program, kernel_name);

-    // Sets the kernel arguments
-    if (use_fast_kernel) {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, x_buffer());
-      kernel.SetArgument(2, y_buffer());
-    }
-    else {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, x_buffer());
-      kernel.SetArgument(2, static_cast<int>(x_offset));
-      kernel.SetArgument(3, static_cast<int>(x_inc));
-      kernel.SetArgument(4, y_buffer());
-      kernel.SetArgument(5, static_cast<int>(y_offset));
-      kernel.SetArgument(6, static_cast<int>(y_inc));
-    }
+  // Sets the kernel arguments
+  if (use_fast_kernel) {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, x_buffer());
+    kernel.SetArgument(2, y_buffer());
+  }
+  else {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, x_buffer());
+    kernel.SetArgument(2, static_cast<int>(x_offset));
+    kernel.SetArgument(3, static_cast<int>(x_inc));
+    kernel.SetArgument(4, y_buffer());
+    kernel.SetArgument(5, static_cast<int>(y_offset));
+    kernel.SetArgument(6, static_cast<int>(y_inc));
+  }

-    // Launches the kernel
-    if (use_fast_kernel) {
-      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    else {
-      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
-      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the kernel
+  if (use_fast_kernel) {
+    auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
+  else {
+    auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+    auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
 }

 // =================================================================================================
--- a/Show More
+++ b/Show More