Merge branch 'development' into netlib_blas_api

Conflicts: scripts/generator/generator.py scripts/generator/generator/routine.py
2016-10-25 09:34:24 +02:00 · 2016-10-25 09:34:24 +02:00 · 3b65eace0a
parent 9331442a56 0f5bf35ebe
commit 3b65eace0a
153 changed files with 8633 additions and 7576 deletions
--- a/3
+++ b/3
@ -1,8 +1,11 @@

 Development version (next release)
 - Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header
+- Changed the enums in the C API to avoid potential name clashes with external code
+- Greatly improved the way exceptions are handled in the library (thanks to 'intelfx')
 - Improved performance of GEMM kernels for small sizes by using a direct single-kernel implementation
 - Fixed a bug in the tests and samples related to waiting for an invalid event
+- Fixed a bug in the SYRK/SYR2K/HERK/HER2K routines that would occur with specific tuning parameters
 - Added support for compilation under Visual Studio 2013 (MSVC++ 12.0)
 - Added an option to set OpenCL compiler options through the env variable CLBLAST_BUILD_OPTIONS
 - Added an option to run tuned kernels multiple times to average execution times
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -69,9 +69,7 @@ endif()
 if(MSVC)
  if(BUILD_SHARED_LIBS)
    add_definitions(" /DCLBLAST_DLL")
-  else(BUILD_SHARED_LIBS)
-    add_definitions(" /DCLBLAST_STATIC")
-  endif(BUILD_SHARED_LIBS)
+  endif()
 endif(MSVC)

 # C++ compiler settings
@ -167,11 +165,12 @@ set(PRECISIONS 32 64 3232 6464 16)
 set(SOURCES
  src/database/database.cpp
  src/routines/common.cpp
+  src/utilities/clblast_exceptions.cpp
+  src/utilities/utilities.cpp
  src/cache.cpp
  src/clblast.cpp
  src/clblast_c.cpp
  src/routine.cpp
-  src/utilities.cpp
 )
 foreach(ROUTINE ${LEVEL1_ROUTINES})
  set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cpp)
@ -191,7 +190,7 @@ if(BUILD_SHARED_LIBS)
  add_library(clblast SHARED ${SOURCES})
 else(BUILD_SHARED_LIBS)
  add_library(clblast STATIC ${SOURCES})
-endif(BUILD_SHARED_LIBS)
+endif()

 target_link_libraries(clblast ${OPENCL_LIBRARIES})

@ -206,7 +205,7 @@ target_include_directories(clblast PUBLIC
 if(MSVC)
  if(BUILD_SHARED_LIBS)
    target_compile_definitions(clblast PRIVATE COMPILING_DLL=1) # requires at least CMake 2.8.11
-  endif(BUILD_SHARED_LIBS)
+  endif()
 endif()

 # Installs the library
@ -218,9 +217,17 @@ install(FILES include/clblast_half.h DESTINATION include)
 # Installs the config for find_package in dependent projects
 install(EXPORT CLBlast DESTINATION lib/cmake/CLBLast FILE CLBlastConfig.cmake)

+# Install pkg-config file on Linux
+if(UNIX)
+    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/clblast.pc.in"
+                   "${CMAKE_CURRENT_BINARY_DIR}/clblast.pc" @ONLY IMMEDIATE)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/clblast.pc
+            DESTINATION lib/pkgconfig)
+endif()
+
 # ==================================================================================================

-# Sets a default platform ($DEVICEPLATFORM) and device ($CLBLAST_DEVICE) to run tuners and tests on
+# Sets a default platform ($CLBLAST_PLATFORM) and device ($CLBLAST_DEVICE) to run tuners and tests
 set(DEVICEPLATFORM )
 if(DEFINED ENV{CLBLAST_DEVICE})
  set(DEVICEPLATFORM ${DEVICEPLATFORM} -device $ENV{CLBLAST_DEVICE})
@ -229,6 +236,12 @@ if(DEFINED ENV{CLBLAST_PLATFORM})
  set(DEVICEPLATFORM ${DEVICEPLATFORM} -platform $ENV{CLBLAST_PLATFORM})
 endif()

+# Optionally also provides other options to the tests such as -full_test ($CLBLAST_TEST_ARGUMENTS)
+set(TEST_ARGUMENTS )
+if(DEFINED ENV{CLBLAST_TEST_ARGUMENTS})
+  set(TEST_ARGUMENTS $ENV{CLBLAST_TEST_ARGUMENTS})
+endif()
+
 # ==================================================================================================

 # This section contains all the code related to the examples
@ -262,7 +275,7 @@ if(TUNERS)
  # Visual Studio requires the sources of non-exported objects/libraries
  set(TUNERS_COMMON )
  if(MSVC)
-    set(TUNERS_COMMON ${TUNERS_COMMON} src/utilities.cpp)
+    set(TUNERS_COMMON ${TUNERS_COMMON} src/utilities/utilities.cpp)
  endif()

  # Adds tuning executables
@ -298,7 +311,7 @@ if(CLIENTS OR TESTS)
    find_package(Threads)
    set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
    set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS})
-    if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+    if(MSVC)
      add_definitions(" /DCLBLAST_REF_CLBLAS")
    else()
      add_definitions(" -DCLBLAST_REF_CLBLAS")
@ -307,7 +320,7 @@ if(CLIENTS OR TESTS)
  if(CBLAS_FOUND)
    set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS})
    set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES})
-    if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+    if(MSVC)
      add_definitions(" /DCLBLAST_REF_CBLAS")
    else()
      add_definitions(" -DCLBLAST_REF_CBLAS")
@ -325,7 +338,7 @@ if(CLIENTS)
  # Visual Studio requires the sources of non-exported objects/libraries
  set(CLIENTS_COMMON )
  if(MSVC)
-    set(CLIENTS_COMMON ${CLIENTS_COMMON} src/utilities.cpp test/performance/client.cpp)
+    set(CLIENTS_COMMON ${CLIENTS_COMMON} src/utilities/utilities.cpp test/performance/client.cpp)
  else()
    # Creates the common performance-tests objects (requires CMake 2.8.8)
    add_library(test_performance_common OBJECT test/performance/client.cpp)
@ -372,7 +385,7 @@ if(TESTS)
  # Visual Studio requires the sources of non-exported objects/libraries
  set(TESTS_COMMON )
  if(MSVC)
-    set(TESTS_COMMON ${TESTS_COMMON} src/utilities.cpp
+    set(TESTS_COMMON ${TESTS_COMMON} src/utilities/utilities.cpp
        test/correctness/tester.cpp test/correctness/testblas.cpp)
  else()
    # Creates the common correctness-tests objects (requires CMake 2.8.8)
@ -405,14 +418,14 @@ if(TESTS)
    target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
    install(TARGETS clblast_test_${ROUTINE} DESTINATION bin)
    target_include_directories(clblast_test_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES})
-    add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE} ${DEVICEPLATFORM})
+    add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE} ${DEVICEPLATFORM} ${TEST_ARGUMENTS})
  endforeach()

  # Adds 'alltests' target: runs all tests
  set(ALLTESTS )
  set(ALLTESTSDEPENDS )
  foreach(ROUTINE ${ROUTINES})
-    set(ALLTESTS ${ALLTESTS} COMMAND clblast_test_${ROUTINE} ${DEVICEPLATFORM})
+    set(ALLTESTS ${ALLTESTS} COMMAND clblast_test_${ROUTINE} ${DEVICEPLATFORM} ${TEST_ARGUMENTS})
    set(ALLTESTSDEPENDS clblast_test_${ROUTINE})
  endforeach()
  add_custom_target(alltests ${ALLTESTS} DEPENDS ${ALLTESTSDEPENDS})
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,20 @@
+
+CLBlast: Contributing guidelines
+================
+
+For information about the CLBlast library, see the [README](README.md) file instead.
+
+Tuning results
+-------------
+
+A [dedicated GitHub issue](https://github.com/CNugteren/CLBlast/issues/1) is available to post new tuning results. If you compiled with the tuners (see the [README](README.md) for instructions), ran one of the tuners on your device (or all perhaps?), and feel that these results should be included in the next release of CLBlast, please post them there. You can do this by attaching the JSON files to the issue (archived in a .ZIP file).
+
+
+Code improvements and additions
+-------------
+
+Pull requests are welcome as long as they:
+
+* Contain unit additions or modifications
+* Follow the CLBlast coding style, which is loosely based on the [Google C++ style guide](https://google-styleguide.googlecode.com/svn/trunk/cppguide.html) and the Effective C++ books by Scott Meyers. We use a tab-size of 2 spaces and a max-width of 100 characters.
+* Are made against the `development` branch.
--- a/214
+++ b/214
@ -1,21 +1,201 @@
-MIT License
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/

-Copyright (c) 2016 Cedric Nugteren
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
+   1. Definitions.

-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.

-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2015 Cedric Nugteren
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
@ -111,8 +111,9 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
  - GeForce GTX 750 Ti
  - GeForce GTX 980
  - GeForce GTX 1070
-  - GeForce GTX Titan
-  - GeForce GTX Titan X
+  - GeForce GTX TITAN
+  - GeForce GTX TITAN Black
+  - GeForce GTX TITAN X
  - Tesla K20m
  - Tesla K40m
 * AMD GPUs:
@ -121,6 +122,7 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
  - Oland
  - Pitcairn
  - Tahiti
+  - Tonga
 * Intel GPUs:
  - HD Graphics 530
  - HD Graphics 5500 BroadWell U-Processor GT2
@ -175,7 +177,7 @@ To build these tests, another BLAS library is needed to serve as a reference. Th

 Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is tested for correctness against [clBLAS](http://github.com/clMathLibraries/clBLAS) and/or a regular CPU BLAS library. If both are installed on your system, setting the command-line option `-clblas 1` or `-cblas 1` will select the library to test against for the `clblast_test_xxxxx` executables. All tests have a `-verbose` option to enable additional diagnostic output. They also have a `-full_test` option to increase coverage further.

-All tests can be run directly together in one go through the `make alltests` target or using CTest (`make test` or `ctest`). In the latter case the output is less verbose. Both cases allow you to set the default device and platform to non-zero by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake.
+All tests can be run directly together in one go through the `make alltests` target or using CTest (`make test` or `ctest`). In the latter case the output is less verbose. Both cases allow you to set the default device and platform to non-zero by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake. Further options (e.g. `-full_test`) can be supplied through the `CLBLAST_TEST_ARGUMENTS` environmental variable.


 Compiling the performance tests/clients (optional)
@ -284,7 +286,7 @@ The `samples/haxpy.c` example shows how to use these convencience functions when
 Contributing
 -------------

-Contributions are welcome in the form of tuning results for OpenCL devices previously untested. Furthermore, merge requests are welcome as long as they contain unit additions or modifications. Furthermore, they should follow the CLBlast coding style, which is based on the [Google C++ style guide](https://google-styleguide.googlecode.com/svn/trunk/cppguide.html) and the Effective C++ books by Scott Meyers.
+Contributions are welcome in the form of tuning results for OpenCL devices previously untested or pull requests. See [the contributing guidelines](CONTRIBUTING.md) for more details.

 The contributing authors (code, pull requests, testing) so far are:

@ -296,6 +298,7 @@ The contributing authors (code, pull requests, testing) so far are:
 * [Gian-Carlo Pascutto](https://github.com/gcp)
 * [Ivan Shapovalov](https://github.com/intelfx)
 * [Dimitri Van Assche](https://github.com/dvasschemacq)
+* [Shehzan Mohammed](https://shehzan10.github.io)

 Tuning and testing on a variety of OpenCL devices was made possible by:

@ -303,6 +306,7 @@ Tuning and testing on a variety of OpenCL devices was made possible by:
 * [ASCI DAS4 and DAS5](http://www.cs.vu.nl/das4/)
 * [dividiti](http://www.dividiti.com)
 * [SURFsara HPC center](http://www.surfsara.com)
+* [ArrayFire](http://arrayfire.org)


 Support us
--- a/clblast.pc.in
+++ b/clblast.pc.in
@ -0,0 +1,10 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+includedir=${prefix}/include
+libdir=${exec_prefix}/lib
+
+Name: CLBlast
+Description: CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11
+Version: @clblast_VERSION_MAJOR@.@clblast_VERSION_MINOR@.@clblast_VERSION_PATCH@
+Libs: -L${libdir} -lclblast
+Cflags: -I${includedir}
--- a/doc/clblast.md
+++ b/doc/clblast.md
--- a/include/clblast.h
+++ b/include/clblast.h
@ -46,14 +46,34 @@ enum class StatusCode {

  // Status codes in common with the OpenCL standard
  kSuccess                   =   0, // CL_SUCCESS
+  kOpenCLCompilerNotAvailable=  -3, // CL_COMPILER_NOT_AVAILABLE
  kTempBufferAllocFailure    =  -4, // CL_MEM_OBJECT_ALLOCATION_FAILURE
-  kBuildProgramFailure       = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error
+  kOpenCLOutOfResources      =  -5, // CL_OUT_OF_RESOURCES
+  kOpenCLOutOfHostMemory     =  -6, // CL_OUT_OF_HOST_MEMORY
+  kOpenCLBuildProgramFailure = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error
+  kInvalidValue              = -30, // CL_INVALID_VALUE
+  kInvalidCommandQueue       = -36, // CL_INVALID_COMMAND_QUEUE
+  kInvalidMemObject          = -38, // CL_INVALID_MEM_OBJECT
  kInvalidBinary             = -42, // CL_INVALID_BINARY
+  kInvalidBuildOptions       = -43, // CL_INVALID_BUILD_OPTIONS
+  kInvalidProgram            = -44, // CL_INVALID_PROGRAM
+  kInvalidProgramExecutable  = -45, // CL_INVALID_PROGRAM_EXECUTABLE
+  kInvalidKernelName         = -46, // CL_INVALID_KERNEL_NAME
+  kInvalidKernelDefinition   = -47, // CL_INVALID_KERNEL_DEFINITION
  kInvalidKernel             = -48, // CL_INVALID_KERNEL
+  kInvalidArgIndex           = -49, // CL_INVALID_ARG_INDEX
+  kInvalidArgValue           = -50, // CL_INVALID_ARG_VALUE
+  kInvalidArgSize            = -51, // CL_INVALID_ARG_SIZE
+  kInvalidKernelArgs         = -52, // CL_INVALID_KERNEL_ARGS
  kInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions
  kInvalidLocalThreadsTotal  = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total
  kInvalidLocalThreadsDim    = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension
-  kInvalidTempBufferSize     = -61, // CL_INVALID_BUFFER_SIZE
+  kInvalidGlobalOffset       = -56, // CL_INVALID_GLOBAL_OFFSET
+  kInvalidEventWaitList      = -57, // CL_INVALID_EVENT_WAIT_LIST
+  kInvalidEvent              = -58, // CL_INVALID_EVENT
+  kInvalidOperation          = -59, // CL_INVALID_OPERATION
+  kInvalidBufferSize         = -61, // CL_INVALID_BUFFER_SIZE
+  kInvalidGlobalWorkSize     = -63, // CL_INVALID_GLOBAL_WORK_SIZE

  // Status codes in common with the clBLAS library
  kNotImplemented            = -1024, // Routine or functionality not implemented yet
@ -75,13 +95,14 @@ enum class StatusCode {
  kInsufficientMemoryY       = -1007, // Vector Y's OpenCL buffer is too small

  // Custom additional status codes for CLBlast
-  kKernelLaunchError         = -2048, // Problem occurred when enqueuing the kernel
-  kKernelRunError            = -2047, // Problem occurred while running the kernel
  kInvalidLocalMemUsage      = -2046, // Not enough local memory available on this device
  kNoHalfPrecision           = -2045, // Half precision (16-bits) not supported by the device
  kNoDoublePrecision         = -2044, // Double precision (64-bits) not supported by the device
  kInvalidVectorScalar       = -2043, // The unit-sized vector is not a valid OpenCL buffer
  kInsufficientMemoryScalar  = -2042, // The unit-sized vector's OpenCL buffer is too small
+  kDatabaseError             = -2041, // Entry for the device was not found in the database
+  kUnknownError              = -2040, // A catch-all error code representing an unspecified error
+  kUnexpectedError           = -2039, // A catch-all error code representing an unexpected exception
 };

 // Matrix layout and transpose types
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
--- a/samples/cache.c
+++ b/samples/cache.c
@ -106,13 +106,13 @@ void run_example_routine(const cl_device_id device) {
  clock_t start = clock();

  // Calls an example routine
-  StatusCode status = CLBlastSasum(n,
-                                   device_output, 0,
-                                   device_input, 0, 1,
-                                   &queue, &event);
+  CLBlastStatusCode status = CLBlastSasum(n,
+                                          device_output, 0,
+                                          device_input, 0, 1,
+                                          &queue, &event);

  // Wait for completion
-  if (status == kSuccess) {
+  if (status == CLBlastSuccess) {
    clWaitForEvents(1, &event);
    clReleaseEvent(event);
  }
--- a/samples/dgemv.c
+++ b/samples/dgemv.c
@ -74,17 +74,17 @@ int main(void) {
  clEnqueueWriteBuffer(queue, device_y, CL_TRUE, 0, m*sizeof(double), host_y, 0, NULL, NULL);

  // Call the DGEMV routine.
-  StatusCode status = CLBlastDgemv(kRowMajor, kNo,
-                                   m, n,
-                                   alpha,
-                                   device_a, 0, a_ld,
-                                   device_x, 0, 1,
-                                   beta,
-                                   device_y, 0, 1,
-                                   &queue, &event);
+  CLBlastStatusCode status = CLBlastDgemv(CLBlastLayoutRowMajor, CLBlastTransposeNo,
+                                          m, n,
+                                          alpha,
+                                          device_a, 0, a_ld,
+                                          device_x, 0, 1,
+                                          beta,
+                                          device_y, 0, 1,
+                                          &queue, &event);

  // Wait for completion
-  if (status == kSuccess) {
+  if (status == CLBlastSuccess) {
    clWaitForEvents(1, &event);
    clReleaseEvent(event);
  }
--- a/samples/haxpy.c
+++ b/samples/haxpy.c
@ -71,13 +71,13 @@ int main(void) {
  clEnqueueWriteBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL);

  // Call the HAXPY routine.
-  StatusCode status = CLBlastHaxpy(n, alpha,
-                                   device_a, 0, 1,
-                                   device_b, 0, 1,
-                                   &queue, &event);
+  CLBlastStatusCode status = CLBlastHaxpy(n, alpha,
+                                          device_a, 0, 1,
+                                          device_b, 0, 1,
+                                          &queue, &event);

  // Wait for completion
-  if (status == kSuccess) {
+  if (status == CLBlastSuccess) {
    clWaitForEvents(1, &event);
    clReleaseEvent(event);
  }
--- a/samples/sasum.c
+++ b/samples/sasum.c
@ -67,13 +67,13 @@ int main(void) {
  clEnqueueWriteBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);

  // Call the SASUM routine.
-  StatusCode status = CLBlastSasum(n,
-                                   device_output, 0,
-                                   device_input, 0, 1,
-                                   &queue, &event);
+  CLBlastStatusCode status = CLBlastSasum(n,
+                                          device_output, 0,
+                                          device_input, 0, 1,
+                                          &queue, &event);

  // Wait for completion
-  if (status == kSuccess) {
+  if (status == CLBlastSuccess) {
    clWaitForEvents(1, &event);
    clReleaseEvent(event);
  }
--- a/samples/sgemm.c
+++ b/samples/sgemm.c
@ -77,17 +77,18 @@ int main(void) {
  clEnqueueWriteBuffer(queue, device_c, CL_TRUE, 0, m*n*sizeof(float), host_c, 0, NULL, NULL);

  // Call the SGEMM routine.
-  StatusCode status = CLBlastSgemm(kRowMajor, kNo, kNo,
-                                   m, n, k,
-                                   alpha,
-                                   device_a, 0, a_ld,
-                                   device_b, 0, b_ld,
-                                   beta,
-                                   device_c, 0, c_ld,
-                                   &queue, &event);
+  CLBlastStatusCode status = CLBlastSgemm(CLBlastLayoutRowMajor,
+                                          CLBlastTransposeNo, CLBlastTransposeNo,
+                                          m, n, k,
+                                          alpha,
+                                          device_a, 0, a_ld,
+                                          device_b, 0, b_ld,
+                                          beta,
+                                          device_c, 0, c_ld,
+                                          &queue, &event);

  // Wait for completion
-  if (status == kSuccess) {
+  if (status == CLBlastSuccess) {
    clWaitForEvents(1, &event);
    clReleaseEvent(event);
  }
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@ -31,9 +31,18 @@ import generator.doc as doc
 from generator.routine import Routine
 from generator.datatype import H, S, D, C, Z, Sc, Dz, iH, iS, iD, iC, iZ, Css, Zdd, Ccs, Zzd, T, Tc, TU

-
-HEADER_LINES = [96, 73, 97, 22, 29, 41, 43, 1]
-FOOTER_LINES = [17, 75, 19, 14, 6, 6, 10, 1]
+FILES = [
+    "/include/clblast.h",
+    "/src/clblast.cpp",
+    "/include/clblast_c.h",
+    "/src/clblast_c.cpp",
+    "/test/wrapper_clblas.hpp",
+    "/test/wrapper_cblas.hpp",
+    "/include/clblast_blas.h",
+    "/src/clblast_blas.cpp",
+]
+HEADER_LINES = [117, 73, 118, 22, 29, 41, 43, 1]
+FOOTER_LINES = [17, 80, 19, 18, 6, 6, 10, 1]

 # Different possibilities for requirements
 ald_m = "The value of `a_ld` must be at least `m`."
@ -126,35 +135,23 @@ def main(argv):
    cl_args = parser.parse_args(argv)
    library_root = cl_args.clblast_root

-    # Sets all the files the output
-    files = [
-        library_root + "/include/clblast.h",
-        library_root + "/src/clblast.cpp",
-        library_root + "/include/clblast_c.h",
-        library_root + "/src/clblast_c.cpp",
-        library_root + "/test/wrapper_clblas.hpp",
-        library_root + "/test/wrapper_cblas.hpp",
-        library_root + "/include/clblast_blas.h",
-        library_root + "/src/clblast_blas.cpp",
-    ]
-
    # Checks whether the command-line arguments are valid; exists otherwise
-    for f in files:
-        if not os.path.isfile(f):
+    for f in FILES:
+        if not os.path.isfile(library_root + f):
            print("[ERROR] The path '" + library_root + "' does not point to the root of the CLBlast library")
            sys.exit()

    # Iterates over all regular files to output
-    for i in range(0, len(files)):
+    for i in range(0, len(FILES)):

        # Stores the header and the footer of the original file
-        with open(files[i]) as f:
+        with open(library_root + FILES[i]) as f:
            original = f.readlines()
        file_header = original[:HEADER_LINES[i]]
        file_footer = original[-FOOTER_LINES[i]:]

        # Re-writes the body of the file
-        with open(files[i], "w") as f:
+        with open(library_root + FILES[i], "w") as f:
            body = ""
            levels = [1, 2, 3] if (i == 4 or i == 5) else [1, 2, 3, 4]
            for level in levels:
--- a/scripts/generator/generator/cpp.py
+++ b/scripts/generator/generator/cpp.py
@ -45,17 +45,18 @@ def clblast_h(routine):

 def clblast_cc(routine):
    """The C++ API implementation (.cpp)"""
-    indent1 = " " * (20 + routine.length())
+    indent1 = " " * (15 + routine.length())
    result = NL + "// " + routine.description + ": " + routine.short_names() + NL
    if routine.implemented:
        result += routine.routine_header_cpp(12, "") + " {" + NL
-        result += "  auto queue_cpp = Queue(*queue);" + NL
-        result += "  auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL
-        result += "  auto status = routine.SetUp();" + NL
-        result += "  if (status != StatusCode::kSuccess) { return status; }" + NL
-        result += "  return routine.Do" + routine.name.capitalize() + "("
+        result += "  try {" + NL
+        result += "    auto queue_cpp = Queue(*queue);" + NL
+        result += "    auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL
+        result += "    routine.Do" + routine.name.capitalize() + "("
        result += ("," + NL + indent1).join([a for a in routine.arguments_clcudaapi()])
        result += ");" + NL
+        result += "    return StatusCode::kSuccess;" + NL
+        result += "  } catch (...) { return DispatchException(); }" + NL
    else:
        result += routine.routine_header_type_cpp(12) + " {" + NL
        result += "  return StatusCode::kNotImplemented;" + NL
@ -72,7 +73,7 @@ def clblast_c_h(routine):
    """The C API header (.h)"""
    result = NL + "// " + routine.description + ": " + routine.short_names() + NL
    for flavour in routine.flavours:
-        result += routine.routine_header_c(flavour, 31, " PUBLIC_API") + ";" + NL
+        result += routine.routine_header_c(flavour, 38, " PUBLIC_API") + ";" + NL
    return result


@ -81,12 +82,16 @@ def clblast_c_cc(routine):
    result = NL + "// " + routine.name.upper() + NL
    for flavour in routine.flavours:
        template = "<" + flavour.template + ">" if routine.no_scalars() else ""
-        indent = " " * (26 + routine.length() + len(template))
-        result += routine.routine_header_c(flavour, 20, "") + " {" + NL
-        result += "  auto status = clblast::" + routine.name.capitalize() + template + "("
+        indent = " " * (16 + routine.length() + len(template))
+        result += routine.routine_header_c(flavour, 27, "") + " {" + NL
+        result += "  try {" + NL
+        result += "    return static_cast<CLBlastStatusCode>(" + NL
+        result += "      clblast::" + routine.name.capitalize() + template + "("
        result += ("," + NL + indent).join([a for a in routine.arguments_cast(flavour, indent)])
-        result += "," + NL + indent + "queue, event);"
-        result += NL + "  return static_cast<StatusCode>(status);" + NL + "}" + NL
+        result += "," + NL + indent + "queue, event)" + NL
+        result += "    );" + NL
+        result += "  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }" + NL
+        result += "}" + NL
    return result


--- a/scripts/generator/generator/doc.py
+++ b/scripts/generator/generator/doc.py
@ -32,7 +32,7 @@ def generate(routine):
    result += "C API:" + NL
    result += "```" + NL
    for flavour in routine.flavours:
-        result += routine.routine_header_c(flavour, 20, "") + NL
+        result += routine.routine_header_c(flavour, 27, "") + NL
    result += "```" + NL + NL

    # Routine arguments
--- a/scripts/generator/generator/routine.py
+++ b/scripts/generator/generator/routine.py
@ -390,6 +390,13 @@ class Routine:
            return [", ".join(definitions)]
        return []

+    def options_def_c(self):
+        """As above, but now for the C API"""
+        if self.options:
+            definitions = ["const CLBlast" + convert.option_to_clblast(o) + " " + o for o in self.options]
+            return [", ".join(definitions)]
+        return []
+
    def options_def_wrapper_clblas(self):
        """As above, but now using clBLAS data-types"""
        if self.options:
@ -505,6 +512,17 @@ class Routine:
                list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_second()])) +
                list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))

+    def arguments_def_c(self, flavour):
+        """As above, but for the C API"""
+        return (self.options_def_c() + self.sizes_def() +
+                list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_first()])) +
+                self.scalar_def("alpha", flavour) +
+                list(chain(*[self.buffer_def(b) for b in self.buffers_first()])) +
+                self.scalar_def("beta", flavour) +
+                list(chain(*[self.buffer_def(b) for b in self.buffers_second()])) +
+                list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) +
+                list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
+
    def arguments_def_wrapper_clblas(self, flavour):
        """As above, but clBLAS wrapper plain data-types"""
        return (self.options_def_wrapper_clblas() + self.sizes_def() +
@ -575,8 +593,8 @@ class Routine:
    def routine_header_c(self, flavour, spaces, extra_qualifier):
        """As above, but now for C"""
        indent = " " * (spaces + self.length())
-        result = "StatusCode" + extra_qualifier + " CLBlast" + flavour.name + self.name + "("
-        result += (",\n" + indent).join([a for a in self.arguments_def(flavour)])
+        result = "CLBlastStatusCode" + extra_qualifier + " CLBlast" + flavour.name + self.name + "("
+        result += (",\n" + indent).join([a for a in self.arguments_def_c(flavour)])
        result += ",\n" + indent + "cl_command_queue* queue, cl_event* event)"
        return result

--- a/src/buffer_test.hpp
+++ b/src/buffer_test.hpp
@ -1,121 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the tests for the OpenCL buffers (matrices and vectors). These tests are
-// templated and thus header-only.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_BUFFER_TEST_H_
-#define CLBLAST_BUFFER_TEST_H_
-
-#include "clblast.h"
-
-namespace clblast {
-// =================================================================================================
-
-// Tests matrix 'A' for validity
-template <typename T>
-StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
-                       const size_t offset, const size_t ld) {
-  if (ld < one) { return StatusCode::kInvalidLeadDimA; }
-  try {
-    const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
-  } catch (...) { return StatusCode::kInvalidMatrixA; }
-  return StatusCode::kSuccess;
-}
-
-// Tests matrix 'B' for validity
-template <typename T>
-StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
-                       const size_t offset, const size_t ld) {
-  if (ld < one) { return StatusCode::kInvalidLeadDimB; }
-  try {
-    const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryB; }
-  } catch (...) { return StatusCode::kInvalidMatrixB; }
-  return StatusCode::kSuccess;
-}
-
-// Tests matrix 'C' for validity
-template <typename T>
-StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
-                       const size_t offset, const size_t ld) {
-  if (ld < one) { return StatusCode::kInvalidLeadDimC; }
-  try {
-    const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryC; }
-  } catch (...) { return StatusCode::kInvalidMatrixC; }
-  return StatusCode::kSuccess;
-}
-
-// Tests matrix 'AP' for validity
-template <typename T>
-StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer, const size_t offset) {
-  try {
-    const auto required_size = (((n * (n + 1)) / 2) + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
-  } catch (...) { return StatusCode::kInvalidMatrixA; }
-  return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-
-// Tests vector 'X' for validity
-template <typename T>
-StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
-                       const size_t inc) {
-  if (inc == 0) { return StatusCode::kInvalidIncrementX; }
-  try {
-    const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryX; }
-  } catch (...) { return StatusCode::kInvalidVectorX; }
-  return StatusCode::kSuccess;
-}
-
-// Tests vector 'Y' for validity
-template <typename T>
-StatusCode TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
-                       const size_t inc) {
-  if (inc == 0) { return StatusCode::kInvalidIncrementY; }
-  try {
-    const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryY; }
-  } catch (...) { return StatusCode::kInvalidVectorY; }
-  return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-
-// Tests vector 'scalar' for validity
-template <typename T>
-StatusCode TestVectorScalar(const size_t n, const Buffer<T> &buffer, const size_t offset) {
-  try {
-    const auto required_size = (n + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
-  } catch (...) { return StatusCode::kInvalidVectorScalar; }
-  return StatusCode::kSuccess;
-}
-
-// Tests vector 'index' for validity
-template <typename T>
-StatusCode TestVectorIndex(const size_t n, const Buffer<T> &buffer, const size_t offset) {
-  try {
-    const auto required_size = (n + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
-  } catch (...) { return StatusCode::kInvalidVectorScalar; }
-  return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_BUFFER_TEST_H_
-#endif
--- a/src/cache.cpp
+++ b/src/cache.cpp
@ -57,7 +57,7 @@ const std::string& GetBinaryFromCache(const std::string &device_name, const Prec
    }
  }
  binary_cache_mutex_.unlock();
-  throw std::runtime_error("Internal CLBlast error: Expected binary in cache, but found none.");
+  throw LogicError("GetBinaryFromCache: Expected binary in cache, but found none");
 }

 // Queries the cache and retrieves a matching program. Assumes that the match is available, throws
@ -75,7 +75,7 @@ const Program& GetProgramFromCache(const Context &context, const Precision &prec
    }
  }
  program_cache_mutex_.unlock();
-  throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none.");
+  throw LogicError("GetProgramFromCache: Expected program in cache, but found none");
 }

 // Queries the cache to see whether or not the compiled kernel is already there
@ -109,14 +109,13 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
 // =================================================================================================

 // Clears the cache of stored binaries and programs
-StatusCode CacheClearAll() {
+void CacheClearAll() {
  binary_cache_mutex_.lock();
  binary_cache_.clear();
  binary_cache_mutex_.unlock();
  program_cache_mutex_.lock();
  program_cache_.clear();
  program_cache_mutex_.unlock();
-  return StatusCode::kSuccess;
 }

 // =================================================================================================
--- a/src/cache.hpp
+++ b/src/cache.hpp
@ -18,7 +18,7 @@
 #include <vector>
 #include <mutex>

-#include "utilities.hpp"
+#include "utilities/utilities.hpp"

 namespace clblast {
 // =================================================================================================
@ -89,7 +89,7 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
 // =================================================================================================

 // Clears the cache of stored binaries
-StatusCode CacheClearAll();
+void CacheClearAll();

 // =================================================================================================
 } // namespace clblast
--- a/src/clblast.cpp
+++ b/src/clblast.cpp
--- a/src/clblast_c.cpp
+++ b/src/clblast_c.cpp
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@ -41,8 +41,8 @@
 #include <string>    // std::string
 #include <vector>    // std::vector
 #include <memory>    // std::shared_ptr
-#include <stdexcept> // std::runtime_error
 #include <numeric>   // std::accumulate
+#include <cstring>   // std::strlen

 // OpenCL
 #if defined(__APPLE__) || defined(__MACOSX)
@ -51,20 +51,41 @@
  #include <CL/opencl.h>
 #endif

+// Exception classes
+#include "cxpp11_common.hpp"
+
 namespace clblast {
 // =================================================================================================

-// Error occurred in the C++11 OpenCL header (this file)
-inline void Error(const std::string &message) {
-  throw std::runtime_error("Internal OpenCL error: "+message);
-}
+// Represents a runtime error returned by an OpenCL API function
+class CLError : public ErrorCode<DeviceError, cl_int> {
+ public:
+  explicit CLError(cl_int status, const std::string &where):
+      ErrorCode(status,
+                where,
+                "OpenCL error: " + where + ": " + std::to_string(static_cast<int>(status))) {
+  }
+
+  static void Check(const cl_int status, const std::string &where) {
+    if (status != CL_SUCCESS) {
+      throw CLError(status, where);
+    }
+  }
+
+  static void CheckDtor(const cl_int status, const std::string &where) {
+    if (status != CL_SUCCESS) {
+      fprintf(stderr, "CLBlast: %s (ignoring)\n", CLError(status, where).what());
+    }
+  }
+};
+
+// =================================================================================================

 // Error occurred in OpenCL
-inline void CheckError(const cl_int status) {
-  if (status != CL_SUCCESS) {
-    throw std::runtime_error("Internal OpenCL error: "+std::to_string(status));
-  }
-}
+#define CheckError(call) CLError::Check(call, CLError::TrimCallString(#call))
+
+// Error occured in OpenCL (no-exception version for destructors)
+#define CheckErrorDtor(call) CLError::CheckDtor(call, CLError::TrimCallString(#call))

 // =================================================================================================

@ -81,7 +102,7 @@ class Event {
  // Regular constructor with memory management
  explicit Event():
      event_(new cl_event, [](cl_event* e) {
-        if (*e) { CheckError(clReleaseEvent(*e)); }
+        if (*e) { CheckErrorDtor(clReleaseEvent(*e)); }
        delete e;
      }) {
    *event_ = nullptr;
@ -92,16 +113,17 @@ class Event {
    CheckError(clWaitForEvents(1, &(*event_)));
  }

-  // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
-  // the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation:
-  // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx
+  // Retrieves the elapsed time of the last recorded event.
+  // (Note that there is a bug in Apple's OpenCL implementation of the 'clGetEventProfilingInfo' function:
+  //  http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx)
+  // However, in our case the reply size is fixed to be cl_ulong, so we are not affected.
  float GetElapsedTime() const {
    WaitForCompletion();
    const auto bytes = sizeof(cl_ulong);
    auto time_start = cl_ulong{0};
-    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
+    CheckError(clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr));
    auto time_end = cl_ulong{0};
-    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
+    CheckError(clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr));
    return static_cast<float>(time_end - time_start) * 1.0e-6f;
  }

@ -130,10 +152,14 @@ class Platform {
  explicit Platform(const size_t platform_id) {
    auto num_platforms = cl_uint{0};
    CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
-    if (num_platforms == 0) { Error("no platforms found"); }
+    if (num_platforms == 0) {
+      throw RuntimeError("Platform: no platforms found");
+    }
+    if (platform_id >= num_platforms) {
+      throw RuntimeError("Platform: invalid platform ID "+std::to_string(platform_id));
+    }
    auto platforms = std::vector<cl_platform_id>(num_platforms);
    CheckError(clGetPlatformIDs(num_platforms, platforms.data(), nullptr));
-    if (platform_id >= num_platforms) { Error("invalid platform ID "+std::to_string(platform_id)); }
    platform_ = platforms[platform_id];
  }

@ -173,11 +199,16 @@ class Device {
  // Initialize the device. Note that this constructor can throw exceptions!
  explicit Device(const Platform &platform, const size_t device_id) {
    auto num_devices = platform.NumDevices();
-    if (num_devices == 0) { Error("no devices found"); }
+    if (num_devices == 0) {
+      throw RuntimeError("Device: no devices found");
+    }
+    if (device_id >= num_devices) {
+      throw RuntimeError("Device: invalid device ID "+std::to_string(device_id));
+    }
+
    auto devices = std::vector<cl_device_id>(num_devices);
    CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, static_cast<cl_uint>(num_devices),
                              devices.data(), nullptr));
-    if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); }
    device_ = devices[device_id];
  }

@ -282,7 +313,8 @@ class Device {
    auto result = std::string{};
    result.resize(bytes);
    CheckError(clGetDeviceInfo(device_, info, bytes, &result[0], nullptr));
-    return std::string{result.c_str()}; // Removes any trailing '\0'-characters
+    result.resize(strlen(result.c_str())); // Removes any trailing '\0'-characters
+    return result;
  }
 };

@ -300,11 +332,11 @@ class Context {

  // Regular constructor with memory management
  explicit Context(const Device &device):
-      context_(new cl_context, [](cl_context* c) { CheckError(clReleaseContext(*c)); delete c; }) {
+      context_(new cl_context, [](cl_context* c) { CheckErrorDtor(clReleaseContext(*c)); delete c; }) {
    auto status = CL_SUCCESS;
    const cl_device_id dev = device();
    *context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status);
-    CheckError(status);
+    CLError::Check(status, "clCreateContext");
  }

  // Accessor to the private data-member
@ -329,18 +361,18 @@ class Program {

  // Source-based constructor with memory management
  explicit Program(const Context &context, std::string source):
-      program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
+      program_(new cl_program, [](cl_program* p) { CheckErrorDtor(clReleaseProgram(*p)); delete p; }),
      length_(source.length()),
      source_(std::move(source)),
      source_ptr_(&source_[0]) {
    auto status = CL_SUCCESS;
    *program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status);
-    CheckError(status);
+    CLError::Check(status, "clCreateProgramWithSource");
  }

  // Binary-based constructor with memory management
  explicit Program(const Device &device, const Context &context, const std::string& binary):
-      program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
+      program_(new cl_program, [](cl_program* p) { CheckErrorDtor(clReleaseProgram(*p)); delete p; }),
      length_(binary.length()),
      source_(binary),
      source_ptr_(&source_[0]) {
@ -350,25 +382,15 @@ class Program {
    *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_,
                                          reinterpret_cast<const unsigned char**>(&source_ptr_),
                                          &status1, &status2);
-    CheckError(status1);
-    CheckError(status2);
+    CLError::Check(status1, "clCreateProgramWithBinary (binary status)");
+    CLError::Check(status2, "clCreateProgramWithBinary");
  }

  // Compiles the device program and returns whether or not there where any warnings/errors
-  BuildStatus Build(const Device &device, std::vector<std::string> &options) {
+  void Build(const Device &device, std::vector<std::string> &options) {
    auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
    const cl_device_id dev = device();
-    auto status = clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr);
-    if (status == CL_BUILD_PROGRAM_FAILURE) {
-      return BuildStatus::kError;
-    }
-    else if (status == CL_INVALID_BINARY) {
-      return BuildStatus::kInvalid;
-    }
-    else {
-      CheckError(status);
-      return BuildStatus::kSuccess;
-    }
+    CheckError(clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr));
  }

  // Retrieves the warning/error message from the compiler (if any)
@ -416,7 +438,7 @@ class Queue {

  // Regular constructor with memory management
  explicit Queue(const Context &context, const Device &device):
-      queue_(new cl_command_queue, [](cl_command_queue* s) { CheckError(clReleaseCommandQueue(*s));
+      queue_(new cl_command_queue, [](cl_command_queue* s) { CheckErrorDtor(clReleaseCommandQueue(*s));
                                                             delete s; }) {
    auto status = CL_SUCCESS;
    #ifdef CL_VERSION_2_0
@ -425,15 +447,17 @@ class Queue {
      {
        cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
        *queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status);
+        CLError::Check(status, "clCreateCommandQueueWithProperties");
      }
      else
      {
        *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
+        CLError::Check(status, "clCreateCommandQueue");
      }
    #else
      *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
+      CLError::Check(status, "clCreateCommandQueue");
    #endif
-    CheckError(status);
  }

  // Synchronizes the queue
@ -525,7 +549,7 @@ class Buffer {
    if (access_ == BufferAccess::kWriteOnly) { flags = CL_MEM_WRITE_ONLY; }
    auto status = CL_SUCCESS;
    *buffer_ = clCreateBuffer(context(), flags, size*sizeof(T), nullptr, &status);
-    CheckError(status);
+    CLError::Check(status, "clCreateBuffer");
  }

  // As above, but now with read/write access as a default
@ -546,18 +570,24 @@ class Buffer {

  // Copies from device to host: reading the device buffer a-synchronously
  void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
-    if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
+    if (access_ == BufferAccess::kWriteOnly) {
+      throw LogicError("Buffer: reading from a write-only buffer");
+    }
    CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
                                   host, 0, nullptr, nullptr));
  }
  void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
                 const size_t offset = 0) const {
-    if (host.size() < size) { Error("target host buffer is too small"); }
+    if (host.size() < size) {
+      throw LogicError("Buffer: target host buffer is too small");
+    }
    ReadAsync(queue, size, host.data(), offset);
  }
  void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
                 const size_t offset = 0) const {
-    if (host.size() < size) { Error("target host buffer is too small"); }
+    if (host.size() < size) {
+      throw LogicError("Buffer: target host buffer is too small");
+    }
    ReadAsync(queue, size, host.data(), offset);
  }

@ -577,8 +607,12 @@ class Buffer {

  // Copies from host to device: writing the device buffer a-synchronously
  void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
-    if (access_ == BufferAccess::kReadOnly) { Error("writing to a read-only buffer"); }
-    if (GetSize() < (offset+size)*sizeof(T)) { Error("target device buffer is too small"); }
+    if (access_ == BufferAccess::kReadOnly) {
+      throw LogicError("Buffer: writing to a read-only buffer");
+    }
+    if (GetSize() < (offset+size)*sizeof(T)) {
+      throw LogicError("Buffer: target device buffer is too small");
+    }
    CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
                                    host, 0, nullptr, nullptr));
  }
@ -644,10 +678,10 @@ class Kernel {

  // Regular constructor with memory management
  explicit Kernel(const Program &program, const std::string &name):
-      kernel_(new cl_kernel, [](cl_kernel* k) { CheckError(clReleaseKernel(*k)); delete k; }) {
+      kernel_(new cl_kernel, [](cl_kernel* k) { CheckErrorDtor(clReleaseKernel(*k)); delete k; }) {
    auto status = CL_SUCCESS;
    *kernel_ = clCreateKernel(program(), name.c_str(), &status);
-    CheckError(status);
+    CLError::Check(status, "clCreateKernel");
  }

  // Sets a kernel argument at the indicated position
--- a/src/cxpp11_common.hpp
+++ b/src/cxpp11_common.hpp
@ -0,0 +1,109 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Ivan Shapovalov <intelfx@intelfx.name>
+//
+// This file contains exception classes corresponding to 'clpp11.hpp'. It is also part of the
+// CLCudaAPI project. See 'clpp11.hpp' for more details.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_CXPP11_COMMON_H_
+#define CLBLAST_CXPP11_COMMON_H_
+
+#include <string>    // std::string
+#include <stdexcept> // std::runtime_error
+
+namespace clblast {
+// =================================================================================================
+
+// Basic exception class: represents an error happened inside our code
+// (as opposed to an error in C++ runtime)
+template <typename Base>
+class Error : public Base {
+ public:
+  // Perfect forwarding of the constructor since "using Base::Base" is not supported by VS 2013
+  template <typename... Args>
+  Error(Args&&... args):
+      Base(std::forward<Args>(args)...) {
+  }
+};
+
+// =================================================================================================
+
+// Represents a generic device-specific runtime error (returned by an OpenCL or CUDA API function)
+class DeviceError : public Error<std::runtime_error> {
+ public:
+   // Perfect forwarding of the constructor since "using Error<std::runtime_error>::Error" is not
+   // supported by VS 2013
+   template <typename... Args>
+   DeviceError(Args&&... args):
+       Error<std::runtime_error>(std::forward<Args>(args)...) {
+   }
+
+  static std::string TrimCallString(const char *where) {
+    const char *paren = strchr(where, '(');
+    if (paren) {
+      return std::string(where, paren);
+    } else {
+      return std::string(where);
+    }
+  }
+};
+
+// =================================================================================================
+
+// Represents a generic runtime error (aka environmental problem)
+class RuntimeError : public Error<std::runtime_error> {
+ public:
+  explicit RuntimeError(const std::string &reason):
+      Error("Run-time error: " + reason) {
+  }
+};
+
+// =================================================================================================
+
+// Represents a generic logic error (aka failed assertion)
+class LogicError : public Error<std::logic_error> {
+ public:
+  explicit LogicError(const std::string &reason):
+      Error("Internal logic error: " + reason) {
+  }
+};
+
+// =================================================================================================
+
+// Internal exception base class with a status field and a subclass-specific "details" field
+// which can be used to recreate an exception
+template <typename Base, typename Status>
+class ErrorCode : public Base {
+ public:
+  ErrorCode(Status status, const std::string &details, const std::string &reason):
+      Base(reason),
+      status_(status),
+      details_(details) {
+  }
+
+  Status status() const {
+    return status_;
+  }
+
+  const std::string& details() const {
+    return details_;
+  }
+
+ private:
+  const Status status_;
+  const std::string details_;
+};
+
+// =================================================================================================
+
+} // namespace clblast
+
+// CLBLAST_CXPP11_COMMON_H_
+#endif
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "utilities.hpp"
+#include "utilities/utilities.hpp"

 #include "database/database.hpp"
 #include "database/kernels/xaxpy.hpp"
@ -92,7 +92,7 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
      }
    }

-    if (!search_result) { throw std::runtime_error("Database error, could not find a suitable entry"); }
+    if (!search_result) { throw RuntimeErrorCode(StatusCode::kDatabaseError); }
  }
 }

--- a/src/database/database.hpp
+++ b/src/database/database.hpp
@ -21,7 +21,7 @@
 #include <vector>
 #include <unordered_map>

-#include "utilities.hpp"
+#include "utilities/utilities.hpp"

 namespace clblast {
 // =================================================================================================
--- a/src/database/kernels/copy.hpp
+++ b/src/database/kernels/copy.hpp
@ -43,6 +43,7 @@ const Database::DatabaseEntry CopySingle = {
        { "Oland",                                           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
        { "Pitcairn",                                        { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
        { "Tahiti",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+        { "Tonga",                                           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
      }
    },
@ -89,6 +90,7 @@ const Database::DatabaseEntry CopySingle = {
        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
        { "GeForce GTX 980",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
+        { "GeForce GTX TITAN Black",                         { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} } },
        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
        { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
@ -114,6 +116,7 @@ const Database::DatabaseEntry CopyComplexSingle = {
        { "Oland",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Pitcairn",                                        { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+        { "Tonga",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
@ -152,6 +155,7 @@ const Database::DatabaseEntry CopyComplexSingle = {
        { "GeForce GTX 750",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX 980",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX TITAN Black",                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
        { "Tesla K40m",                                      { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
@ -177,6 +181,7 @@ const Database::DatabaseEntry CopyDouble = {
        { "Oland",                                           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
        { "Pitcairn",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "Tonga",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",4} } },
        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
      }
    },
@ -211,15 +216,16 @@ const Database::DatabaseEntry CopyDouble = {
        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "GeForce GTX 980",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+        { "GeForce GTX TITAN Black",                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
-        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
      }
    },
  }
@ -236,6 +242,7 @@ const Database::DatabaseEntry CopyComplexDouble = {
        { "Oland",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Pitcairn",                                        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "Tonga",                                           { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
@ -270,6 +277,7 @@ const Database::DatabaseEntry CopyComplexDouble = {
        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX 980",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX TITAN Black",                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
--- a/src/database/kernels/pad.hpp
+++ b/src/database/kernels/pad.hpp
@ -43,7 +43,8 @@ const Database::DatabaseEntry PadSingle = {
        { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Pitcairn",                                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Tahiti",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "Tonga",                                           { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
      }
    },
    { // ARM GPUs
@ -89,6 +90,7 @@ const Database::DatabaseEntry PadSingle = {
        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
        { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX TITAN",                               { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "GeForce GTX TITAN Black",                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "Tesla K40m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@ -114,6 +116,7 @@ const Database::DatabaseEntry PadComplexSingle = {
        { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Pitcairn",                                        { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Tahiti",                                          { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Tonga",                                           { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
@ -160,10 +163,11 @@ const Database::DatabaseEntry PadComplexSingle = {
        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX TITAN",                               { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "GeForce GTX TITAN Black",                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Tesla K40m",                                      { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
      }
    },
    { // Default
@ -185,7 +189,8 @@ const Database::DatabaseEntry PadDouble = {
        { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Pitcairn",                                        { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Tahiti",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "Tonga",                                           { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
    { // ARM GPUs
@ -219,6 +224,7 @@ const Database::DatabaseEntry PadDouble = {
        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX 980",                                 { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX TITAN",                               { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX TITAN Black",                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tesla K40m",                                      { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
@ -244,7 +250,8 @@ const Database::DatabaseEntry PadComplexDouble = {
        { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "Pitcairn",                                        { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tahiti",                                          { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Tonga",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
    { // ARM GPUs
@ -278,6 +285,7 @@ const Database::DatabaseEntry PadComplexDouble = {
        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX TITAN",                               { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "GeForce GTX TITAN Black",                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
        { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tesla K20m",                                      { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Tesla K40m",                                      { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
--- a/src/database/kernels/padtranspose.hpp
+++ b/src/database/kernels/padtranspose.hpp
@ -43,6 +43,7 @@ const Database::DatabaseEntry PadtransposeSingle = {
        { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
        { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
        { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+        { "Tonga",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
      }
    },
@ -89,6 +90,7 @@ const Database::DatabaseEntry PadtransposeSingle = {
        { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
        { "GeForce GTX 980",                                 { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "GeForce GTX TITAN Black",                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
        { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
        { "Tesla K20m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
@ -114,6 +116,7 @@ const Database::DatabaseEntry PadtransposeComplexSingle = {
        { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Tonga",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
      }
    },
@ -160,6 +163,7 @@ const Database::DatabaseEntry PadtransposeComplexSingle = {
        { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX 980",                                 { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX TITAN Black",                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
        { "Tesla K20m",                                      { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@ -185,6 +189,7 @@ const Database::DatabaseEntry PadtransposeDouble = {
        { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
        { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Tahiti",                                          { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Tonga",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
      }
    },
@ -219,6 +224,7 @@ const Database::DatabaseEntry PadtransposeDouble = {
        { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
        { "GeForce GTX 980",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX TITAN Black",                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
        { "Tesla K20m",                                      { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@ -244,6 +250,7 @@ const Database::DatabaseEntry PadtransposeComplexDouble = {
        { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+        { "Tonga",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
      }
    },
@ -278,6 +285,7 @@ const Database::DatabaseEntry PadtransposeComplexDouble = {
        { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
        { "GeForce GTX 980",                                 { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX TITAN Black",                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
        { "Tesla K20m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
--- a/src/database/kernels/transpose.hpp
+++ b/src/database/kernels/transpose.hpp
@ -43,7 +43,8 @@ const Database::DatabaseEntry TransposeSingle = {
        { "Oland",                                           { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
        { "Pitcairn",                                        { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "Tahiti",                                          { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "Tonga",                                           { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
      }
    },
    { // ARM GPUs
@ -89,6 +90,7 @@ const Database::DatabaseEntry TransposeSingle = {
        { "GeForce GTX 750 Ti",                              { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "GeForce GTX 980",                                 { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "GeForce GTX TITAN Black",                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "GeForce GTX TITAN X",                             { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "Tesla K20m",                                      { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "Tesla K40m",                                      { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
@ -114,6 +116,7 @@ const Database::DatabaseEntry TransposeComplexSingle = {
        { "Oland",                                           { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
        { "Pitcairn",                                        { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "Tahiti",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "Tonga",                                           { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
      }
    },
@ -154,6 +157,7 @@ const Database::DatabaseEntry TransposeComplexSingle = {
        { "GeForce GTX 750 Ti",                              { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX 980",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX TITAN Black",                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
@ -162,7 +166,7 @@ const Database::DatabaseEntry TransposeComplexSingle = {
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
      }
    },
  }
@ -179,6 +183,7 @@ const Database::DatabaseEntry TransposeDouble = {
        { "Oland",                                           { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
        { "Pitcairn",                                        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
        { "Tahiti",                                          { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "Tonga",                                           { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
      }
    },
@ -213,6 +218,7 @@ const Database::DatabaseEntry TransposeDouble = {
        { "GeForce GTX 750 Ti",                              { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "GeForce GTX 980",                                 { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "GeForce GTX TITAN",                               { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "GeForce GTX TITAN Black",                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
@ -238,7 +244,8 @@ const Database::DatabaseEntry TransposeComplexDouble = {
        { "Oland",                                           { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "Pitcairn",                                        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "Tahiti",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
-        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "Tonga",                                           { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
      }
    },
    { // ARM GPUs
@ -266,6 +273,7 @@ const Database::DatabaseEntry TransposeComplexDouble = {
        { "GeForce GTX 750 Ti",                              { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX 980",                                 { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX TITAN Black",                         { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
--- a/src/database/kernels/xaxpy.hpp
+++ b/src/database/kernels/xaxpy.hpp
@ -43,7 +43,8 @@ const Database::DatabaseEntry XaxpySingle = {
        { "Oland",                                           { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "Pitcairn",                                        { {"VW",2}, {"WGS",128}, {"WPT",1} } },
        { "Tahiti",                                          { {"VW",2}, {"WGS",64}, {"WPT",1} } },
-        { "default",                                         { {"VW",2}, {"WGS",256}, {"WPT",1} } },
+        { "Tonga",                                           { {"VW",1}, {"WGS",256}, {"WPT",8} } },
+        { "default",                                         { {"VW",2}, {"WGS",64}, {"WPT",2} } },
      }
    },
    { // ARM GPUs
@ -89,6 +90,7 @@ const Database::DatabaseEntry XaxpySingle = {
        { "GeForce GTX 750 Ti",                              { {"VW",2}, {"WGS",64}, {"WPT",1} } },
        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
        { "GeForce GTX TITAN",                               { {"VW",4}, {"WGS",256}, {"WPT",1} } },
+        { "GeForce GTX TITAN Black",                         { {"VW",4}, {"WGS",128}, {"WPT",4} } },
        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "Tesla K20m",                                      { {"VW",4}, {"WGS",128}, {"WPT",1} } },
        { "Tesla K40m",                                      { {"VW",4}, {"WGS",128}, {"WPT",1} } },
@ -97,7 +99,7 @@ const Database::DatabaseEntry XaxpySingle = {
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",4}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",4}, {"WGS",256}, {"WPT",1} } },
      }
    },
  }
@ -114,6 +116,7 @@ const Database::DatabaseEntry XaxpyComplexSingle = {
        { "Oland",                                           { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "Pitcairn",                                        { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "Tahiti",                                          { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "Tonga",                                           { {"VW",1}, {"WGS",256}, {"WPT",8} } },
        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
      }
    },
@ -160,6 +163,7 @@ const Database::DatabaseEntry XaxpyComplexSingle = {
        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",512}, {"WPT",1} } },
        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+        { "GeForce GTX TITAN Black",                         { {"VW",1}, {"WGS",128}, {"WPT",2} } },
        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",512}, {"WPT",1} } },
        { "Tesla K20m",                                      { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "Tesla K40m",                                      { {"VW",1}, {"WGS",128}, {"WPT",1} } },
@ -185,6 +189,7 @@ const Database::DatabaseEntry XaxpyDouble = {
        { "Oland",                                           { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "Pitcairn",                                        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "Tahiti",                                          { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "Tonga",                                           { {"VW",1}, {"WGS",128}, {"WPT",4} } },
        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
    },
@ -219,15 +224,16 @@ const Database::DatabaseEntry XaxpyDouble = {
        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",256}, {"WPT",2} } },
        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
        { "GeForce GTX TITAN",                               { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
+        { "GeForce GTX TITAN Black",                         { {"VW",2}, {"WGS",128}, {"WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",512}, {"WPT",1} } },
        { "Tesla K20m",                                      { {"VW",2}, {"WGS",128}, {"WPT",1} } },
        { "Tesla K40m",                                      { {"VW",2}, {"WGS",128}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "default",                                         { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",2} } },
      }
    },
  }
@ -244,6 +250,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
        { "Oland",                                           { {"VW",1}, {"WGS",256}, {"WPT",1} } },
        { "Pitcairn",                                        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "Tahiti",                                          { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "Tonga",                                           { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
      }
    },
@ -278,6 +285,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",64}, {"WPT",2} } },
        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
        { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS",64}, {"WPT",4} } },
+        { "GeForce GTX TITAN Black",                         { {"VW",1}, {"WGS",128}, {"WPT",4} } },
        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
        { "Tesla K20m",                                      { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "Tesla K40m",                                      { {"VW",1}, {"WGS",64}, {"WPT",1} } },
--- a/src/database/kernels/xdot.hpp
+++ b/src/database/kernels/xdot.hpp
@ -42,6 +42,7 @@ const Database::DatabaseEntry XdotSingle = {
        { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
        { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",32} } },
        { "Tahiti",                                          { {"WGS1",128}, {"WGS2",32} } },
+        { "Tonga",                                           { {"WGS1",64}, {"WGS2",32} } },
        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
      }
    },
@ -72,6 +73,7 @@ const Database::DatabaseEntry XdotSingle = {
        { "GeForce GTX 750",                                 { {"WGS1",128}, {"WGS2",32} } },
        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WGS2",32} } },
        { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",32} } },
+        { "GeForce GTX TITAN Black",                         { {"WGS1",512}, {"WGS2",64} } },
        { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
        { "Tesla K20m",                                      { {"WGS1",1024}, {"WGS2",32} } },
        { "default",                                         { {"WGS1",256}, {"WGS2",256} } },
@ -95,7 +97,8 @@ const Database::DatabaseEntry XdotComplexSingle = {
        { "Oland",                                           { {"WGS1",128}, {"WGS2",32} } },
        { "Pitcairn",                                        { {"WGS1",256}, {"WGS2",32} } },
        { "Tahiti",                                          { {"WGS1",64}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
+        { "Tonga",                                           { {"WGS1",256}, {"WGS2",64} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",64} } },
      }
    },
    { // Intel CPUs
@ -125,6 +128,7 @@ const Database::DatabaseEntry XdotComplexSingle = {
        { "GeForce GTX 750",                                 { {"WGS1",64}, {"WGS2",32} } },
        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WGS2",32} } },
        { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",64} } },
+        { "GeForce GTX TITAN Black",                         { {"WGS1",128}, {"WGS2",64} } },
        { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
        { "Tesla K20m",                                      { {"WGS1",512}, {"WGS2",32} } },
        { "default",                                         { {"WGS1",512}, {"WGS2",64} } },
@ -148,7 +152,8 @@ const Database::DatabaseEntry XdotDouble = {
        { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
        { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",32} } },
        { "Tahiti",                                          { {"WGS1",256}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
+        { "Tonga",                                           { {"WGS1",128}, {"WGS2",64} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",64} } },
      }
    },
    { // Intel CPUs
@ -167,9 +172,10 @@ const Database::DatabaseEntry XdotDouble = {
        { "GeForce GTX 750",                                 { {"WGS1",64}, {"WGS2",256} } },
        { "GeForce GTX 750 Ti",                              { {"WGS1",128}, {"WGS2",64} } },
        { "GeForce GTX 980",                                 { {"WGS1",128}, {"WGS2",32} } },
+        { "GeForce GTX TITAN Black",                         { {"WGS1",128}, {"WGS2",64} } },
        { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
        { "Tesla K20m",                                      { {"WGS1",512}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",256}, {"WGS2",64} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",64} } },
      }
    },
    { // Default
@ -190,6 +196,7 @@ const Database::DatabaseEntry XdotComplexDouble = {
        { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
        { "Pitcairn",                                        { {"WGS1",256}, {"WGS2",32} } },
        { "Tahiti",                                          { {"WGS1",256}, {"WGS2",32} } },
+        { "Tonga",                                           { {"WGS1",128}, {"WGS2",64} } },
        { "default",                                         { {"WGS1",256}, {"WGS2",32} } },
      }
    },
@ -209,6 +216,7 @@ const Database::DatabaseEntry XdotComplexDouble = {
        { "GeForce GTX 750",                                 { {"WGS1",256}, {"WGS2",32} } },
        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WGS2",32} } },
        { "GeForce GTX 980",                                 { {"WGS1",64}, {"WGS2",32} } },
+        { "GeForce GTX TITAN Black",                         { {"WGS1",128}, {"WGS2",32} } },
        { "GeForce GTX TITAN X",                             { {"WGS1",128}, {"WGS2",32} } },
        { "Tesla K20m",                                      { {"WGS1",128}, {"WGS2",32} } },
        { "default",                                         { {"WGS1",128}, {"WGS2",64} } },
--- a/src/database/kernels/xgemm.hpp
+++ b/src/database/kernels/xgemm.hpp
@ -36,6 +36,7 @@ const Database::DatabaseEntry XgemmSingle = {
        { "Oland",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
        { "Pitcairn",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tahiti",                                          { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
+        { "Tonga",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } },
        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
@ -82,6 +83,7 @@ const Database::DatabaseEntry XgemmSingle = {
        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",2} } },
        { "GeForce GTX 980",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",8} } },
        { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
+        { "GeForce GTX TITAN Black",                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
        { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
        { "Tesla K20m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
        { "Tesla K40m",                                      { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
@ -107,6 +109,7 @@ const Database::DatabaseEntry XgemmComplexSingle = {
        { "Oland",                                           { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
        { "Pitcairn",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
        { "Tahiti",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
+        { "Tonga",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
@ -153,6 +156,7 @@ const Database::DatabaseEntry XgemmComplexSingle = {
        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
        { "GeForce GTX 980",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
        { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "GeForce GTX TITAN Black",                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
        { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
        { "Tesla K20m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
        { "Tesla K40m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@ -178,6 +182,7 @@ const Database::DatabaseEntry XgemmDouble = {
        { "Oland",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
        { "Pitcairn",                                        { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
        { "Tahiti",                                          { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
+        { "Tonga",                                           { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
@ -212,6 +217,7 @@ const Database::DatabaseEntry XgemmDouble = {
        { "GeForce GTX 750 Ti",                              { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
        { "GeForce GTX 980",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
        { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+        { "GeForce GTX TITAN Black",                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tesla K20m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tesla K40m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
@ -237,6 +243,7 @@ const Database::DatabaseEntry XgemmComplexDouble = {
        { "Oland",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
        { "Pitcairn",                                        { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tahiti",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "Tonga",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
@ -270,6 +277,7 @@ const Database::DatabaseEntry XgemmComplexDouble = {
        { "GeForce GTX 750",                                 { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
        { "GeForce GTX 750 Ti",                              { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "GeForce GTX 980",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+        { "GeForce GTX TITAN Black",                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
        { "GeForce GTX TITAN X",                             { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tesla K20m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tesla K40m",                                      { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
--- a/src/database/kernels/xgemm_direct.hpp
+++ b/src/database/kernels/xgemm_direct.hpp
@ -19,7 +19,7 @@ const Database::DatabaseEntry XgemmDirectHalf = {
  "XgemmDirect", Precision::kHalf, {
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
      }
    },
  }
@ -32,7 +32,8 @@ const Database::DatabaseEntry XgemmDirectSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
-        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+        { "Tonga",                                           { {"KWID",16}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",32}, {"NDIMCD",8}, {"PADA",0}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
      }
    },
    { // Intel GPUs
@ -44,12 +45,13 @@ const Database::DatabaseEntry XgemmDirectSingle = {
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 750 Ti",                              { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
-        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
+        { "GeForce GTX TITAN Black",                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
      }
    },
  }
@ -62,7 +64,8 @@ const Database::DatabaseEntry XgemmDirectComplexSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
-        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "Tonga",                                           { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
      }
    },
    { // Intel GPUs
@ -74,12 +77,13 @@ const Database::DatabaseEntry XgemmDirectComplexSingle = {
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 750 Ti",                              { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } },
-        { "default",                                         { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } },
+        { "GeForce GTX TITAN Black",                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
      }
    },
  }
@ -92,18 +96,20 @@ const Database::DatabaseEntry XgemmDirectDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
-        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "Tonga",                                           { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 750 Ti",                              { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
-        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+        { "GeForce GTX TITAN Black",                         { {"KWID",8}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
      }
    },
  }
@ -116,18 +122,20 @@ const Database::DatabaseEntry XgemmDirectComplexDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "Tonga",                                           { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 750 Ti",                              { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
-        { "default",                                         { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+        { "GeForce GTX TITAN Black",                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
      }
    },
  }
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@ -43,6 +43,7 @@ const Database::DatabaseEntry XgemvSingle = {
        { "Oland",                                           { {"WGS1",128}, {"WPT1",1} } },
        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1} } },
        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1} } },
+        { "Tonga",                                           { {"WGS1",128}, {"WPT1",2} } },
        { "default",                                         { {"WGS1",128}, {"WPT1",1} } },
      }
    },
@ -82,6 +83,7 @@ const Database::DatabaseEntry XgemvSingle = {
        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WPT1",1} } },
        { "GeForce GTX 980",                                 { {"WGS1",128}, {"WPT1",1} } },
        { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX TITAN Black",                         { {"WGS1",256}, {"WPT1",1} } },
        { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WPT1",1} } },
        { "Tesla K20m",                                      { {"WGS1",128}, {"WPT1",1} } },
        { "Tesla K40m",                                      { {"WGS1",256}, {"WPT1",1} } },
@ -107,6 +109,7 @@ const Database::DatabaseEntry XgemvComplexSingle = {
        { "Oland",                                           { {"WGS1",64}, {"WPT1",1} } },
        { "Pitcairn",                                        { {"WGS1",64}, {"WPT1",1} } },
        { "Tahiti",                                          { {"WGS1",64}, {"WPT1",1} } },
+        { "Tonga",                                           { {"WGS1",32}, {"WPT1",1} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
      }
    },
@ -145,6 +148,7 @@ const Database::DatabaseEntry XgemvComplexSingle = {
        { "GeForce GTX 750",                                 { {"WGS1",128}, {"WPT1",1} } },
        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WPT1",1} } },
        { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX TITAN Black",                         { {"WGS1",32}, {"WPT1",1} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
      }
    },
@ -167,6 +171,7 @@ const Database::DatabaseEntry XgemvDouble = {
        { "Oland",                                           { {"WGS1",256}, {"WPT1",1} } },
        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1} } },
        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1} } },
+        { "Tonga",                                           { {"WGS1",32}, {"WPT1",1} } },
        { "default",                                         { {"WGS1",256}, {"WPT1",1} } },
      }
    },
@ -194,6 +199,7 @@ const Database::DatabaseEntry XgemvDouble = {
        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WPT1",1} } },
        { "GeForce GTX 980",                                 { {"WGS1",64}, {"WPT1",1} } },
        { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX TITAN Black",                         { {"WGS1",32}, {"WPT1",1} } },
        { "GeForce GTX TITAN X",                             { {"WGS1",64}, {"WPT1",1} } },
        { "Tesla K20m",                                      { {"WGS1",256}, {"WPT1",1} } },
        { "Tesla K40m",                                      { {"WGS1",256}, {"WPT1",1} } },
@ -219,6 +225,7 @@ const Database::DatabaseEntry XgemvComplexDouble = {
        { "Oland",                                           { {"WGS1",256}, {"WPT1",1} } },
        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1} } },
        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1} } },
+        { "Tonga",                                           { {"WGS1",64}, {"WPT1",1} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
      }
    },
--- a/src/database/kernels/xgemv_fast.hpp
+++ b/src/database/kernels/xgemv_fast.hpp
@ -43,6 +43,7 @@ const Database::DatabaseEntry XgemvFastSingle = {
        { "Oland",                                           { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Tahiti",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Tonga",                                           { {"VW2",1}, {"WGS2",16}, {"WPT2",4} } },
        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
      }
    },
@ -82,6 +83,7 @@ const Database::DatabaseEntry XgemvFastSingle = {
        { "GeForce GTX 750 Ti",                              { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
        { "GeForce GTX 980",                                 { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
        { "GeForce GTX TITAN",                               { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX TITAN Black",                         { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
        { "GeForce GTX TITAN X",                             { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Tesla K20m",                                      { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
        { "Tesla K40m",                                      { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
@ -107,6 +109,7 @@ const Database::DatabaseEntry XgemvFastComplexSingle = {
        { "Oland",                                           { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Tahiti",                                          { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "Tonga",                                           { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
      }
    },
@ -164,6 +167,7 @@ const Database::DatabaseEntry XgemvFastDouble = {
        { "Oland",                                           { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Tahiti",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Tonga",                                           { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
      }
    },
@ -191,6 +195,7 @@ const Database::DatabaseEntry XgemvFastDouble = {
        { "GeForce GTX 750 Ti",                              { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
        { "GeForce GTX 980",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "GeForce GTX TITAN",                               { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX TITAN Black",                         { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
        { "GeForce GTX TITAN X",                             { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
        { "Tesla K20m",                                      { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
        { "Tesla K40m",                                      { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
@ -216,6 +221,7 @@ const Database::DatabaseEntry XgemvFastComplexDouble = {
        { "Oland",                                           { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
        { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "Tahiti",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Tonga",                                           { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } },
        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
      }
    },
--- a/src/database/kernels/xgemv_fast_rot.hpp
+++ b/src/database/kernels/xgemv_fast_rot.hpp
@ -32,7 +32,8 @@ const Database::DatabaseEntry XgemvFastRotSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
-        { "default",                                         { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
+        { "Tonga",                                           { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
      }
    },
    { // Intel CPUs
@ -55,6 +56,7 @@ const Database::DatabaseEntry XgemvFastRotSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 750 Ti",                              { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
        { "GeForce GTX TITAN",                               { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+        { "GeForce GTX TITAN Black",                         { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
      }
    },
@ -73,7 +75,8 @@ const Database::DatabaseEntry XgemvFastRotComplexSingle = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
-        { "default",                                         { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
+        { "Tonga",                                           { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
      }
    },
    { // Intel CPUs
@ -107,6 +110,7 @@ const Database::DatabaseEntry XgemvFastRotDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
+        { "Tonga",                                           { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
        { "default",                                         { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
      }
    },
@ -120,6 +124,7 @@ const Database::DatabaseEntry XgemvFastRotDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 750 Ti",                              { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
        { "GeForce GTX TITAN",                               { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+        { "GeForce GTX TITAN Black",                         { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
        { "default",                                         { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
      }
    },
@ -138,7 +143,8 @@ const Database::DatabaseEntry XgemvFastRotComplexDouble = {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
-        { "default",                                         { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
+        { "Tonga",                                           { {"VW3",4}, {"WGS3",16}, {"WPT3",8} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",16} } },
      }
    },
    { // Intel CPUs
--- a/src/database/kernels/xger.hpp
+++ b/src/database/kernels/xger.hpp
@ -43,7 +43,8 @@ const Database::DatabaseEntry XgerSingle = {
        { "Oland",                                           { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
        { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
        { "Tahiti",                                          { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
-        { "default",                                         { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
+        { "Tonga",                                           { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
      }
    },
    { // ARM GPUs
@ -80,6 +81,7 @@ const Database::DatabaseEntry XgerSingle = {
        { "GeForce GTX 750",                                 { {"WGS1",64}, {"WGS2",16}, {"WPT",4} } },
        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
        { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+        { "GeForce GTX TITAN Black",                         { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
        { "default",                                         { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
      }
    },
@ -102,7 +104,8 @@ const Database::DatabaseEntry XgerComplexSingle = {
        { "Oland",                                           { {"WGS1",4}, {"WGS2",8}, {"WPT",1} } },
        { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
        { "Tahiti",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
-        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
+        { "Tonga",                                           { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
      }
    },
    { // ARM GPUs
@ -139,12 +142,13 @@ const Database::DatabaseEntry XgerComplexSingle = {
        { "GeForce GTX 750",                                 { {"WGS1",32}, {"WGS2",16}, {"WPT",4} } },
        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
        { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
+        { "GeForce GTX TITAN Black",                         { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
      }
    },
  }
@ -161,7 +165,8 @@ const Database::DatabaseEntry XgerDouble = {
        { "Oland",                                           { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
        { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
        { "Tahiti",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
+        { "Tonga",                                           { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
      }
    },
    { // ARM GPUs
@ -187,6 +192,7 @@ const Database::DatabaseEntry XgerDouble = {
        { "GeForce GTX 750",                                 { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WGS2",16}, {"WPT",1} } },
        { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
+        { "GeForce GTX TITAN Black",                         { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
      }
    },
@ -209,6 +215,7 @@ const Database::DatabaseEntry XgerComplexDouble = {
        { "Oland",                                           { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
        { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
        { "Tahiti",                                          { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
+        { "Tonga",                                           { {"WGS1",16}, {"WGS2",4}, {"WPT",1} } },
        { "default",                                         { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
      }
    },
@ -235,6 +242,7 @@ const Database::DatabaseEntry XgerComplexDouble = {
        { "GeForce GTX 750",                                 { {"WGS1",8}, {"WGS2",32}, {"WPT",4} } },
        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
        { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+        { "GeForce GTX TITAN Black",                         { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
        { "default",                                         { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
      }
    },
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@ -113,7 +113,7 @@ void XgemmUpper(const int kSizeN, const int kSizeK,
  const real beta = GetRealArg(arg_beta);

  // Skip these threads if they do not contain threads contributing to the upper-triangle
-  if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
+  if ((GetGroupID1() + 1)*NWG < GetGroupID0()*MWG) {
    return;
  }

@ -153,7 +153,7 @@ void XgemmLower(const int kSizeN, const int kSizeK,
  const real beta = GetRealArg(arg_beta);

  // Skip these threads if they do not contain threads contributing to the lower-triangle
-  if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
+  if (GetGroupID1()*NWG > (GetGroupID0() + 1)*MWG) {
    return;
  }

--- a/src/routine.cpp
+++ b/src/routine.cpp
@ -21,10 +21,11 @@
 namespace clblast {
 // =================================================================================================

-// Constructor: not much here, because no status codes can be returned
+// The constructor does all heavy work, errors are returned as exceptions
 Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
                 const std::vector<std::string> &routines, const Precision precision,
-                 const std::vector<const Database::DatabaseEntry*> &userDatabase):
+                 const std::vector<const Database::DatabaseEntry*> &userDatabase,
+                 std::initializer_list<const char *> source):
    precision_(precision),
    routine_name_(name),
    queue_(queue),
@ -33,15 +34,9 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
    device_(queue_.GetDevice()),
    device_name_(device_.Name()),
    db_(queue_, routines, precision_, userDatabase) {
-}
-
-// =================================================================================================
-
-// Separate set-up function to allow for status codes to be returned
-StatusCode Routine::SetUp() {

  // Queries the cache to see whether or not the program (context-specific) is already there
-  if (ProgramIsInCache(context_, precision_, routine_name_)) { return StatusCode::kSuccess; }
+  if (ProgramIsInCache(context_, precision_, routine_name_)) { return; }

  // Sets the build options from an environmental variable (if set)
  auto options = std::vector<std::string>();
@ -53,13 +48,10 @@ StatusCode Routine::SetUp() {
  // Queries the cache to see whether or not the binary (device-specific) is already there. If it
  // is, a program is created and stored in the cache
  if (BinaryIsInCache(device_name_, precision_, routine_name_)) {
-    try {
-      auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
-      auto program = Program(device_, context_, binary);
-      program.Build(device_, options);
-      StoreProgramToCache(program, context_, precision_, routine_name_);
-    } catch (...) { return StatusCode::kBuildProgramFailure; }
-    return StatusCode::kSuccess;
+    auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
+    auto program = Program(device_, context_, binary);
+    program.Build(device_, options);
+    StoreProgramToCache(program, context_, precision_, routine_name_);
  }

  // Otherwise, the kernel will be compiled and program will be built. Both the binary and the
@ -69,48 +61,50 @@ StatusCode Routine::SetUp() {
  const auto extensions = device_.Capabilities();
  if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
    if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
-      return StatusCode::kNoDoublePrecision;
+      throw RuntimeErrorCode(StatusCode::kNoDoublePrecision);
    }
  }

  // As above, but for cl_khr_fp16 (half precision)
  if (precision_ == Precision::kHalf) {
    if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
-      return StatusCode::kNoHalfPrecision;
+      throw RuntimeErrorCode(StatusCode::kNoHalfPrecision);
    }
  }

-  // Loads the common header (typedefs and defines and such)
-  std::string common_header =
-    #include "kernels/common.opencl"
-  ;
-
  // Collects the parameters for this device in the form of defines, and adds the precision
-  auto defines = db_.GetDefines();
-  defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
+  auto source_string = db_.GetDefines();
+  source_string += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";

  // Adds the name of the routine as a define
-  defines += "#define ROUTINE_"+routine_name_+"\n";
+  source_string += "#define ROUTINE_"+routine_name_+"\n";

  // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
  // performance, but might result in a reduced accuracy.
  if (device_.IsAMD() && device_.IsGPU()) {
-    defines += "#define USE_CL_MAD 1\n";
+    source_string += "#define USE_CL_MAD 1\n";
  }

  // For specific devices, use staggered/shuffled workgroup indices.
  if (device_.IsAMD() && device_.IsGPU()) {
-    defines += "#define USE_STAGGERED_INDICES 1\n";
+    source_string += "#define USE_STAGGERED_INDICES 1\n";
  }

  // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
  // performance through better cache behaviour
  if (device_.IsARM() && device_.IsGPU()) {
-    defines += "#define GLOBAL_MEM_FENCE 1\n";
+    source_string += "#define GLOBAL_MEM_FENCE 1\n";
  }

-  // Combines everything together into a single source string
-  const auto source_string = defines + common_header + source_string_;
+  // Loads the common header (typedefs and defines and such)
+  source_string +=
+    #include "kernels/common.opencl"
+  ;
+
+  // Adds routine-specific code to the constructed source string
+  for (const char *s: source) {
+    source_string += s;
+  }

  // Prints details of the routine to compile in case of debugging in verbose mode
  #ifdef VERBOSE
@ -120,23 +114,21 @@ StatusCode Routine::SetUp() {
  #endif

  // Compiles the kernel
+  auto program = Program(context_, source_string);
  try {
-    auto program = Program(context_, source_string);
-    const auto build_status = program.Build(device_, options);
-
-    // Checks for compiler crashes/errors/warnings
-    if (build_status == BuildStatus::kError) {
-      const auto message = program.GetBuildInfo(device_);
-      fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str());
-      return StatusCode::kBuildProgramFailure;
+    program.Build(device_, options);
+  } catch (const CLError &e) {
+    if (e.status() == CL_BUILD_PROGRAM_FAILURE) {
+      fprintf(stdout, "OpenCL compiler error/warning: %s\n",
+              program.GetBuildInfo(device_).c_str());
    }
-    if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; }
+    throw;
+  }

-    // Store the compiled binary and program in the cache
-    const auto binary = program.GetIR();
-    StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
-    StoreProgramToCache(program, context_, precision_, routine_name_);
-  } catch (...) { return StatusCode::kBuildProgramFailure; }
+  // Store the compiled binary and program in the cache
+  const auto binary = program.GetIR();
+  StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
+  StoreProgramToCache(program, context_, precision_, routine_name_);

  // Prints the elapsed compilation time in case of debugging in verbose mode
  #ifdef VERBOSE
@ -144,9 +136,6 @@ StatusCode Routine::SetUp() {
    const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
    printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
  #endif
-
-  // No errors, normal termination of this function
-  return StatusCode::kSuccess;
 }

 // =================================================================================================
--- a/src/routine.hpp
+++ b/src/routine.hpp
@ -19,9 +19,9 @@
 #include <string>
 #include <vector>

-#include "utilities.hpp"
+#include "utilities/utilities.hpp"
 #include "cache.hpp"
-#include "buffer_test.hpp"
+#include "utilities/buffer_test.hpp"
 #include "database/database.hpp"
 #include "routines/common.hpp"

@ -34,21 +34,19 @@ class Routine {

  // Base class constructor. The user database is an optional extra database to override the
  // built-in database.
+  // All heavy preparation work is done inside this constructor.
  explicit Routine(Queue &queue, EventPointer event, const std::string &name,
                   const std::vector<std::string> &routines, const Precision precision,
-                   const std::vector<const Database::DatabaseEntry*> &userDatabase = {});
-
-  // Set-up phase of the kernel
-  StatusCode SetUp();
+                   const std::vector<const Database::DatabaseEntry*> &userDatabase,
+                   std::initializer_list<const char *> source);

 protected:

  // Non-static variable for the precision
  const Precision precision_;

-  // The routine's name and its kernel-source in string form
+  // The routine's name
  const std::string routine_name_;
-  std::string source_string_;

  // The OpenCL objects, accessible only from derived classes
  Queue queue_;
--- a/src/routines/common.cpp
+++ b/src/routines/common.cpp
@ -20,22 +20,26 @@ namespace clblast {
 // =================================================================================================

 // Enqueues a kernel, waits for completion, and checks for errors
-StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
-                     std::vector<size_t> global, const std::vector<size_t> &local,
-                     EventPointer event, const std::vector<Event> &waitForEvents) {
+void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
+               std::vector<size_t> global, const std::vector<size_t> &local,
+               EventPointer event, const std::vector<Event> &waitForEvents) {

  if (!local.empty()) {
    // Tests for validity of the local thread sizes
    if (local.size() > device.MaxWorkItemDimensions()) {
-      return StatusCode::kInvalidLocalNumDimensions;
+      throw RuntimeErrorCode(StatusCode::kInvalidLocalNumDimensions);
    }
    const auto max_work_item_sizes = device.MaxWorkItemSizes();
    for (auto i=size_t{0}; i<local.size(); ++i) {
-      if (local[i] > max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; }
+      if (local[i] > max_work_item_sizes[i]) {
+        throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsDim);
+      }
    }
    auto local_size = size_t{1};
    for (auto &item: local) { local_size *= item; }
-    if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; }
+    if (local_size > device.MaxWorkGroupSize()) {
+      throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal);
+    }

    // Make sure the global thread sizes are at least equal to the local sizes
    for (auto i=size_t{0}; i<global.size(); ++i) {
@ -45,7 +49,9 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,

  // Tests for local memory usage
  const auto local_mem_usage = kernel.LocalMemUsage(device);
-  if (!device.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; }
+  if (!device.IsLocalMemoryValid(local_mem_usage)) {
+    throw RuntimeErrorCode(StatusCode::kInvalidLocalMemUsage);
+  }

  // Prints the name of the kernel to launch in case of debugging in verbose mode
  #ifdef VERBOSE
@ -55,9 +61,7 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
  #endif

  // Launches the kernel (and checks for launch errors)
-  try {
-    kernel.Launch(queue, global, local, event, waitForEvents);
-  } catch (...) { return StatusCode::kKernelLaunchError; }
+  kernel.Launch(queue, global, local, event, waitForEvents);

  // Prints the elapsed execution time in case of debugging in verbose mode
  #ifdef VERBOSE
@ -66,9 +70,6 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
    const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
    printf("[DEBUG] Completed kernel in %.2lf ms\n", timing);
  #endif
-
-  // No errors, normal termination of this function
-  return StatusCode::kSuccess;
 }

 // =================================================================================================
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@ -27,29 +27,29 @@ namespace clblast {
 // =================================================================================================

 // Enqueues a kernel, waits for completion, and checks for errors
-StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
-                     std::vector<size_t> global, const std::vector<size_t> &local,
-                     EventPointer event, const std::vector<Event> &waitForEvents = {});
+void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
+               std::vector<size_t> global, const std::vector<size_t> &local,
+               EventPointer event, const std::vector<Event> &waitForEvents = {});

 // =================================================================================================

 // Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
 // to write to symmetric and triangular matrices through optional arguments.
 template <typename T>
-StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
-                                  const Database &db,
-                                  EventPointer event, const std::vector<Event> &waitForEvents,
-                                  const size_t src_one, const size_t src_two,
-                                  const size_t src_ld, const size_t src_offset,
-                                  const Buffer<T> &src,
-                                  const size_t dest_one, const size_t dest_two,
-                                  const size_t dest_ld, const size_t dest_offset,
-                                  const Buffer<T> &dest,
-                                  const T alpha,
-                                  const Program &program, const bool do_pad,
-                                  const bool do_transpose, const bool do_conjugate,
-                                  const bool upper = false, const bool lower = false,
-                                  const bool diagonal_imag_zero = false) {
+void PadCopyTransposeMatrix(Queue &queue, const Device &device,
+                            const Database &db,
+                            EventPointer event, const std::vector<Event> &waitForEvents,
+                            const size_t src_one, const size_t src_two,
+                            const size_t src_ld, const size_t src_offset,
+                            const Buffer<T> &src,
+                            const size_t dest_one, const size_t dest_two,
+                            const size_t dest_ld, const size_t dest_offset,
+                            const Buffer<T> &dest,
+                            const T alpha,
+                            const Program &program, const bool do_pad,
+                            const bool do_transpose, const bool do_conjugate,
+                            const bool upper = false, const bool lower = false,
+                            const bool diagonal_imag_zero = false) {

  // Determines whether or not the fast-version could potentially be used
  auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
@ -61,8 +61,8 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
  if (do_transpose) {
    if (use_fast_kernel &&
        IsMultiple(src_ld, db["TRA_WPT"]) &&
-        IsMultiple(src_one, db["TRA_WPT"]*db["TRA_WPT"]) &&
-        IsMultiple(src_two, db["TRA_WPT"]*db["TRA_WPT"])) {
+        IsMultiple(src_one, db["TRA_WPT"]*db["TRA_DIM"]) &&
+        IsMultiple(src_two, db["TRA_WPT"]*db["TRA_DIM"])) {
      kernel_name = "TransposeMatrixFast";
    }
    else {
@ -84,77 +84,75 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
  }

  // Retrieves the kernel from the compiled binary
-  try {
-    auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program, kernel_name);

-    // Sets the kernel arguments
+  // Sets the kernel arguments
+  if (use_fast_kernel) {
+    kernel.SetArgument(0, static_cast<int>(src_ld));
+    kernel.SetArgument(1, src());
+    kernel.SetArgument(2, dest());
+    kernel.SetArgument(3, GetRealArg(alpha));
+  }
+  else {
+    kernel.SetArgument(0, static_cast<int>(src_one));
+    kernel.SetArgument(1, static_cast<int>(src_two));
+    kernel.SetArgument(2, static_cast<int>(src_ld));
+    kernel.SetArgument(3, static_cast<int>(src_offset));
+    kernel.SetArgument(4, src());
+    kernel.SetArgument(5, static_cast<int>(dest_one));
+    kernel.SetArgument(6, static_cast<int>(dest_two));
+    kernel.SetArgument(7, static_cast<int>(dest_ld));
+    kernel.SetArgument(8, static_cast<int>(dest_offset));
+    kernel.SetArgument(9, dest());
+    kernel.SetArgument(10, GetRealArg(alpha));
+    if (do_pad) {
+      kernel.SetArgument(11, static_cast<int>(do_conjugate));
+    }
+    else {
+      kernel.SetArgument(11, static_cast<int>(upper));
+      kernel.SetArgument(12, static_cast<int>(lower));
+      kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
+    }
+  }
+
+  // Launches the kernel and returns the error code. Uses global and local thread sizes based on
+  // parameters in the database.
+  if (do_transpose) {
    if (use_fast_kernel) {
-      kernel.SetArgument(0, static_cast<int>(src_ld));
-      kernel.SetArgument(1, src());
-      kernel.SetArgument(2, dest());
-      kernel.SetArgument(3, GetRealArg(alpha));
+      const auto global = std::vector<size_t>{
+        dest_one / db["TRA_WPT"],
+        dest_two / db["TRA_WPT"]
+      };
+      const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
+      RunKernel(kernel, queue, device, global, local, event, waitForEvents);
    }
    else {
-      kernel.SetArgument(0, static_cast<int>(src_one));
-      kernel.SetArgument(1, static_cast<int>(src_two));
-      kernel.SetArgument(2, static_cast<int>(src_ld));
-      kernel.SetArgument(3, static_cast<int>(src_offset));
-      kernel.SetArgument(4, src());
-      kernel.SetArgument(5, static_cast<int>(dest_one));
-      kernel.SetArgument(6, static_cast<int>(dest_two));
-      kernel.SetArgument(7, static_cast<int>(dest_ld));
-      kernel.SetArgument(8, static_cast<int>(dest_offset));
-      kernel.SetArgument(9, dest());
-      kernel.SetArgument(10, GetRealArg(alpha));
-      if (do_pad) {
-        kernel.SetArgument(11, static_cast<int>(do_conjugate));
-      }
-      else {
-        kernel.SetArgument(11, static_cast<int>(upper));
-        kernel.SetArgument(12, static_cast<int>(lower));
-        kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
-      }
+      const auto global = std::vector<size_t>{
+        Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
+        Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
+      };
+      const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
+      RunKernel(kernel, queue, device, global, local, event, waitForEvents);
    }
-
-    // Launches the kernel and returns the error code. Uses global and local thread sizes based on
-    // parameters in the database.
-    if (do_transpose) {
-      if (use_fast_kernel) {
-        const auto global = std::vector<size_t>{
-          dest_one / db["TRA_WPT"],
-          dest_two / db["TRA_WPT"]
-        };
-        const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
-        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
-      }
-      else {
-        const auto global = std::vector<size_t>{
-          Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
-          Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
-        };
-        const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
-        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
-      }
+  }
+  else {
+    if (use_fast_kernel) {
+      const auto global = std::vector<size_t>{
+        dest_one / db["COPY_VW"],
+        dest_two / db["COPY_WPT"]
+      };
+      const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
+      RunKernel(kernel, queue, device, global, local, event, waitForEvents);
    }
    else {
-      if (use_fast_kernel) {
-        const auto global = std::vector<size_t>{
-          dest_one / db["COPY_VW"],
-          dest_two / db["COPY_WPT"]
-        };
-        const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
-        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
-      }
-      else {
-        const auto global = std::vector<size_t>{
-          Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
-          Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
-        };
-        const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
-        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
-      }
+      const auto global = std::vector<size_t>{
+        Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
+        Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
+      };
+      const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
+      RunKernel(kernel, queue, device, global, local, event, waitForEvents);
    }
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  }
 }

 // =================================================================================================
--- a/src/routines/level1/xamax.cpp
+++ b/src/routines/level1/xamax.cpp
@ -22,74 +22,64 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xamax<T>::Xamax(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
-  source_string_ =
+    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
    #include "../../kernels/level1/xamax.opencl"
-  ;
+    }) {
 }

 // =================================================================================================

 // The main routine
 template <typename T>
-StatusCode Xamax<T>::DoAmax(const size_t n,
-                            const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xamax<T>::DoAmax(const size_t n,
+                      const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorIndex(1, imax_buffer, imax_offset);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorIndex(1, imax_buffer, imax_offset);

  // Retrieves the Xamax kernels from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel1 = Kernel(program, "Xamax");
-    auto kernel2 = Kernel(program, "XamaxEpilogue");
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel1 = Kernel(program, "Xamax");
+  auto kernel2 = Kernel(program, "XamaxEpilogue");

-    // Creates the buffer for intermediate values
-    auto temp_size = 2*db_["WGS2"];
-    auto temp_buffer1 = Buffer<T>(context_, temp_size);
-    auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size);
+  // Creates the buffer for intermediate values
+  auto temp_size = 2*db_["WGS2"];
+  auto temp_buffer1 = Buffer<T>(context_, temp_size);
+  auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size);

-    // Sets the kernel arguments
-    kernel1.SetArgument(0, static_cast<int>(n));
-    kernel1.SetArgument(1, x_buffer());
-    kernel1.SetArgument(2, static_cast<int>(x_offset));
-    kernel1.SetArgument(3, static_cast<int>(x_inc));
-    kernel1.SetArgument(4, temp_buffer1());
-    kernel1.SetArgument(5, temp_buffer2());
+  // Sets the kernel arguments
+  kernel1.SetArgument(0, static_cast<int>(n));
+  kernel1.SetArgument(1, x_buffer());
+  kernel1.SetArgument(2, static_cast<int>(x_offset));
+  kernel1.SetArgument(3, static_cast<int>(x_inc));
+  kernel1.SetArgument(4, temp_buffer1());
+  kernel1.SetArgument(5, temp_buffer2());

-    // Event waiting list
-    auto eventWaitList = std::vector<Event>();
+  // Event waiting list
+  auto eventWaitList = std::vector<Event>();

-    // Launches the main kernel
-    auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
-    auto local1 = std::vector<size_t>{db_["WGS1"]};
-    auto kernelEvent = Event();
-    status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
-    if (ErrorIn(status)) { return status; }
-    eventWaitList.push_back(kernelEvent);
+  // Launches the main kernel
+  auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
+  auto local1 = std::vector<size_t>{db_["WGS1"]};
+  auto kernelEvent = Event();
+  RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
+  eventWaitList.push_back(kernelEvent);

-    // Sets the arguments for the epilogue kernel
-    kernel2.SetArgument(0, temp_buffer1());
-    kernel2.SetArgument(1, temp_buffer2());
-    kernel2.SetArgument(2, imax_buffer());
-    kernel2.SetArgument(3, static_cast<int>(imax_offset));
+  // Sets the arguments for the epilogue kernel
+  kernel2.SetArgument(0, temp_buffer1());
+  kernel2.SetArgument(1, temp_buffer2());
+  kernel2.SetArgument(2, imax_buffer());
+  kernel2.SetArgument(3, static_cast<int>(imax_offset));

-    // Launches the epilogue kernel
-    auto global2 = std::vector<size_t>{db_["WGS2"]};
-    auto local2 = std::vector<size_t>{db_["WGS2"]};
-    status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the epilogue kernel
+  auto global2 = std::vector<size_t>{db_["WGS2"]};
+  auto local2 = std::vector<size_t>{db_["WGS2"]};
+  RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
 }

 // =================================================================================================
--- a/src/routines/level1/xamax.hpp
+++ b/src/routines/level1/xamax.hpp
@ -28,9 +28,9 @@ class Xamax: public Routine {
  Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX");

  // Templated-precision implementation of the routine
-  StatusCode DoAmax(const size_t n,
-                    const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+  void DoAmax(const size_t n,
+              const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xasum.cpp
+++ b/src/routines/level1/xasum.cpp
@ -22,71 +22,61 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xasum<T>::Xasum(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
-  source_string_ =
+    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
    #include "../../kernels/level1/xasum.opencl"
-  ;
+    }) {
 }

 // =================================================================================================

 // The main routine
 template <typename T>
-StatusCode Xasum<T>::DoAsum(const size_t n,
-                            const Buffer<T> &asum_buffer, const size_t asum_offset,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xasum<T>::DoAsum(const size_t n,
+                      const Buffer<T> &asum_buffer, const size_t asum_offset,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorScalar(1, asum_buffer, asum_offset);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorScalar(1, asum_buffer, asum_offset);

  // Retrieves the Xasum kernels from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel1 = Kernel(program, "Xasum");
-    auto kernel2 = Kernel(program, "XasumEpilogue");
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel1 = Kernel(program, "Xasum");
+  auto kernel2 = Kernel(program, "XasumEpilogue");

-    // Creates the buffer for intermediate values
-    auto temp_size = 2*db_["WGS2"];
-    auto temp_buffer = Buffer<T>(context_, temp_size);
+  // Creates the buffer for intermediate values
+  auto temp_size = 2*db_["WGS2"];
+  auto temp_buffer = Buffer<T>(context_, temp_size);

-    // Sets the kernel arguments
-    kernel1.SetArgument(0, static_cast<int>(n));
-    kernel1.SetArgument(1, x_buffer());
-    kernel1.SetArgument(2, static_cast<int>(x_offset));
-    kernel1.SetArgument(3, static_cast<int>(x_inc));
-    kernel1.SetArgument(4, temp_buffer());
+  // Sets the kernel arguments
+  kernel1.SetArgument(0, static_cast<int>(n));
+  kernel1.SetArgument(1, x_buffer());
+  kernel1.SetArgument(2, static_cast<int>(x_offset));
+  kernel1.SetArgument(3, static_cast<int>(x_inc));
+  kernel1.SetArgument(4, temp_buffer());

-    // Event waiting list
-    auto eventWaitList = std::vector<Event>();
+  // Event waiting list
+  auto eventWaitList = std::vector<Event>();

-    // Launches the main kernel
-    auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
-    auto local1 = std::vector<size_t>{db_["WGS1"]};
-    auto kernelEvent = Event();
-    status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
-    if (ErrorIn(status)) { return status; }
-    eventWaitList.push_back(kernelEvent);
+  // Launches the main kernel
+  auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
+  auto local1 = std::vector<size_t>{db_["WGS1"]};
+  auto kernelEvent = Event();
+  RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
+  eventWaitList.push_back(kernelEvent);

-    // Sets the arguments for the epilogue kernel
-    kernel2.SetArgument(0, temp_buffer());
-    kernel2.SetArgument(1, asum_buffer());
-    kernel2.SetArgument(2, static_cast<int>(asum_offset));
+  // Sets the arguments for the epilogue kernel
+  kernel2.SetArgument(0, temp_buffer());
+  kernel2.SetArgument(1, asum_buffer());
+  kernel2.SetArgument(2, static_cast<int>(asum_offset));

-    // Launches the epilogue kernel
-    auto global2 = std::vector<size_t>{db_["WGS2"]};
-    auto local2 = std::vector<size_t>{db_["WGS2"]};
-    status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the epilogue kernel
+  auto global2 = std::vector<size_t>{db_["WGS2"]};
+  auto local2 = std::vector<size_t>{db_["WGS2"]};
+  RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
 }

 // =================================================================================================
--- a/src/routines/level1/xasum.hpp
+++ b/src/routines/level1/xasum.hpp
@ -28,9 +28,9 @@ class Xasum: public Routine {
  Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM");

  // Templated-precision implementation of the routine
-  StatusCode DoAsum(const size_t n,
-                    const Buffer<T> &asum_buffer, const size_t asum_offset,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+  void DoAsum(const size_t n,
+              const Buffer<T> &asum_buffer, const size_t asum_offset,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xaxpy.cpp
+++ b/src/routines/level1/xaxpy.cpp
@ -22,29 +22,26 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xaxpy<T>::Xaxpy(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
-  source_string_ =
+    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
    #include "../../kernels/level1/level1.opencl"
    #include "../../kernels/level1/xaxpy.opencl"
-  ;
+    }) {
 }

 // =================================================================================================

 // The main routine
 template <typename T>
-StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorY(n, y_buffer, y_offset, y_inc);

  // Determines whether or not the fast-version can be used
  bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -55,45 +52,39 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
  auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy";

  // Retrieves the Xaxpy kernel from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel = Kernel(program, kernel_name);
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel = Kernel(program, kernel_name);

-    // Sets the kernel arguments
-    if (use_fast_kernel) {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, GetRealArg(alpha));
-      kernel.SetArgument(2, x_buffer());
-      kernel.SetArgument(3, y_buffer());
-    }
-    else {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, GetRealArg(alpha));
-      kernel.SetArgument(2, x_buffer());
-      kernel.SetArgument(3, static_cast<int>(x_offset));
-      kernel.SetArgument(4, static_cast<int>(x_inc));
-      kernel.SetArgument(5, y_buffer());
-      kernel.SetArgument(6, static_cast<int>(y_offset));
-      kernel.SetArgument(7, static_cast<int>(y_inc));
-    }
+  // Sets the kernel arguments
+  if (use_fast_kernel) {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, GetRealArg(alpha));
+    kernel.SetArgument(2, x_buffer());
+    kernel.SetArgument(3, y_buffer());
+  }
+  else {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, GetRealArg(alpha));
+    kernel.SetArgument(2, x_buffer());
+    kernel.SetArgument(3, static_cast<int>(x_offset));
+    kernel.SetArgument(4, static_cast<int>(x_inc));
+    kernel.SetArgument(5, y_buffer());
+    kernel.SetArgument(6, static_cast<int>(y_offset));
+    kernel.SetArgument(7, static_cast<int>(y_inc));
+  }

-    // Launches the kernel
-    if (use_fast_kernel) {
-      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    else {
-      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
-      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the kernel
+  if (use_fast_kernel) {
+    auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
+  else {
+    auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+    auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
 }

 // =================================================================================================
--- a/src/routines/level1/xaxpy.hpp
+++ b/src/routines/level1/xaxpy.hpp
@ -28,9 +28,9 @@ class Xaxpy: public Routine {
  Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY");

  // Templated-precision implementation of the routine
-  StatusCode DoAxpy(const size_t n, const T alpha,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoAxpy(const size_t n, const T alpha,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xcopy.cpp
+++ b/src/routines/level1/xcopy.cpp
@ -22,29 +22,26 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xcopy<T>::Xcopy(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
-  source_string_ =
+    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
    #include "../../kernels/level1/level1.opencl"
    #include "../../kernels/level1/xcopy.opencl"
-  ;
+    }) {
 }

 // =================================================================================================

 // The main routine
 template <typename T>
-StatusCode Xcopy<T>::DoCopy(const size_t n,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xcopy<T>::DoCopy(const size_t n,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorY(n, y_buffer, y_offset, y_inc);

  // Determines whether or not the fast-version can be used
  bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -55,43 +52,37 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
  auto kernel_name = (use_fast_kernel) ? "XcopyFast" : "Xcopy";

  // Retrieves the Xcopy kernel from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel = Kernel(program, kernel_name);
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel = Kernel(program, kernel_name);

-    // Sets the kernel arguments
-    if (use_fast_kernel) {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, x_buffer());
-      kernel.SetArgument(2, y_buffer());
-    }
-    else {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, x_buffer());
-      kernel.SetArgument(2, static_cast<int>(x_offset));
-      kernel.SetArgument(3, static_cast<int>(x_inc));
-      kernel.SetArgument(4, y_buffer());
-      kernel.SetArgument(5, static_cast<int>(y_offset));
-      kernel.SetArgument(6, static_cast<int>(y_inc));
-    }
+  // Sets the kernel arguments
+  if (use_fast_kernel) {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, x_buffer());
+    kernel.SetArgument(2, y_buffer());
+  }
+  else {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, x_buffer());
+    kernel.SetArgument(2, static_cast<int>(x_offset));
+    kernel.SetArgument(3, static_cast<int>(x_inc));
+    kernel.SetArgument(4, y_buffer());
+    kernel.SetArgument(5, static_cast<int>(y_offset));
+    kernel.SetArgument(6, static_cast<int>(y_inc));
+  }

-    // Launches the kernel
-    if (use_fast_kernel) {
-      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    else {
-      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
-      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the kernel
+  if (use_fast_kernel) {
+    auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
+  else {
+    auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+    auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
 }

 // =================================================================================================
--- a/src/routines/level1/xcopy.hpp
+++ b/src/routines/level1/xcopy.hpp
@ -28,9 +28,9 @@ class Xcopy: public Routine {
  Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY");

  // Templated-precision implementation of the routine
-  StatusCode DoCopy(const size_t n,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoCopy(const size_t n,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xdot.cpp
+++ b/src/routines/level1/xdot.cpp
@ -22,79 +22,68 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xdot<T>::Xdot(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
-  source_string_ =
+    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
    #include "../../kernels/level1/xdot.opencl"
-  ;
+    }) {
 }

 // =================================================================================================

 // The main routine
 template <typename T>
-StatusCode Xdot<T>::DoDot(const size_t n,
-                          const Buffer<T> &dot_buffer, const size_t dot_offset,
-                          const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                          const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                          const bool do_conjugate) {
+void Xdot<T>::DoDot(const size_t n,
+                    const Buffer<T> &dot_buffer, const size_t dot_offset,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    const bool do_conjugate) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorScalar(1, dot_buffer, dot_offset);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorY(n, y_buffer, y_offset, y_inc);
+  TestVectorScalar(1, dot_buffer, dot_offset);

  // Retrieves the Xdot kernels from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel1 = Kernel(program, "Xdot");
-    auto kernel2 = Kernel(program, "XdotEpilogue");
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel1 = Kernel(program, "Xdot");
+  auto kernel2 = Kernel(program, "XdotEpilogue");

-    // Creates the buffer for intermediate values
-    auto temp_size = 2*db_["WGS2"];
-    auto temp_buffer = Buffer<T>(context_, temp_size);
+  // Creates the buffer for intermediate values
+  auto temp_size = 2*db_["WGS2"];
+  auto temp_buffer = Buffer<T>(context_, temp_size);

-    // Sets the kernel arguments
-    kernel1.SetArgument(0, static_cast<int>(n));
-    kernel1.SetArgument(1, x_buffer());
-    kernel1.SetArgument(2, static_cast<int>(x_offset));
-    kernel1.SetArgument(3, static_cast<int>(x_inc));
-    kernel1.SetArgument(4, y_buffer());
-    kernel1.SetArgument(5, static_cast<int>(y_offset));
-    kernel1.SetArgument(6, static_cast<int>(y_inc));
-    kernel1.SetArgument(7, temp_buffer());
-    kernel1.SetArgument(8, static_cast<int>(do_conjugate));
+  // Sets the kernel arguments
+  kernel1.SetArgument(0, static_cast<int>(n));
+  kernel1.SetArgument(1, x_buffer());
+  kernel1.SetArgument(2, static_cast<int>(x_offset));
+  kernel1.SetArgument(3, static_cast<int>(x_inc));
+  kernel1.SetArgument(4, y_buffer());
+  kernel1.SetArgument(5, static_cast<int>(y_offset));
+  kernel1.SetArgument(6, static_cast<int>(y_inc));
+  kernel1.SetArgument(7, temp_buffer());
+  kernel1.SetArgument(8, static_cast<int>(do_conjugate));

-    // Event waiting list
-    auto eventWaitList = std::vector<Event>();
+  // Event waiting list
+  auto eventWaitList = std::vector<Event>();

-    // Launches the main kernel
-    auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
-    auto local1 = std::vector<size_t>{db_["WGS1"]};
-    auto kernelEvent = Event();
-    status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
-    if (ErrorIn(status)) { return status; }
-    eventWaitList.push_back(kernelEvent);
+  // Launches the main kernel
+  auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
+  auto local1 = std::vector<size_t>{db_["WGS1"]};
+  auto kernelEvent = Event();
+  RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
+  eventWaitList.push_back(kernelEvent);

-    // Sets the arguments for the epilogue kernel
-    kernel2.SetArgument(0, temp_buffer());
-    kernel2.SetArgument(1, dot_buffer());
-    kernel2.SetArgument(2, static_cast<int>(dot_offset));
+  // Sets the arguments for the epilogue kernel
+  kernel2.SetArgument(0, temp_buffer());
+  kernel2.SetArgument(1, dot_buffer());
+  kernel2.SetArgument(2, static_cast<int>(dot_offset));

-    // Launches the epilogue kernel
-    auto global2 = std::vector<size_t>{db_["WGS2"]};
-    auto local2 = std::vector<size_t>{db_["WGS2"]};
-    status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the epilogue kernel
+  auto global2 = std::vector<size_t>{db_["WGS2"]};
+  auto local2 = std::vector<size_t>{db_["WGS2"]};
+  RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
 }

 // =================================================================================================
--- a/src/routines/level1/xdot.hpp
+++ b/src/routines/level1/xdot.hpp
@ -28,11 +28,11 @@ class Xdot: public Routine {
  Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT");

  // Templated-precision implementation of the routine
-  StatusCode DoDot(const size_t n,
-                   const Buffer<T> &dot_buffer, const size_t dot_offset,
-                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                   const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                   const bool do_conjugate = false);
+  void DoDot(const size_t n,
+             const Buffer<T> &dot_buffer, const size_t dot_offset,
+             const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+             const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+             const bool do_conjugate = false);
 };

 // =================================================================================================
--- a/src/routines/level1/xdotc.cpp
+++ b/src/routines/level1/xdotc.cpp
@ -29,14 +29,14 @@ Xdotc<T>::Xdotc(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xdotc<T>::DoDotc(const size_t n,
-                            const Buffer<T> &dot_buffer, const size_t dot_offset,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
-  return DoDot(n, dot_buffer, dot_offset,
-               x_buffer, x_offset, x_inc,
-               y_buffer, y_offset, y_inc,
-               true);
+void Xdotc<T>::DoDotc(const size_t n,
+                      const Buffer<T> &dot_buffer, const size_t dot_offset,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+  DoDot(n, dot_buffer, dot_offset,
+        x_buffer, x_offset, x_inc,
+        y_buffer, y_offset, y_inc,
+        true);
 }

 // =================================================================================================
--- a/src/routines/level1/xdotc.hpp
+++ b/src/routines/level1/xdotc.hpp
@ -31,10 +31,10 @@ class Xdotc: public Xdot<T> {
  Xdotc(Queue &queue, EventPointer event, const std::string &name = "DOTC");

  // Templated-precision implementation of the routine
-  StatusCode DoDotc(const size_t n,
-                    const Buffer<T> &dot_buffer, const size_t dot_offset,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoDotc(const size_t n,
+              const Buffer<T> &dot_buffer, const size_t dot_offset,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xdotu.cpp
+++ b/src/routines/level1/xdotu.cpp
@ -28,14 +28,14 @@ Xdotu<T>::Xdotu(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xdotu<T>::DoDotu(const size_t n,
-                            const Buffer<T> &dot_buffer, const size_t dot_offset,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
-  return DoDot(n, dot_buffer, dot_offset,
-               x_buffer, x_offset, x_inc,
-               y_buffer, y_offset, y_inc,
-               false);
+void Xdotu<T>::DoDotu(const size_t n,
+                      const Buffer<T> &dot_buffer, const size_t dot_offset,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+  DoDot(n, dot_buffer, dot_offset,
+        x_buffer, x_offset, x_inc,
+        y_buffer, y_offset, y_inc,
+        false);
 }

 // =================================================================================================
--- a/src/routines/level1/xdotu.hpp
+++ b/src/routines/level1/xdotu.hpp
@ -31,10 +31,10 @@ class Xdotu: public Xdot<T> {
  Xdotu(Queue &queue, EventPointer event, const std::string &name = "DOTU");

  // Templated-precision implementation of the routine
-  StatusCode DoDotu(const size_t n,
-                    const Buffer<T> &dot_buffer, const size_t dot_offset,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoDotu(const size_t n,
+              const Buffer<T> &dot_buffer, const size_t dot_offset,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xmax.hpp
+++ b/src/routines/level1/xmax.hpp
@ -35,10 +35,10 @@ class Xmax: public Xamax<T> {

  // Forwards to the regular absolute version. The implementation difference is realised in the
  // kernel through a pre-processor macro based on the name of the routine.
-  StatusCode DoMax(const size_t n,
-                   const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
-                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
-    return DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc);
+  void DoMax(const size_t n,
+             const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
+             const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+    DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc);
  }
 };

--- a/src/routines/level1/xmin.hpp
+++ b/src/routines/level1/xmin.hpp
@ -35,10 +35,10 @@ class Xmin: public Xamax<T> {

  // Forwards to the regular max-absolute version. The implementation difference is realised in the
  // kernel through a pre-processor macro based on the name of the routine.
-  StatusCode DoMin(const size_t n,
-                   const Buffer<unsigned int> &imin_buffer, const size_t imin_offset,
-                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
-    return DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc);
+  void DoMin(const size_t n,
+             const Buffer<unsigned int> &imin_buffer, const size_t imin_offset,
+             const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+    DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc);
  }
 };

--- a/src/routines/level1/xnrm2.cpp
+++ b/src/routines/level1/xnrm2.cpp
@ -22,71 +22,61 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xnrm2<T>::Xnrm2(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
-  source_string_ =
+    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
    #include "../../kernels/level1/xnrm2.opencl"
-  ;
+    }) {
 }

 // =================================================================================================

 // The main routine
 template <typename T>
-StatusCode Xnrm2<T>::DoNrm2(const size_t n,
-                            const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xnrm2<T>::DoNrm2(const size_t n,
+                      const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorScalar(1, nrm2_buffer, nrm2_offset);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorScalar(1, nrm2_buffer, nrm2_offset);

  // Retrieves the Xnrm2 kernels from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel1 = Kernel(program, "Xnrm2");
-    auto kernel2 = Kernel(program, "Xnrm2Epilogue");
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel1 = Kernel(program, "Xnrm2");
+  auto kernel2 = Kernel(program, "Xnrm2Epilogue");

-    // Creates the buffer for intermediate values
-    auto temp_size = 2*db_["WGS2"];
-    auto temp_buffer = Buffer<T>(context_, temp_size);
+  // Creates the buffer for intermediate values
+  auto temp_size = 2*db_["WGS2"];
+  auto temp_buffer = Buffer<T>(context_, temp_size);

-    // Sets the kernel arguments
-    kernel1.SetArgument(0, static_cast<int>(n));
-    kernel1.SetArgument(1, x_buffer());
-    kernel1.SetArgument(2, static_cast<int>(x_offset));
-    kernel1.SetArgument(3, static_cast<int>(x_inc));
-    kernel1.SetArgument(4, temp_buffer());
+  // Sets the kernel arguments
+  kernel1.SetArgument(0, static_cast<int>(n));
+  kernel1.SetArgument(1, x_buffer());
+  kernel1.SetArgument(2, static_cast<int>(x_offset));
+  kernel1.SetArgument(3, static_cast<int>(x_inc));
+  kernel1.SetArgument(4, temp_buffer());

-    // Event waiting list
-    auto eventWaitList = std::vector<Event>();
+  // Event waiting list
+  auto eventWaitList = std::vector<Event>();

-    // Launches the main kernel
-    auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
-    auto local1 = std::vector<size_t>{db_["WGS1"]};
-    auto kernelEvent = Event();
-    status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
-    if (ErrorIn(status)) { return status; }
-    eventWaitList.push_back(kernelEvent);
+  // Launches the main kernel
+  auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
+  auto local1 = std::vector<size_t>{db_["WGS1"]};
+  auto kernelEvent = Event();
+  RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
+  eventWaitList.push_back(kernelEvent);

-    // Sets the arguments for the epilogue kernel
-    kernel2.SetArgument(0, temp_buffer());
-    kernel2.SetArgument(1, nrm2_buffer());
-    kernel2.SetArgument(2, static_cast<int>(nrm2_offset));
+  // Sets the arguments for the epilogue kernel
+  kernel2.SetArgument(0, temp_buffer());
+  kernel2.SetArgument(1, nrm2_buffer());
+  kernel2.SetArgument(2, static_cast<int>(nrm2_offset));

-    // Launches the epilogue kernel
-    auto global2 = std::vector<size_t>{db_["WGS2"]};
-    auto local2 = std::vector<size_t>{db_["WGS2"]};
-    status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the epilogue kernel
+  auto global2 = std::vector<size_t>{db_["WGS2"]};
+  auto local2 = std::vector<size_t>{db_["WGS2"]};
+  RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
 }

 // =================================================================================================
--- a/src/routines/level1/xnrm2.hpp
+++ b/src/routines/level1/xnrm2.hpp
@ -28,9 +28,9 @@ class Xnrm2: public Routine {
  Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2");

  // Templated-precision implementation of the routine
-  StatusCode DoNrm2(const size_t n,
-                    const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+  void DoNrm2(const size_t n,
+              const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xscal.cpp
+++ b/src/routines/level1/xscal.cpp
@ -22,26 +22,24 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xscal<T>::Xscal(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
-  source_string_ =
+    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
    #include "../../kernels/level1/level1.opencl"
    #include "../../kernels/level1/xscal.opencl"
-  ;
+    }) {
 }

 // =================================================================================================

 // The main routine
 template <typename T>
-StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xscal<T>::DoScal(const size_t n, const T alpha,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vector for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);

  // Determines whether or not the fast-version can be used
  bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -51,41 +49,35 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
  auto kernel_name = (use_fast_kernel) ? "XscalFast" : "Xscal";

  // Retrieves the Xscal kernel from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel = Kernel(program, kernel_name);
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel = Kernel(program, kernel_name);

-    // Sets the kernel arguments
-    if (use_fast_kernel) {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, alpha);
-      kernel.SetArgument(2, x_buffer());
-    }
-    else {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, alpha);
-      kernel.SetArgument(2, x_buffer());
-      kernel.SetArgument(3, static_cast<int>(x_offset));
-      kernel.SetArgument(4, static_cast<int>(x_inc));
-    }
+  // Sets the kernel arguments
+  if (use_fast_kernel) {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, alpha);
+    kernel.SetArgument(2, x_buffer());
+  }
+  else {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, alpha);
+    kernel.SetArgument(2, x_buffer());
+    kernel.SetArgument(3, static_cast<int>(x_offset));
+    kernel.SetArgument(4, static_cast<int>(x_inc));
+  }

-    // Launches the kernel
-    if (use_fast_kernel) {
-      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    else {
-      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
-      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the kernel
+  if (use_fast_kernel) {
+    auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
+  else {
+    auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+    auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
 }

 // =================================================================================================
--- a/src/routines/level1/xscal.hpp
+++ b/src/routines/level1/xscal.hpp
@ -28,8 +28,8 @@ class Xscal: public Routine {
  Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL");

  // Templated-precision implementation of the routine
-  StatusCode DoScal(const size_t n, const T alpha,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+  void DoScal(const size_t n, const T alpha,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xsum.hpp
+++ b/src/routines/level1/xsum.hpp
@ -35,10 +35,10 @@ class Xsum: public Xasum<T> {

  // Forwards to the regular absolute version. The implementation difference is realised in the
  // kernel through a pre-processor macro based on the name of the routine.
-  StatusCode DoSum(const size_t n,
-                   const Buffer<T> &sum_buffer, const size_t sum_offset,
-                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
-    return DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc);
+  void DoSum(const size_t n,
+             const Buffer<T> &sum_buffer, const size_t sum_offset,
+             const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+    DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc);
  }
 };

--- a/src/routines/level1/xswap.cpp
+++ b/src/routines/level1/xswap.cpp
@ -22,29 +22,26 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xswap<T>::Xswap(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
-  source_string_ =
+    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
    #include "../../kernels/level1/level1.opencl"
    #include "../../kernels/level1/xswap.opencl"
-  ;
+    }) {
 }

 // =================================================================================================

 // The main routine
 template <typename T>
-StatusCode Xswap<T>::DoSwap(const size_t n,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xswap<T>::DoSwap(const size_t n,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorY(n, y_buffer, y_offset, y_inc);

  // Determines whether or not the fast-version can be used
  bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -55,43 +52,37 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
  auto kernel_name = (use_fast_kernel) ? "XswapFast" : "Xswap";

  // Retrieves the Xswap kernel from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel = Kernel(program, kernel_name);
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel = Kernel(program, kernel_name);

-    // Sets the kernel arguments
-    if (use_fast_kernel) {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, x_buffer());
-      kernel.SetArgument(2, y_buffer());
-    }
-    else {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, x_buffer());
-      kernel.SetArgument(2, static_cast<int>(x_offset));
-      kernel.SetArgument(3, static_cast<int>(x_inc));
-      kernel.SetArgument(4, y_buffer());
-      kernel.SetArgument(5, static_cast<int>(y_offset));
-      kernel.SetArgument(6, static_cast<int>(y_inc));
-    }
+  // Sets the kernel arguments
+  if (use_fast_kernel) {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, x_buffer());
+    kernel.SetArgument(2, y_buffer());
+  }
+  else {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, x_buffer());
+    kernel.SetArgument(2, static_cast<int>(x_offset));
+    kernel.SetArgument(3, static_cast<int>(x_inc));
+    kernel.SetArgument(4, y_buffer());
+    kernel.SetArgument(5, static_cast<int>(y_offset));
+    kernel.SetArgument(6, static_cast<int>(y_inc));
+  }

-    // Launches the kernel
-    if (use_fast_kernel) {
-      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    else {
-      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
-      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the kernel
+  if (use_fast_kernel) {
+    auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
+  else {
+    auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+    auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
 }

 // =================================================================================================
--- a/src/routines/level1/xswap.hpp
+++ b/src/routines/level1/xswap.hpp
@ -28,9 +28,9 @@ class Xswap: public Routine {
  Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP");

  // Templated-precision implementation of the routine
-  StatusCode DoSwap(const size_t n,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoSwap(const size_t n,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level2/xgbmv.cpp
+++ b/src/routines/level2/xgbmv.cpp
@ -29,13 +29,13 @@ Xgbmv<T>::Xgbmv(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
-                            const size_t m, const size_t n, const size_t kl, const size_t ku,
-                            const T alpha,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const T beta,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
+                      const size_t m, const size_t n, const size_t kl, const size_t ku,
+                      const T alpha,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const T beta,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // Reverses the upper and lower band count
  auto rotated = (layout == Layout::kRowMajor);
@ -46,13 +46,13 @@ StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
  // The specific hermitian matrix-accesses are implemented in the kernel guarded by the
  // ROUTINE_GBMV define.
  bool fast_kernels = false;
-  return MatVec(layout, a_transpose,
-                m, n, alpha,
-                a_buffer, a_offset, a_ld,
-                x_buffer, x_offset, x_inc, beta,
-                y_buffer, y_offset, y_inc,
-                fast_kernels, fast_kernels,
-                0, false, kl_real, ku_real);
+  MatVec(layout, a_transpose,
+         m, n, alpha,
+         a_buffer, a_offset, a_ld,
+         x_buffer, x_offset, x_inc, beta,
+         y_buffer, y_offset, y_inc,
+         fast_kernels, fast_kernels,
+         0, false, kl_real, ku_real);
 }

 // =================================================================================================
--- a/src/routines/level2/xgbmv.hpp
+++ b/src/routines/level2/xgbmv.hpp
@ -33,13 +33,13 @@ class Xgbmv: public Xgemv<T> {
  Xgbmv(Queue &queue, EventPointer event, const std::string &name = "GBMV");

  // Templated-precision implementation of the routine
-  StatusCode DoGbmv(const Layout layout, const Transpose a_transpose,
-                    const size_t m, const size_t n, const size_t kl, const size_t ku,
-                    const T alpha,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const T beta,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoGbmv(const Layout layout, const Transpose a_transpose,
+              const size_t m, const size_t n, const size_t kl, const size_t ku,
+              const T alpha,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const T beta,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level2/xgemv.cpp
+++ b/src/routines/level2/xgemv.cpp
@ -22,52 +22,51 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>()) {
-  source_string_ =
+    Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>(), {}, {
    #include "../../kernels/level2/xgemv.opencl"
    #include "../../kernels/level2/xgemv_fast.opencl"
-  ;
+    }) {
 }

 // =================================================================================================

 // The main routine
 template <typename T>
-StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
-                            const size_t m, const size_t n,
-                            const T alpha,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const T beta,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
+                      const size_t m, const size_t n,
+                      const T alpha,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const T beta,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // Performs the matrix-vector multiplication
-  return MatVec(layout, a_transpose,
-                m, n, alpha,
-                a_buffer, a_offset, a_ld,
-                x_buffer, x_offset, x_inc, beta,
-                y_buffer, y_offset, y_inc,
-                true, true,
-                0, false, 0, 0); // N/A for this routine
+  MatVec(layout, a_transpose,
+         m, n, alpha,
+         a_buffer, a_offset, a_ld,
+         x_buffer, x_offset, x_inc, beta,
+         y_buffer, y_offset, y_inc,
+         true, true,
+         0, false, 0, 0); // N/A for this routine
 }

 // =================================================================================================

 // The generic implementation, also suited for other (non general) matrix-vector multiplications
 template <typename T>
-StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
-                            const size_t m, const size_t n,
-                            const T alpha,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const T beta,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                            bool fast_kernel, bool fast_kernel_rot,
-                            const size_t parameter, const bool packed,
-                            const size_t kl, const size_t ku) {
+void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
+                      const size_t m, const size_t n,
+                      const T alpha,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const T beta,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                      bool fast_kernel, bool fast_kernel_rot,
+                      const size_t parameter, const bool packed,
+                      const size_t kl, const size_t ku) {

  // Makes sure all dimensions are larger than zero
-  if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
+  if (m == 0 || n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Computes whether or not the matrix has an alternative layout (row or column-major).
  auto a_altlayout = (layout == Layout::kRowMajor);
@ -91,14 +90,10 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
  auto a_conjugate = (a_transpose == Transpose::kConjugate);

  // Tests the matrix and the vectors for validity
-  auto status = StatusCode::kSuccess;
-  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
-  else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); }
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorX(n_real, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(m_real, y_buffer, y_offset, y_inc);
-  if (ErrorIn(status)) { return status; }
+  if (packed) { TestMatrixAP(n, a_buffer, a_offset); }
+  else { TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); }
+  TestVectorX(n_real, x_buffer, x_offset, x_inc);
+  TestVectorY(m_real, y_buffer, y_offset, y_inc);

  // Determines whether or not the fast-version can be used
  fast_kernel = fast_kernel && (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) &&
@ -127,39 +122,33 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
  }

  // Retrieves the Xgemv kernel from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel = Kernel(program, kernel_name);
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel = Kernel(program, kernel_name);

-    // Sets the kernel arguments
-    kernel.SetArgument(0, static_cast<int>(m_real));
-    kernel.SetArgument(1, static_cast<int>(n_real));
-    kernel.SetArgument(2, GetRealArg(alpha));
-    kernel.SetArgument(3, GetRealArg(beta));
-    kernel.SetArgument(4, static_cast<int>(a_rotated));
-    kernel.SetArgument(5, a_buffer());
-    kernel.SetArgument(6, static_cast<int>(a_offset));
-    kernel.SetArgument(7, static_cast<int>(a_ld));
-    kernel.SetArgument(8, x_buffer());
-    kernel.SetArgument(9, static_cast<int>(x_offset));
-    kernel.SetArgument(10, static_cast<int>(x_inc));
-    kernel.SetArgument(11, y_buffer());
-    kernel.SetArgument(12, static_cast<int>(y_offset));
-    kernel.SetArgument(13, static_cast<int>(y_inc));
-    kernel.SetArgument(14, static_cast<int>(a_conjugate));
-    kernel.SetArgument(15, static_cast<int>(parameter)); // extra parameter used for symm/herm
-    kernel.SetArgument(16, static_cast<int>(kl)); // only used for banded matrices
-    kernel.SetArgument(17, static_cast<int>(ku)); // only used for banded matrices
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(m_real));
+  kernel.SetArgument(1, static_cast<int>(n_real));
+  kernel.SetArgument(2, GetRealArg(alpha));
+  kernel.SetArgument(3, GetRealArg(beta));
+  kernel.SetArgument(4, static_cast<int>(a_rotated));
+  kernel.SetArgument(5, a_buffer());
+  kernel.SetArgument(6, static_cast<int>(a_offset));
+  kernel.SetArgument(7, static_cast<int>(a_ld));
+  kernel.SetArgument(8, x_buffer());
+  kernel.SetArgument(9, static_cast<int>(x_offset));
+  kernel.SetArgument(10, static_cast<int>(x_inc));
+  kernel.SetArgument(11, y_buffer());
+  kernel.SetArgument(12, static_cast<int>(y_offset));
+  kernel.SetArgument(13, static_cast<int>(y_inc));
+  kernel.SetArgument(14, static_cast<int>(a_conjugate));
+  kernel.SetArgument(15, static_cast<int>(parameter)); // extra parameter used for symm/herm
+  kernel.SetArgument(16, static_cast<int>(kl)); // only used for banded matrices
+  kernel.SetArgument(17, static_cast<int>(ku)); // only used for banded matrices

-    // Launches the kernel
-    auto global = std::vector<size_t>{global_size};
-    auto local = std::vector<size_t>{local_size};
-    status = RunKernel(kernel, queue_, device_, global, local, event_);
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the kernel
+  auto global = std::vector<size_t>{global_size};
+  auto local = std::vector<size_t>{local_size};
+  RunKernel(kernel, queue_, device_, global, local, event_);
 }

 // =================================================================================================
--- a/src/routines/level2/xgemv.hpp
+++ b/src/routines/level2/xgemv.hpp
@ -28,25 +28,25 @@ class Xgemv: public Routine {
  Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV");

  // Templated-precision implementation of the routine
-  StatusCode DoGemv(const Layout layout, const Transpose a_transpose,
-                    const size_t m, const size_t n,
-                    const T alpha,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const T beta,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoGemv(const Layout layout, const Transpose a_transpose,
+              const size_t m, const size_t n,
+              const T alpha,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const T beta,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);

  // Generic version used also for other matrix-vector multiplications
-  StatusCode MatVec(const Layout layout, const Transpose a_transpose,
-                    const size_t m, const size_t n,
-                    const T alpha,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const T beta,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                    bool fast_kernel, bool fast_kernel_rot,
-                    const size_t parameter, const bool packed,
-                    const size_t kl, const size_t ku);
+  void MatVec(const Layout layout, const Transpose a_transpose,
+              const size_t m, const size_t n,
+              const T alpha,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const T beta,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+              bool fast_kernel, bool fast_kernel_rot,
+              const size_t parameter, const bool packed,
+              const size_t kl, const size_t ku);
 };

 // =================================================================================================
--- a/src/routines/level2/xger.cpp
+++ b/src/routines/level2/xger.cpp
@ -22,26 +22,25 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xger<T>::Xger(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
-  source_string_ =
+    Routine(queue, event, name, {"Xger"}, PrecisionValue<T>(), {}, {
    #include "../../kernels/level2/level2.opencl"
    #include "../../kernels/level2/xger.opencl"
-  ;
+    }) {
 }

 // =================================================================================================

 // The main routine
 template <typename T>
-StatusCode Xger<T>::DoGer(const Layout layout,
-                          const size_t m, const size_t n,
-                          const T alpha,
-                          const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                          const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                          const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
+void Xger<T>::DoGer(const Layout layout,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {

  // Makes sure all dimensions are larger than zero
-  if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
+  if (m == 0 || n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Computes whether or not the matrix has an alternative layout (row or column-major).
  const auto a_is_rowmajor = (layout == Layout::kRowMajor);
@ -49,44 +48,35 @@ StatusCode Xger<T>::DoGer(const Layout layout,
  const auto a_two = (a_is_rowmajor) ? m : n;

  // Tests the matrix and the vectors for validity
-  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorX(m, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc);
-  if (ErrorIn(status)) { return status; }
+  TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
+  TestVectorX(m, x_buffer, x_offset, x_inc);
+  TestVectorY(n, y_buffer, y_offset, y_inc);

  // Retrieves the kernel from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel = Kernel(program, "Xger");
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel = Kernel(program, "Xger");

-    // Sets the kernel arguments
-    kernel.SetArgument(0, static_cast<int>(a_one));
-    kernel.SetArgument(1, static_cast<int>(a_two));
-    kernel.SetArgument(2, GetRealArg(alpha));
-    kernel.SetArgument(3, x_buffer());
-    kernel.SetArgument(4, static_cast<int>(x_offset));
-    kernel.SetArgument(5, static_cast<int>(x_inc));
-    kernel.SetArgument(6, y_buffer());
-    kernel.SetArgument(7, static_cast<int>(y_offset));
-    kernel.SetArgument(8, static_cast<int>(y_inc));
-    kernel.SetArgument(9, a_buffer());
-    kernel.SetArgument(10, static_cast<int>(a_offset));
-    kernel.SetArgument(11, static_cast<int>(a_ld));
-    kernel.SetArgument(12, static_cast<int>(a_is_rowmajor));
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(a_one));
+  kernel.SetArgument(1, static_cast<int>(a_two));
+  kernel.SetArgument(2, GetRealArg(alpha));
+  kernel.SetArgument(3, x_buffer());
+  kernel.SetArgument(4, static_cast<int>(x_offset));
+  kernel.SetArgument(5, static_cast<int>(x_inc));
+  kernel.SetArgument(6, y_buffer());
+  kernel.SetArgument(7, static_cast<int>(y_offset));
+  kernel.SetArgument(8, static_cast<int>(y_inc));
+  kernel.SetArgument(9, a_buffer());
+  kernel.SetArgument(10, static_cast<int>(a_offset));
+  kernel.SetArgument(11, static_cast<int>(a_ld));
+  kernel.SetArgument(12, static_cast<int>(a_is_rowmajor));

-    // Launches the kernel
-    auto a_one_ceiled = Ceil(CeilDiv(a_one, db_["WPT"]), db_["WGS1"]);
-    auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
-    auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
-    auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
-    status = RunKernel(kernel, queue_, device_, global, local, event_);
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the kernel
+  auto a_one_ceiled = Ceil(CeilDiv(a_one, db_["WPT"]), db_["WGS1"]);
+  auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
+  auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
+  auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
+  RunKernel(kernel, queue_, device_, global, local, event_);
 }

 // =================================================================================================
--- a/src/routines/level2/xger.hpp
+++ b/src/routines/level2/xger.hpp
@ -28,12 +28,12 @@ class Xger: public Routine {
  Xger(Queue &queue, EventPointer event, const std::string &name = "GER");

  // Templated-precision implementation of the routine
-  StatusCode DoGer(const Layout layout,
-                   const size_t m, const size_t n,
-                   const T alpha,
-                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                   const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                   const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+  void DoGer(const Layout layout,
+             const size_t m, const size_t n,
+             const T alpha,
+             const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+             const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+             const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
 };

 // =================================================================================================
--- a/src/routines/level2/xgerc.cpp
+++ b/src/routines/level2/xgerc.cpp
@ -28,19 +28,19 @@ Xgerc<T>::Xgerc(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xgerc<T>::DoGerc(const Layout layout,
-                            const size_t m, const size_t n,
-                            const T alpha,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
+void Xgerc<T>::DoGerc(const Layout layout,
+                      const size_t m, const size_t n,
+                      const T alpha,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {

  // Regular Ger operation on complex data, plus conjugation in the kernel guarded by the
  // ROUTINE_GERC guard.
-  return DoGer(layout, m, n, alpha,
-               x_buffer, x_offset, x_inc,
-               y_buffer, y_offset, y_inc,
-               a_buffer, a_offset, a_ld);
+  DoGer(layout, m, n, alpha,
+        x_buffer, x_offset, x_inc,
+        y_buffer, y_offset, y_inc,
+        a_buffer, a_offset, a_ld);
 }

 // =================================================================================================
--- a/src/routines/level2/xgerc.hpp
+++ b/src/routines/level2/xgerc.hpp
@ -31,12 +31,12 @@ class Xgerc: public Xger<T> {
  Xgerc(Queue &queue, EventPointer event, const std::string &name = "GERC");

  // Templated-precision implementation of the routine
-  StatusCode DoGerc(const Layout layout,
-                    const size_t m, const size_t n,
-                    const T alpha,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+  void DoGerc(const Layout layout,
+              const size_t m, const size_t n,
+              const T alpha,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
 };

 // =================================================================================================
--- a/src/routines/level2/xgeru.cpp
+++ b/src/routines/level2/xgeru.cpp
@ -28,18 +28,18 @@ Xgeru<T>::Xgeru(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xgeru<T>::DoGeru(const Layout layout,
-                            const size_t m, const size_t n,
-                            const T alpha,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
+void Xgeru<T>::DoGeru(const Layout layout,
+                      const size_t m, const size_t n,
+                      const T alpha,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {

  // Regular Ger operation on complex data
-  return DoGer(layout, m, n, alpha,
-               x_buffer, x_offset, x_inc,
-               y_buffer, y_offset, y_inc,
-               a_buffer, a_offset, a_ld);
+  DoGer(layout, m, n, alpha,
+        x_buffer, x_offset, x_inc,
+        y_buffer, y_offset, y_inc,
+        a_buffer, a_offset, a_ld);
 }

 // =================================================================================================
--- a/src/routines/level2/xgeru.hpp
+++ b/src/routines/level2/xgeru.hpp
@ -31,12 +31,12 @@ class Xgeru: public Xger<T> {
  Xgeru(Queue &queue, EventPointer event, const std::string &name = "GERU");

  // Templated-precision implementation of the routine
-  StatusCode DoGeru(const Layout layout,
-                    const size_t m, const size_t n,
-                    const T alpha,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+  void DoGeru(const Layout layout,
+              const size_t m, const size_t n,
+              const T alpha,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
 };

 // =================================================================================================
--- a/src/routines/level2/xhbmv.cpp
+++ b/src/routines/level2/xhbmv.cpp
@ -29,13 +29,13 @@ Xhbmv<T>::Xhbmv(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle,
-                            const size_t n, const size_t k,
-                            const T alpha,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const T beta,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle,
+                      const size_t n, const size_t k,
+                      const T alpha,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const T beta,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // The data is either in the upper or lower triangle
  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle,
  // The specific hermitian banded matrix-accesses are implemented in the kernel guarded by the
  // ROUTINE_HBMV define.
  bool fast_kernels = false;
-  return MatVec(layout, Transpose::kNo,
-                n, n, alpha,
-                a_buffer, a_offset, a_ld,
-                x_buffer, x_offset, x_inc, beta,
-                y_buffer, y_offset, y_inc,
-                fast_kernels, fast_kernels,
-                is_upper, false, k, 0);
+  MatVec(layout, Transpose::kNo,
+         n, n, alpha,
+         a_buffer, a_offset, a_ld,
+         x_buffer, x_offset, x_inc, beta,
+         y_buffer, y_offset, y_inc,
+         fast_kernels, fast_kernels,
+         is_upper, false, k, 0);
 }

 // =================================================================================================
--- a/src/routines/level2/xhbmv.hpp
+++ b/src/routines/level2/xhbmv.hpp
@ -33,13 +33,13 @@ class Xhbmv: public Xgemv<T> {
  Xhbmv(Queue &queue, EventPointer event, const std::string &name = "HBMV");

  // Templated-precision implementation of the routine
-  StatusCode DoHbmv(const Layout layout, const Triangle triangle,
-                    const size_t n, const size_t k,
-                    const T alpha,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const T beta,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoHbmv(const Layout layout, const Triangle triangle,
+              const size_t n, const size_t k,
+              const T alpha,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const T beta,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level2/xhemv.cpp
+++ b/src/routines/level2/xhemv.cpp
@ -29,13 +29,13 @@ Xhemv<T>::Xhemv(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle,
-                            const size_t n,
-                            const T alpha,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const T beta,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle,
+                      const size_t n,
+                      const T alpha,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const T beta,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // The data is either in the upper or lower triangle
  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle,
  // The specific hermitian matrix-accesses are implemented in the kernel guarded by the
  // ROUTINE_HEMV define.
  bool fast_kernels = false;
-  return MatVec(layout, Transpose::kNo,
-                n, n, alpha,
-                a_buffer, a_offset, a_ld,
-                x_buffer, x_offset, x_inc, beta,
-                y_buffer, y_offset, y_inc,
-                fast_kernels, fast_kernels,
-                is_upper, false, 0, 0);
+  MatVec(layout, Transpose::kNo,
+         n, n, alpha,
+         a_buffer, a_offset, a_ld,
+         x_buffer, x_offset, x_inc, beta,
+         y_buffer, y_offset, y_inc,
+         fast_kernels, fast_kernels,
+         is_upper, false, 0, 0);
 }

 // =================================================================================================
--- a/src/routines/level2/xhemv.hpp
+++ b/src/routines/level2/xhemv.hpp
@ -33,13 +33,13 @@ class Xhemv: public Xgemv<T> {
  Xhemv(Queue &queue, EventPointer event, const std::string &name = "HEMV");

  // Templated-precision implementation of the routine
-  StatusCode DoHemv(const Layout layout, const Triangle triangle,
-                    const size_t n,
-                    const T alpha,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const T beta,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoHemv(const Layout layout, const Triangle triangle,
+              const size_t n,
+              const T alpha,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const T beta,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level2/xher.cpp
+++ b/src/routines/level2/xher.cpp
@ -21,11 +21,10 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T, typename U>
 Xher<T,U>::Xher(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
-  source_string_ =
+    Routine(queue, event, name, {"Xger"}, PrecisionValue<T>(), {}, {
    #include "../../kernels/level2/level2.opencl"
    #include "../../kernels/level2/xher.opencl"
-  ;
+    }) {
 }

 // =================================================================================================
@ -41,15 +40,15 @@ template <> half Xher<half,half>::GetAlpha(const half alpha) { return alpha; }

 // The main routine
 template <typename T, typename U>
-StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
-                            const size_t n,
-                            const U alpha,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const bool packed) {
+void Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
+                      const size_t n,
+                      const U alpha,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const bool packed) {

  // Makes sure the dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // The data is either in the upper or lower triangle
  const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -57,47 +56,38 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
  const auto is_rowmajor = (layout == Layout::kRowMajor);

  // Tests the matrix and the vectors for validity
-  auto status = StatusCode::kSuccess;
-  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
-  else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
+  if (packed) { TestMatrixAP(n, a_buffer, a_offset); }
+  else { TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
+  TestVectorX(n, x_buffer, x_offset, x_inc);

  // If alpha is zero an update is not required
-  if (alpha == U{0}) { return StatusCode::kSuccess; }
+  if (alpha == U{0}) { return; }

  // Creates a matching version of alpha
  const auto matching_alpha = GetAlpha(alpha);

  // Retrieves the kernel from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel = Kernel(program, "Xher");
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel = Kernel(program, "Xher");

-    // Sets the kernel arguments
-    kernel.SetArgument(0, static_cast<int>(n));
-    kernel.SetArgument(1, GetRealArg(matching_alpha));
-    kernel.SetArgument(2, x_buffer());
-    kernel.SetArgument(3, static_cast<int>(x_offset));
-    kernel.SetArgument(4, static_cast<int>(x_inc));
-    kernel.SetArgument(5, a_buffer());
-    kernel.SetArgument(6, static_cast<int>(a_offset));
-    kernel.SetArgument(7, static_cast<int>(a_ld));
-    kernel.SetArgument(8, static_cast<int>(is_upper));
-    kernel.SetArgument(9, static_cast<int>(is_rowmajor));
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, GetRealArg(matching_alpha));
+  kernel.SetArgument(2, x_buffer());
+  kernel.SetArgument(3, static_cast<int>(x_offset));
+  kernel.SetArgument(4, static_cast<int>(x_inc));
+  kernel.SetArgument(5, a_buffer());
+  kernel.SetArgument(6, static_cast<int>(a_offset));
+  kernel.SetArgument(7, static_cast<int>(a_ld));
+  kernel.SetArgument(8, static_cast<int>(is_upper));
+  kernel.SetArgument(9, static_cast<int>(is_rowmajor));

-    // Launches the kernel
-    auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
-    auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
-    auto global = std::vector<size_t>{global_one, global_two};
-    auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
-    status = RunKernel(kernel, queue_, device_, global, local, event_);
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the kernel
+  auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
+  auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
+  auto global = std::vector<size_t>{global_one, global_two};
+  auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
+  RunKernel(kernel, queue_, device_, global, local, event_);
 }

 // =================================================================================================
--- a/src/routines/level2/xher.hpp
+++ b/src/routines/level2/xher.hpp
@ -31,12 +31,12 @@ class Xher: public Routine {
  T GetAlpha(const U alpha);

  // Templated-precision implementation of the routine
-  StatusCode DoHer(const Layout layout, const Triangle triangle,
-                   const size_t n,
-                   const U alpha,
-                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                   const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                   const bool packed = false);
+  void DoHer(const Layout layout, const Triangle triangle,
+             const size_t n,
+             const U alpha,
+             const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+             const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+             const bool packed = false);
 };

 // =================================================================================================
--- a/src/routines/level2/xher2.cpp
+++ b/src/routines/level2/xher2.cpp
@ -21,27 +21,26 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xher2<T>::Xher2(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
-  source_string_ =
+    Routine(queue, event, name, {"Xger"}, PrecisionValue<T>(), {}, {
    #include "../../kernels/level2/level2.opencl"
    #include "../../kernels/level2/xher2.opencl"
-  ;
+    }) {
 }

 // =================================================================================================

 // The main routine
 template <typename T>
-StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
-                            const size_t n,
-                            const T alpha,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const bool packed) {
+void Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
+                      const size_t n,
+                      const T alpha,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const bool packed) {

  // Makes sure the dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // The data is either in the upper or lower triangle
  const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -49,46 +48,36 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
  const auto is_rowmajor = (layout == Layout::kRowMajor);

  // Tests the matrix and the vectors for validity
-  auto status = StatusCode::kSuccess;
-  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
-  else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc);
-  if (ErrorIn(status)) { return status; }
+  if (packed) { TestMatrixAP(n, a_buffer, a_offset); }
+  else { TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorY(n, y_buffer, y_offset, y_inc);

  // Retrieves the kernel from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel = Kernel(program, "Xher2");
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel = Kernel(program, "Xher2");

-    // Sets the kernel arguments
-    kernel.SetArgument(0, static_cast<int>(n));
-    kernel.SetArgument(1, GetRealArg(alpha));
-    kernel.SetArgument(2, x_buffer());
-    kernel.SetArgument(3, static_cast<int>(x_offset));
-    kernel.SetArgument(4, static_cast<int>(x_inc));
-    kernel.SetArgument(5, y_buffer());
-    kernel.SetArgument(6, static_cast<int>(y_offset));
-    kernel.SetArgument(7, static_cast<int>(y_inc));
-    kernel.SetArgument(8, a_buffer());
-    kernel.SetArgument(9, static_cast<int>(a_offset));
-    kernel.SetArgument(10, static_cast<int>(a_ld));
-    kernel.SetArgument(11, static_cast<int>(is_upper));
-    kernel.SetArgument(12, static_cast<int>(is_rowmajor));
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, GetRealArg(alpha));
+  kernel.SetArgument(2, x_buffer());
+  kernel.SetArgument(3, static_cast<int>(x_offset));
+  kernel.SetArgument(4, static_cast<int>(x_inc));
+  kernel.SetArgument(5, y_buffer());
+  kernel.SetArgument(6, static_cast<int>(y_offset));
+  kernel.SetArgument(7, static_cast<int>(y_inc));
+  kernel.SetArgument(8, a_buffer());
+  kernel.SetArgument(9, static_cast<int>(a_offset));
+  kernel.SetArgument(10, static_cast<int>(a_ld));
+  kernel.SetArgument(11, static_cast<int>(is_upper));
+  kernel.SetArgument(12, static_cast<int>(is_rowmajor));

-    // Launches the kernel
-    auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
-    auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
-    auto global = std::vector<size_t>{global_one, global_two};
-    auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
-    status = RunKernel(kernel, queue_, device_, global, local, event_);
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the kernel
+  auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
+  auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
+  auto global = std::vector<size_t>{global_one, global_two};
+  auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
+  RunKernel(kernel, queue_, device_, global, local, event_);
 }

 // =================================================================================================
--- a/src/routines/level2/xher2.hpp
+++ b/src/routines/level2/xher2.hpp
@ -28,13 +28,13 @@ class Xher2: public Routine {
  Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2");

  // Templated-precision implementation of the routine
-  StatusCode DoHer2(const Layout layout, const Triangle triangle,
-                    const size_t n,
-                    const T alpha,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const bool packed = false);
+  void DoHer2(const Layout layout, const Triangle triangle,
+              const size_t n,
+              const T alpha,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const bool packed = false);
 };

 // =================================================================================================
--- a/src/routines/level2/xhpmv.cpp
+++ b/src/routines/level2/xhpmv.cpp
@ -29,13 +29,13 @@ Xhpmv<T>::Xhpmv(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle,
-                            const size_t n,
-                            const T alpha,
-                            const Buffer<T> &ap_buffer, const size_t ap_offset,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const T beta,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle,
+                      const size_t n,
+                      const T alpha,
+                      const Buffer<T> &ap_buffer, const size_t ap_offset,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const T beta,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // The data is either in the upper or lower triangle
  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle,
  // The specific hermitian packed matrix-accesses are implemented in the kernel guarded by the
  // ROUTINE_HPMV define.
  bool fast_kernels = false;
-  return MatVec(layout, Transpose::kNo,
-                n, n, alpha,
-                ap_buffer, ap_offset, n,
-                x_buffer, x_offset, x_inc, beta,
-                y_buffer, y_offset, y_inc,
-                fast_kernels, fast_kernels,
-                is_upper, true, 0, 0);
+  MatVec(layout, Transpose::kNo,
+         n, n, alpha,
+         ap_buffer, ap_offset, n,
+         x_buffer, x_offset, x_inc, beta,
+         y_buffer, y_offset, y_inc,
+         fast_kernels, fast_kernels,
+         is_upper, true, 0, 0);
 }

 // =================================================================================================
--- a/src/routines/level2/xhpmv.hpp
+++ b/src/routines/level2/xhpmv.hpp
@ -33,13 +33,13 @@ class Xhpmv: public Xgemv<T> {
  Xhpmv(Queue &queue, EventPointer event, const std::string &name = "HPMV");

  // Templated-precision implementation of the routine
-  StatusCode DoHpmv(const Layout layout, const Triangle triangle,
-                    const size_t n,
-                    const T alpha,
-                    const Buffer<T> &ap_buffer, const size_t ap_offset,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const T beta,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoHpmv(const Layout layout, const Triangle triangle,
+              const size_t n,
+              const T alpha,
+              const Buffer<T> &ap_buffer, const size_t ap_offset,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const T beta,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level2/xhpr.cpp
+++ b/src/routines/level2/xhpr.cpp
@ -28,17 +28,17 @@ Xhpr<T,U>::Xhpr(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T, typename U>
-StatusCode Xhpr<T,U>::DoHpr(const Layout layout, const Triangle triangle,
-                            const size_t n,
-                            const U alpha,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &ap_buffer, const size_t ap_offset) {
+void Xhpr<T,U>::DoHpr(const Layout layout, const Triangle triangle,
+                      const size_t n,
+                      const U alpha,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &ap_buffer, const size_t ap_offset) {

  // Specific Xhpr functionality is implemented in the kernel using defines
-  return DoHer(layout, triangle, n, alpha,
-               x_buffer, x_offset, x_inc,
-               ap_buffer, ap_offset, n,
-               true); // packed matrix
+  DoHer(layout, triangle, n, alpha,
+        x_buffer, x_offset, x_inc,
+        ap_buffer, ap_offset, n,
+        true); // packed matrix
 }

 // =================================================================================================
--- a/src/routines/level2/xhpr.hpp
+++ b/src/routines/level2/xhpr.hpp
@ -31,11 +31,11 @@ class Xhpr: public Xher<T,U> {
  Xhpr(Queue &queue, EventPointer event, const std::string &name = "HPR");

  // Templated-precision implementation of the routine
-  StatusCode DoHpr(const Layout layout, const Triangle triangle,
-                   const size_t n,
-                   const U alpha,
-                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                   const Buffer<T> &ap_buffer, const size_t ap_offset);
+  void DoHpr(const Layout layout, const Triangle triangle,
+             const size_t n,
+             const U alpha,
+             const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+             const Buffer<T> &ap_buffer, const size_t ap_offset);
 };

 // =================================================================================================
--- a/src/routines/level2/xhpr2.cpp
+++ b/src/routines/level2/xhpr2.cpp
@ -28,19 +28,19 @@ Xhpr2<T>::Xhpr2(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xhpr2<T>::DoHpr2(const Layout layout, const Triangle triangle,
-                            const size_t n,
-                            const T alpha,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                            const Buffer<T> &ap_buffer, const size_t ap_offset) {
+void Xhpr2<T>::DoHpr2(const Layout layout, const Triangle triangle,
+                      const size_t n,
+                      const T alpha,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                      const Buffer<T> &ap_buffer, const size_t ap_offset) {

  // Specific Xhpr2 functionality is implemented in the kernel using defines
-  return DoHer2(layout, triangle, n, alpha,
-                x_buffer, x_offset, x_inc,
-                y_buffer, y_offset, y_inc,
-                ap_buffer, ap_offset, n,
-                true); // packed matrix
+  DoHer2(layout, triangle, n, alpha,
+         x_buffer, x_offset, x_inc,
+         y_buffer, y_offset, y_inc,
+         ap_buffer, ap_offset, n,
+         true); // packed matrix
 }

 // =================================================================================================
--- a/src/routines/level2/xhpr2.hpp
+++ b/src/routines/level2/xhpr2.hpp
@ -31,12 +31,12 @@ class Xhpr2: public Xher2<T> {
  Xhpr2(Queue &queue, EventPointer event, const std::string &name = "HPR2");

  // Templated-precision implementation of the routine
-  StatusCode DoHpr2(const Layout layout, const Triangle triangle,
-                    const size_t n,
-                    const T alpha,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                    const Buffer<T> &ap_buffer, const size_t ap_offset);
+  void DoHpr2(const Layout layout, const Triangle triangle,
+              const size_t n,
+              const T alpha,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+              const Buffer<T> &ap_buffer, const size_t ap_offset);
 };

 // =================================================================================================
--- a/src/routines/level2/xsbmv.cpp
+++ b/src/routines/level2/xsbmv.cpp
@ -29,13 +29,13 @@ Xsbmv<T>::Xsbmv(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
-                            const size_t n, const size_t k,
-                            const T alpha,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const T beta,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
+                      const size_t n, const size_t k,
+                      const T alpha,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const T beta,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // The data is either in the upper or lower triangle
  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
  // The specific symmetric banded matrix-accesses are implemented in the kernel guarded by the
  // ROUTINE_SBMV define.
  bool fast_kernels = false;
-  return MatVec(layout, Transpose::kNo,
-                n, n, alpha,
-                a_buffer, a_offset, a_ld,
-                x_buffer, x_offset, x_inc, beta,
-                y_buffer, y_offset, y_inc,
-                fast_kernels, fast_kernels,
-                is_upper, false, k, 0);
+  MatVec(layout, Transpose::kNo,
+         n, n, alpha,
+         a_buffer, a_offset, a_ld,
+         x_buffer, x_offset, x_inc, beta,
+         y_buffer, y_offset, y_inc,
+         fast_kernels, fast_kernels,
+         is_upper, false, k, 0);
 }

 // =================================================================================================
--- a/src/routines/level2/xsbmv.hpp
+++ b/src/routines/level2/xsbmv.hpp
@ -33,13 +33,13 @@ class Xsbmv: public Xgemv<T> {
  Xsbmv(Queue &queue, EventPointer event, const std::string &name = "SBMV");

  // Templated-precision implementation of the routine
-  StatusCode DoSbmv(const Layout layout, const Triangle triangle,
-                    const size_t n, const size_t k,
-                    const T alpha,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const T beta,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoSbmv(const Layout layout, const Triangle triangle,
+              const size_t n, const size_t k,
+              const T alpha,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const T beta,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level2/xspmv.cpp
+++ b/src/routines/level2/xspmv.cpp
@ -29,13 +29,13 @@ Xspmv<T>::Xspmv(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
-                            const size_t n,
-                            const T alpha,
-                            const Buffer<T> &ap_buffer, const size_t ap_offset,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const T beta,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
+                      const size_t n,
+                      const T alpha,
+                      const Buffer<T> &ap_buffer, const size_t ap_offset,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const T beta,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // The data is either in the upper or lower triangle
  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
  // The specific symmetric packed matrix-accesses are implemented in the kernel guarded by the
  // ROUTINE_SPMV define.
  bool fast_kernels = false;
-  return MatVec(layout, Transpose::kNo,
-                n, n, alpha,
-                ap_buffer, ap_offset, n,
-                x_buffer, x_offset, x_inc, beta,
-                y_buffer, y_offset, y_inc,
-                fast_kernels, fast_kernels,
-                is_upper, true, 0, 0);
+  MatVec(layout, Transpose::kNo,
+         n, n, alpha,
+         ap_buffer, ap_offset, n,
+         x_buffer, x_offset, x_inc, beta,
+         y_buffer, y_offset, y_inc,
+         fast_kernels, fast_kernels,
+         is_upper, true, 0, 0);
 }

 // =================================================================================================
--- a/src/routines/level2/xspmv.hpp
+++ b/src/routines/level2/xspmv.hpp
@ -33,13 +33,13 @@ class Xspmv: public Xgemv<T> {
  Xspmv(Queue &queue, EventPointer event, const std::string &name = "SPMV");

  // Templated-precision implementation of the routine
-  StatusCode DoSpmv(const Layout layout, const Triangle triangle,
-                    const size_t n,
-                    const T alpha,
-                    const Buffer<T> &ap_buffer, const size_t ap_offset,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const T beta,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoSpmv(const Layout layout, const Triangle triangle,
+              const size_t n,
+              const T alpha,
+              const Buffer<T> &ap_buffer, const size_t ap_offset,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const T beta,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level2/xspr.cpp
+++ b/src/routines/level2/xspr.cpp
@ -28,17 +28,17 @@ Xspr<T>::Xspr(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xspr<T>::DoSpr(const Layout layout, const Triangle triangle,
-                          const size_t n,
-                          const T alpha,
-                          const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                          const Buffer<T> &ap_buffer, const size_t ap_offset) {
+void Xspr<T>::DoSpr(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &ap_buffer, const size_t ap_offset) {

  // Specific Xspr functionality is implemented in the kernel using defines
-  return DoHer(layout, triangle, n, alpha,
-               x_buffer, x_offset, x_inc,
-               ap_buffer, ap_offset, n,
-               true); // packed matrix
+  DoHer(layout, triangle, n, alpha,
+        x_buffer, x_offset, x_inc,
+        ap_buffer, ap_offset, n,
+        true); // packed matrix
 }

 // =================================================================================================
--- a/src/routines/level2/xspr.hpp
+++ b/src/routines/level2/xspr.hpp
@ -31,11 +31,11 @@ class Xspr: public Xher<T,T> {
  Xspr(Queue &queue, EventPointer event, const std::string &name = "SPR");

  // Templated-precision implementation of the routine
-  StatusCode DoSpr(const Layout layout, const Triangle triangle,
-                   const size_t n,
-                   const T alpha,
-                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                   const Buffer<T> &ap_buffer, const size_t ap_offset);
+  void DoSpr(const Layout layout, const Triangle triangle,
+             const size_t n,
+             const T alpha,
+             const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+             const Buffer<T> &ap_buffer, const size_t ap_offset);
 };

 // =================================================================================================
--- a/src/routines/level2/xspr2.cpp
+++ b/src/routines/level2/xspr2.cpp
@ -28,19 +28,19 @@ Xspr2<T>::Xspr2(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle,
-                            const size_t n,
-                            const T alpha,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                            const Buffer<T> &ap_buffer, const size_t ap_offset) {
+void Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle,
+                      const size_t n,
+                      const T alpha,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                      const Buffer<T> &ap_buffer, const size_t ap_offset) {

  // Specific Xspr2 functionality is implemented in the kernel using defines
-  return DoHer2(layout, triangle, n, alpha,
-                x_buffer, x_offset, x_inc,
-                y_buffer, y_offset, y_inc,
-                ap_buffer, ap_offset, n,
-                true); // packed matrix
+  DoHer2(layout, triangle, n, alpha,
+         x_buffer, x_offset, x_inc,
+         y_buffer, y_offset, y_inc,
+         ap_buffer, ap_offset, n,
+         true); // packed matrix
 }

 // =================================================================================================
--- a/src/routines/level2/xspr2.hpp
+++ b/src/routines/level2/xspr2.hpp
@ -31,12 +31,12 @@ class Xspr2: public Xher2<T> {
  Xspr2(Queue &queue, EventPointer event, const std::string &name = "SPR2");

  // Templated-precision implementation of the routine
-  StatusCode DoSpr2(const Layout layout, const Triangle triangle,
-                    const size_t n,
-                    const T alpha,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                    const Buffer<T> &ap_buffer, const size_t ap_offset);
+  void DoSpr2(const Layout layout, const Triangle triangle,
+              const size_t n,
+              const T alpha,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+              const Buffer<T> &ap_buffer, const size_t ap_offset);
 };

 // =================================================================================================
--- a/src/routines/level2/xsymv.cpp
+++ b/src/routines/level2/xsymv.cpp
@ -29,13 +29,13 @@ Xsymv<T>::Xsymv(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
-                            const size_t n,
-                            const T alpha,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const T beta,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
+                      const size_t n,
+                      const T alpha,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const T beta,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // The data is either in the upper or lower triangle
  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
  // The specific symmetric matrix-accesses are implemented in the kernel guarded by the
  // ROUTINE_SYMV define.
  bool fast_kernels = false;
-  return MatVec(layout, Transpose::kNo,
-                n, n, alpha,
-                a_buffer, a_offset, a_ld,
-                x_buffer, x_offset, x_inc, beta,
-                y_buffer, y_offset, y_inc,
-                fast_kernels, fast_kernels,
-                is_upper, false, 0, 0);
+  MatVec(layout, Transpose::kNo,
+         n, n, alpha,
+         a_buffer, a_offset, a_ld,
+         x_buffer, x_offset, x_inc, beta,
+         y_buffer, y_offset, y_inc,
+         fast_kernels, fast_kernels,
+         is_upper, false, 0, 0);
 }

 // =================================================================================================
--- a/Show More
+++ b/Show More