Merge branch 'master' into android_support

2017-10-28 17:32:37 +02:00 · 2017-10-28 17:32:37 +02:00 · 12b08ae491
parent 2949e156f5 fa6e5e67f5
commit 12b08ae491
168 changed files with 6955 additions and 1828 deletions
--- a/.appveyor.yml
+++ b/.appveyor.yml
@ -58,8 +58,8 @@ build_script:

 after_build:
  - ps: pushd $env:CLBLAST_BUILD
-  - 7z a CLBlast-1.0.0-Windows-x64.zip .\install_dir\*
-  - ps: mv CLBlast-1.0.0-Windows-x64.zip $env:APPVEYOR_BUILD_FOLDER
+  - 7z a CLBlast-1.1.0-Windows-x64.zip .\install_dir\*
+  - ps: mv CLBlast-1.1.0-Windows-x64.zip $env:APPVEYOR_BUILD_FOLDER

 artifacts:
  - path: '*.zip'
--- a/.travis.yml
+++ b/.travis.yml
@ -21,7 +21,7 @@ matrix:

 env:
  global:
-    - CLBLAST_VERSION=1.0.0
+    - CLBLAST_VERSION=1.1.0
    - CLBLAST_ROOT=${TRAVIS_BUILD_DIR}/bin/clblast
    - CLBLAST_INSTALL=${TRAVIS_BUILD_DIR}/bin/CLBlast-${CLBLAST_VERSION}
    - CLBLAST_TAR=CLBlast-${CLBLAST_VERSION}-${TRAVIS_OS_NAME}-x64.tar.gz
--- a/12
+++ b/12
@ -1,5 +1,17 @@

 Development (next version)
+- Fixed a bug in the TRSM/TRSV routines due to missing synchronisations after GEMM/GEMV calls
+- Fixed a bug in TRSM when using the a-offset argument
+- Added a CUDA API to CLBlast:
+  * The library and kernels can be compiled with the CUDA driver API and NVRTC (requires CUDA 7.5)
+  * Two CUDA API sample programs are added: SGEMM and DAXPY
+  * All correctness tests and performance clients work on CUDA like they did for OpenCL
+- Kernels are now cached based on their tuning parameters: fits the use-case of 'OverrideParameters'
+- Improved performance for small GEMM problems by going from 3 to 1 optional temporary buffers
+- Various minor fixes and enhancements
+- Added tuned parameters for various devices (see README)
+
+Version 1.1.0
 - The tuning database now has defaults per architecture (e.g. NVIDIA Kepler SM3.5, AMD Fiji)
 - The tuning database now has a dictionary to translate vendor/device names to a common set
 - The tuners can now distinguish between different AMD GPU board names of the same architecture
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -12,14 +12,17 @@
 cmake_minimum_required(VERSION 2.8.11)

 # Overrides for MSVC static runtime
-set(CMAKE_USER_MAKE_RULES_OVERRIDE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/c_flag_overrides.cmake)
-set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_flag_overrides.cmake)
+option(OVERRIDE_MSVC_FLAGS_TO_MT "Override compiler flags for MSVC to build with a static runtime (/MT instead of /MD)" ON)
+if(OVERRIDE_MSVC_FLAGS_TO_MT)
+  set(CMAKE_USER_MAKE_RULES_OVERRIDE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/c_flag_overrides.cmake)
+  set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_flag_overrides.cmake)
+endif()

 # CMake project details
 project("clblast" C CXX)
 set(clblast_VERSION_MAJOR 1)
-set(clblast_VERSION_MINOR 0)
-set(clblast_VERSION_PATCH 1)
+set(clblast_VERSION_MINOR 1)
+set(clblast_VERSION_PATCH 0)

 # Options and their default values
 option(BUILD_SHARED_LIBS "Build a shared (ON) or static library (OFF)" ON)
@ -30,6 +33,23 @@ option(TESTS "Enable compilation of the correctness tests" OFF)
 option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF)
 option(CUBLAS "Enables performance comparison against cuBLAS on NVIDIA GPUs" OFF)

+# Select between an OpenCL API (default) or a CUDA API (beta)
+option(OPENCL "Build CLBlast with an OpenCL API (default)" ON)
+option(CUDA "Build CLBlast with a CUDA API (beta)" OFF)
+if(NOT OPENCL AND NOT CUDA)
+  message(FATAL_ERROR "No API selected, choose from OpenCL (-DOPENCL=ON) or CUDA (-DCUDA=ON)")
+endif()
+if(OPENCL AND CUDA)
+  message(FATAL_ERROR "Multiple APIs selected, choose either OpenCL (-DOPENCL=ON -DCUDA=OFF) or CUDA (-DCUDA=ON -DOPENCL=OFF)")
+endif()
+if(OPENCL)
+  message("-- Building CLBlast with OpenCL API (default)")
+  add_definitions(-DOPENCL_API)
+elseif(CUDA)
+  message("-- Building CLBlast with CUDA API (beta)")
+  add_definitions(-DCUDA_API)
+endif()
+
 # Compile in verbose mode with additional diagnostic messages
 option(VERBOSE "Compile in verbose mode for additional diagnostic messages" OFF)
 if(VERBOSE)
@ -123,8 +143,18 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CFLAGS}")
 # Package scripts location
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${clblast_SOURCE_DIR}/cmake/Modules/")

-# Requires OpenCL. It is found through the included "FindOpenCL.cmake" in CMAKE_MODULE_PATH.
-find_package(OpenCL REQUIRED)
+if(OPENCL)
+  # Requires OpenCL. It is found through the included "FindOpenCL.cmake" in CMAKE_MODULE_PATH.
+  find_package(OpenCL REQUIRED)
+  set(API_LIBRARIES ${OPENCL_LIBRARIES})
+  set(API_INCLUDE_DIRS ${OPENCL_INCLUDE_DIRS})
+elseif(CUDA)
+  # For CUDA, the "FindCUDA.cmake" is part of CMake
+  find_package(CUDA REQUIRED)
+  set(API_LIBRARIES cuda nvrtc)
+  set(API_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS})
+  link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
+endif()

 # Locates the CLTune library in case the tuners need to be compiled. "FindCLTune.cmake" is included.
 if(TUNERS)
@ -138,8 +168,10 @@ endif()
 # Locates the reference BLAS libraries in case the tests need to be compiled. The "FindclBLAS.cmake",
 # "FindCBLAS.cmake" and "FindcuBLAS.cmake" are included.
 if(CLIENTS OR TESTS)
-  find_package(clBLAS)
  find_package(CBLAS)
+  if(OPENCL)
+    find_package(clBLAS)
+  endif()
  if(CUBLAS)
    find_package(cuBLAS)
  endif()
@ -161,11 +193,6 @@ set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger
            xgemm xgemm_direct xgemv)
 set(DATABASES copy pad padtranspose transpose xaxpy xdot
              xgemm xgemm_direct xgemv xgemv_fast xgemv_fast_rot xger)
-set(SAMPLE_PROGRAMS_CPP sgemm sgemm_batched)
-set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache)
-if(NETLIB)
-  set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib)
-endif()
 set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
 set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv
                    xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
@ -174,6 +201,18 @@ set(LEVELX_ROUTINES xomatcopy xim2col xaxpybatched xgemmbatched)
 set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES})
 set(PRECISIONS 32 64 3232 6464 16)

+# Sample programs
+if(OPENCL)
+  set(SAMPLE_PROGRAMS_CPP sgemm sgemm_batched dtrsm)
+  set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache)
+  if(NETLIB)
+    set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib)
+  endif()
+elseif(CUDA)
+  set(SAMPLE_PROGRAMS_CPP daxpy_cuda sgemm_cuda)
+  set(SAMPLE_PROGRAMS_C )
+endif()
+
 # ==================================================================================================

 # Gathers all source-files (required for the compiler) and header-files (for IDEs only)
@ -182,15 +221,12 @@ set(SOURCES
  src/routines/common.cpp
  src/utilities/clblast_exceptions.cpp
  src/utilities/utilities.cpp
+  src/api_common.cpp
  src/cache.cpp
-  src/clblast.cpp
-  src/clblast_c.cpp
  src/routine.cpp
  src/routines/levelx/xinvert.cpp  # only source, don't include it as a test
 )
 set(HEADERS  # such that they can be discovered by IDEs such as CLion and Visual Studio
-  include/clblast.h
-  include/clblast_c.h
  include/clblast_half.h
  src/database/apple_cpu_fallback.hpp
  src/database/database.hpp
@ -201,19 +237,26 @@ set(HEADERS  # such that they can be discovered by IDEs such as CLion and Visual
  src/routines/level1/xmin.hpp
  src/routines/level1/xsum.hpp
  src/routines/common.hpp
+  src/routines/routines.hpp
  src/utilities/buffer_test.hpp
  src/utilities/clblast_exceptions.hpp
  src/utilities/device_mapping.hpp
  src/utilities/msvc.hpp
  src/utilities/utilities.hpp
  src/cache.hpp
-  src/clpp11.hpp
  src/cxpp11_common.hpp
  src/routine.hpp
 )
-if(NETLIB)
-  set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp)
-  set(HEADERS ${HEADERS} include/clblast_netlib_c.h)
+if(OPENCL)
+  set(SOURCES ${SOURCES} src/clblast.cpp src/clblast_c.cpp)
+  set(HEADERS ${HEADERS} include/clblast.h include/clblast_c.h src/clpp11.hpp)
+  if(NETLIB)
+    set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp)
+    set(HEADERS ${HEADERS} include/clblast_netlib_c.h)
+  endif()
+elseif(CUDA)
+  set(SOURCES ${SOURCES} src/clblast_cuda.cpp)
+  set(HEADERS ${HEADERS} include/clblast_cuda.h src/cupp11.hpp)
 endif()
 foreach(ROUTINE ${LEVEL1_ROUTINES})
  set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cpp)
@ -247,14 +290,14 @@ else(BUILD_SHARED_LIBS)
  add_library(clblast STATIC ${SOURCES} ${HEADERS})
 endif()

-target_link_libraries(clblast ${OPENCL_LIBRARIES})
+target_link_libraries(clblast ${API_LIBRARIES})

 # Includes directories: CLBlast and OpenCL
 target_include_directories(clblast PUBLIC
                           $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/include>
                           $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/src>
                           $<INSTALL_INTERFACE:include>
-                           ${OPENCL_INCLUDE_DIRS})
+                           ${API_INCLUDE_DIRS})

 # Sets the proper __declspec(dllexport) keyword for Visual Studio when the library is built
 if(MSVC)
@ -264,23 +307,28 @@ if(MSVC)
 endif()

 # Installs the library
-install(TARGETS clblast EXPORT CLBlast DESTINATION lib)
-install(FILES include/clblast.h DESTINATION include)
-install(FILES include/clblast_c.h DESTINATION include)
-install(FILES include/clblast_half.h DESTINATION include)
-if(NETLIB)
-  install(FILES include/clblast_netlib_c.h DESTINATION include)
+include(GNUInstallDirs)
+
+install(TARGETS clblast EXPORT CLBlast DESTINATION ${CMAKE_INSTALL_LIBDIR})
+install(FILES include/clblast_half.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+if(OPENCL)
+  install(FILES include/clblast.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+  install(FILES include/clblast_c.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+  if(NETLIB)
+    install(FILES include/clblast_netlib_c.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+  endif()
+elseif(CUDA)
+  install(FILES include/clblast_cuda.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 endif()

 # Installs the config for find_package in dependent projects
-install(EXPORT CLBlast DESTINATION lib/cmake/CLBLast FILE CLBlastConfig.cmake)
+install(EXPORT CLBlast DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/CLBLast FILE CLBlastConfig.cmake)

 # Install pkg-config file on Linux
 if(UNIX)
    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/clblast.pc.in"
                   "${CMAKE_CURRENT_BINARY_DIR}/clblast.pc" @ONLY IMMEDIATE)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/clblast.pc
-            DESTINATION lib/pkgconfig)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/clblast.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
 endif()

 # ==================================================================================================
@ -289,19 +337,21 @@ endif()
 if(SAMPLES)

  # Downloads the cl.hpp file from Khronos
-  file(DOWNLOAD https://www.khronos.org/registry/OpenCL/api/2.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp)
+  if(OPENCL)
+    file(DOWNLOAD https://www.khronos.org/registry/OpenCL/api/2.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp)
+  endif()

  # Adds sample programs (C++)
  foreach(SAMPLE ${SAMPLE_PROGRAMS_CPP})
    add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cpp)
-    target_link_libraries(clblast_sample_${SAMPLE} clblast ${OPENCL_LIBRARIES})
+    target_link_libraries(clblast_sample_${SAMPLE} clblast ${API_LIBRARIES})
    install(TARGETS clblast_sample_${SAMPLE} DESTINATION bin)
  endforeach()

  # Adds sample programs (C)
  foreach(SAMPLE ${SAMPLE_PROGRAMS_C})
    add_executable(clblast_sample_${SAMPLE}_c samples/${SAMPLE}.c)
-    target_link_libraries(clblast_sample_${SAMPLE}_c clblast ${OPENCL_LIBRARIES})
+    target_link_libraries(clblast_sample_${SAMPLE}_c clblast ${API_LIBRARIES})
    install(TARGETS clblast_sample_${SAMPLE}_c DESTINATION bin)
  endforeach()

@ -322,7 +372,7 @@ if(TUNERS)
  # Adds tuning executables
  foreach(KERNEL ${KERNELS})
    add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} src/tuning/kernels/${KERNEL}.cpp)
-    target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${OPENCL_LIBRARIES})
+    target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${API_LIBRARIES})
    target_include_directories(clblast_tuner_${KERNEL} PUBLIC ${CLTUNE_INCLUDE_DIRS})
    install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
  endforeach()
@ -427,7 +477,7 @@ if(CLIENTS)
                   test/routines/levelx/${ROUTINE}.hpp)
  endforeach()
  foreach(ROUTINE ${ROUTINES})
-    target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
+    target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${API_LIBRARIES})
    target_include_directories(clblast_client_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES})
    install(TARGETS clblast_client_${ROUTINE} DESTINATION bin)
  endforeach()
@ -479,7 +529,7 @@ if(TESTS)
                   test/routines/levelx/${ROUTINE}.hpp)
  endforeach()
  foreach(ROUTINE ${ROUTINES})
-    target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
+    target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${API_LIBRARIES})
    install(TARGETS clblast_test_${ROUTINE} DESTINATION bin)
    target_include_directories(clblast_test_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES})
    add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE})
@ -490,7 +540,7 @@ if(TESTS)
  foreach(MISC_TEST ${MISC_TESTS})
    add_executable(clblast_test_${MISC_TEST} ${TESTS_COMMON}
                   test/correctness/misc/${MISC_TEST}.cpp)
-    target_link_libraries(clblast_test_${MISC_TEST} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
+    target_link_libraries(clblast_test_${MISC_TEST} clblast ${REF_LIBRARIES} ${API_LIBRARIES})
    target_include_directories(clblast_test_${MISC_TEST} PUBLIC
                               $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
                               ${clblast_SOURCE_DIR} ${REF_INCLUDES})
@ -499,7 +549,7 @@ if(TESTS)

  # CLBlast diagnostics
  add_executable(clblast_test_diagnostics ${TESTS_COMMON} test/diagnostics.cpp)
-  target_link_libraries(clblast_test_diagnostics clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
+  target_link_libraries(clblast_test_diagnostics clblast ${REF_LIBRARIES} ${API_LIBRARIES})
  target_include_directories(clblast_test_diagnostics PUBLIC
                             $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
                             ${clblast_SOURCE_DIR} ${REF_INCLUDES})
--- a/README.md
+++ b/README.md
@ -10,7 +10,7 @@ CLBlast: The tuned OpenCL BLAS library

 CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices. See [the CLBlast website](https://cnugteren.github.io/clblast) for performance reports on various devices as well as the latest CLBlast news.

-The library is not tuned for all possible OpenCL devices: __if out-of-the-box performance is poor, please run the tuners first__. See below for a list of already tuned devices and instructions on how to tune yourself and contribute to future releases of the CLBlast library.
+The library is not tuned for all possible OpenCL devices: __if out-of-the-box performance is poor, please run the tuners first__. See below for a list of already tuned devices and instructions on how to tune yourself and contribute to future releases of the CLBlast library. See also the [CLBlast feature roadmap](ROADMAP.md) to get an indication of the future of CLBlast.


 Why CLBlast and not clBLAS or cuBLAS?
@ -99,11 +99,23 @@ To get started quickly, a couple of stand-alone example programs are included in

    cmake -DSAMPLES=ON ..

+For all of CLBlast's APIs, it is possible to optionally set an OS environmental variable `CLBLAST_BUILD_OPTIONS` to pass specific build options to the OpenCL compiler.
+
+
+Using the library (Netlib API)
+-------------
+
 There is also a Netlib CBLAS C API available. This is however not recommended for full control over performance, since at every call it will copy all buffers to and from the OpenCL device. Especially for level 1 and level 2 BLAS functions performance will be impacted severely. However, it can be useful if you don't want to touch OpenCL at all. You can set the default device and platform by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. This API can be used as follows after providing the `-DNETLIB=ON` flag to CMake:

    #include <clblast_netlib_c.h>

-For all of CLBlast's APIs, it is possible to optionally set an OS environmental variable `CLBLAST_BUILD_OPTIONS` to pass specific build options to the OpenCL compiler.
+
+Using the library (CUDA API)
+-------------
+
+There is also a CUDA API of CLBlast available. Enabling this compiles the whole library for CUDA and thus replaces the OpenCL API. It is based upon the CUDA runtime and NVRTC APIs, requiring NVIDIA CUDA 7.5 or higher. The CUDA version of the library can be used as follows after providing the `-DCUDA=ON -DOPENCL=OFF` flags to CMake:
+
+    #include <clblast_cuda.h>


 Using the tuners (optional)
@ -115,6 +127,7 @@ The CLBlast library is already tuned for the most commonly used OpenCL devices a
  - GRID K520
  - GeForce GT 650M
  - GeForce GTX 480
+  - GeForce GTX 580
  - GeForce GTX 670
  - GeForce GTX 680
  - GeForce GTX 750
@ -122,6 +135,7 @@ The CLBlast library is already tuned for the most commonly used OpenCL devices a
  - GeForce GTX 980
  - GeForce GTX 1070
  - GeForce GTX 1080
+  - GeForce GTX 1080 Ti
  - GeForce GTX TITAN
  - GeForce GTX TITAN Black
  - GeForce GTX TITAN X
@ -147,6 +161,7 @@ The CLBlast library is already tuned for the most commonly used OpenCL devices a
  - Iris
  - Iris Pro
 * Intel CPUs:
+  - Core i5-4570
  - Core i5-6200U
  - Core i7-920
  - Core i7-2670QM
@ -341,6 +356,17 @@ Finally, a third option is to use the [Collective Knowledge framework](https://g
    ck pull repo:ck-math
    ck install package:lib-clblast-master-universal --target_os=android21-arm64

+
+Known issues
+-------------
+
+Known performance related issues:
+
+* Severe performance issues with Beignet v1.3.0 due to missing support for local memory. Please downgrade to v1.2.1 or upgrade to v1.3.1 or newer.
+
+* Performance issues on ARM Mali GPUs due to missing compiler for support for loop unrolling and array-to-register promotion.
+
+
 Contributing
 -------------

--- a/ROADMAP.md
+++ b/ROADMAP.md
@ -0,0 +1,12 @@
+CLBlast feature road-map
+================
+
+This file gives an overview of the main features planned for addition to CLBlast. A first-order indication time-frame for development time is provided:
+
+| Issue#     | When        | Who       | Status | What          |
+| -----------|-------------|-----------|--------|---------------|
+| -          | Oct '17     | CNugteren | ✔      | CUDA API for CLBlast |
+| [#169](https://github.com/CNugteren/CLBlast/issues/169), [#195](https://github.com/CNugteren/CLBlast/issues/195) | Oct-Nov '17 | CNugteren |        | Auto-tuning the kernel selection parameter |
+| [#181](https://github.com/CNugteren/CLBlast/issues/181), [#201](https://github.com/CNugteren/CLBlast/issues/201) | Nov '17     | CNugteren |        | Compilation for Android and testing on Qualcomm Adreno |
+| [#128](https://github.com/CNugteren/CLBlast/issues/128), [#205](https://github.com/CNugteren/CLBlast/issues/205) | Nov-Dec '17 | CNugteren |        | Pre-processor for loop unrolling and array-to-register-promotion for e.g. ARM Mali |
+| [#169](https://github.com/CNugteren/CLBlast/issues/169)       | '17         | dividiti  |        | Problem-specific tuning parameter selection |
--- a/clblast.pc.in
+++ b/clblast.pc.in
@ -1,7 +1,7 @@
 prefix=@CMAKE_INSTALL_PREFIX@
 exec_prefix=${prefix}
-includedir=${prefix}/include
-libdir=${exec_prefix}/lib
+includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
+libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@

 Name: CLBlast
 Description: CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11
--- a/include/clblast_cuda.h
+++ b/include/clblast_cuda.h
@ -0,0 +1,643 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the special CUDA interface to the CLBlast BLAS routines. It also contains the
+// definitions of the returned status codes and the layout and transpose types. This is the header
+// users of the CUDA API of CLBlast should include and use.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_CLBLAST_CUDA_H_
+#define CLBLAST_CLBLAST_CUDA_H_
+
+#include <cstdlib> // For size_t
+#include <string> // For OverrideParameters function
+#include <unordered_map> // For OverrideParameters function
+
+// CUDA
+#include <cuda.h> // CUDA driver API
+#include <nvrtc.h> // NVIDIA runtime compilation API
+
+// Exports library functions under Windows when building a DLL. See also:
+// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
+#if defined(_WIN32) && defined(CLBLAST_DLL)
+  #if defined(COMPILING_DLL)
+    #define PUBLIC_API __declspec(dllexport)
+  #else
+    #define PUBLIC_API __declspec(dllimport)
+  #endif
+#else
+  #define PUBLIC_API
+#endif
+
+namespace clblast {
+// =================================================================================================
+
+// Status codes. These codes can be returned by functions declared in this header file. The error
+// codes match either the standard CUDA driver API error codes or the regular CLBlast error codes.
+enum class StatusCode {
+
+  // Status codes in common with the OpenCL standard
+  kSuccess                   =   0, // CUDA_SUCCESS
+  kInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions
+  kInvalidLocalThreadsTotal  = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total
+  kInvalidLocalThreadsDim    = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension
+
+  // Status codes in common with the clBLAS library
+  kNotImplemented            = -1024, // Routine or functionality not implemented yet
+  kInvalidMatrixA            = -1022, // Matrix A is not a valid OpenCL buffer
+  kInvalidMatrixB            = -1021, // Matrix B is not a valid OpenCL buffer
+  kInvalidMatrixC            = -1020, // Matrix C is not a valid OpenCL buffer
+  kInvalidVectorX            = -1019, // Vector X is not a valid OpenCL buffer
+  kInvalidVectorY            = -1018, // Vector Y is not a valid OpenCL buffer
+  kInvalidDimension          = -1017, // Dimensions M, N, and K have to be larger than zero
+  kInvalidLeadDimA           = -1016, // LD of A is smaller than the matrix's first dimension
+  kInvalidLeadDimB           = -1015, // LD of B is smaller than the matrix's first dimension
+  kInvalidLeadDimC           = -1014, // LD of C is smaller than the matrix's first dimension
+  kInvalidIncrementX         = -1013, // Increment of vector X cannot be zero
+  kInvalidIncrementY         = -1012, // Increment of vector Y cannot be zero
+  kInsufficientMemoryA       = -1011, // Matrix A's OpenCL buffer is too small
+  kInsufficientMemoryB       = -1010, // Matrix B's OpenCL buffer is too small
+  kInsufficientMemoryC       = -1009, // Matrix C's OpenCL buffer is too small
+  kInsufficientMemoryX       = -1008, // Vector X's OpenCL buffer is too small
+  kInsufficientMemoryY       = -1007, // Vector Y's OpenCL buffer is too small
+
+  // Custom additional status codes for CLBlast
+  kInvalidBatchCount         = -2049, // The batch count needs to be positive
+  kInvalidOverrideKernel     = -2048, // Trying to override parameters for an invalid kernel
+  kMissingOverrideParameter  = -2047, // Missing override parameter(s) for the target kernel
+  kInvalidLocalMemUsage      = -2046, // Not enough local memory available on this device
+  kNoHalfPrecision           = -2045, // Half precision (16-bits) not supported by the device
+  kNoDoublePrecision         = -2044, // Double precision (64-bits) not supported by the device
+  kInvalidVectorScalar       = -2043, // The unit-sized vector is not a valid OpenCL buffer
+  kInsufficientMemoryScalar  = -2042, // The unit-sized vector's OpenCL buffer is too small
+  kDatabaseError             = -2041, // Entry for the device was not found in the database
+  kUnknownError              = -2040, // A catch-all error code representing an unspecified error
+  kUnexpectedError           = -2039, // A catch-all error code representing an unexpected exception
+};
+
+// Matrix layout and transpose types
+enum class Layout { kRowMajor = 101, kColMajor = 102 };
+enum class Transpose { kNo = 111, kYes = 112, kConjugate = 113 };
+enum class Triangle { kUpper = 121, kLower = 122 };
+enum class Diagonal { kNonUnit = 131, kUnit = 132 };
+enum class Side { kLeft = 141, kRight = 142 };
+
+// Precision scoped enum (values in bits)
+enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
+                       kComplexSingle = 3232, kComplexDouble = 6464, kAny = -1 };
+
+// =================================================================================================
+// BLAS level-1 (vector-vector) routines
+// =================================================================================================
+
+// Generate givens plane rotation: SROTG/DROTG
+template <typename T>
+StatusCode Rotg(CUdeviceptr sa_buffer, const size_t sa_offset,
+                CUdeviceptr sb_buffer, const size_t sb_offset,
+                CUdeviceptr sc_buffer, const size_t sc_offset,
+                CUdeviceptr ss_buffer, const size_t ss_offset,
+                const CUcontext context, const CUdevice device);
+
+// Generate modified givens plane rotation: SROTMG/DROTMG
+template <typename T>
+StatusCode Rotmg(CUdeviceptr sd1_buffer, const size_t sd1_offset,
+                 CUdeviceptr sd2_buffer, const size_t sd2_offset,
+                 CUdeviceptr sx1_buffer, const size_t sx1_offset,
+                 const CUdeviceptr sy1_buffer, const size_t sy1_offset,
+                 CUdeviceptr sparam_buffer, const size_t sparam_offset,
+                 const CUcontext context, const CUdevice device);
+
+// Apply givens plane rotation: SROT/DROT
+template <typename T>
+StatusCode Rot(const size_t n,
+               CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+               CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+               const T cos,
+               const T sin,
+               const CUcontext context, const CUdevice device);
+
+// Apply modified givens plane rotation: SROTM/DROTM
+template <typename T>
+StatusCode Rotm(const size_t n,
+                CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+                CUdeviceptr sparam_buffer, const size_t sparam_offset,
+                const CUcontext context, const CUdevice device);
+
+// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP
+template <typename T>
+StatusCode Swap(const size_t n,
+                CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+                const CUcontext context, const CUdevice device);
+
+// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
+template <typename T>
+StatusCode Scal(const size_t n,
+                const T alpha,
+                CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const CUcontext context, const CUdevice device);
+
+// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
+template <typename T>
+StatusCode Copy(const size_t n,
+                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+                const CUcontext context, const CUdevice device);
+
+// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
+template <typename T>
+StatusCode Axpy(const size_t n,
+                const T alpha,
+                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+                const CUcontext context, const CUdevice device);
+
+// Dot product of two vectors: SDOT/DDOT/HDOT
+template <typename T>
+StatusCode Dot(const size_t n,
+               CUdeviceptr dot_buffer, const size_t dot_offset,
+               const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+               const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+               const CUcontext context, const CUdevice device);
+
+// Dot product of two complex vectors: CDOTU/ZDOTU
+template <typename T>
+StatusCode Dotu(const size_t n,
+                CUdeviceptr dot_buffer, const size_t dot_offset,
+                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+                const CUcontext context, const CUdevice device);
+
+// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC
+template <typename T>
+StatusCode Dotc(const size_t n,
+                CUdeviceptr dot_buffer, const size_t dot_offset,
+                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+                const CUcontext context, const CUdevice device);
+
+// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
+template <typename T>
+StatusCode Nrm2(const size_t n,
+                CUdeviceptr nrm2_buffer, const size_t nrm2_offset,
+                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const CUcontext context, const CUdevice device);
+
+// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
+template <typename T>
+StatusCode Asum(const size_t n,
+                CUdeviceptr asum_buffer, const size_t asum_offset,
+                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const CUcontext context, const CUdevice device);
+
+// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
+template <typename T>
+StatusCode Sum(const size_t n,
+               CUdeviceptr sum_buffer, const size_t sum_offset,
+               const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+               const CUcontext context, const CUdevice device);
+
+// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
+template <typename T>
+StatusCode Amax(const size_t n,
+                CUdeviceptr imax_buffer, const size_t imax_offset,
+                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const CUcontext context, const CUdevice device);
+
+// Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN
+template <typename T>
+StatusCode Amin(const size_t n,
+                CUdeviceptr imin_buffer, const size_t imin_offset,
+                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const CUcontext context, const CUdevice device);
+
+// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
+template <typename T>
+StatusCode Max(const size_t n,
+               CUdeviceptr imax_buffer, const size_t imax_offset,
+               const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+               const CUcontext context, const CUdevice device);
+
+// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
+template <typename T>
+StatusCode Min(const size_t n,
+               CUdeviceptr imin_buffer, const size_t imin_offset,
+               const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+               const CUcontext context, const CUdevice device);
+
+// =================================================================================================
+// BLAS level-2 (matrix-vector) routines
+// =================================================================================================
+
+// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV
+template <typename T>
+StatusCode Gemv(const Layout layout, const Transpose a_transpose,
+                const size_t m, const size_t n,
+                const T alpha,
+                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+                const CUcontext context, const CUdevice device);
+
+// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
+template <typename T>
+StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
+                const size_t m, const size_t n, const size_t kl, const size_t ku,
+                const T alpha,
+                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+                const CUcontext context, const CUdevice device);
+
+// Hermitian matrix-vector multiplication: CHEMV/ZHEMV
+template <typename T>
+StatusCode Hemv(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+                const CUcontext context, const CUdevice device);
+
+// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV
+template <typename T>
+StatusCode Hbmv(const Layout layout, const Triangle triangle,
+                const size_t n, const size_t k,
+                const T alpha,
+                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+                const CUcontext context, const CUdevice device);
+
+// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV
+template <typename T>
+StatusCode Hpmv(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const CUdeviceptr ap_buffer, const size_t ap_offset,
+                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+                const CUcontext context, const CUdevice device);
+
+// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV
+template <typename T>
+StatusCode Symv(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+                const CUcontext context, const CUdevice device);
+
+// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
+template <typename T>
+StatusCode Sbmv(const Layout layout, const Triangle triangle,
+                const size_t n, const size_t k,
+                const T alpha,
+                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+                const CUcontext context, const CUdevice device);
+
+// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
+template <typename T>
+StatusCode Spmv(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const CUdeviceptr ap_buffer, const size_t ap_offset,
+                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+                const CUcontext context, const CUdevice device);
+
+// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
+template <typename T>
+StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const CUcontext context, const CUdevice device);
+
+// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
+template <typename T>
+StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n, const size_t k,
+                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const CUcontext context, const CUdevice device);
+
+// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
+template <typename T>
+StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const CUdeviceptr ap_buffer, const size_t ap_offset,
+                CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const CUcontext context, const CUdevice device);
+
+// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
+template <typename T>
+StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const CUcontext context, const CUdevice device);
+
+// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV
+template <typename T>
+StatusCode Tbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n, const size_t k,
+                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const CUcontext context, const CUdevice device);
+
+// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV
+template <typename T>
+StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const CUdeviceptr ap_buffer, const size_t ap_offset,
+                CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const CUcontext context, const CUdevice device);
+
+// General rank-1 matrix update: SGER/DGER/HGER
+template <typename T>
+StatusCode Ger(const Layout layout,
+               const size_t m, const size_t n,
+               const T alpha,
+               const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+               const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+               CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+               const CUcontext context, const CUdevice device);
+
+// General rank-1 complex matrix update: CGERU/ZGERU
+template <typename T>
+StatusCode Geru(const Layout layout,
+                const size_t m, const size_t n,
+                const T alpha,
+                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+                CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                const CUcontext context, const CUdevice device);
+
+// General rank-1 complex conjugated matrix update: CGERC/ZGERC
+template <typename T>
+StatusCode Gerc(const Layout layout,
+                const size_t m, const size_t n,
+                const T alpha,
+                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+                CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                const CUcontext context, const CUdevice device);
+
+// Hermitian rank-1 matrix update: CHER/ZHER
+template <typename T>
+StatusCode Her(const Layout layout, const Triangle triangle,
+               const size_t n,
+               const T alpha,
+               const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+               CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+               const CUcontext context, const CUdevice device);
+
+// Hermitian packed rank-1 matrix update: CHPR/ZHPR
+template <typename T>
+StatusCode Hpr(const Layout layout, const Triangle triangle,
+               const size_t n,
+               const T alpha,
+               const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+               CUdeviceptr ap_buffer, const size_t ap_offset,
+               const CUcontext context, const CUdevice device);
+
+// Hermitian rank-2 matrix update: CHER2/ZHER2
+template <typename T>
+StatusCode Her2(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+                CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                const CUcontext context, const CUdevice device);
+
+// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2
+template <typename T>
+StatusCode Hpr2(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+                CUdeviceptr ap_buffer, const size_t ap_offset,
+                const CUcontext context, const CUdevice device);
+
+// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
+template <typename T>
+StatusCode Syr(const Layout layout, const Triangle triangle,
+               const size_t n,
+               const T alpha,
+               const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+               CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+               const CUcontext context, const CUdevice device);
+
+// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
+template <typename T>
+StatusCode Spr(const Layout layout, const Triangle triangle,
+               const size_t n,
+               const T alpha,
+               const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+               CUdeviceptr ap_buffer, const size_t ap_offset,
+               const CUcontext context, const CUdevice device);
+
+// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
+template <typename T>
+StatusCode Syr2(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+                CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                const CUcontext context, const CUdevice device);
+
+// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
+template <typename T>
+StatusCode Spr2(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+                const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+                CUdeviceptr ap_buffer, const size_t ap_offset,
+                const CUcontext context, const CUdevice device);
+
+// =================================================================================================
+// BLAS level-3 (matrix-matrix) routines
+// =================================================================================================
+
+// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
+template <typename T>
+StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                const size_t m, const size_t n, const size_t k,
+                const T alpha,
+                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
+                const T beta,
+                CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
+                const CUcontext context, const CUdevice device);
+
+// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
+template <typename T>
+StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
+                const size_t m, const size_t n,
+                const T alpha,
+                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
+                const T beta,
+                CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
+                const CUcontext context, const CUdevice device);
+
+// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
+template <typename T>
+StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
+                const size_t m, const size_t n,
+                const T alpha,
+                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
+                const T beta,
+                CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
+                const CUcontext context, const CUdevice device);
+
+// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK
+template <typename T>
+StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                const size_t n, const size_t k,
+                const T alpha,
+                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                const T beta,
+                CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
+                const CUcontext context, const CUdevice device);
+
+// Rank-K update of a hermitian matrix: CHERK/ZHERK
+template <typename T>
+StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                const size_t n, const size_t k,
+                const T alpha,
+                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                const T beta,
+                CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
+                const CUcontext context, const CUdevice device);
+
+// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K
+template <typename T>
+StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                 const size_t n, const size_t k,
+                 const T alpha,
+                 const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                 const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
+                 const T beta,
+                 CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
+                 const CUcontext context, const CUdevice device);
+
+// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
+template <typename T, typename U>
+StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                 const size_t n, const size_t k,
+                 const T alpha,
+                 const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                 const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
+                 const U beta,
+                 CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
+                 const CUcontext context, const CUdevice device);
+
+// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM
+template <typename T>
+StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t m, const size_t n,
+                const T alpha,
+                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
+                const CUcontext context, const CUdevice device);
+
+// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
+template <typename T>
+StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t m, const size_t n,
+                const T alpha,
+                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
+                const CUcontext context, const CUdevice device);
+
+// =================================================================================================
+// Extra non-BLAS routines (level-X)
+// =================================================================================================
+
+// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
+template <typename T>
+StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+                    CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
+                    const CUcontext context, const CUdevice device);
+
+// Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL
+template <typename T>
+StatusCode Im2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                  const CUdeviceptr im_buffer, const size_t im_offset,
+                  CUdeviceptr col_buffer, const size_t col_offset,
+                  const CUcontext context, const CUdevice device);
+
+// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
+template <typename T>
+StatusCode AxpyBatched(const size_t n,
+                       const T *alphas,
+                       const CUdeviceptr x_buffer, const size_t *x_offsets, const size_t x_inc,
+                       CUdeviceptr y_buffer, const size_t *y_offsets, const size_t y_inc,
+                       const size_t batch_count,
+                       const CUcontext context, const CUdevice device);
+
+// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED
+template <typename T>
+StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                       const size_t m, const size_t n, const size_t k,
+                       const T *alphas,
+                       const CUdeviceptr a_buffer, const size_t *a_offsets, const size_t a_ld,
+                       const CUdeviceptr b_buffer, const size_t *b_offsets, const size_t b_ld,
+                       const T *betas,
+                       CUdeviceptr c_buffer, const size_t *c_offsets, const size_t c_ld,
+                       const size_t batch_count,
+                       const CUcontext context, const CUdevice device);
+
+// =================================================================================================
+
+// CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
+// for the same device. This cache can be cleared to free up system memory or in case of debugging.
+StatusCode PUBLIC_API ClearCache();
+
+// The cache can also be pre-initialized for a specific device with all possible CLBLast kernels.
+// Further CLBlast routine calls will then run at maximum speed.
+StatusCode PUBLIC_API FillCache(const CUdevice device);
+
+// =================================================================================================
+
+// Overrides tuning parameters for a specific device-precision-kernel combination. The next time
+// the target routine is called it will re-compile and use the new parameters from then on.
+StatusCode PUBLIC_API OverrideParameters(const CUdevice device, const std::string &kernel_name,
+                                         const Precision precision,
+                                         const std::unordered_map<std::string,size_t> &parameters);
+
+// =================================================================================================
+
+} // namespace clblast
+
+// CLBLAST_CLBLAST_CUDA_H_
+#endif
--- a/include/clblast_half.h
+++ b/include/clblast_half.h
@ -18,13 +18,6 @@
 #ifndef CLBLAST_HALF_H_
 #define CLBLAST_HALF_H_

-// Includes the normal OpenCL C header
-#if defined(__APPLE__) || defined(__MACOSX)
-  #include <OpenCL/opencl.h>
-#else
-  #include <CL/opencl.h>
-#endif
-
 // MSVC 2013 doesn't fully support C99
 #ifdef _MSC_VER
    #define inline __inline
@ -34,6 +27,7 @@

 // The host data-type for half-precision floating-point (16-bit) is based on the `cl_half` OpenCL
 // type, which is a typedef for unsigned short.
+typedef unsigned short half;

 // 32-bit union for conversions
 typedef union ConversionBits_ {
@ -46,7 +40,7 @@ typedef union ConversionBits_ {
 // Converts a IEEE-compliant single-precision value to half-precision floating-point. This function
 // applies simple truncation (round toward zero, but with overflows set to infinity) as rounding
 // mode.
-inline cl_half FloatToHalf(const float value) {
+inline half FloatToHalf(const float value) {
  static const unsigned short base_table[512] = { 
    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
@ -107,7 +101,7 @@ inline cl_half FloatToHalf(const float value) {
 }

 // Converts a half-precision value to IEEE-compliant single-precision floating-point
-inline float HalfToFloat(const cl_half value) {
+inline float HalfToFloat(const half value) {
  static const unsigned int mantissa_table[2048] = { 
    0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000,
    0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000,
--- a/samples/daxpy_cuda.cpp
+++ b/samples/daxpy_cuda.cpp
@ -0,0 +1,88 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the DAXPY routine with the C++ CUDA API of CLBlast.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <cstdio>
+#include <chrono>
+#include <vector>
+
+// Includes the CUDA driver API
+#include <cuda.h>
+
+// Includes the CLBlast library
+#include <clblast_cuda.h>
+
+// =================================================================================================
+
+// Example use of the double-precision Xaxpy routine DAXPY
+int main() {
+
+  // CUDA device selection
+  const auto device_id = 0;
+
+  // Example DAXPY arguments
+  const size_t n = 8192;
+  const double alpha = 0.7;
+
+  // Initializes the OpenCL device
+  cuInit(0);
+  CUdevice device;
+  cuDeviceGet(&device, device_id);
+
+  // Creates the OpenCL context and stream
+  CUcontext context;
+  cuCtxCreate(&context, 0, device);
+  CUstream stream;
+  cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING);
+
+  // Populate host matrices with some example data
+  auto host_a = std::vector<double>(n);
+  auto host_b = std::vector<double>(n);
+  for (auto &item: host_a) { item = 12.193; }
+  for (auto &item: host_b) { item = -8.199; }
+
+  // Copy the matrices to the device
+  CUdeviceptr device_a;
+  CUdeviceptr device_b;
+  cuMemAlloc(&device_a, host_a.size()*sizeof(double));
+  cuMemAlloc(&device_b, host_b.size()*sizeof(double));
+  cuMemcpyHtoDAsync(device_a, host_a.data(), host_a.size()*sizeof(double), stream);
+  cuMemcpyHtoDAsync(device_b, host_b.data(), host_b.size()*sizeof(double), stream);
+
+  // Start the timer
+  auto start_time = std::chrono::steady_clock::now();
+
+  // Call the DAXPY routine. Note that the type of alpha (double) determines the precision.
+  const auto status = clblast::Axpy(n, alpha,
+                                    device_a, 0, 1,
+                                    device_b, 0, 1,
+                                    context, device);
+  cuStreamSynchronize(stream);
+
+  // Record the execution time
+  auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+  auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();
+
+  // Example completed. See "clblast_cuda.h" for status codes (0 -> success).
+  printf("Completed DAXPY in %.3lf ms with status %d\n", time_ms, static_cast<int>(status));
+
+  // Clean-up
+  cuMemFree(device_a);
+  cuMemFree(device_b);
+  cuStreamDestroy(stream);
+  return 0;
+}
+
+// =================================================================================================
--- a/samples/dtrsm.cpp
+++ b/samples/dtrsm.cpp
@ -0,0 +1,117 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the DTRSM routine. It is a stand-alone example, but it does
+// require the Khronos C++ OpenCL API header file (downloaded by CMake). The example uses C++
+// features, but CLBlast can also be used using the regular C-style OpenCL API.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <cstdio>
+#include <vector>
+
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
+// Includes the C++ OpenCL API. If not yet available, it can be found here:
+// https://www.khronos.org/registry/cl/api/1.1/cl.hpp
+#include "cl.hpp"
+
+// Includes the CLBlast library
+#include <clblast.h>
+
+// =================================================================================================
+
+// Example use of the double-precision Xtrsm routine DTRSM, solving A*X = alpha*B, storing the
+// result in the memory of matrix B. Uses row-major storage (C-style).
+int main() {
+
+  // OpenCL platform/device settings
+  const auto platform_id = 0;
+  const auto device_id = 0;
+
+  // Example TRSM arguments
+  const size_t m = 4;
+  const size_t n = 3;
+  const double alpha = 1.0;
+  const auto a_ld = m;
+  const auto b_ld = n;
+
+  // Initializes the OpenCL platform
+  auto platforms = std::vector<cl::Platform>();
+  cl::Platform::get(&platforms);
+  if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; }
+  auto platform = platforms[platform_id];
+
+  // Initializes the OpenCL device
+  auto devices = std::vector<cl::Device>();
+  platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
+  if (devices.size() == 0 || device_id >= devices.size()) { return 1; }
+  auto device = devices[device_id];
+
+  // Creates the OpenCL context, queue, and an event
+  auto device_as_vector = std::vector<cl::Device>{device};
+  auto context = cl::Context(device_as_vector);
+  auto queue = cl::CommandQueue(context, device);
+  auto event = cl_event{nullptr};
+
+  // Populate host matrices with some example data
+  auto host_a = std::vector<double>({1.0,  2.0,  1.0, -2.0,
+                                    0.0, -1.0, -2.0,  0.0,
+                                    0.0,  0.0,  1.0,  1.0,
+                                    0.0,  0.0,  0.0, -1.0});
+  auto host_b = std::vector<double>({-1.0, -1.0,  3.0,
+                                     1.0, -3.0,  2.0,
+                                     1.0,  1.0, -1.0,
+                                     4.0, -1.0, -2.0});
+  // Expected result:
+  //   8 -5  2
+  // -11  3  4
+  //   5  0 -3
+  //  -4  1  2
+
+  // Copy the matrices to the device
+  auto device_a = cl::Buffer(context, CL_MEM_READ_WRITE, host_a.size()*sizeof(double));
+  auto device_b = cl::Buffer(context, CL_MEM_READ_WRITE, host_b.size()*sizeof(double));
+  queue.enqueueWriteBuffer(device_a, CL_TRUE, 0, host_a.size()*sizeof(double), host_a.data());
+  queue.enqueueWriteBuffer(device_b, CL_TRUE, 0, host_b.size()*sizeof(double), host_b.data());
+
+  // Call the DTRSM routine. Note that the type of alpha and beta (double) determine the precision.
+  auto queue_plain = queue();
+  auto status = clblast::Trsm(clblast::Layout::kRowMajor, clblast::Side::kLeft,
+                              clblast::Triangle::kUpper, clblast::Transpose::kNo,
+                              clblast::Diagonal::kNonUnit,
+                              m, n,
+                              alpha,
+                              device_a(), 0, a_ld,
+                              device_b(), 0, b_ld,
+                              &queue_plain, &event);
+
+  // Retrieves the results
+  if (status == clblast::StatusCode::kSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
+  queue.enqueueReadBuffer(device_b, CL_TRUE, 0, host_b.size()*sizeof(double), host_b.data());
+
+  // Example completed. See "clblast.h" for status codes (0 -> success).
+  printf("Completed TRSM with status %d and results:\n", static_cast<int>(status));
+  for (auto i = size_t{0}; i < m; ++i) {
+    for (auto j = size_t{0}; j < n; ++j) {
+      printf("%3.0f ", host_b[i * b_ld + j]);
+    }
+    printf("\n");
+  }
+  return 0;
+}
+
+// =================================================================================================
--- a/samples/sgemm_cuda.cpp
+++ b/samples/sgemm_cuda.cpp
@ -0,0 +1,105 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the SGEMM routine with the C++ CUDA API of CLBlast.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <cstdio>
+#include <chrono>
+#include <vector>
+
+// Includes the CUDA driver API
+#include <cuda.h>
+
+// Includes the CLBlast library
+#include <clblast_cuda.h>
+
+// =================================================================================================
+
+// Example use of the single-precision Xgemm routine SGEMM
+int main() {
+
+  // CUDA device selection
+  const auto device_id = 0;
+
+  // Example SGEMM arguments
+  const size_t m = 128;
+  const size_t n = 64;
+  const size_t k = 512;
+  const float alpha = 0.7f;
+  const float beta = 1.0f;
+  const auto a_ld = k;
+  const auto b_ld = n;
+  const auto c_ld = n;
+
+  // Initializes the OpenCL device
+  cuInit(0);
+  CUdevice device;
+  cuDeviceGet(&device, device_id);
+
+  // Creates the OpenCL context and stream
+  CUcontext context;
+  cuCtxCreate(&context, 0, device);
+  CUstream stream;
+  cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING);
+
+  // Populate host matrices with some example data
+  auto host_a = std::vector<float>(m*k);
+  auto host_b = std::vector<float>(n*k);
+  auto host_c = std::vector<float>(m*n);
+  for (auto &item: host_a) { item = 12.193f; }
+  for (auto &item: host_b) { item = -8.199f; }
+  for (auto &item: host_c) { item = 0.0f; }
+
+  // Copy the matrices to the device
+  CUdeviceptr device_a;
+  CUdeviceptr device_b;
+  CUdeviceptr device_c;
+  cuMemAlloc(&device_a, host_a.size()*sizeof(float));
+  cuMemAlloc(&device_b, host_b.size()*sizeof(float));
+  cuMemAlloc(&device_c, host_c.size()*sizeof(float));
+  cuMemcpyHtoDAsync(device_a, host_a.data(), host_a.size()*sizeof(float), stream);
+  cuMemcpyHtoDAsync(device_b, host_b.data(), host_b.size()*sizeof(float), stream);
+  cuMemcpyHtoDAsync(device_c, host_c.data(), host_c.size()*sizeof(float), stream);
+
+  // Start the timer
+  auto start_time = std::chrono::steady_clock::now();
+
+  // Call the SGEMM routine. Note that the type of alpha and beta (float) determine the precision.
+  auto status = clblast::Gemm(clblast::Layout::kRowMajor,
+                              clblast::Transpose::kNo, clblast::Transpose::kNo,
+                              m, n, k,
+                              alpha,
+                              device_a, 0, a_ld,
+                              device_b, 0, b_ld,
+                              beta,
+                              device_c, 0, c_ld,
+                              context, device);
+  cuStreamSynchronize(stream);
+
+  // Record the execution time
+  auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+  auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();
+
+  // Example completed. See "clblast_cuda.h" for status codes (0 -> success).
+  printf("Completed SGEMM in %.3lf ms with status %d\n", time_ms, static_cast<int>(status));
+
+  // Clean-up
+  cuMemFree(device_a);
+  cuMemFree(device_b);
+  cuMemFree(device_c);
+  cuStreamDestroy(stream);
+  return 0;
+}
+
+// =================================================================================================
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@ -12,6 +12,8 @@
 #    clblast.cpp
 #    clblast_c.h
 #    clblast_c.cpp
+#    clblast_cuda.h
+#    clblast_cuda.cpp
 #    clblast_netlib_c.h
 #    clblast_netlib_c.cpp
 #    wrapper_clblas.h
@ -41,9 +43,11 @@ FILES = [
    "/test/wrapper_cublas.hpp",
    "/include/clblast_netlib_c.h",
    "/src/clblast_netlib_c.cpp",
+    "/include/clblast_cuda.h",
+    "/src/clblast_cuda.cpp",
 ]
-HEADER_LINES = [122, 79, 126, 24, 29, 41, 29, 65, 32]
-FOOTER_LINES = [25, 147, 27, 38, 6, 6, 6, 9, 2]
+HEADER_LINES = [122, 21, 126, 24, 29, 41, 29, 65, 32, 94, 21]
+FOOTER_LINES = [25, 3, 27, 38, 6, 6, 6, 9, 2, 25, 3]
 HEADER_LINES_DOC = 0
 FOOTER_LINES_DOC = 63

@ -224,6 +228,10 @@ def main(argv):
                    if i == 8:
                        if not routine.batched:
                            body += cpp.clblast_netlib_c_cc(routine)
+                    if i == 9:
+                        body += cpp.clblast_h(routine, cuda=True)
+                    if i == 10:
+                        body += cpp.clblast_cc(routine, cuda=True)
            f.write("".join(file_header))
            f.write(body)
            f.write("".join(file_footer))
--- a/scripts/generator/generator/cpp.py
+++ b/scripts/generator/generator/cpp.py
@ -36,22 +36,28 @@ HEADER = NL + SEPARATOR + """
 """ + SEPARATOR + NL


-def clblast_h(routine):
+def clblast_h(routine, cuda=False):
    """The C++ API header (.h)"""
    result = NL + "// " + routine.description + ": " + routine.short_names() + NL
-    result += routine.routine_header_cpp(12, " = nullptr") + ";" + NL
+    result += routine.routine_header_cpp(12, " = nullptr", cuda) + ";" + NL
    return result


-def clblast_cc(routine):
+def clblast_cc(routine, cuda=False):
    """The C++ API implementation (.cpp)"""
    indent1 = " " * (15 + routine.length())
    result = NL + "// " + routine.description + ": " + routine.short_names() + NL
    if routine.implemented:
-        result += routine.routine_header_cpp(12, "") + " {" + NL
+        result += routine.routine_header_cpp(12, "", cuda) + " {" + NL
        result += "  try {" + NL
-        result += "    auto queue_cpp = Queue(*queue);" + NL
-        result += "    auto routine = X" + routine.plain_name() + "<" + routine.template.template + ">(queue_cpp, event);" + NL
+        if cuda:
+            result += "    const auto context_cpp = Context(context);" + NL
+            result += "    const auto device_cpp = Device(device);" + NL
+            result += "    auto queue_cpp = Queue(context_cpp, device_cpp);" + NL
+        else:
+            result += "    auto queue_cpp = Queue(*queue);" + NL
+        event = "nullptr" if cuda else "event"
+        result += "    auto routine = X" + routine.plain_name() + "<" + routine.template.template + ">(queue_cpp, " + event + ");" + NL
        if routine.batched:
            result += "    " + (NL + "    ").join(routine.batched_transform_to_cpp()) + NL
        result += "    routine.Do" + routine.capitalized_name() + "("
@ -60,14 +66,22 @@ def clblast_cc(routine):
        result += "    return StatusCode::kSuccess;" + NL
        result += "  } catch (...) { return DispatchException(); }" + NL
    else:
-        result += routine.routine_header_type_cpp(12) + " {" + NL
+        result += routine.routine_header_type_cpp(12, cuda) + " {" + NL
        result += "  return StatusCode::kNotImplemented;" + NL
    result += "}" + NL
    for flavour in routine.flavours:
        indent2 = " " * (34 + routine.length() + len(flavour.template))
        result += "template StatusCode PUBLIC_API " + routine.capitalized_name() + "<" + flavour.template + ">("
-        result += ("," + NL + indent2).join([a for a in routine.arguments_type(flavour)])
-        result += "," + NL + indent2 + "cl_command_queue*, cl_event*);" + NL
+        arguments = routine.arguments_type(flavour)
+        if cuda:
+            arguments = [a.replace("cl_mem", "CUdeviceptr") for a in arguments]
+        result += ("," + NL + indent2).join([a for a in arguments])
+        result += "," + NL + indent2
+        if cuda:
+            result += "const CUcontext, const CUdevice"
+        else:
+            result += "cl_command_queue*, cl_event*"
+        result += ");" + NL
    return result


@ -364,7 +378,9 @@ def performance_test(routine, level_string):
        found = False
        for flavour in routine.flavours:
            if flavour.precision_name == precision:
-                result += NL + "      clblast::RunClient<clblast::TestX" + routine.plain_name() + flavour.test_template()
+                extra_template_argument = "0, " if routine.name == "gemm" and not routine.batched else ""
+                result += NL + "      clblast::RunClient<clblast::TestX" + routine.plain_name()
+                result += flavour.test_template(extra_template_argument)
                result += ">(argc, argv); break;" + NL
                found = True
        if not found:
@ -384,10 +400,13 @@ def correctness_test(routine, level_string):
    result += "int main(int argc, char *argv[]) {" + NL
    result += "  auto errors = size_t{0};" + NL
    not_first = "false"
-    for flavour in routine.flavours:
-        result += "  errors += clblast::RunTests<clblast::TestX" + routine.plain_name() + flavour.test_template()
-        result += ">(argc, argv, " + not_first + ", \"" + flavour.name + routine.upper_name() + "\");" + NL
-        not_first = "true"
+    extra_template_arguments = ["1, ", "2, "] if routine.name == "gemm" and not routine.batched else [""]
+    for extra_template_argument in extra_template_arguments:
+        for flavour in routine.flavours:
+            result += "  errors += clblast::RunTests<clblast::TestX" + routine.plain_name()
+            result += flavour.test_template(extra_template_argument)
+            result += ">(argc, argv, " + not_first + ", \"" + flavour.name + routine.upper_name() + "\");" + NL
+            not_first = "true"
    result += "  if (errors > 0) { return 1; } else { return 0; }" + NL
    result += "}" + NL
    return result
--- a/scripts/generator/generator/datatype.py
+++ b/scripts/generator/generator/datatype.py
@ -70,13 +70,13 @@ class DataType:
            return self.beta_cpp + "{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}"
        return "beta"

-    def test_template(self):
+    def test_template(self, extra_template_argument):
        """Returns the template as used in the correctness/performance tests"""
        buffer_type = "clblast::" + self.buffer_type if self.is_non_standard() else self.buffer_type
        beta_cpp = "clblast::" + self.beta_cpp if self.beta_cpp in [D_HALF, D_FLOAT2, D_DOUBLE2] else self.beta_cpp
        if self.buffer_type != self.beta_cpp:
-            return "<" + buffer_type + "," + self.beta_cpp + ">, " + buffer_type + ", " + beta_cpp
-        return "<" + buffer_type + ">, " + buffer_type + ", " + beta_cpp
+            return "<" + extra_template_argument + buffer_type + "," + self.beta_cpp + ">, " + buffer_type + ", " + beta_cpp
+        return "<" + extra_template_argument + buffer_type + ">, " + buffer_type + ", " + beta_cpp

    def is_complex(self, scalar):
        """Current scalar is complex"""
--- a/scripts/generator/generator/routine.py
+++ b/scripts/generator/generator/routine.py
@ -802,22 +802,38 @@ class Routine:
        """Retrieves a list of routine requirements for documentation"""
        return self.requirements

-    def routine_header_cpp(self, spaces, default_event):
+    def routine_header_cpp(self, spaces, default_event, cuda=False):
        """Retrieves the C++ templated definition for a routine"""
        indent = " " * (spaces + self.length())
+        arguments = self.arguments_def(self.template)
+        if cuda:
+            arguments = [a.replace("cl_mem", "CUdeviceptr") for a in arguments]
        result = "template <" + self.template.name + ">\n"
        result += "StatusCode " + self.capitalized_name() + "("
-        result += (",\n" + indent).join([a for a in self.arguments_def(self.template)])
-        result += ",\n" + indent + "cl_command_queue* queue, cl_event* event" + default_event + ")"
+        result += (",\n" + indent).join([a for a in arguments])
+        result += ",\n" + indent
+        if cuda:
+            result += "const CUcontext context, const CUdevice device"
+        else:
+            result += "cl_command_queue* queue, cl_event* event" + default_event
+        result += ")"
        return result

-    def routine_header_type_cpp(self, spaces):
+    def routine_header_type_cpp(self, spaces, cuda=False):
        """As above, but now without variable names"""
        indent = " " * (spaces + self.length())
+        arguments = self.arguments_type(self.template)
+        if cuda:
+            arguments = [a.replace("cl_mem", "CUdeviceptr") for a in arguments]
        result = "template <" + self.template.name + ">\n"
        result += "StatusCode " + self.capitalized_name() + "("
-        result += (",\n" + indent).join([a for a in self.arguments_type(self.template)])
-        result += ",\n" + indent + "cl_command_queue*, cl_event*)"
+        result += (",\n" + indent).join([a for a in arguments])
+        result += ",\n" + indent
+        if cuda:
+            result += "const CUcontext, const CUdevice"
+        else:
+            result += "cl_command_queue*, cl_event*"
+        result += ")"
        return result

    def routine_header_c(self, flavour, spaces, extra_qualifier):
--- a/src/api_common.cpp
+++ b/src/api_common.cpp
@ -0,0 +1,169 @@
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the common (non-OpenCL-specific) functions of the CLBlast API.
+//
+// =================================================================================================
+
+#include <string>
+
+#include "utilities/utilities.hpp"
+#include "cache.hpp"
+#include "routines/routines.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Clears the cache of stored binaries
+StatusCode ClearCache() {
+  try {
+    ProgramCache::Instance().Invalidate();
+    BinaryCache::Instance().Invalidate();
+  } catch (...) { return DispatchException(); }
+  return StatusCode::kSuccess;
+}
+
+template <typename Real, typename Complex>
+void FillCacheForPrecision(Queue &queue) {
+  try {
+
+    // Runs all the level 1 set-up functions
+    Xswap<Real>(queue, nullptr); Xswap<Complex>(queue, nullptr);
+    Xswap<Real>(queue, nullptr); Xswap<Complex>(queue, nullptr);
+    Xscal<Real>(queue, nullptr); Xscal<Complex>(queue, nullptr);
+    Xcopy<Real>(queue, nullptr); Xcopy<Complex>(queue, nullptr);
+    Xaxpy<Real>(queue, nullptr); Xaxpy<Complex>(queue, nullptr);
+    Xdot<Real>(queue, nullptr);
+    Xdotu<Complex>(queue, nullptr);
+    Xdotc<Complex>(queue, nullptr);
+    Xnrm2<Real>(queue, nullptr); Xnrm2<Complex>(queue, nullptr);
+    Xasum<Real>(queue, nullptr); Xasum<Complex>(queue, nullptr);
+    Xsum<Real>(queue, nullptr); Xsum<Complex>(queue, nullptr);
+    Xamax<Real>(queue, nullptr); Xamax<Complex>(queue, nullptr);
+    Xmax<Real>(queue, nullptr); Xmax<Complex>(queue, nullptr);
+    Xmin<Real>(queue, nullptr); Xmin<Complex>(queue, nullptr);
+
+    // Runs all the level 2 set-up functions
+    Xgemv<Real>(queue, nullptr); Xgemv<Complex>(queue, nullptr);
+    Xgbmv<Real>(queue, nullptr); Xgbmv<Complex>(queue, nullptr);
+    Xhemv<Complex>(queue, nullptr);
+    Xhbmv<Complex>(queue, nullptr);
+    Xhpmv<Complex>(queue, nullptr);
+    Xsymv<Real>(queue, nullptr);
+    Xsbmv<Real>(queue, nullptr);
+    Xspmv<Real>(queue, nullptr);
+    Xtrmv<Real>(queue, nullptr); Xtrmv<Complex>(queue, nullptr);
+    Xtbmv<Real>(queue, nullptr); Xtbmv<Complex>(queue, nullptr);
+    Xtpmv<Real>(queue, nullptr); Xtpmv<Complex>(queue, nullptr);
+    Xger<Real>(queue, nullptr);
+    Xgeru<Complex>(queue, nullptr);
+    Xgerc<Complex>(queue, nullptr);
+    Xher<Complex,Real>(queue, nullptr);
+    Xhpr<Complex,Real>(queue, nullptr);
+    Xher2<Complex>(queue, nullptr);
+    Xhpr2<Complex>(queue, nullptr);
+    Xsyr<Real>(queue, nullptr);
+    Xspr<Real>(queue, nullptr);
+    Xsyr2<Real>(queue, nullptr);
+    Xspr2<Real>(queue, nullptr);
+
+    // Runs all the level 3 set-up functions
+    Xgemm<Real>(queue, nullptr); Xgemm<Complex>(queue, nullptr);
+    Xsymm<Real>(queue, nullptr); Xsymm<Complex>(queue, nullptr);
+    Xhemm<Complex>(queue, nullptr);
+    Xsyrk<Real>(queue, nullptr); Xsyrk<Complex>(queue, nullptr);
+    Xherk<Complex,Real>(queue, nullptr);
+    Xsyr2k<Real>(queue, nullptr); Xsyr2k<Complex>(queue, nullptr);
+    Xher2k<Complex,Real>(queue, nullptr);
+    Xtrmm<Real>(queue, nullptr); Xtrmm<Complex>(queue, nullptr);
+
+    // Runs all the non-BLAS set-up functions
+    Xomatcopy<Real>(queue, nullptr); Xomatcopy<Complex>(queue, nullptr);
+
+  } catch(const RuntimeErrorCode &e) {
+    if (e.status() != StatusCode::kNoDoublePrecision &&
+        e.status() != StatusCode::kNoHalfPrecision) {
+      throw;
+    }
+  }
+}
+
+// Fills the cache with all binaries for a specific device
+// TODO: Add half-precision FP16 set-up calls
+StatusCode FillCache(const RawDeviceID device) {
+  try {
+
+    // Creates a sample context and queue to match the normal routine calling conventions
+    auto device_cpp = Device(device);
+    auto context = Context(device_cpp);
+    auto queue = Queue(context, device_cpp);
+
+    FillCacheForPrecision<float, float2>(queue);
+    FillCacheForPrecision<double, double2>(queue);
+
+  } catch (...) { return DispatchException(); }
+  return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+
+// Overrides the tuning parameters for this device-precision-kernel combination
+StatusCode OverrideParameters(const RawDeviceID device, const std::string &kernel_name,
+                              const Precision precision,
+                              const std::unordered_map<std::string,size_t> &parameters) {
+  try {
+
+    // Retrieves the device name
+    const auto device_cpp = Device(device);
+    const auto platform_id = device_cpp.PlatformID();
+    const auto device_name = GetDeviceName(device_cpp);
+
+    // Retrieves the current database values to verify whether the new ones are complete
+    auto in_cache = false;
+    auto current_database = DatabaseCache::Instance().Get(DatabaseKeyRef{platform_id, device, precision, kernel_name}, &in_cache);
+    if (!in_cache) {
+      log_debug("Searching database for kernel '" + kernel_name + "'");
+      current_database = Database(device_cpp, kernel_name, precision, {});
+    }
+
+    // Verifies the parameters size
+    const auto current_parameter_names = current_database.GetParameterNames();
+    if (current_parameter_names.size() != parameters.size()) {
+      return StatusCode::kMissingOverrideParameter;
+    }
+
+    // Retrieves the names and values separately and in the same order as the existing database
+    auto parameter_values = database::Params{0};
+    auto i = size_t{0};
+    for (const auto &current_param : current_parameter_names) {
+      if (parameters.find(current_param) == parameters.end()) {
+        return StatusCode::kMissingOverrideParameter;
+      }
+      const auto parameter_value = parameters.at(current_param);
+      parameter_values[i] = parameter_value;
+      ++i;
+    }
+
+    // Creates a small custom database based on the provided parameters
+    const auto database_device = database::DatabaseDevice{database::kDeviceNameDefault, parameter_values};
+    const auto database_architecture = database::DatabaseArchitecture{"default", {database_device}};
+    const auto database_vendor = database::DatabaseVendor{database::kDeviceTypeAll, "default", {database_architecture}};
+    const auto database_entry = database::DatabaseEntry{kernel_name, precision, current_parameter_names, {database_vendor}};
+    const auto database_entries = std::vector<database::DatabaseEntry>{database_entry};
+    const auto database = Database(device_cpp, kernel_name, precision, database_entries);
+
+    // Removes the old database entry and stores the new one in the cache
+    DatabaseCache::Instance().Remove(DatabaseKey{platform_id, device, precision, kernel_name});
+    DatabaseCache::Instance().Store(DatabaseKey{platform_id, device, precision, kernel_name}, Database(database));
+
+  } catch (...) { return DispatchException(); }
+  return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+} // namespace clblast
--- a/src/cache.hpp
+++ b/src/cache.hpp
@ -80,8 +80,8 @@ extern template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const

 // The key struct for the cache of compiled OpenCL programs (context-dependent)
 // Order of fields: context, device_id, precision, routine_name (smaller fields first)
-typedef std::tuple<cl_context, cl_device_id, Precision, std::string> ProgramKey;
-typedef std::tuple<const cl_context &, const cl_device_id &, const Precision &, const std::string &> ProgramKeyRef;
+typedef std::tuple<RawContext, RawDeviceID, Precision, std::string> ProgramKey;
+typedef std::tuple<const RawContext &, const RawDeviceID &, const Precision &, const std::string &> ProgramKeyRef;

 typedef Cache<ProgramKey, Program> ProgramCache;

@ -94,8 +94,8 @@ class Database;

 // The key struct for the cache of database maps.
 // Order of fields: platform_id, device_id, precision, kernel_name (smaller fields first)
-typedef std::tuple<cl_platform_id, cl_device_id, Precision, std::string> DatabaseKey;
-typedef std::tuple<const cl_platform_id &, const cl_device_id &, const Precision &, const std::string &> DatabaseKeyRef;
+typedef std::tuple<RawPlatformID, RawDeviceID, Precision, std::string> DatabaseKey;
+typedef std::tuple<const RawPlatformID &, const RawDeviceID &, const Precision &, const std::string &> DatabaseKeyRef;

 typedef Cache<DatabaseKey, Database> DatabaseCache;

--- a/src/clblast.cpp
+++ b/src/clblast.cpp
@ -15,67 +15,9 @@

 #include <string>

-#include "cache.hpp"
+#include "routines/routines.hpp"
 #include "clblast.h"

-// BLAS level-1 includes
-#include "routines/level1/xswap.hpp"
-#include "routines/level1/xscal.hpp"
-#include "routines/level1/xcopy.hpp"
-#include "routines/level1/xaxpy.hpp"
-#include "routines/level1/xdot.hpp"
-#include "routines/level1/xdotu.hpp"
-#include "routines/level1/xdotc.hpp"
-#include "routines/level1/xnrm2.hpp"
-#include "routines/level1/xasum.hpp"
-#include "routines/level1/xsum.hpp" // non-BLAS routine
-#include "routines/level1/xamax.hpp"
-#include "routines/level1/xamin.hpp" // non-BLAS routine
-#include "routines/level1/xmax.hpp" // non-BLAS routine
-#include "routines/level1/xmin.hpp" // non-BLAS routine
-
-// BLAS level-2 includes
-#include "routines/level2/xgemv.hpp"
-#include "routines/level2/xgbmv.hpp"
-#include "routines/level2/xhemv.hpp"
-#include "routines/level2/xhbmv.hpp"
-#include "routines/level2/xhpmv.hpp"
-#include "routines/level2/xsymv.hpp"
-#include "routines/level2/xsbmv.hpp"
-#include "routines/level2/xspmv.hpp"
-#include "routines/level2/xtrmv.hpp"
-#include "routines/level2/xtbmv.hpp"
-#include "routines/level2/xtpmv.hpp"
-#include "routines/level2/xtrsv.hpp"
-#include "routines/level2/xger.hpp"
-#include "routines/level2/xgeru.hpp"
-#include "routines/level2/xgerc.hpp"
-#include "routines/level2/xher.hpp"
-#include "routines/level2/xhpr.hpp"
-#include "routines/level2/xher2.hpp"
-#include "routines/level2/xhpr2.hpp"
-#include "routines/level2/xsyr.hpp"
-#include "routines/level2/xspr.hpp"
-#include "routines/level2/xsyr2.hpp"
-#include "routines/level2/xspr2.hpp"
-
-// BLAS level-3 includes
-#include "routines/level3/xgemm.hpp"
-#include "routines/level3/xsymm.hpp"
-#include "routines/level3/xhemm.hpp"
-#include "routines/level3/xsyrk.hpp"
-#include "routines/level3/xherk.hpp"
-#include "routines/level3/xsyr2k.hpp"
-#include "routines/level3/xher2k.hpp"
-#include "routines/level3/xtrmm.hpp"
-#include "routines/level3/xtrsm.hpp"
-
-// Level-x includes (non-BLAS)
-#include "routines/levelx/xomatcopy.hpp"
-#include "routines/levelx/xim2col.hpp"
-#include "routines/levelx/xaxpybatched.hpp"
-#include "routines/levelx/xgemmbatched.hpp"
-
 namespace clblast {

 // =================================================================================================
@ -2389,154 +2331,6 @@ template StatusCode PUBLIC_API GemmBatched<half>(const Layout, const Transpose,
                                                 cl_mem, const size_t*, const size_t,
                                                 const size_t,
                                                 cl_command_queue*, cl_event*);
-// =================================================================================================
-
-// Clears the cache of stored binaries
-StatusCode ClearCache() {
-  try {
-    ProgramCache::Instance().Invalidate();
-    BinaryCache::Instance().Invalidate();
-  } catch (...) { return DispatchException(); }
-  return StatusCode::kSuccess;
-}
-
-template <typename Real, typename Complex>
-void FillCacheForPrecision(Queue &queue) {
-  try {
-
-    // Runs all the level 1 set-up functions
-    Xswap<Real>(queue, nullptr); Xswap<Complex>(queue, nullptr);
-    Xswap<Real>(queue, nullptr); Xswap<Complex>(queue, nullptr);
-    Xscal<Real>(queue, nullptr); Xscal<Complex>(queue, nullptr);
-    Xcopy<Real>(queue, nullptr); Xcopy<Complex>(queue, nullptr);
-    Xaxpy<Real>(queue, nullptr); Xaxpy<Complex>(queue, nullptr);
-    Xdot<Real>(queue, nullptr);
-    Xdotu<Complex>(queue, nullptr);
-    Xdotc<Complex>(queue, nullptr);
-    Xnrm2<Real>(queue, nullptr); Xnrm2<Complex>(queue, nullptr);
-    Xasum<Real>(queue, nullptr); Xasum<Complex>(queue, nullptr);
-    Xsum<Real>(queue, nullptr); Xsum<Complex>(queue, nullptr);
-    Xamax<Real>(queue, nullptr); Xamax<Complex>(queue, nullptr);
-    Xmax<Real>(queue, nullptr); Xmax<Complex>(queue, nullptr);
-    Xmin<Real>(queue, nullptr); Xmin<Complex>(queue, nullptr);
-
-    // Runs all the level 2 set-up functions
-    Xgemv<Real>(queue, nullptr); Xgemv<Complex>(queue, nullptr);
-    Xgbmv<Real>(queue, nullptr); Xgbmv<Complex>(queue, nullptr);
-    Xhemv<Complex>(queue, nullptr);
-    Xhbmv<Complex>(queue, nullptr);
-    Xhpmv<Complex>(queue, nullptr);
-    Xsymv<Real>(queue, nullptr);
-    Xsbmv<Real>(queue, nullptr);
-    Xspmv<Real>(queue, nullptr);
-    Xtrmv<Real>(queue, nullptr); Xtrmv<Complex>(queue, nullptr);
-    Xtbmv<Real>(queue, nullptr); Xtbmv<Complex>(queue, nullptr);
-    Xtpmv<Real>(queue, nullptr); Xtpmv<Complex>(queue, nullptr);
-    Xger<Real>(queue, nullptr);
-    Xgeru<Complex>(queue, nullptr);
-    Xgerc<Complex>(queue, nullptr);
-    Xher<Complex,Real>(queue, nullptr);
-    Xhpr<Complex,Real>(queue, nullptr);
-    Xher2<Complex>(queue, nullptr);
-    Xhpr2<Complex>(queue, nullptr);
-    Xsyr<Real>(queue, nullptr);
-    Xspr<Real>(queue, nullptr);
-    Xsyr2<Real>(queue, nullptr);
-    Xspr2<Real>(queue, nullptr);
-
-    // Runs all the level 3 set-up functions
-    Xgemm<Real>(queue, nullptr); Xgemm<Complex>(queue, nullptr);
-    Xsymm<Real>(queue, nullptr); Xsymm<Complex>(queue, nullptr);
-    Xhemm<Complex>(queue, nullptr);
-    Xsyrk<Real>(queue, nullptr); Xsyrk<Complex>(queue, nullptr);
-    Xherk<Complex,Real>(queue, nullptr);
-    Xsyr2k<Real>(queue, nullptr); Xsyr2k<Complex>(queue, nullptr);
-    Xher2k<Complex,Real>(queue, nullptr);
-    Xtrmm<Real>(queue, nullptr); Xtrmm<Complex>(queue, nullptr);
-
-    // Runs all the non-BLAS set-up functions
-    Xomatcopy<Real>(queue, nullptr); Xomatcopy<Complex>(queue, nullptr);
-
-  } catch(const RuntimeErrorCode &e) {
-    if (e.status() != StatusCode::kNoDoublePrecision &&
-        e.status() != StatusCode::kNoHalfPrecision) {
-      throw;
-    }
-  }
-}
-
-// Fills the cache with all binaries for a specific device
-// TODO: Add half-precision FP16 set-up calls
-StatusCode FillCache(const cl_device_id device) {
-  try {
-
-    // Creates a sample context and queue to match the normal routine calling conventions
-    auto device_cpp = Device(device);
-    auto context = Context(device_cpp);
-    auto queue = Queue(context, device_cpp);
-
-    FillCacheForPrecision<float, float2>(queue);
-    FillCacheForPrecision<double, double2>(queue);
-
-  } catch (...) { return DispatchException(); }
-  return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-
-// Overrides the tuning parameters for this device-precision-kernel combination
-StatusCode OverrideParameters(const cl_device_id device, const std::string &kernel_name,
-                              const Precision precision,
-                              const std::unordered_map<std::string,size_t> &parameters) {
-  try {
-
-    // Retrieves the device name
-    const auto device_cpp = Device(device);
-    const auto platform_id = device_cpp.Platform();
-    const auto device_name = GetDeviceName(device_cpp);
-
-    // Retrieves the current database values to verify whether the new ones are complete
-    auto in_cache = false;
-    const auto current_database = DatabaseCache::Instance().Get(DatabaseKeyRef{platform_id, device, precision, kernel_name}, &in_cache);
-    if (!in_cache) { return StatusCode::kInvalidOverrideKernel; }
-    for (const auto &current_param : current_database.GetParameterNames()) {
-      if (parameters.find(current_param) == parameters.end()) {
-        return StatusCode::kMissingOverrideParameter;
-      }
-    }
-
-    // Clears the existing program & binary cache for routines with the target kernel
-    const auto routine_names = Routine::routines_by_kernel.at(kernel_name);
-    for (const auto &routine_name : routine_names) {
-      ProgramCache::Instance().RemoveBySubset<1, 2>(ProgramKey{nullptr, device, precision, routine_name});
-      BinaryCache::Instance().Remove(BinaryKey{precision, routine_name, device_name});
-    }
-
-    // Retrieves the names and values separately
-    auto parameter_values = database::Params{0};
-    auto parameter_names = std::vector<std::string>();
-    auto i = size_t{0};
-    for (const auto &parameter : parameters) {
-      parameter_values[i] = parameter.second;
-      parameter_names.push_back(parameter.first);
-      ++i;
-    }
-
-    // Creates a small custom database based on the provided parameters
-    const auto database_device = database::DatabaseDevice{database::kDeviceNameDefault, parameter_values};
-    const auto database_architecture = database::DatabaseArchitecture{"default", {database_device}};
-    const auto database_vendor = database::DatabaseVendor{database::kDeviceTypeAll, "default", {database_architecture}};
-    const auto database_entry = database::DatabaseEntry{kernel_name, precision, parameter_names, {database_vendor}};
-    const auto database_entries = std::vector<database::DatabaseEntry>{database_entry};
-    const auto database = Database(device_cpp, kernel_name, precision, database_entries);
-
-    // Removes the old database entry and stores the new one in the cache
-    DatabaseCache::Instance().Remove(DatabaseKey{platform_id, device, precision, kernel_name});
-    DatabaseCache::Instance().Store(DatabaseKey{platform_id, device, precision, kernel_name}, Database(database));
-
-  } catch (...) { return DispatchException(); }
-  return StatusCode::kSuccess;
-}

 // =================================================================================================
 } // namespace clblast
--- a/src/clblast_cuda.cpp
+++ b/src/clblast_cuda.cpp
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@ -60,34 +60,36 @@ namespace clblast {
 // =================================================================================================

 // Represents a runtime error returned by an OpenCL API function
-class CLError : public ErrorCode<DeviceError, cl_int> {
+class CLCudaAPIError : public ErrorCode<DeviceError, cl_int> {
 public:
-  explicit CLError(cl_int status, const std::string &where):
-      ErrorCode(status,
-                where,
-                "OpenCL error: " + where + ": " + std::to_string(static_cast<int>(status))) {
+  explicit CLCudaAPIError(cl_int status, const std::string &where):
+          ErrorCode(status, where, "OpenCL error: " + where + ": " +
+                                   std::to_string(static_cast<int>(status))) {
  }

  static void Check(const cl_int status, const std::string &where) {
    if (status != CL_SUCCESS) {
-      throw CLError(status, where);
+      throw CLCudaAPIError(status, where);
    }
  }

  static void CheckDtor(const cl_int status, const std::string &where) {
    if (status != CL_SUCCESS) {
-      fprintf(stderr, "CLBlast: %s (ignoring)\n", CLError(status, where).what());
+      fprintf(stderr, "CLBlast: %s (ignoring)\n", CLCudaAPIError(status, where).what());
    }
  }
 };

+// Exception returned when building a program
+using CLCudaAPIBuildError = CLCudaAPIError;
+
 // =================================================================================================

 // Error occurred in OpenCL
-#define CheckError(call) CLError::Check(call, CLError::TrimCallString(#call))
+#define CheckError(call) CLCudaAPIError::Check(call, CLCudaAPIError::TrimCallString(#call))

-// Error occured in OpenCL (no-exception version for destructors)
-#define CheckErrorDtor(call) CLError::CheckDtor(call, CLError::TrimCallString(#call))
+// Error occurred in OpenCL (no-exception version for destructors)
+#define CheckErrorDtor(call) CLCudaAPIError::CheckDtor(call, CLCudaAPIError::TrimCallString(#call))

 // =================================================================================================

@ -143,6 +145,9 @@ using EventPointer = cl_event*;

 // =================================================================================================

+// Raw platform ID type
+using RawPlatformID = cl_platform_id;
+
 // C++11 version of 'cl_platform_id'
 class Platform {
 public:
@ -178,7 +183,7 @@ class Platform {
  }

  // Accessor to the private data-member
-  const cl_platform_id& operator()() const { return platform_; }
+  const RawPlatformID& operator()() const { return platform_; }
 private:
  cl_platform_id platform_;

@ -207,6 +212,9 @@ inline std::vector<Platform> GetAllPlatforms() {

 // =================================================================================================

+// Raw device ID type
+using RawDeviceID = cl_device_id;
+
 // C++11 version of 'cl_device_id'
 class Device {
 public:
@ -231,7 +239,7 @@ class Device {
  }

  // Methods to retrieve device information
-  cl_platform_id Platform() const { return GetInfo<cl_platform_id>(CL_DEVICE_PLATFORM); }
+  RawPlatformID PlatformID() const { return GetInfo<cl_platform_id>(CL_DEVICE_PLATFORM); }
  std::string Version() const { return GetInfoString(CL_DEVICE_VERSION); }
  size_t VersionNumber() const
  {
@ -263,11 +271,19 @@ class Device {
  unsigned long LocalMemSize() const {
    return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE));
  }
+
  std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
  bool HasExtension(const std::string &extension) const {
    const auto extensions = Capabilities();
    return extensions.find(extension) != std::string::npos;
  }
+  bool SupportsFP64() const {
+    return HasExtension("cl_khr_fp64");
+  }
+  bool SupportsFP16() const {
+    if (Name() == "Mali-T628") { return true; } // supports fp16 but not cl_khr_fp16 officially
+    return HasExtension("cl_khr_fp16");
+  }

  size_t CoreClock() const {
    return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_CLOCK_FREQUENCY));
@ -331,9 +347,8 @@ class Device {
           std::string{"."} + std::to_string(GetInfo<cl_uint>(CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV));
  }

-
  // Accessor to the private data-member
-  const cl_device_id& operator()() const { return device_; }
+  const RawDeviceID& operator()() const { return device_; }
 private:
  cl_device_id device_;

@ -367,6 +382,9 @@ class Device {

 // =================================================================================================

+// Raw context type
+using RawContext = cl_context;
+
 // C++11 version of 'cl_context'
 class Context {
 public:
@ -386,12 +404,12 @@ class Context {
    auto status = CL_SUCCESS;
    const cl_device_id dev = device();
    *context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status);
-    CLError::Check(status, "clCreateContext");
+    CLCudaAPIError::Check(status, "clCreateContext");
  }

  // Accessor to the private data-member
-  const cl_context& operator()() const { return *context_; }
-  cl_context* pointer() const { return &(*context_); }
+  const RawContext& operator()() const { return *context_; }
+  RawContext* pointer() const { return &(*context_); }
 private:
  std::shared_ptr<cl_context> context_;
 };
@ -401,9 +419,6 @@ using ContextPointer = cl_context*;

 // =================================================================================================

-// Enumeration of build statuses of the run-time compilation process
-enum class BuildStatus { kSuccess, kError, kInvalid };
-
 // C++11 version of 'cl_program'.
 class Program {
 public:
@ -416,10 +431,10 @@ class Program {
        delete p;
      }) {
    const char *source_ptr = &source[0];
-    size_t length = source.length();
+    const auto length = source.length();
    auto status = CL_SUCCESS;
    *program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status);
-    CLError::Check(status, "clCreateProgramWithSource");
+    CLCudaAPIError::Check(status, "clCreateProgramWithSource");
  }

  // Binary-based constructor with memory management
@ -429,18 +444,18 @@ class Program {
        delete p;
      }) {
    const char *binary_ptr = &binary[0];
-    size_t length = binary.length();
+    const auto length = binary.length();
    auto status1 = CL_SUCCESS;
    auto status2 = CL_SUCCESS;
-    const cl_device_id dev = device();
+    const auto dev = device();
    *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length,
                                          reinterpret_cast<const unsigned char**>(&binary_ptr),
                                          &status1, &status2);
-    CLError::Check(status1, "clCreateProgramWithBinary (binary status)");
-    CLError::Check(status2, "clCreateProgramWithBinary");
+    CLCudaAPIError::Check(status1, "clCreateProgramWithBinary (binary status)");
+    CLCudaAPIError::Check(status2, "clCreateProgramWithBinary");
  }

-  // Compiles the device program and returns whether or not there where any warnings/errors
+  // Compiles the device program and checks whether or not there are any warnings/errors
  void Build(const Device &device, std::vector<std::string> &options) {
    options.push_back("-cl-std=CL1.1");
    auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
@ -448,6 +463,11 @@ class Program {
    CheckError(clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr));
  }

+  // Confirms whether a certain status code is an actual compilation error or warning
+  bool StatusIsCompilationWarningOrError(const cl_int status) const {
+    return (status == CL_BUILD_PROGRAM_FAILURE);
+  }
+
  // Retrieves the warning/error message from the compiler (if any)
  std::string GetBuildInfo(const Device &device) const {
    auto bytes = size_t{0};
@ -478,6 +498,9 @@ class Program {

 // =================================================================================================

+// Raw command-queue type
+using RawCommandQueue = cl_command_queue;
+
 // C++11 version of 'cl_command_queue'
 class Queue {
 public:
@ -496,7 +519,7 @@ class Queue {
      }) {
    auto status = CL_SUCCESS;
    *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
-    CLError::Check(status, "clCreateCommandQueue");
+    CLCudaAPIError::Check(status, "clCreateCommandQueue");
  }

  // Synchronizes the queue
@ -524,7 +547,7 @@ class Queue {
  }

  // Accessor to the private data-member
-  const cl_command_queue& operator()() const { return *queue_; }
+  const RawCommandQueue& operator()() const { return *queue_; }
 private:
  std::shared_ptr<cl_command_queue> queue_;
 };
@ -588,7 +611,7 @@ class Buffer {
    if (access_ == BufferAccess::kWriteOnly) { flags = CL_MEM_WRITE_ONLY; }
    auto status = CL_SUCCESS;
    *buffer_ = clCreateBuffer(context(), flags, size*sizeof(T), nullptr, &status);
-    CLError::Check(status, "clCreateBuffer");
+    CLCudaAPIError::Check(status, "clCreateBuffer");
  }

  // As above, but now with read/write access as a default
@ -646,6 +669,9 @@ class Buffer {

  // Copies from host to device: writing the device buffer a-synchronously
  void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
+    if (access_ == BufferAccess::kReadOnly) {
+      throw LogicError("Buffer: writing to a read-only buffer");
+    }
    if (GetSize() < (offset+size)*sizeof(T)) {
      throw LogicError("Buffer: target device buffer is too small");
    }
@ -720,7 +746,7 @@ class Kernel {
      }) {
    auto status = CL_SUCCESS;
    *kernel_ = clCreateKernel(program(), name.c_str(), &status);
-    CLError::Check(status, "clCreateKernel");
+    CLCudaAPIError::Check(status, "clCreateKernel");
  }

  // Sets a kernel argument at the indicated position
--- a/src/cupp11.hpp
+++ b/src/cupp11.hpp
@ -0,0 +1,782 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a bunch of C++11 classes that act as wrappers around OpenCL objects and API
+// calls. The main benefits are increased abstraction, automatic memory management, and portability.
+// Portability here means that a similar header exists for CUDA with the same classes and
+// interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
+//
+// This file is taken from the CLCudaAPI project <https://github.com/CNugteren/CLCudaAPI> and
+// therefore contains the following header copyright notice:
+//
+// =================================================================================================
+//
+// Copyright 2015 SURFsara
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_CUPP11_H_
+#define CLBLAST_CUPP11_H_
+
+// C++
+#include <algorithm> // std::copy
+#include <string>    // std::string
+#include <vector>    // std::vector
+#include <memory>    // std::shared_ptr
+#include <cstring>   // std::strlen
+
+// CUDA
+#define CUDA_NO_HALF // Incompatible with CLBlast's definition; TODO: resolve this
+#include <cuda.h>    // CUDA driver API
+#include <nvrtc.h>   // NVIDIA runtime compilation API
+
+// Exception classes
+#include "cxpp11_common.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Max-length of strings
+constexpr auto kStringLength = 256;
+
+// =================================================================================================
+
+// Represents a runtime error returned by a CUDA driver API function
+class CLCudaAPIError : public ErrorCode<DeviceError, CUresult> {
+public:
+  explicit CLCudaAPIError(CUresult status, const std::string &where):
+      ErrorCode(status, where, "CUDA error: " + where + ": " +
+                               GetErrorName(status) + " --> " + GetErrorString(status)) {
+  }
+
+  static void Check(const CUresult status, const std::string &where) {
+    if (status != CUDA_SUCCESS) {
+      throw CLCudaAPIError(status, where);
+    }
+  }
+
+  static void CheckDtor(const CUresult status, const std::string &where) {
+    if (status != CUDA_SUCCESS) {
+      fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPIError(status, where).what());
+    }
+  }
+
+private:
+  std::string GetErrorName(CUresult status) const {
+    const char* status_code;
+    cuGetErrorName(status, &status_code);
+    return std::string(status_code);
+  }
+  std::string GetErrorString(CUresult status) const {
+    const char* status_string;
+    cuGetErrorString(status, &status_string);
+    return std::string(status_string);
+  }
+};
+
+// Represents a runtime error returned by a CUDA runtime compilation API function
+class CLCudaAPINVRTCError : public ErrorCode<DeviceError, nvrtcResult> {
+public:
+  explicit CLCudaAPINVRTCError(nvrtcResult status, const std::string &where):
+      ErrorCode(status, where, "CUDA NVRTC error: " + where + ": " + GetErrorString(status)) {
+  }
+
+  static void Check(const nvrtcResult status, const std::string &where) {
+    if (status != NVRTC_SUCCESS) {
+      throw CLCudaAPINVRTCError(status, where);
+    }
+  }
+
+  static void CheckDtor(const nvrtcResult status, const std::string &where) {
+    if (status != NVRTC_SUCCESS) {
+      fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPINVRTCError(status, where).what());
+    }
+  }
+
+private:
+  std::string GetErrorString(nvrtcResult status) const {
+    const char* status_string = nvrtcGetErrorString(status);
+    return std::string(status_string);
+  }
+};
+
+// Exception returned when building a program
+using CLCudaAPIBuildError = CLCudaAPINVRTCError;
+
+// =================================================================================================
+
+// Error occurred in CUDA driver or runtime compilation API
+#define CheckError(call) CLCudaAPIError::Check(call, CLCudaAPIError::TrimCallString(#call))
+#define CheckErrorNVRTC(call) CLCudaAPINVRTCError::Check(call, CLCudaAPINVRTCError::TrimCallString(#call))
+
+// Error occurred in CUDA driver or runtime compilation API (no-exception version for destructors)
+#define CheckErrorDtor(call) CLCudaAPIError::CheckDtor(call, CLCudaAPIError::TrimCallString(#call))
+#define CheckErrorDtorNVRTC(call) CLCudaAPINVRTCError::CheckDtor(call, CLCudaAPINVRTCError::TrimCallString(#call))
+
+// =================================================================================================
+
+// C++11 version of two 'CUevent' pointers
+class Event {
+public:
+  // Note that there is no constructor based on the regular CUDA data-type because of extra state
+
+  // Regular constructor with memory management
+  explicit Event():
+      start_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }),
+      end_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }) {
+    CheckError(cuEventCreate(start_.get(), CU_EVENT_DEFAULT));
+    CheckError(cuEventCreate(end_.get(), CU_EVENT_DEFAULT));
+  }
+
+  // Waits for completion of this event (not implemented for CUDA)
+  void WaitForCompletion() const { }   // not needed due to cuStreamSynchronize call after each kernel launch
+
+  // Retrieves the elapsed time of the last recorded event
+  float GetElapsedTime() const {
+    auto result = 0.0f;
+    cuEventElapsedTime(&result, *start_, *end_);
+    return result;
+  }
+
+  // Accessors to the private data-members
+  const CUevent& start() const { return *start_; }
+  const CUevent& end() const { return *end_; }
+  Event* pointer() { return this; }
+private:
+  std::shared_ptr<CUevent> start_;
+  std::shared_ptr<CUevent> end_;
+};
+
+// Pointer to a CUDA event
+using EventPointer = Event*;
+
+// =================================================================================================
+
+// Raw platform ID type
+using RawPlatformID = size_t;
+
+// The CUDA platform: initializes the CUDA driver API
+class Platform {
+public:
+
+  // Initializes the platform. Note that the platform ID variable is not actually used for CUDA.
+  explicit Platform(const size_t platform_id) : platform_id_(0) {
+    if (platform_id != 0) { throw LogicError("CUDA back-end requires a platform ID of 0"); }
+    CheckError(cuInit(0));
+  }
+
+  // Methods to retrieve platform information
+  std::string Name() const { return "CUDA"; }
+  std::string Vendor() const { return "NVIDIA Corporation"; }
+  std::string Version() const {
+    auto result = 0;
+    CheckError(cuDriverGetVersion(&result));
+    return "CUDA driver "+std::to_string(result);
+  }
+
+  // Returns the number of devices on this platform
+  size_t NumDevices() const {
+    auto result = 0;
+    CheckError(cuDeviceGetCount(&result));
+    return static_cast<size_t>(result);
+  }
+
+  // Accessor to the raw ID (which doesn't exist in the CUDA back-end, this is always just 0)
+  const RawPlatformID& operator()() const { return platform_id_; }
+private:
+  const size_t platform_id_;
+};
+
+// Retrieves a vector with all platforms. Note that there is just one platform in CUDA.
+inline std::vector<Platform> GetAllPlatforms() {
+  auto all_platforms = std::vector<Platform>{ Platform(size_t{0}) };
+  return all_platforms;
+}
+
+// =================================================================================================
+
+// Raw device ID type
+using RawDeviceID = CUdevice;
+
+// C++11 version of 'CUdevice'
+class Device {
+public:
+
+  // Constructor based on the regular CUDA data-type
+  explicit Device(const CUdevice device): device_(device) { }
+
+  // Initialization
+  explicit Device(const Platform &platform, const size_t device_id) {
+    auto num_devices = platform.NumDevices();
+    if (num_devices == 0) {
+      throw RuntimeError("Device: no devices found");
+    }
+    if (device_id >= num_devices) {
+      throw RuntimeError("Device: invalid device ID "+std::to_string(device_id));
+    }
+
+    CheckError(cuDeviceGet(&device_, device_id));
+  }
+
+  // Methods to retrieve device information
+  RawPlatformID PlatformID() const { return 0; }
+  std::string Version() const {
+    auto result = 0;
+    CheckError(cuDriverGetVersion(&result));
+    return "CUDA driver "+std::to_string(result);
+  }
+  size_t VersionNumber() const {
+    auto result = 0;
+    CheckError(cuDriverGetVersion(&result));
+    return static_cast<size_t>(result);
+  }
+  std::string Vendor() const { return "NVIDIA Corporation"; }
+  std::string Name() const {
+    auto result = std::string{};
+    result.resize(kStringLength);
+    CheckError(cuDeviceGetName(&result[0], result.size(), device_));
+    result.resize(strlen(result.c_str())); // Removes any trailing '\0'-characters
+    return result;
+  }
+  std::string Type() const { return "GPU"; }
+  size_t MaxWorkGroupSize() const {return GetInfo(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK); }
+  size_t MaxWorkItemDimensions() const { return size_t{3}; }
+  std::vector<size_t> MaxWorkItemSizes() const {
+    return std::vector<size_t>{GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X),
+                               GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y),
+                               GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z)};
+  }
+  unsigned long LocalMemSize() const {
+    return static_cast<unsigned long>(GetInfo(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK));
+  }
+
+  std::string Capabilities() const {
+    const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
+    const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
+    return "SM"+std::to_string(major)+"."+std::to_string(minor);
+  }
+  std::string ComputeArch() const {
+    const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
+    const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
+    return "compute_"+std::to_string(major)+std::to_string(minor);
+  }
+  bool HasExtension(const std::string &extension) const { return false; }
+  bool SupportsFP64() const { return true; }
+  bool SupportsFP16() const {
+    const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
+    const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
+    if (major > 5) { return true; } // SM 6.x, 7.x and higher
+    if (major == 5 && minor == 3) { return true; } // SM 5.3
+    return false;
+  }
+
+  size_t CoreClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_CLOCK_RATE); }
+  size_t ComputeUnits() const { return GetInfo(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT); }
+  unsigned long MemorySize() const {
+    auto result = size_t{0};
+    CheckError(cuDeviceTotalMem(&result, device_));
+    return static_cast<unsigned long>(result);
+  }
+  unsigned long MaxAllocSize() const { return MemorySize(); }
+  size_t MemoryClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE); }
+  size_t MemoryBusWidth() const { return GetInfo(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH); }
+
+  // Configuration-validity checks
+  bool IsLocalMemoryValid(const size_t local_mem_usage) const {
+    return (local_mem_usage <= LocalMemSize());
+  }
+  bool IsThreadConfigValid(const std::vector<size_t> &local) const {
+    auto local_size = size_t{1};
+    for (const auto &item: local) { local_size *= item; }
+    for (auto i=size_t{0}; i<local.size(); ++i) {
+      if (local[i] > MaxWorkItemSizes()[i]) { return false; }
+    }
+    if (local_size > MaxWorkGroupSize()) { return false; }
+    if (local.size() > MaxWorkItemDimensions()) { return false; }
+    return true;
+  }
+
+  // Query for a specific type of device or brand
+  bool IsCPU() const { return false; }
+  bool IsGPU() const { return true; }
+  bool IsAMD() const { return false; }
+  bool IsNVIDIA() const { return true; }
+  bool IsIntel() const { return false; }
+  bool IsARM() const { return false; }
+
+  // Platform specific extensions
+  std::string AMDBoardName() const { return ""; }
+  std::string NVIDIAComputeCapability() const { return Capabilities(); }
+
+  // Accessor to the private data-member
+  const RawDeviceID& operator()() const { return device_; }
+private:
+  CUdevice device_;
+
+  // Private helper function
+  size_t GetInfo(const CUdevice_attribute info) const {
+    auto result = 0;
+    CheckError(cuDeviceGetAttribute(&result, info, device_));
+    return static_cast<size_t>(result);
+  }
+};
+
+// =================================================================================================
+
+// Raw context type
+using RawContext = CUcontext;
+
+// C++11 version of 'CUcontext'
+class Context {
+public:
+
+  // Constructor based on the regular CUDA data-type: memory management is handled elsewhere
+  explicit Context(const CUcontext context):
+      context_(new CUcontext) {
+    *context_ = context;
+  }
+
+  // Regular constructor with memory management
+  explicit Context(const Device &device):
+      context_(new CUcontext, [](CUcontext* c) {
+          if (*c) { CheckErrorDtor(cuCtxDestroy(*c)); }
+          delete c;
+      }) {
+    CheckError(cuCtxCreate(context_.get(), 0, device()));
+  }
+
+  // Accessor to the private data-member
+  const RawContext& operator()() const { return *context_; }
+  RawContext* pointer() const { return &(*context_); }
+private:
+  std::shared_ptr<CUcontext> context_;
+};
+
+// Pointer to a raw CUDA context
+using ContextPointer = CUcontext*;
+
+// =================================================================================================
+
+// C++11 version of 'nvrtcProgram'. Additionally holds the program's source code.
+class Program {
+public:
+  Program() = default;
+
+  // Note that there is no constructor based on the regular CUDA data-type because of extra state
+
+  // Source-based constructor with memory management
+  explicit Program(const Context &, std::string source):
+      program_(new nvrtcProgram, [](nvrtcProgram* p) {
+          if (*p) { CheckErrorDtorNVRTC(nvrtcDestroyProgram(p)); }
+          delete p;
+      }),
+      source_(std::move(source)),
+      from_binary_(false) {
+    const auto source_ptr = &source_[0];
+    CheckErrorNVRTC(nvrtcCreateProgram(program_.get(), source_ptr, nullptr, 0, nullptr, nullptr));
+  }
+
+  // PTX-based constructor
+  explicit Program(const Device &device, const Context &context, const std::string &binary):
+      program_(nullptr), // not used
+      source_(binary),
+      from_binary_(true) {
+  }
+
+  // Compiles the device program and checks whether or not there are any warnings/errors
+  void Build(const Device &device, std::vector<std::string> &options) {
+    options.push_back("-arch=" + device.ComputeArch());
+    if (from_binary_) { return; }
+    auto raw_options = std::vector<const char*>();
+    for (const auto &option: options) {
+      raw_options.push_back(option.c_str());
+    }
+    auto status = nvrtcCompileProgram(*program_, raw_options.size(), raw_options.data());
+    CLCudaAPINVRTCError::Check(status, "nvrtcCompileProgram");
+    CheckError(cuModuleLoadDataEx(&module_, GetIR().data(), 0, nullptr, nullptr));
+  }
+
+  // Confirms whether a certain status code is an actual compilation error or warning
+  bool StatusIsCompilationWarningOrError(const nvrtcResult status) const {
+    return (status == NVRTC_ERROR_COMPILATION);
+  }
+
+  // Retrieves the warning/error message from the compiler (if any)
+  std::string GetBuildInfo(const Device &) const {
+    if (from_binary_) { return std::string{}; }
+    auto bytes = size_t{0};
+    CheckErrorNVRTC(nvrtcGetProgramLogSize(*program_, &bytes));
+    auto result = std::string{};
+    result.resize(bytes);
+    CheckErrorNVRTC(nvrtcGetProgramLog(*program_, &result[0]));
+    return result;
+  }
+
+  // Retrieves an intermediate representation of the compiled program (i.e. PTX)
+  std::string GetIR() const {
+    if (from_binary_) { return source_; } // holds the PTX
+    auto bytes = size_t{0};
+    CheckErrorNVRTC(nvrtcGetPTXSize(*program_, &bytes));
+    auto result = std::string{};
+    result.resize(bytes);
+    CheckErrorNVRTC(nvrtcGetPTX(*program_, &result[0]));
+    return result;
+  }
+
+  // Accessor to the private data-members
+  const CUmodule GetModule() const { return module_; }
+  const nvrtcProgram& operator()() const { return *program_; }
+private:
+  std::shared_ptr<nvrtcProgram> program_;
+  CUmodule module_;
+  std::string source_;
+  bool from_binary_;
+};
+
+// =================================================================================================
+
+// Raw command-queue type
+using RawCommandQueue = CUstream;
+
+// C++11 version of 'CUstream'
+class Queue {
+public:
+  // Note that there is no constructor based on the regular CUDA data-type because of extra state
+
+  // Regular constructor with memory management
+  explicit Queue(const Context &context, const Device &device):
+      queue_(new CUstream, [](CUstream* s) {
+          if (*s) { CheckErrorDtor(cuStreamDestroy(*s)); }
+          delete s;
+      }),
+      context_(context),
+      device_(device) {
+    CheckError(cuStreamCreate(queue_.get(), CU_STREAM_NON_BLOCKING));
+  }
+
+  // Synchronizes the queue and optionally also an event
+  void Finish(Event &event) const {
+    CheckError(cuEventSynchronize(event.end()));
+    Finish();
+  }
+  void Finish() const {
+    CheckError(cuStreamSynchronize(*queue_));
+  }
+
+  // Retrieves the corresponding context or device
+  Context GetContext() const { return context_; }
+  Device GetDevice() const { return device_; }
+
+  // Accessor to the private data-member
+  const RawCommandQueue& operator()() const { return *queue_; }
+private:
+  std::shared_ptr<CUstream> queue_;
+  const Context context_;
+  const Device device_;
+};
+
+// =================================================================================================
+
+// C++11 version of page-locked host memory
+template <typename T>
+class BufferHost {
+public:
+
+  // Regular constructor with memory management
+  explicit BufferHost(const Context &, const size_t size):
+      buffer_(new void*, [](void** m) { CheckError(cuMemFreeHost(*m)); delete m; }),
+      size_(size) {
+    CheckError(cuMemAllocHost(buffer_.get(), size*sizeof(T)));
+  }
+
+  // Retrieves the actual allocated size in bytes
+  size_t GetSize() const {
+    return size_*sizeof(T);
+  }
+
+  // Compatibility with std::vector
+  size_t size() const { return size_; }
+  T* begin() { return &static_cast<T*>(*buffer_)[0]; }
+  T* end() { return &static_cast<T*>(*buffer_)[size_-1]; }
+  T& operator[](const size_t i) { return static_cast<T*>(*buffer_)[i]; }
+  T* data() { return static_cast<T*>(*buffer_); }
+  const T* data() const { return static_cast<T*>(*buffer_); }
+
+private:
+  std::shared_ptr<void*> buffer_;
+  const size_t size_;
+};
+
+// =================================================================================================
+
+// Enumeration of buffer access types
+enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned };
+
+// C++11 version of 'CUdeviceptr'
+template <typename T>
+class Buffer {
+public:
+
+  // Constructor based on the regular CUDA data-type: memory management is handled elsewhere
+  explicit Buffer(const CUdeviceptr buffer):
+      buffer_(new CUdeviceptr),
+      access_(BufferAccess::kNotOwned) {
+    *buffer_ = buffer;
+  }
+
+  // Regular constructor with memory management. If this class does not own the buffer object, then
+  // the memory will not be freed automatically afterwards.
+  explicit Buffer(const Context &, const BufferAccess access, const size_t size):
+      buffer_(new CUdeviceptr, [access](CUdeviceptr* m) {
+          if (access != BufferAccess::kNotOwned) { CheckError(cuMemFree(*m)); }
+          delete m;
+      }),
+      access_(access) {
+    CheckError(cuMemAlloc(buffer_.get(), size*sizeof(T)));
+  }
+
+  // As above, but now with read/write access as a default
+  explicit Buffer(const Context &context, const size_t size):
+      Buffer<T>(context, BufferAccess::kReadWrite, size) {
+  }
+
+  // Constructs a new buffer based on an existing host-container
+  template <typename Iterator>
+  explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end):
+      Buffer(context, BufferAccess::kReadWrite, static_cast<size_t>(end - start)) {
+    auto size = static_cast<size_t>(end - start);
+    auto pointer = &*start;
+    CheckError(cuMemcpyHtoDAsync(*buffer_, pointer, size*sizeof(T), queue()));
+    queue.Finish();
+  }
+
+  // Copies from device to host: reading the device buffer a-synchronously
+  void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
+    if (access_ == BufferAccess::kWriteOnly) {
+      throw LogicError("Buffer: reading from a write-only buffer");
+    }
+    CheckError(cuMemcpyDtoHAsync(host, *buffer_ + offset*sizeof(T), size*sizeof(T), queue()));
+  }
+  void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
+                 const size_t offset = 0) const {
+    if (host.size() < size) {
+      throw LogicError("Buffer: target host buffer is too small");
+    }
+    ReadAsync(queue, size, host.data(), offset);
+  }
+  void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
+                 const size_t offset = 0) const {
+    if (host.size() < size) {
+      throw LogicError("Buffer: target host buffer is too small");
+    }
+    ReadAsync(queue, size, host.data(), offset);
+  }
+
+  // Copies from device to host: reading the device buffer
+  void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
+    ReadAsync(queue, size, host, offset);
+    queue.Finish();
+  }
+  void Read(const Queue &queue, const size_t size, std::vector<T> &host,
+            const size_t offset = 0) const {
+    Read(queue, size, host.data(), offset);
+  }
+  void Read(const Queue &queue, const size_t size, BufferHost<T> &host,
+            const size_t offset = 0) const {
+    Read(queue, size, host.data(), offset);
+  }
+
+  // Copies from host to device: writing the device buffer a-synchronously
+  void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
+    if (access_ == BufferAccess::kReadOnly) {
+      throw LogicError("Buffer: writing to a read-only buffer");
+    }
+    if (GetSize() < (offset+size)*sizeof(T)) {
+      throw LogicError("Buffer: target device buffer is too small");
+    }
+    CheckError(cuMemcpyHtoDAsync(*buffer_ + offset*sizeof(T), host, size*sizeof(T), queue()));
+  }
+  void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host,
+                  const size_t offset = 0) {
+    WriteAsync(queue, size, host.data(), offset);
+  }
+  void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host,
+                  const size_t offset = 0) {
+    WriteAsync(queue, size, host.data(), offset);
+  }
+
+  // Copies from host to device: writing the device buffer
+  void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
+    WriteAsync(queue, size, host, offset);
+    queue.Finish();
+  }
+  void Write(const Queue &queue, const size_t size, const std::vector<T> &host,
+             const size_t offset = 0) {
+    Write(queue, size, host.data(), offset);
+  }
+  void Write(const Queue &queue, const size_t size, const BufferHost<T> &host,
+             const size_t offset = 0) {
+    Write(queue, size, host.data(), offset);
+  }
+
+  // Copies the contents of this buffer into another device buffer
+  void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
+    CheckError(cuMemcpyDtoDAsync(destination(), *buffer_, size*sizeof(T), queue()));
+  }
+  void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
+    CopyToAsync(queue, size, destination);
+    queue.Finish();
+  }
+
+  // Retrieves the actual allocated size in bytes
+  size_t GetSize() const {
+    auto result = size_t{0};
+    CheckError(cuMemGetAddressRange(nullptr, &result, *buffer_));
+    return result;
+  }
+
+  // Accessors to the private data-members
+  CUdeviceptr operator()() const { return *buffer_; }
+  CUdeviceptr& operator()() { return *buffer_; }
+private:
+  std::shared_ptr<CUdeviceptr> buffer_;
+  const BufferAccess access_;
+};
+
+// =================================================================================================
+
+// C++11 version of 'CUfunction'
+class Kernel {
+public:
+
+  // Constructor based on the regular CUDA data-type: memory management is handled elsewhere
+  explicit Kernel(const CUfunction kernel):
+      name_("unknown"),
+      kernel_(kernel) {
+  }
+
+  // Regular constructor with memory management
+  explicit Kernel(const Program &program, const std::string &name): name_(name) {
+    CheckError(cuModuleGetFunction(&kernel_, program.GetModule(), name.c_str()));
+  }
+
+  // Sets a kernel argument at the indicated position. This stores both the value of the argument
+  // (as raw bytes) and the index indicating where this value can be found.
+  template <typename T>
+  void SetArgument(const size_t index, const T &value) {
+    if (index >= arguments_indices_.size()) { arguments_indices_.resize(index+1); }
+    arguments_indices_[index] = arguments_data_.size();
+    for (auto j=size_t(0); j<sizeof(T); ++j) {
+      arguments_data_.push_back(reinterpret_cast<const char*>(&value)[j]);
+    }
+  }
+  template <typename T>
+  void SetArgument(const size_t index, Buffer<T> &value) {
+    SetArgument(index, value());
+  }
+
+  // Sets all arguments in one go using parameter packs. Note that this resets all previously set
+  // arguments using 'SetArgument' or 'SetArguments'.
+  template <typename... Args>
+  void SetArguments(Args&... args) {
+    arguments_indices_.clear();
+    arguments_data_.clear();
+    SetArgumentsRecursive(0, args...);
+  }
+
+  // Retrieves the amount of local memory used per work-group for this kernel. Note that this the
+  // shared memory in CUDA terminology.
+  unsigned long LocalMemUsage(const Device &) const {
+    auto result = 0;
+    CheckError(cuFuncGetAttribute(&result, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel_));
+    return static_cast<unsigned long>(result);
+  }
+
+  // Retrieves the name of the kernel
+  std::string GetFunctionName() const {
+    return name_;
+  }
+
+  // Launches a kernel onto the specified queue
+  void Launch(const Queue &queue, const std::vector<size_t> &global,
+              const std::vector<size_t> &local, EventPointer event) {
+    // TODO: Currently this CUDA launch is always synchronous due to a cuStreamSynchronize call
+    if (local.size() == 0) {
+      throw LogicError("Kernel: launching with a default workgroup size is not implemented for the CUDA back-end");
+    }
+
+    // Creates the grid (number of threadblocks) and sets the block sizes (threads per block)
+    auto grid = std::vector<size_t>{1, 1, 1};
+    auto block = std::vector<size_t>{1, 1, 1};
+    if (global.size() != local.size()) { throw LogicError("invalid thread/workgroup dimensions"); }
+    for (auto i=size_t{0}; i<local.size(); ++i) { grid[i] = global[i]/local[i]; }
+    for (auto i=size_t{0}; i<local.size(); ++i) { block[i] = local[i]; }
+
+    // Creates the array of pointers from the arrays of indices & data
+    std::vector<void*> pointers;
+    for (auto &index: arguments_indices_) {
+      pointers.push_back(&arguments_data_[index]);
+    }
+
+    // Launches the kernel, its execution time is recorded by events
+    if (event) { CheckError(cuEventRecord(event->start(), queue())); }
+    CheckError(cuLaunchKernel(kernel_, grid[0], grid[1], grid[2], block[0], block[1], block[2],
+                              0, queue(), pointers.data(), nullptr));
+    cuStreamSynchronize(queue());
+    if (event) { CheckError(cuEventRecord(event->end(), queue())); }
+  }
+
+  // As above, but with an event waiting list
+  void Launch(const Queue &queue, const std::vector<size_t> &global,
+              const std::vector<size_t> &local, EventPointer event,
+              const std::vector<Event>& waitForEvents) {
+    for (auto &waitEvent : waitForEvents) {
+      waitEvent.WaitForCompletion(); // note: doesn't do anything, every kernel call is synchronous
+    }
+    return Launch(queue, global, local, event);
+  }
+
+  // Accessors to the private data-members
+  const CUfunction& operator()() const { return kernel_; }
+  CUfunction operator()() { return kernel_; }
+private:
+  const std::string name_;
+  CUfunction kernel_;
+  std::vector<size_t> arguments_indices_; // Indices of the arguments
+  std::vector<char> arguments_data_; // The arguments data as raw bytes
+
+  // Internal implementation for the recursive SetArguments function.
+  template <typename T>
+  void SetArgumentsRecursive(const size_t index, T &first) {
+    SetArgument(index, first);
+  }
+  template <typename T, typename... Args>
+  void SetArgumentsRecursive(const size_t index, T &first, Args&... args) {
+    SetArgument(index, first);
+    SetArgumentsRecursive(index+1, args...);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_CUPP11_H_
+#endif
--- a/src/cxpp11_common.hpp
+++ b/src/cxpp11_common.hpp
@ -15,6 +15,7 @@
 #ifndef CLBLAST_CXPP11_COMMON_H_
 #define CLBLAST_CXPP11_COMMON_H_

+#include <cstring>   // strchr
 #include <string>    // std::string
 #include <stdexcept> // std::runtime_error

--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@ -124,6 +124,15 @@ std::string Database::GetDefines() const {
  return defines;
 }

+// ... or just the values as string
+std::string Database::GetValuesString() const {
+  std::string defines{};
+  for (auto &parameter: *parameters_) {
+    defines += "_"+ToString(parameter.second);
+  }
+  return defines;
+}
+
 // Retrieves the names of all the parameters
 std::vector<std::string> Database::GetParameterNames() const {
  auto parameter_names = std::vector<std::string>();
--- a/src/database/database.hpp
+++ b/src/database/database.hpp
@ -53,7 +53,8 @@ class Database {
  // Obtain a list of OpenCL pre-processor defines based on the parameters
  std::string GetDefines() const;

-  // Retrieves the names of all the parameters
+  // Retrieves the values or names of all the parameters
+  std::string GetValuesString() const;
  std::vector<std::string> GetParameterNames() const;

 private:
--- a/src/database/database_structure.hpp
+++ b/src/database/database_structure.hpp
@ -17,7 +17,7 @@
 #include <string>
 #include <array>
 #include <vector>
-#include <unordered_map>
+#include <map>

 namespace clblast {
 // A special namespace to hold all the global constant variables (including the database entries)
@ -29,8 +29,8 @@ namespace database {
 using Name = std::array<char, 51>; // name as stored in database (50 chars + string terminator)
 using Params = std::array<size_t, 14>; // parameters as stored in database

-// Type alias after extracting from the database (map for improved code readability)
-using Parameters = std::unordered_map<std::string, size_t>; // parameters after reading from DB
+// Type alias after extracting from the database (sorted map for improved code readability)
+using Parameters = std::map<std::string, size_t>; // parameters after reading from DB

 // The OpenCL device types
 const std::string kDeviceTypeCPU = "CPU";
--- a/src/database/kernels/copy/copy_32.hpp
+++ b/src/database/kernels/copy/copy_32.hpp
@ -68,6 +68,7 @@ const DatabaseEntry CopySingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 32, 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 32, 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 32, 16, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 32, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 32, 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -104,7 +105,8 @@ const DatabaseEntry CopySingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
          { Name{"GRID K520                                         "}, Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -133,6 +135,7 @@ const DatabaseEntry CopySingle = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 8, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 8, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 8, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
@ -152,7 +155,7 @@ const DatabaseEntry CopySingle = {
    { // Default
      kDeviceTypeAll, "default", {
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 32, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
--- a/src/database/kernels/copy/copy_3232.hpp
+++ b/src/database/kernels/copy/copy_3232.hpp
@ -60,6 +60,7 @@ const DatabaseEntry CopyComplexSingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 32, 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 32, 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 16, 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -96,7 +97,8 @@ const DatabaseEntry CopyComplexSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
          { Name{"GRID K520                                         "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -122,11 +124,12 @@ const DatabaseEntry CopyComplexSingle = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 32, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
@ -141,7 +144,7 @@ const DatabaseEntry CopyComplexSingle = {
    { // Default
      kDeviceTypeAll, "default", {
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
--- a/src/database/kernels/copy/copy_64.hpp
+++ b/src/database/kernels/copy/copy_64.hpp
@ -60,13 +60,14 @@ const DatabaseEntry CopyDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 16, 32, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 16, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 32, 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 16, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 16, 32, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz          "}, Params{ 16, 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz         "}, Params{ 32, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 16, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 32, 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
@ -82,7 +83,8 @@ const DatabaseEntry CopyDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
          { Name{"GRID K520                                         "}, Params{ 32, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -110,18 +112,19 @@ const DatabaseEntry CopyDouble = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 32, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
--- a/src/database/kernels/copy/copy_6464.hpp
+++ b/src/database/kernels/copy/copy_6464.hpp
@ -60,6 +60,7 @@ const DatabaseEntry CopyComplexDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 8, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 32, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 32, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 32, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 32, 32, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -82,7 +83,8 @@ const DatabaseEntry CopyComplexDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
          { Name{"GRID K520                                         "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -110,6 +112,7 @@ const DatabaseEntry CopyComplexDouble = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 8, 32, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 32, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
--- a/src/database/kernels/pad/pad_32.hpp
+++ b/src/database/kernels/pad/pad_32.hpp
@ -68,6 +68,7 @@ const DatabaseEntry PadSingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 32, 32, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 32, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 32, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 16, 32, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -104,7 +105,8 @@ const DatabaseEntry PadSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
          { Name{"GRID K520                                         "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -133,6 +135,7 @@ const DatabaseEntry PadSingle = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 16, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 16, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
--- a/src/database/kernels/pad/pad_3232.hpp
+++ b/src/database/kernels/pad/pad_3232.hpp
@ -68,6 +68,7 @@ const DatabaseEntry PadComplexSingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 32, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -104,7 +105,8 @@ const DatabaseEntry PadComplexSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 32, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
          { Name{"GRID K520                                         "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -132,11 +134,12 @@ const DatabaseEntry PadComplexSingle = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 32, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
--- a/src/database/kernels/pad/pad_64.hpp
+++ b/src/database/kernels/pad/pad_64.hpp
@ -60,6 +60,7 @@ const DatabaseEntry PadDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 32, 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 32, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -82,7 +83,8 @@ const DatabaseEntry PadDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
          { Name{"GRID K520                                         "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -110,11 +112,12 @@ const DatabaseEntry PadDouble = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 32, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 32, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
--- a/src/database/kernels/pad/pad_6464.hpp
+++ b/src/database/kernels/pad/pad_6464.hpp
@ -60,6 +60,7 @@ const DatabaseEntry PadComplexDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 16, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 16, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -82,6 +83,7 @@ const DatabaseEntry PadComplexDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
@ -110,8 +112,9 @@ const DatabaseEntry PadComplexDouble = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 8, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 16, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "default", {
          { kDeviceNameDefault                                        , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
--- a/src/database/kernels/padtranspose/padtranspose_32.hpp
+++ b/src/database/kernels/padtranspose/padtranspose_32.hpp
@ -68,6 +68,7 @@ const DatabaseEntry PadtransposeSingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 0, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 0, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -104,6 +105,7 @@ const DatabaseEntry PadtransposeSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
@ -132,11 +134,12 @@ const DatabaseEntry PadtransposeSingle = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
--- a/src/database/kernels/padtranspose/padtranspose_3232.hpp
+++ b/src/database/kernels/padtranspose/padtranspose_3232.hpp
@ -68,6 +68,7 @@ const DatabaseEntry PadtransposeComplexSingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -104,6 +105,7 @@ const DatabaseEntry PadtransposeComplexSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
@ -132,6 +134,7 @@ const DatabaseEntry PadtransposeComplexSingle = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
--- a/src/database/kernels/padtranspose/padtranspose_64.hpp
+++ b/src/database/kernels/padtranspose/padtranspose_64.hpp
@ -60,13 +60,14 @@ const DatabaseEntry PadtransposeDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 0, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 0, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz          "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz         "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
@ -82,6 +83,7 @@ const DatabaseEntry PadtransposeDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
@ -110,6 +112,7 @@ const DatabaseEntry PadtransposeDouble = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
--- a/src/database/kernels/padtranspose/padtranspose_6464.hpp
+++ b/src/database/kernels/padtranspose/padtranspose_6464.hpp
@ -60,6 +60,7 @@ const DatabaseEntry PadtransposeComplexDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -82,6 +83,7 @@ const DatabaseEntry PadtransposeComplexDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
@ -110,8 +112,9 @@ const DatabaseEntry PadtransposeComplexDouble = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "default", {
          { kDeviceNameDefault                                        , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
--- a/src/database/kernels/transpose/transpose_32.hpp
+++ b/src/database/kernels/transpose/transpose_32.hpp
@ -68,6 +68,7 @@ const DatabaseEntry TransposeSingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 4, 1, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 8, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 4, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 4, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -104,7 +105,8 @@ const DatabaseEntry TransposeSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
          { Name{"GRID K520                                         "}, Params{ 32, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -133,8 +135,9 @@ const DatabaseEntry TransposeSingle = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "default", {
          { kDeviceNameDefault                                        , Params{ 8, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
--- a/src/database/kernels/transpose/transpose_3232.hpp
+++ b/src/database/kernels/transpose/transpose_3232.hpp
@ -68,13 +68,14 @@ const DatabaseEntry TransposeComplexSingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 8, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 4, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 4, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz          "}, Params{ 16, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz         "}, Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 4, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
@ -96,6 +97,7 @@ const DatabaseEntry TransposeComplexSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
@ -124,8 +126,9 @@ const DatabaseEntry TransposeComplexSingle = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 16, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 32, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "default", {
          { kDeviceNameDefault                                        , Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
--- a/src/database/kernels/transpose/transpose_64.hpp
+++ b/src/database/kernels/transpose/transpose_64.hpp
@ -60,6 +60,7 @@ const DatabaseEntry TransposeDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 4, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 4, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 4, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -82,6 +83,7 @@ const DatabaseEntry TransposeDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
@ -110,6 +112,7 @@ const DatabaseEntry TransposeDouble = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
--- a/src/database/kernels/transpose/transpose_6464.hpp
+++ b/src/database/kernels/transpose/transpose_6464.hpp
@ -60,13 +60,14 @@ const DatabaseEntry TransposeComplexDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 8, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 4, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz          "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz         "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 4, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 4, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
@ -74,7 +75,8 @@ const DatabaseEntry TransposeComplexDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
          { Name{"GRID K520                                         "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -102,8 +104,9 @@ const DatabaseEntry TransposeComplexDouble = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "default", {
          { kDeviceNameDefault                                        , Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
--- a/src/database/kernels/xaxpy/xaxpy_32.hpp
+++ b/src/database/kernels/xaxpy/xaxpy_32.hpp
@ -68,6 +68,7 @@ const DatabaseEntry XaxpySingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 8, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 4, 2048, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 1, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 4, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -104,7 +105,8 @@ const DatabaseEntry XaxpySingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
          { Name{"GRID K520                                         "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -133,11 +135,12 @@ const DatabaseEntry XaxpySingle = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 1, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 1, 512, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 4, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
@ -152,7 +155,7 @@ const DatabaseEntry XaxpySingle = {
    { // Default
      kDeviceTypeAll, "default", {
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
--- a/src/database/kernels/xaxpy/xaxpy_3232.hpp
+++ b/src/database/kernels/xaxpy/xaxpy_3232.hpp
@ -68,6 +68,7 @@ const DatabaseEntry XaxpyComplexSingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 4, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 4, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 4, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 4, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 1, 1024, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -104,7 +105,8 @@ const DatabaseEntry XaxpyComplexSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
          { Name{"GRID K520                                         "}, Params{ 1, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -132,6 +134,7 @@ const DatabaseEntry XaxpyComplexSingle = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 2, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
--- a/src/database/kernels/xaxpy/xaxpy_64.hpp
+++ b/src/database/kernels/xaxpy/xaxpy_64.hpp
@ -60,6 +60,7 @@ const DatabaseEntry XaxpyDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 1, 2048, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 1, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 2, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 8, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -82,7 +83,8 @@ const DatabaseEntry XaxpyDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
          { Name{"GRID K520                                         "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -110,6 +112,7 @@ const DatabaseEntry XaxpyDouble = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 1, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 2, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
--- a/src/database/kernels/xaxpy/xaxpy_6464.hpp
+++ b/src/database/kernels/xaxpy/xaxpy_6464.hpp
@ -60,6 +60,7 @@ const DatabaseEntry XaxpyComplexDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 4, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 8, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 8, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 8, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 8, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -82,6 +83,7 @@ const DatabaseEntry XaxpyComplexDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 1, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
@ -110,8 +112,9 @@ const DatabaseEntry XaxpyComplexDouble = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 1, 512, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 512, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "default", {
          { kDeviceNameDefault                                        , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
--- a/src/database/kernels/xdot/xdot_32.hpp
+++ b/src/database/kernels/xdot/xdot_32.hpp
@ -56,6 +56,7 @@ const DatabaseEntry XdotSingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -81,7 +82,8 @@ const DatabaseEntry XdotSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 512, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
          { Name{"GRID K520                                         "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -108,6 +110,7 @@ const DatabaseEntry XdotSingle = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 128, 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 512, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
--- a/src/database/kernels/xdot/xdot_3232.hpp
+++ b/src/database/kernels/xdot/xdot_3232.hpp
@ -56,6 +56,7 @@ const DatabaseEntry XdotComplexSingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -81,6 +82,7 @@ const DatabaseEntry XdotComplexSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 128, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
@ -107,11 +109,12 @@ const DatabaseEntry XdotComplexSingle = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 1024, 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 128, 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 512, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 512, 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
--- a/src/database/kernels/xdot/xdot_64.hpp
+++ b/src/database/kernels/xdot/xdot_64.hpp
@ -48,6 +48,7 @@ const DatabaseEntry XdotDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 512, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -60,7 +61,8 @@ const DatabaseEntry XdotDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 256, 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 512, 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
          { Name{"GRID K520                                         "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -86,8 +88,9 @@ const DatabaseEntry XdotDouble = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 128, 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 256, 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 128, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "default", {
          { kDeviceNameDefault                                        , Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -97,7 +100,7 @@ const DatabaseEntry XdotDouble = {
    { // Default
      kDeviceTypeAll, "default", {
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
--- a/src/database/kernels/xdot/xdot_6464.hpp
+++ b/src/database/kernels/xdot/xdot_6464.hpp
@ -48,11 +48,12 @@ const DatabaseEntry XdotComplexDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 32, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz         "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
@ -60,6 +61,7 @@ const DatabaseEntry XdotComplexDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
@ -86,11 +88,12 @@ const DatabaseEntry XdotComplexDouble = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 128, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
--- a/src/database/kernels/xgemm/xgemm_32.hpp
+++ b/src/database/kernels/xgemm/xgemm_32.hpp
@ -68,13 +68,14 @@ const DatabaseEntry XgemmSingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 16, 2, 8, 8, 128, 16, 8, 128, 0, 1, 1, 1, 1, 8 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 32, 2, 32, 16, 64, 32, 8, 64, 0, 1, 1, 0, 1, 1 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 32, 8, 32, 32, 64, 32, 16, 64, 1, 1, 1, 0, 2, 2 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 32, 2, 16, 8, 128, 16, 8, 64, 0, 0, 1, 0, 1, 2 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 32, 2, 32, 8, 128, 8, 8, 128, 1, 1, 1, 1, 2, 8 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 16, 2, 8, 8, 128, 8, 8, 128, 1, 1, 1, 0, 1, 8 } },
          { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz          "}, Params{ 32, 8, 16, 16, 64, 32, 32, 64, 0, 1, 1, 0, 1, 2 } },
          { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz         "}, Params{ 32, 2, 16, 32, 32, 8, 8, 64, 0, 1, 0, 0, 1, 8 } },
-          { kDeviceNameDefault                                        , Params{ 32, 2, 8, 8, 32, 8, 8, 64, 1, 1, 0, 0, 4, 4 } },
+          { kDeviceNameDefault                                        , Params{ 32, 2, 8, 8, 32, 8, 8, 64, 0, 0, 0, 0, 4, 4 } },
        } },
      }
    },
@ -104,7 +105,8 @@ const DatabaseEntry XgemmSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 16, 2, 16, 8, 64, 32, 16, 64, 1, 1, 1, 1, 2, 2 } },
-          { kDeviceNameDefault                                        , Params{ 16, 2, 16, 8, 64, 32, 16, 64, 1, 1, 1, 1, 2, 2 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 16, 2, 32, 8, 128, 16, 32, 64, 1, 1, 1, 0, 4, 2 } },
+          { kDeviceNameDefault                                        , Params{ 32, 2, 8, 8, 32, 32, 32, 64, 0, 0, 0, 0, 1, 2 } },
        } },
        { "SM3.0", {
          { Name{"GRID K520                                         "}, Params{ 16, 2, 16, 8, 32, 8, 16, 64, 1, 1, 1, 1, 2, 4 } },
@ -133,8 +135,9 @@ const DatabaseEntry XgemmSingle = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 16, 2, 32, 16, 128, 32, 8, 128, 1, 1, 1, 0, 4, 1 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 32, 2, 16, 8, 64, 8, 8, 64, 1, 1, 1, 1, 4, 8 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 16, 2, 32, 16, 64, 16, 8, 128, 1, 1, 0, 1, 2, 8 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 1 } },
-          { kDeviceNameDefault                                        , Params{ 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 1 } },
+          { kDeviceNameDefault                                        , Params{ 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } },
        } },
        { "default", {
          { kDeviceNameDefault                                        , Params{ 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 2 } },
--- a/src/database/kernels/xgemm/xgemm_3232.hpp
+++ b/src/database/kernels/xgemm/xgemm_3232.hpp
@ -68,13 +68,14 @@ const DatabaseEntry XgemmComplexSingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 16, 2, 32, 8, 128, 16, 16, 128, 1, 1, 0, 1, 1, 2 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 32, 2, 32, 32, 32, 16, 16, 128, 1, 0, 0, 0, 1, 1 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 32, 2, 32, 16, 32, 16, 16, 64, 0, 1, 1, 0, 1, 2 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 32, 2, 16, 16, 64, 8, 16, 64, 0, 1, 0, 0, 4, 4 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 32, 2, 8, 8, 128, 16, 32, 128, 0, 0, 0, 0, 1, 4 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 32, 2, 8, 8, 128, 32, 8, 128, 0, 0, 0, 0, 1, 4 } },
          { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz          "}, Params{ 32, 2, 8, 16, 16, 16, 16, 128, 0, 0, 1, 1, 1, 4 } },
          { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz         "}, Params{ 32, 2, 16, 16, 16, 8, 16, 128, 0, 1, 0, 0, 1, 8 } },
-          { kDeviceNameDefault                                        , Params{ 32, 2, 16, 16, 64, 8, 8, 32, 0, 0, 0, 0, 4, 2 } },
+          { kDeviceNameDefault                                        , Params{ 32, 2, 8, 8, 32, 8, 8, 32, 0, 0, 0, 0, 4, 2 } },
        } },
      }
    },
@ -104,7 +105,8 @@ const DatabaseEntry XgemmComplexSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 16, 2, 16, 16, 32, 32, 16, 128, 0, 1, 1, 1, 2, 2 } },
-          { kDeviceNameDefault                                        , Params{ 16, 2, 16, 16, 32, 32, 16, 128, 0, 1, 1, 1, 2, 2 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 32, 2, 16, 8, 32, 32, 32, 128, 1, 0, 1, 0, 1, 1 } },
+          { kDeviceNameDefault                                        , Params{ 16, 2, 16, 16, 32, 32, 16, 128, 0, 0, 1, 0, 1, 1 } },
        } },
        { "SM3.0", {
          { Name{"GRID K520                                         "}, Params{ 16, 8, 32, 32, 64, 32, 16, 128, 1, 0, 1, 0, 1, 4 } },
@ -132,8 +134,9 @@ const DatabaseEntry XgemmComplexSingle = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 16, 2, 16, 16, 128, 16, 16, 64, 1, 1, 1, 1, 2, 4 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 16, 2, 32, 16, 64, 32, 8, 64, 1, 1, 0, 0, 1, 2 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 16, 2, 8, 16, 32, 16, 8, 64, 1, 1, 0, 0, 1, 1 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 32, 2, 32, 32, 64, 8, 8, 32, 1, 1, 0, 0, 2, 4 } },
-          { kDeviceNameDefault                                        , Params{ 32, 2, 16, 16, 32, 16, 16, 64, 1, 1, 0, 0, 2, 4 } },
+          { kDeviceNameDefault                                        , Params{ 32, 2, 8, 8, 16, 16, 16, 32, 1, 1, 0, 0, 2, 2 } },
        } },
        { "default", {
          { kDeviceNameDefault                                        , Params{ 32, 2, 8, 8, 16, 32, 32, 64, 1, 1, 0, 0, 1, 1 } },
--- a/src/database/kernels/xgemm/xgemm_64.hpp
+++ b/src/database/kernels/xgemm/xgemm_64.hpp
@ -60,13 +60,14 @@ const DatabaseEntry XgemmDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 16, 2, 32, 8, 128, 16, 16, 128, 1, 1, 1, 1, 2, 8 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 32, 2, 16, 8, 128, 16, 8, 128, 1, 0, 1, 1, 1, 8 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 32, 2, 32, 16, 128, 16, 16, 64, 0, 1, 1, 0, 1, 2 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 32, 2, 32, 16, 128, 16, 16, 128, 0, 0, 1, 0, 1, 2 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 32, 2, 16, 8, 128, 8, 8, 64, 1, 0, 0, 1, 2, 8 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 32, 2, 16, 8, 128, 8, 8, 128, 1, 0, 0, 0, 2, 8 } },
          { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz          "}, Params{ 32, 2, 8, 16, 128, 16, 8, 128, 0, 0, 1, 1, 1, 8 } },
          { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz         "}, Params{ 32, 2, 8, 16, 64, 16, 8, 64, 0, 1, 1, 0, 1, 4 } },
-          { kDeviceNameDefault                                        , Params{ 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 1, 4 } },
+          { kDeviceNameDefault                                        , Params{ 32, 2, 32, 32, 32, 16, 16, 64, 1, 1, 0, 0, 1, 4 } },
        } },
      }
    },
@ -82,7 +83,8 @@ const DatabaseEntry XgemmDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 16, 2, 8, 16, 32, 32, 8, 64, 1, 1, 1, 0, 1, 2 } },
-          { kDeviceNameDefault                                        , Params{ 16, 2, 8, 16, 32, 32, 8, 64, 1, 1, 1, 0, 1, 2 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 32, 2, 32, 16, 64, 8, 8, 32, 0, 1, 1, 1, 1, 4 } },
+          { kDeviceNameDefault                                        , Params{ 16, 2, 32, 16, 32, 32, 8, 32, 0, 1, 1, 0, 1, 2 } },
        } },
        { "SM3.0", {
          { Name{"GRID K520                                         "}, Params{ 16, 2, 8, 8, 16, 8, 8, 32, 1, 0, 0, 1, 2, 2 } },
@ -110,18 +112,19 @@ const DatabaseEntry XgemmDouble = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 16, 2, 8, 16, 32, 8, 8, 64, 0, 0, 1, 1, 2, 8 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 32, 2, 16, 16, 32, 16, 16, 64, 0, 0, 0, 0, 2, 4 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 16, 2, 16, 16, 16, 16, 16, 64, 0, 0, 1, 0, 1, 4 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 32, 2, 32, 32, 32, 16, 16, 32, 0, 0, 0, 0, 1, 2 } },
          { kDeviceNameDefault                                        , Params{ 32, 2, 16, 16, 32, 16, 16, 64, 0, 0, 0, 0, 2, 4 } },
        } },
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 32, 2, 16, 16, 32, 16, 16, 64, 0, 0, 0, 0, 2, 4 } },
+          { kDeviceNameDefault                                        , Params{ 32, 2, 8, 8, 32, 8, 8, 32, 0, 0, 0, 0, 1, 1 } },
        } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 32, 2, 8, 8, 32, 8, 8, 64, 0, 0, 0, 0, 4, 4 } },
+          { kDeviceNameDefault                                        , Params{ 32, 2, 32, 32, 32, 8, 8, 32, 1, 1, 0, 0, 1, 4 } },
        } },
      }
    },
--- a/src/database/kernels/xgemm/xgemm_6464.hpp
+++ b/src/database/kernels/xgemm/xgemm_6464.hpp
@ -60,13 +60,14 @@ const DatabaseEntry XgemmComplexDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 16, 2, 32, 8, 64, 16, 8, 128, 0, 1, 0, 1, 2, 1 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 32, 2, 8, 8, 32, 16, 32, 128, 1, 0, 1, 0, 4, 1 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 32, 2, 16, 32, 128, 16, 16, 64, 0, 1, 0, 0, 2, 4 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 32, 2, 16, 32, 128, 16, 8, 32, 0, 1, 0, 0, 4, 1 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 32, 2, 8, 8, 128, 8, 16, 128, 0, 0, 0, 1, 1, 8 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 32, 2, 8, 8, 128, 32, 8, 128, 0, 0, 0, 0, 1, 4 } },
          { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz          "}, Params{ 32, 8, 8, 32, 32, 8, 8, 32, 0, 1, 0, 0, 1, 2 } },
          { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz         "}, Params{ 32, 2, 32, 8, 128, 16, 8, 128, 0, 0, 1, 1, 1, 4 } },
-          { kDeviceNameDefault                                        , Params{ 32, 2, 8, 8, 16, 8, 8, 32, 1, 1, 0, 0, 1, 2 } },
+          { kDeviceNameDefault                                        , Params{ 32, 2, 16, 16, 32, 8, 8, 32, 1, 1, 0, 0, 2, 2 } },
        } },
      }
    },
@ -82,7 +83,8 @@ const DatabaseEntry XgemmComplexDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 16, 2, 32, 32, 32, 32, 8, 32, 0, 0, 1, 0, 1, 1 } },
-          { kDeviceNameDefault                                        , Params{ 16, 2, 32, 32, 32, 32, 8, 32, 0, 0, 1, 0, 1, 1 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 32, 2, 32, 32, 32, 8, 8, 64, 0, 0, 0, 0, 1, 2 } },
+          { kDeviceNameDefault                                        , Params{ 16, 2, 32, 32, 32, 32, 8, 32, 0, 0, 0, 0, 1, 1 } },
        } },
        { "SM3.0", {
          { Name{"GRID K520                                         "}, Params{ 32, 8, 16, 16, 16, 8, 16, 64, 1, 0, 1, 1, 1, 1 } },
@ -109,18 +111,19 @@ const DatabaseEntry XgemmComplexDouble = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 32, 8, 32, 16, 32, 8, 8, 32, 0, 0, 0, 1, 1, 4 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 32, 2, 16, 16, 16, 8, 8, 16, 0, 0, 0, 0, 1, 2 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 32, 2, 16, 16, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 32, 2, 16, 16, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } },
          { kDeviceNameDefault                                        , Params{ 32, 2, 32, 32, 32, 32, 32, 64, 0, 0, 0, 0, 1, 2 } },
        } },
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 32, 2, 16, 16, 32, 16, 16, 32, 0, 0, 0, 0, 1, 1 } },
+          { kDeviceNameDefault                                        , Params{ 32, 2, 16, 16, 16, 16, 16, 32, 0, 0, 0, 0, 1, 1 } },
        } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 32, 2, 32, 32, 32, 8, 8, 32, 1, 1, 0, 0, 1, 1 } },
+          { kDeviceNameDefault                                        , Params{ 32, 2, 16, 16, 32, 16, 16, 64, 0, 0, 0, 0, 2, 2 } },
        } },
      }
    },
--- a/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp
+++ b/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp
@ -44,10 +44,11 @@ const DatabaseEntry XgemmDirectSingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 2, 8, 8, 8, 8, 0, 0, 1, 8, 64, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 8, 16, 16, 16, 16, 0, 0, 1, 1, 64, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 16, 16, 8, 8, 8, 0, 0, 2, 4, 32, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 2, 8, 8, 8, 8, 0, 0, 2, 2, 64, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz         "}, Params{ 2, 8, 8, 16, 8, 0, 0, 4, 4, 64, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 4, 2, 32, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0 } },
        } },
      }
    },
@ -62,6 +63,10 @@ const DatabaseEntry XgemmDirectSingle = {
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
+        { "SM2.0", {
+          { Name{"GeForce GTX 580                                   "}, Params{ 2, 16, 8, 32, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 2, 16, 8, 32, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0 } },
+        } },
        { "SM3.0", {
          { Name{"GeForce GT 650M                                   "}, Params{ 16, 16, 16, 8, 16, 1, 0, 2, 2, 32, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 16, 16, 16, 8, 16, 1, 0, 2, 2, 32, 0, 0, 0, 0 } },
@ -76,8 +81,9 @@ const DatabaseEntry XgemmDirectSingle = {
        } },
        { "SM6.1", {
          { Name{"GeForce GTX 1080                                  "}, Params{ 16, 16, 8, 16, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 16, 8, 8, 16, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 32, 8, 8, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 4, 2, 32, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0 } },
        } },
        { "default", {
          { kDeviceNameDefault                                        , Params{ 2, 8, 8, 16, 16, 1, 1, 4, 2, 32, 0, 0, 0, 0 } },
@ -95,7 +101,7 @@ const DatabaseEntry XgemmDirectSingle = {
    { // Default
      kDeviceTypeAll, "default", {
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 4, 2, 32, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0 } },
        } },
      }
    },
--- a/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp
+++ b/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp
@ -40,6 +40,7 @@ const DatabaseEntry XgemmDirectComplexSingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 2, 8, 8, 8, 8, 0, 0, 4, 4, 32, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 2, 16, 8, 16, 8, 0, 0, 2, 1, 32, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 4, 32, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 2, 8, 8, 16, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz         "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0 } },
@ -58,6 +59,10 @@ const DatabaseEntry XgemmDirectComplexSingle = {
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
+        { "SM2.0", {
+          { Name{"GeForce GTX 580                                   "}, Params{ 2, 16, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 2, 16, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0 } },
+        } },
        { "SM3.5", {
          { Name{"GeForce GTX TITAN Black                           "}, Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0 } },
@ -68,8 +73,9 @@ const DatabaseEntry XgemmDirectComplexSingle = {
        } },
        { "SM6.1", {
          { Name{"GeForce GTX 1080                                  "}, Params{ 8, 8, 16, 16, 8, 1, 1, 2, 2, 32, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 2, 16, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 2, 16, 16, 8, 8, 1, 1, 2, 4, 32, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0 } },
        } },
        { "default", {
          { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0 } },
--- a/src/database/kernels/xgemm_direct/xgemm_direct_64.hpp
+++ b/src/database/kernels/xgemm_direct/xgemm_direct_64.hpp
@ -36,15 +36,20 @@ const DatabaseEntry XgemmDirectDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 8, 8, 8, 8, 8, 0, 0, 1, 4, 32, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz         "}, Params{ 8, 8, 8, 8, 8, 0, 1, 1, 1, 8, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 4, 2, 32, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0 } },
        } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
+        { "SM2.0", {
+          { Name{"GeForce GTX 580                                   "}, Params{ 8, 16, 16, 16, 8, 1, 0, 2, 2, 32, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 16, 16, 16, 8, 1, 0, 2, 2, 32, 0, 0, 0, 0 } },
+        } },
        { "SM3.5", {
          { Name{"GeForce GTX TITAN Black                           "}, Params{ 8, 16, 16, 16, 8, 1, 0, 1, 1, 16, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 8, 16, 16, 16, 8, 1, 0, 1, 1, 16, 0, 0, 0, 0 } },
@ -55,8 +60,9 @@ const DatabaseEntry XgemmDirectDouble = {
        } },
        { "SM6.1", {
          { Name{"GeForce GTX 1080                                  "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 2, 16, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0 } },
        } },
        { "default", {
          { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0 } },
--- a/src/database/kernels/xgemm_direct/xgemm_direct_6464.hpp
+++ b/src/database/kernels/xgemm_direct/xgemm_direct_6464.hpp
@ -36,15 +36,20 @@ const DatabaseEntry XgemmDirectComplexDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 2, 8, 8, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 2, 16, 16, 8, 8, 0, 0, 1, 4, 32, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 8, 16, 16, 8, 8, 0, 0, 2, 1, 32, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 8, 16, 8, 8, 8, 0, 0, 2, 2, 32, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz         "}, Params{ 2, 32, 8, 8, 8, 0, 0, 1, 4, 32, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0 } },
        } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
+        { "SM2.0", {
+          { Name{"GeForce GTX 580                                   "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0 } },
+        } },
        { "SM3.5", {
          { Name{"GeForce GTX TITAN Black                           "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0 } },
@ -55,6 +60,7 @@ const DatabaseEntry XgemmDirectComplexDouble = {
        } },
        { "SM6.1", {
          { Name{"GeForce GTX 1080                                  "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0 } },
        } },
--- a/src/database/kernels/xgemv/xgemv_32.hpp
+++ b/src/database/kernels/xgemv/xgemv_32.hpp
@ -60,6 +60,7 @@ const DatabaseEntry XgemvSingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -95,7 +96,8 @@ const DatabaseEntry XgemvSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
          { Name{"GRID K520                                         "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -124,6 +126,7 @@ const DatabaseEntry XgemvSingle = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
--- a/src/database/kernels/xgemv/xgemv_3232.hpp
+++ b/src/database/kernels/xgemv/xgemv_3232.hpp
@ -60,6 +60,7 @@ const DatabaseEntry XgemvComplexSingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -95,6 +96,7 @@ const DatabaseEntry XgemvComplexSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
@ -116,6 +118,7 @@ const DatabaseEntry XgemvComplexSingle = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
--- a/src/database/kernels/xgemv/xgemv_64.hpp
+++ b/src/database/kernels/xgemv/xgemv_64.hpp
@ -52,6 +52,7 @@ const DatabaseEntry XgemvDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -73,6 +74,7 @@ const DatabaseEntry XgemvDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
@ -101,6 +103,7 @@ const DatabaseEntry XgemvDouble = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
@ -112,7 +115,7 @@ const DatabaseEntry XgemvDouble = {
    { // Default
      kDeviceTypeAll, "default", {
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
--- a/src/database/kernels/xgemv/xgemv_6464.hpp
+++ b/src/database/kernels/xgemv/xgemv_6464.hpp
@ -52,6 +52,7 @@ const DatabaseEntry XgemvComplexDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -73,6 +74,7 @@ const DatabaseEntry XgemvComplexDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
@ -80,6 +82,10 @@ const DatabaseEntry XgemvComplexDouble = {
          { Name{"GeForce GTX 670                                   "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
+        { "SM6.1", {
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
        { "default", {
          { kDeviceNameDefault                                        , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
--- a/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp
+++ b/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp
@ -60,12 +60,13 @@ const DatabaseEntry XgemvFastSingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 1, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 4, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 1, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 2, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz         "}, Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
@ -95,7 +96,8 @@ const DatabaseEntry XgemvFastSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
          { Name{"GRID K520                                         "}, Params{ 2, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -124,6 +126,7 @@ const DatabaseEntry XgemvFastSingle = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
--- a/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp
+++ b/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp
@ -60,12 +60,13 @@ const DatabaseEntry XgemvFastComplexSingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 2, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz         "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
@ -95,6 +96,7 @@ const DatabaseEntry XgemvFastComplexSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
@ -105,6 +107,7 @@ const DatabaseEntry XgemvFastComplexSingle = {
        } },
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "default", {
--- a/src/database/kernels/xgemv_fast/xgemv_fast_64.hpp
+++ b/src/database/kernels/xgemv_fast/xgemv_fast_64.hpp
@ -52,12 +52,13 @@ const DatabaseEntry XgemvFastDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 1, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 1, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz         "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 1, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
@ -73,6 +74,7 @@ const DatabaseEntry XgemvFastDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
@ -101,6 +103,7 @@ const DatabaseEntry XgemvFastDouble = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
--- a/src/database/kernels/xgemv_fast/xgemv_fast_6464.hpp
+++ b/src/database/kernels/xgemv_fast/xgemv_fast_6464.hpp
@ -52,6 +52,7 @@ const DatabaseEntry XgemvFastComplexDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 4, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 4, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 2, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 1, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz           "}, Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -73,6 +74,7 @@ const DatabaseEntry XgemvFastComplexDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
@ -80,6 +82,10 @@ const DatabaseEntry XgemvFastComplexDouble = {
          { Name{"GeForce GTX 670                                   "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
+        { "SM6.1", {
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
        { "default", {
          { kDeviceNameDefault                                        , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
--- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp
+++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp
@ -44,11 +44,12 @@ const DatabaseEntry XgemvFastRotSingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 8, 128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz          "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz         "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
@ -66,6 +67,10 @@ const DatabaseEntry XgemvFastRotSingle = {
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
+        { "SM2.0", {
+          { Name{"GeForce GTX 580                                   "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
        { "SM3.0", {
          { Name{"GeForce GT 650M                                   "}, Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -81,6 +86,7 @@ const DatabaseEntry XgemvFastRotSingle = {
        } },
        { "SM6.1", {
          { Name{"GeForce GTX 1080                                  "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
--- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp
+++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp
@ -44,11 +44,12 @@ const DatabaseEntry XgemvFastRotComplexSingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 8, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz          "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz         "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
@ -64,6 +65,21 @@ const DatabaseEntry XgemvFastRotComplexSingle = {
        } },
      }
    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "SM2.0", {
+          { Name{"GeForce GTX 580                                   "}, Params{ 1, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
+        { "SM6.1", {
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
+        { "default", {
+          { kDeviceNameDefault                                        , Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
+      }
+    },
    { // Default
      kDeviceTypeAll, "default", {
        { "default", {
--- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_64.hpp
+++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_64.hpp
@ -36,6 +36,7 @@ const DatabaseEntry XgemvFastRotDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz          "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -46,6 +47,10 @@ const DatabaseEntry XgemvFastRotDouble = {
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
+        { "SM2.0", {
+          { Name{"GeForce GTX 580                                   "}, Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
        { "SM3.5", {
          { Name{"GeForce GTX TITAN                                 "}, Params{ 1, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX TITAN Black                           "}, Params{ 1, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -57,6 +62,7 @@ const DatabaseEntry XgemvFastRotDouble = {
        } },
        { "SM6.1", {
          { Name{"GeForce GTX 1080                                  "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
--- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_6464.hpp
+++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_6464.hpp
@ -36,6 +36,7 @@ const DatabaseEntry XgemvFastRotComplexDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 2, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 2, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz          "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -44,6 +45,21 @@ const DatabaseEntry XgemvFastRotComplexDouble = {
        } },
      }
    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "SM2.0", {
+          { Name{"GeForce GTX 580                                   "}, Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
+        { "SM6.1", {
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
+        { "default", {
+          { kDeviceNameDefault                                        , Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
+      }
+    },
    { // Default
      kDeviceTypeAll, "default", {
        { "default", {
--- a/src/database/kernels/xger/xger_32.hpp
+++ b/src/database/kernels/xger/xger_32.hpp
@ -68,6 +68,7 @@ const DatabaseEntry XgerSingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 32, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 256, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 128, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 256, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 256, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -94,6 +95,7 @@ const DatabaseEntry XgerSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 256, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 128, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 256, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
@ -116,11 +118,12 @@ const DatabaseEntry XgerSingle = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 512, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 64, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 512, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 512, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
--- a/src/database/kernels/xger/xger_3232.hpp
+++ b/src/database/kernels/xger/xger_3232.hpp
@ -68,12 +68,13 @@ const DatabaseEntry XgerComplexSingle = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 128, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 512, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 256, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 256, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 256, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz          "}, Params{ 512, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz         "}, Params{ 256, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 256, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 128, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
@ -94,7 +95,8 @@ const DatabaseEntry XgerComplexSingle = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
          { Name{"GRID K520                                         "}, Params{ 64, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -115,8 +117,9 @@ const DatabaseEntry XgerComplexSingle = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 16, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 128, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 256, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "default", {
          { kDeviceNameDefault                                        , Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
--- a/src/database/kernels/xger/xger_64.hpp
+++ b/src/database/kernels/xger/xger_64.hpp
@ -60,6 +60,7 @@ const DatabaseEntry XgerDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 256, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 512, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 256, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 256, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -73,6 +74,7 @@ const DatabaseEntry XgerDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
@ -94,18 +96,19 @@ const DatabaseEntry XgerDouble = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 32, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 512, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 512, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 256, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
--- a/src/database/kernels/xger/xger_6464.hpp
+++ b/src/database/kernels/xger/xger_6464.hpp
@ -60,12 +60,13 @@ const DatabaseEntry XgerComplexDouble = {
      kDeviceTypeCPU, "Intel", {
        { "default", {
          { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz         "}, Params{ 128, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz           "}, Params{ 512, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz          "}, Params{ 512, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz   "}, Params{ 256, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz          "}, Params{ 512, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz          "}, Params{ 256, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz         "}, Params{ 256, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 256, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
@ -73,6 +74,7 @@ const DatabaseEntry XgerComplexDouble = {
      kDeviceTypeGPU, "NVIDIA", {
        { "SM2.0", {
          { Name{"GeForce GTX 480                                   "}, Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 580                                   "}, Params{ 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { kDeviceNameDefault                                        , Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "SM3.0", {
@ -94,11 +96,12 @@ const DatabaseEntry XgerComplexDouble = {
        { "SM6.1", {
          { Name{"GeForce GTX 1070                                  "}, Params{ 8, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"GeForce GTX 1080                                  "}, Params{ 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { Name{"GeForce GTX 1080 Ti                               "}, Params{ 4, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
          { Name{"TITAN X (Pascal)                                  "}, Params{ 4, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 256, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
        { "default", {
-          { kDeviceNameDefault                                        , Params{ 16, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
        } },
      }
    },
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@ -24,14 +24,16 @@ R"(

 // =================================================================================================

-// Enable support for double-precision
-#if PRECISION == 16
-  #pragma OPENCL EXTENSION cl_khr_fp16: enable
-#endif
+#ifndef CUDA
+  // Enable support for double-precision
+  #if PRECISION == 16
+    #pragma OPENCL EXTENSION cl_khr_fp16: enable
+  #endif

-// Enable support for double-precision
-#if PRECISION == 64 || PRECISION == 6464
-   #pragma OPENCL EXTENSION cl_khr_fp64: enable
+  // Enable support for double-precision
+  #if PRECISION == 64 || PRECISION == 6464
+    #pragma OPENCL EXTENSION cl_khr_fp64: enable
+  #endif
 #endif

 // Half-precision
@ -117,10 +119,15 @@ R"(
  #define GetRealArg(x) x
 #endif

+// Pointers to local memory objects (using a define because CUDA doesn't need them)
+#ifndef LOCAL_PTR
+  #define LOCAL_PTR __local
+#endif
+
 // =================================================================================================

 // Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific
-// devices, this is enabled (see src/routine.cc).
+// devices, this is enabled (see src/routine.cpp).
 #ifndef USE_CL_MAD
  #define USE_CL_MAD 0
 #endif
@ -254,18 +261,18 @@ R"(
 // http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf
 // More details: https://github.com/CNugteren/CLBlast/issues/53
 #if USE_STAGGERED_INDICES == 1
-  INLINE_FUNC size_t GetGroupIDFlat() {
+  INLINE_FUNC int GetGroupIDFlat() {
    return get_group_id(0) + get_num_groups(0) * get_group_id(1);
  }
-  INLINE_FUNC size_t GetGroupID1() {
+  INLINE_FUNC int GetGroupID1() {
    return (GetGroupIDFlat()) % get_num_groups(1);
  }
-  INLINE_FUNC size_t GetGroupID0() {
+  INLINE_FUNC int GetGroupID0() {
    return ((GetGroupIDFlat() / get_num_groups(1)) + GetGroupID1()) % get_num_groups(0);
  }
 #else
-  INLINE_FUNC size_t GetGroupID1() { return get_group_id(1); }
-  INLINE_FUNC size_t GetGroupID0() { return get_group_id(0); }
+  INLINE_FUNC int GetGroupID1() { return get_group_id(1); }
+  INLINE_FUNC int GetGroupID0() { return get_group_id(0); }
 #endif

 // =================================================================================================
--- a/src/kernels/level2/level2.opencl
+++ b/src/kernels/level2/level2.opencl
@ -34,7 +34,7 @@ R"(

 // Returns an element from a vector
 INLINE_FUNC real LoadVector(const int id, const int max,
-                            __global real* gm, const int offset, const int inc,
+                            const __global real* gm, const int offset, const int inc,
                            const int do_conjugate) {
  if (id < max) {
    real result = gm[id*inc + offset];
--- a/src/kernels/level3/invert_diagonal_blocks.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks.opencl
@ -164,7 +164,7 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
 // =================================================================================================

 // Triple matrix-multiplication kernel: C = A * B
-INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, __local real* blm, int n,
+INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, LOCAL_PTR real* blm, int n,
                              __global const real* agm, __global const real* bgm, __global real* cgm,
                              const int lda, const int ldb, const int ldc,
                              int current_size, int num_pages, const int block_size) {
@ -250,7 +250,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
 // =================================================================================================

 // Triple matrix-multiplication kernel part 1: B12 = A12 * B22 (upper) or B21 = A21 * B11 (lower)
-INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, __local real* blm, int n,
+INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, LOCAL_PTR real* blm, int n,
                                   __global const real* src, const int a_offset, const int lda,
                                   __global real* dest, int current_size, int num_pages, const int block_size) {

@ -286,7 +286,7 @@ INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, __local rea
 }

 // Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower)
-INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, __local real* blm, const int n,
+INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR real* blm, const int n,
                                   __global real* dest, int current_size, int num_pages, const int block_size) {

  // Emulates a 3D grid: NX * (NY * num_pages)
--- a/src/kernels/level3/transpose_fast.opencl
+++ b/src/kernels/level3/transpose_fast.opencl
@ -84,39 +84,39 @@ void TransposeMatrixFast(const int ld,
  #if TRA_WPT == 1
    results[0] = v[0];
  #elif TRA_WPT == 2
-    results[0] = (realT) {v[0].x, v[1].x};
-    results[1] = (realT) {v[0].y, v[1].y};
+    results[0].x = v[0].x; results[0].y = v[1].x;
+    results[1].x = v[0].y; results[1].y = v[1].y;
  #elif TRA_WPT == 4
-    results[0] = (realT) {v[0].x, v[1].x, v[2].x, v[3].x};
-    results[1] = (realT) {v[0].y, v[1].y, v[2].y, v[3].y};
-    results[2] = (realT) {v[0].z, v[1].z, v[2].z, v[3].z};
-    results[3] = (realT) {v[0].w, v[1].w, v[2].w, v[3].w};
+    results[0].x = v[0].x; results[0].y = v[1].x; results[0].z = v[2].x; results[0].w = v[3].x;
+    results[1].x = v[0].y; results[1].y = v[1].y; results[1].z = v[2].y; results[1].w = v[3].y;
+    results[2].x = v[0].z; results[2].y = v[1].z; results[2].z = v[2].z; results[2].w = v[3].z;
+    results[3].x = v[0].w; results[3].y = v[1].w; results[3].z = v[2].w; results[3].w = v[3].w;
  #elif TRA_WPT == 8
-    results[0] = (realT) {v[0].s0, v[1].s0, v[2].s0, v[3].s0, v[4].s0, v[5].s0, v[6].s0, v[7].s0};
-    results[1] = (realT) {v[0].s1, v[1].s1, v[2].s1, v[3].s1, v[4].s1, v[5].s1, v[6].s1, v[7].s1};
-    results[2] = (realT) {v[0].s2, v[1].s2, v[2].s2, v[3].s2, v[4].s2, v[5].s2, v[6].s2, v[7].s2};
-    results[3] = (realT) {v[0].s3, v[1].s3, v[2].s3, v[3].s3, v[4].s3, v[5].s3, v[6].s3, v[7].s3};
-    results[4] = (realT) {v[0].s4, v[1].s4, v[2].s4, v[3].s4, v[4].s4, v[5].s4, v[6].s4, v[7].s4};
-    results[5] = (realT) {v[0].s5, v[1].s5, v[2].s5, v[3].s5, v[4].s5, v[5].s5, v[6].s5, v[7].s5};
-    results[6] = (realT) {v[0].s6, v[1].s6, v[2].s6, v[3].s6, v[4].s6, v[5].s6, v[6].s6, v[7].s6};
-    results[7] = (realT) {v[0].s7, v[1].s7, v[2].s7, v[3].s7, v[4].s7, v[5].s7, v[6].s7, v[7].s7};
+    results[0].s0 = v[0].s0; results[0].s1 = v[1].s0; results[0].s2 = v[2].s0; results[0].s3 = v[3].s0; results[0].s4 = v[4].s0; results[0].s5 = v[5].s0; results[0].s6 = v[6].s0; results[0].s7 = v[7].s0;
+    results[1].s0 = v[0].s1; results[1].s1 = v[1].s1; results[1].s2 = v[2].s1; results[1].s3 = v[3].s1; results[1].s4 = v[4].s1; results[1].s5 = v[5].s1; results[1].s6 = v[6].s1; results[1].s7 = v[7].s1;
+    results[2].s0 = v[0].s2; results[2].s1 = v[1].s2; results[2].s2 = v[2].s2; results[2].s3 = v[3].s2; results[2].s4 = v[4].s2; results[2].s5 = v[5].s2; results[2].s6 = v[6].s2; results[2].s7 = v[7].s2;
+    results[3].s0 = v[0].s3; results[3].s1 = v[1].s3; results[3].s2 = v[2].s3; results[3].s3 = v[3].s3; results[3].s4 = v[4].s3; results[3].s5 = v[5].s3; results[3].s6 = v[6].s3; results[3].s7 = v[7].s3;
+    results[4].s0 = v[0].s4; results[4].s1 = v[1].s4; results[4].s2 = v[2].s4; results[4].s3 = v[3].s4; results[4].s4 = v[4].s4; results[4].s5 = v[5].s4; results[4].s6 = v[6].s4; results[4].s7 = v[7].s4;
+    results[5].s0 = v[0].s5; results[5].s1 = v[1].s5; results[5].s2 = v[2].s5; results[5].s3 = v[3].s5; results[5].s4 = v[4].s5; results[5].s5 = v[5].s5; results[5].s6 = v[6].s5; results[5].s7 = v[7].s5;
+    results[6].s0 = v[0].s6; results[6].s1 = v[1].s6; results[6].s2 = v[2].s6; results[6].s3 = v[3].s6; results[6].s4 = v[4].s6; results[6].s5 = v[5].s6; results[6].s6 = v[6].s6; results[6].s7 = v[7].s6;
+    results[7].s0 = v[0].s7; results[7].s1 = v[1].s7; results[7].s2 = v[2].s7; results[7].s3 = v[3].s7; results[7].s4 = v[4].s7; results[7].s5 = v[5].s7; results[7].s6 = v[6].s7; results[7].s7 = v[7].s7;
  #elif TRA_WPT == 16
-    results[ 0] = (realT) {v[0].s0, v[1].s0, v[2].s0, v[3].s0, v[4].s0, v[5].s0, v[6].s0, v[7].s0, v[8].s0, v[9].s0, v[10].s0, v[11].s0, v[12].s0, v[13].s0, v[14].s0, v[15].s0};
-    results[ 1] = (realT) {v[0].s1, v[1].s1, v[2].s1, v[3].s1, v[4].s1, v[5].s1, v[6].s1, v[7].s1, v[8].s1, v[9].s1, v[10].s1, v[11].s1, v[12].s1, v[13].s1, v[14].s1, v[15].s1};
-    results[ 2] = (realT) {v[0].s2, v[1].s2, v[2].s2, v[3].s2, v[4].s2, v[5].s2, v[6].s2, v[7].s2, v[8].s2, v[9].s2, v[10].s2, v[11].s2, v[12].s2, v[13].s2, v[14].s2, v[15].s2};
-    results[ 3] = (realT) {v[0].s3, v[1].s3, v[2].s3, v[3].s3, v[4].s3, v[5].s3, v[6].s3, v[7].s3, v[8].s3, v[9].s3, v[10].s3, v[11].s3, v[12].s3, v[13].s3, v[14].s3, v[15].s3};
-    results[ 4] = (realT) {v[0].s4, v[1].s4, v[2].s4, v[3].s4, v[4].s4, v[5].s4, v[6].s4, v[7].s4, v[8].s4, v[9].s4, v[10].s4, v[11].s4, v[12].s4, v[13].s4, v[14].s4, v[15].s4};
-    results[ 5] = (realT) {v[0].s5, v[1].s5, v[2].s5, v[3].s5, v[4].s5, v[5].s5, v[6].s5, v[7].s5, v[8].s5, v[9].s5, v[10].s5, v[11].s5, v[12].s5, v[13].s5, v[14].s5, v[15].s5};
-    results[ 6] = (realT) {v[0].s6, v[1].s6, v[2].s6, v[3].s6, v[4].s6, v[5].s6, v[6].s6, v[7].s6, v[8].s6, v[9].s6, v[10].s6, v[11].s6, v[12].s6, v[13].s6, v[14].s6, v[15].s6};
-    results[ 7] = (realT) {v[0].s7, v[1].s7, v[2].s7, v[3].s7, v[4].s7, v[5].s7, v[6].s7, v[7].s7, v[8].s7, v[9].s7, v[10].s7, v[11].s7, v[12].s7, v[13].s7, v[14].s7, v[15].s7};
-    results[ 8] = (realT) {v[0].s8, v[1].s8, v[2].s8, v[3].s8, v[4].s8, v[5].s8, v[6].s8, v[7].s8, v[8].s8, v[9].s8, v[10].s8, v[11].s8, v[12].s8, v[13].s8, v[14].s8, v[15].s8};
-    results[ 9] = (realT) {v[0].s9, v[1].s9, v[2].s9, v[3].s9, v[4].s9, v[5].s9, v[6].s9, v[7].s9, v[8].s9, v[9].s9, v[10].s9, v[11].s9, v[12].s9, v[13].s9, v[14].s9, v[15].s9};
-    results[10] = (realT) {v[0].sA, v[1].sA, v[2].sA, v[3].sA, v[4].sA, v[5].sA, v[6].sA, v[7].sA, v[8].sA, v[9].sA, v[10].sA, v[11].sA, v[12].sA, v[13].sA, v[14].sA, v[15].sA};
-    results[11] = (realT) {v[0].sB, v[1].sB, v[2].sB, v[3].sB, v[4].sB, v[5].sB, v[6].sB, v[7].sB, v[8].sB, v[9].sB, v[10].sB, v[11].sB, v[12].sB, v[13].sB, v[14].sB, v[15].sB};
-    results[12] = (realT) {v[0].sC, v[1].sC, v[2].sC, v[3].sC, v[4].sC, v[5].sC, v[6].sC, v[7].sC, v[8].sC, v[9].sC, v[10].sC, v[11].sC, v[12].sC, v[13].sC, v[14].sC, v[15].sC};
-    results[13] = (realT) {v[0].sD, v[1].sD, v[2].sD, v[3].sD, v[4].sD, v[5].sD, v[6].sD, v[7].sD, v[8].sD, v[9].sD, v[10].sD, v[11].sD, v[12].sD, v[13].sD, v[14].sD, v[15].sD};
-    results[14] = (realT) {v[0].sE, v[1].sE, v[2].sE, v[3].sE, v[4].sE, v[5].sE, v[6].sE, v[7].sE, v[8].sE, v[9].sE, v[10].sE, v[11].sE, v[12].sE, v[13].sE, v[14].sE, v[15].sE};
-    results[15] = (realT) {v[0].sF, v[1].sF, v[2].sF, v[3].sF, v[4].sF, v[5].sF, v[6].sF, v[7].sF, v[8].sF, v[9].sF, v[10].sF, v[11].sF, v[12].sF, v[13].sF, v[14].sF, v[15].sF};
+    results[ 0].s0 = v[0].s0; results[ 0].s1 = v[1].s0; results[ 0].s2 = v[2].s0; results[ 0].s3 = v[3].s0; results[ 0].s4 = v[4].s0; results[ 0].s5 = v[5].s0; results[ 0].s6 = v[6].s0; results[ 0].s7 = v[7].s0; results[ 0].s8 = v[8].s0; results[ 0].s9 = v[9].s0; results[ 0].sA = v[10].s0; results[ 0].sB = v[11].s0; results[ 0].sC = v[12].s0; results[ 0].sD = v[13].s0; results[ 0].sE = v[14].s0; results[ 0].sF = v[15].s0;
+    results[ 1].s0 = v[0].s1; results[ 1].s1 = v[1].s1; results[ 1].s2 = v[2].s1; results[ 1].s3 = v[3].s1; results[ 1].s4 = v[4].s1; results[ 1].s5 = v[5].s1; results[ 1].s6 = v[6].s1; results[ 1].s7 = v[7].s1; results[ 1].s8 = v[8].s1; results[ 1].s9 = v[9].s1; results[ 1].sA = v[10].s1; results[ 1].sB = v[11].s1; results[ 1].sC = v[12].s1; results[ 1].sD = v[13].s1; results[ 1].sE = v[14].s1; results[ 1].sF = v[15].s1;
+    results[ 2].s0 = v[0].s2; results[ 2].s1 = v[1].s2; results[ 2].s2 = v[2].s2; results[ 2].s3 = v[3].s2; results[ 2].s4 = v[4].s2; results[ 2].s5 = v[5].s2; results[ 2].s6 = v[6].s2; results[ 2].s7 = v[7].s2; results[ 2].s8 = v[8].s2; results[ 2].s9 = v[9].s2; results[ 2].sA = v[10].s2; results[ 2].sB = v[11].s2; results[ 2].sC = v[12].s2; results[ 2].sD = v[13].s2; results[ 2].sE = v[14].s2; results[ 2].sF = v[15].s2;
+    results[ 3].s0 = v[0].s3; results[ 3].s1 = v[1].s3; results[ 3].s2 = v[2].s3; results[ 3].s3 = v[3].s3; results[ 3].s4 = v[4].s3; results[ 3].s5 = v[5].s3; results[ 3].s6 = v[6].s3; results[ 3].s7 = v[7].s3; results[ 3].s8 = v[8].s3; results[ 3].s9 = v[9].s3; results[ 3].sA = v[10].s3; results[ 3].sB = v[11].s3; results[ 3].sC = v[12].s3; results[ 3].sD = v[13].s3; results[ 3].sE = v[14].s3; results[ 3].sF = v[15].s3;
+    results[ 4].s0 = v[0].s4; results[ 4].s1 = v[1].s4; results[ 4].s2 = v[2].s4; results[ 4].s3 = v[3].s4; results[ 4].s4 = v[4].s4; results[ 4].s5 = v[5].s4; results[ 4].s6 = v[6].s4; results[ 4].s7 = v[7].s4; results[ 4].s8 = v[8].s4; results[ 4].s9 = v[9].s4; results[ 4].sA = v[10].s4; results[ 4].sB = v[11].s4; results[ 4].sC = v[12].s4; results[ 4].sD = v[13].s4; results[ 4].sE = v[14].s4; results[ 4].sF = v[15].s4;
+    results[ 5].s0 = v[0].s5; results[ 5].s1 = v[1].s5; results[ 5].s2 = v[2].s5; results[ 5].s3 = v[3].s5; results[ 5].s4 = v[4].s5; results[ 5].s5 = v[5].s5; results[ 5].s6 = v[6].s5; results[ 5].s7 = v[7].s5; results[ 5].s8 = v[8].s5; results[ 5].s9 = v[9].s5; results[ 5].sA = v[10].s5; results[ 5].sB = v[11].s5; results[ 5].sC = v[12].s5; results[ 5].sD = v[13].s5; results[ 5].sE = v[14].s5; results[ 5].sF = v[15].s5;
+    results[ 6].s0 = v[0].s6; results[ 6].s1 = v[1].s6; results[ 6].s2 = v[2].s6; results[ 6].s3 = v[3].s6; results[ 6].s4 = v[4].s6; results[ 6].s5 = v[5].s6; results[ 6].s6 = v[6].s6; results[ 6].s7 = v[7].s6; results[ 6].s8 = v[8].s6; results[ 6].s9 = v[9].s6; results[ 6].sA = v[10].s6; results[ 6].sB = v[11].s6; results[ 6].sC = v[12].s6; results[ 6].sD = v[13].s6; results[ 6].sE = v[14].s6; results[ 6].sF = v[15].s6;
+    results[ 7].s0 = v[0].s7; results[ 7].s1 = v[1].s7; results[ 7].s2 = v[2].s7; results[ 7].s3 = v[3].s7; results[ 7].s4 = v[4].s7; results[ 7].s5 = v[5].s7; results[ 7].s6 = v[6].s7; results[ 7].s7 = v[7].s7; results[ 7].s8 = v[8].s7; results[ 7].s9 = v[9].s7; results[ 7].sA = v[10].s7; results[ 7].sB = v[11].s7; results[ 7].sC = v[12].s7; results[ 7].sD = v[13].s7; results[ 7].sE = v[14].s7; results[ 7].sF = v[15].s7;
+    results[ 8].s0 = v[0].s8; results[ 8].s1 = v[1].s8; results[ 8].s2 = v[2].s8; results[ 8].s3 = v[3].s8; results[ 8].s4 = v[4].s8; results[ 8].s5 = v[5].s8; results[ 8].s6 = v[6].s8; results[ 8].s7 = v[7].s8; results[ 8].s8 = v[8].s8; results[ 8].s9 = v[9].s8; results[ 8].sA = v[10].s8; results[ 8].sB = v[11].s8; results[ 8].sC = v[12].s8; results[ 8].sD = v[13].s8; results[ 8].sE = v[14].s8; results[ 8].sF = v[15].s8;
+    results[ 9].s0 = v[0].s9; results[ 9].s1 = v[1].s9; results[ 9].s2 = v[2].s9; results[ 9].s3 = v[3].s9; results[ 9].s4 = v[4].s9; results[ 9].s5 = v[5].s9; results[ 9].s6 = v[6].s9; results[ 9].s7 = v[7].s9; results[ 9].s8 = v[8].s9; results[ 9].s9 = v[9].s9; results[ 9].sA = v[10].s9; results[ 9].sB = v[11].s9; results[ 9].sC = v[12].s9; results[ 9].sD = v[13].s9; results[ 9].sE = v[14].s9; results[ 9].sF = v[15].s9;
+    results[10].s0 = v[0].sA; results[10].s1 = v[1].sA; results[10].s2 = v[2].sA; results[10].s3 = v[3].sA; results[10].s4 = v[4].sA; results[10].s5 = v[5].sA; results[10].s6 = v[6].sA; results[10].s7 = v[7].sA; results[10].s8 = v[8].sA; results[10].s9 = v[9].sA; results[10].sA = v[10].sA; results[10].sB = v[11].sA; results[10].sC = v[12].sA; results[10].sD = v[13].sA; results[10].sE = v[14].sA; results[10].sF = v[15].sA;
+    results[11].s0 = v[0].sB; results[11].s1 = v[1].sB; results[11].s2 = v[2].sB; results[11].s3 = v[3].sB; results[11].s4 = v[4].sB; results[11].s5 = v[5].sB; results[11].s6 = v[6].sB; results[11].s7 = v[7].sB; results[11].s8 = v[8].sB; results[11].s9 = v[9].sB; results[11].sA = v[10].sB; results[11].sB = v[11].sB; results[11].sC = v[12].sB; results[11].sD = v[13].sB; results[11].sE = v[14].sB; results[11].sF = v[15].sB;
+    results[12].s0 = v[0].sC; results[12].s1 = v[1].sC; results[12].s2 = v[2].sC; results[12].s3 = v[3].sC; results[12].s4 = v[4].sC; results[12].s5 = v[5].sC; results[12].s6 = v[6].sC; results[12].s7 = v[7].sC; results[12].s8 = v[8].sC; results[12].s9 = v[9].sC; results[12].sA = v[10].sC; results[12].sB = v[11].sC; results[12].sC = v[12].sC; results[12].sD = v[13].sC; results[12].sE = v[14].sC; results[12].sF = v[15].sC;
+    results[13].s0 = v[0].sD; results[13].s1 = v[1].sD; results[13].s2 = v[2].sD; results[13].s3 = v[3].sD; results[13].s4 = v[4].sD; results[13].s5 = v[5].sD; results[13].s6 = v[6].sD; results[13].s7 = v[7].sD; results[13].s8 = v[8].sD; results[13].s9 = v[9].sD; results[13].sA = v[10].sD; results[13].sB = v[11].sD; results[13].sC = v[12].sD; results[13].sD = v[13].sD; results[13].sE = v[14].sD; results[13].sF = v[15].sD;
+    results[14].s0 = v[0].sE; results[14].s1 = v[1].sE; results[14].s2 = v[2].sE; results[14].s3 = v[3].sE; results[14].s4 = v[4].sE; results[14].s5 = v[5].sE; results[14].s6 = v[6].sE; results[14].s7 = v[7].sE; results[14].s8 = v[8].sE; results[14].s9 = v[9].sE; results[14].sA = v[10].sE; results[14].sB = v[11].sE; results[14].sC = v[12].sE; results[14].sD = v[13].sE; results[14].sE = v[14].sE; results[14].sF = v[15].sE;
+    results[15].s0 = v[0].sF; results[15].s1 = v[1].sF; results[15].s2 = v[2].sF; results[15].s3 = v[3].sF; results[15].s4 = v[4].sF; results[15].s5 = v[5].sF; results[15].s6 = v[6].sF; results[15].s7 = v[7].sF; results[15].s8 = v[8].sF; results[15].s9 = v[9].sF; results[15].sA = v[10].sF; results[15].sB = v[11].sF; results[15].sC = v[12].sF; results[15].sD = v[13].sF; results[15].sE = v[14].sF; results[15].sF = v[15].sF;
  #endif

  // Multiplies by alpha and then stores the results into the destination matrix
--- a/src/kernels/level3/transpose_pad.opencl
+++ b/src/kernels/level3/transpose_pad.opencl
@ -24,7 +24,7 @@ R"(

 // Transposes a matrix from source to destination. The output is padded with zero values in case the
 // destination matrix dimensions are larger than the transposed source matrix dimensions.
-INLINE_FUNC void _TransposePadMatrix(__local real* tile,
+INLINE_FUNC void _TransposePadMatrix(LOCAL_PTR real* tile,
                                     const int src_one, const int src_two,
                                     const int src_ld, const int src_offset,
                                     __global const real* restrict src,
@ -105,7 +105,7 @@ void TransposePadMatrix(const int src_one, const int src_two,
 // Transposes a matrix, while considering possible padding in the source matrix. Data is read from a
 // padded source matrix, but only the actual data is written back to the transposed destination
 // matrix. This kernel optionally checks for upper/lower triangular matrices.
-INLINE_FUNC void _TransposeMatrix(__local real* tile,
+INLINE_FUNC void _TransposeMatrix(LOCAL_PTR real* tile,
                                  const int src_one, const int src_two,
                                  const int src_ld, const int src_offset,
                                  __global const real* restrict src,
--- a/src/kernels/level3/xgemm_direct_batched.opencl
+++ b/src/kernels/level3/xgemm_direct_batched.opencl
@ -19,8 +19,8 @@ R"(
 // =================================================================================================

 // Direct version of the batched GEMM kernel with [A, B] = [non-transposed, non-transposed]
-__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
-__kernel void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
+__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
                                   const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
                                   const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
                                   const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld,
@ -40,8 +40,8 @@ __kernel void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int
 }

 // Direct version of the batched GEMM kernel with [A, B] = [non-transposed, transposed]
-__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
-__kernel void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
+__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
                                   const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
                                   const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
                                   const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld,
@ -61,8 +61,8 @@ __kernel void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int
 }

 // Direct version of the batched GEMM kernel with [A, B] = [transposed, non-transposed]
-__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
-__kernel void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
+__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
                                   const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
                                   const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
                                   const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld,
@ -82,8 +82,8 @@ __kernel void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int
 }

 // Direct version of the batched GEMM kernel with [A, B] = [transposed, transposed]
-__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
-__kernel void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
+__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
                                   const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
                                   const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
                                   const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld,
--- a/src/kernels/level3/xgemm_direct_part1.opencl
+++ b/src/kernels/level3/xgemm_direct_part1.opencl
@ -184,7 +184,7 @@ INLINE_FUNC void GlobalToPrivateCheckedB(const __global real* restrict bgms, rea

 // Caches on-chip local memory into per-thread private memory (registers). This function is specific
 // for caching the A input matrix.
-INLINE_FUNC void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const int kg,
+INLINE_FUNC void LocalToPrivateDirectA(LOCAL_PTR real* alm, real apm[MWID], const int kg,
                                       const int a_transpose) {
  #pragma unroll
  for (int mi=0; mi<MWID; ++mi) {
@ -195,7 +195,7 @@ INLINE_FUNC void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const
 }

 // Same as above, but now for the B input matrix
-INLINE_FUNC void LocalToPrivateDirectB(__local real* blm, real bpm[NWID], const int kg,
+INLINE_FUNC void LocalToPrivateDirectB(LOCAL_PTR real* blm, real bpm[NWID], const int kg,
                                       const int b_transpose) {
  #pragma unroll
  for (int ni=0; ni<NWID; ++ni) {
--- a/src/kernels/level3/xgemm_direct_part2.opencl
+++ b/src/kernels/level3/xgemm_direct_part2.opencl
@ -19,7 +19,7 @@ R"(

 // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
 // caching the A input matrix.
-INLINE_FUNC void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm,
+INLINE_FUNC void GlobalToLocalDirectA(const __global realMD* restrict agm, LOCAL_PTR real* alm,
                                      const int a_ld, const int a_offset, const int kwg,
                                      const int a_transpose, const int a_conjugate) {
  #if MDIMCD == MDIMAD
@ -90,7 +90,7 @@ INLINE_FUNC void GlobalToLocalDirectA(const __global realMD* restrict agm, __loc
 }

 // Same as above, but now for the B input matrix
-INLINE_FUNC void GlobalToLocalDirectB(const __global realND* restrict bgm, __local real* blm,
+INLINE_FUNC void GlobalToLocalDirectB(const __global realND* restrict bgm, LOCAL_PTR real* blm,
                                      const int b_ld, const int b_offset, const int kwg,
                                      const int b_transpose, const int b_conjugate) {
  #if MDIMCD == NDIMBD
@ -165,7 +165,7 @@ INLINE_FUNC void GlobalToLocalDirectB(const __global realND* restrict bgm, __loc
 // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
 // caching the A input matrix. In contrast to the functions above, this function performs doesn't
 // use the vector data-types.
-INLINE_FUNC void GlobalToLocalScalarA(const __global real* restrict agms, __local real* alm,
+INLINE_FUNC void GlobalToLocalScalarA(const __global real* restrict agms, LOCAL_PTR real* alm,
                                      const int a_ld, const int a_offset, const int kwg,
                                      const int a_transpose, const int a_conjugate) {
  #if MDIMCD == MDIMAD
@ -196,7 +196,7 @@ INLINE_FUNC void GlobalToLocalScalarA(const __global real* restrict agms, __loca
 }

 // Same as above, but now for the B input matrix
-INLINE_FUNC void GlobalToLocalScalarB(const __global real* restrict bgms, __local real* blm,
+INLINE_FUNC void GlobalToLocalScalarB(const __global real* restrict bgms, LOCAL_PTR real* blm,
                                      const int b_ld, const int b_offset, const int kwg,
                                      const int b_transpose, const int b_conjugate) {
  #if MDIMCD == NDIMBD
@ -231,7 +231,7 @@ INLINE_FUNC void GlobalToLocalScalarB(const __global real* restrict bgms, __loca
 // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
 // caching the A input matrix. In contrast to the functions above, this function performs bounds
 // checks and doesn't use the vector data-types.
-INLINE_FUNC void GlobalToLocalCheckedA(const __global real* restrict agms, __local real* alm,
+INLINE_FUNC void GlobalToLocalCheckedA(const __global real* restrict agms, LOCAL_PTR real* alm,
                                       const int a_ld, const int a_offset, const int kwg,
                                       const int a_transpose, const int a_conjugate,
                                       const int kSizeM, const int kSizeK) {
@ -270,7 +270,7 @@ INLINE_FUNC void GlobalToLocalCheckedA(const __global real* restrict agms, __loc
 }

 // Same as above, but now for the B input matrix
-INLINE_FUNC void GlobalToLocalCheckedB(const __global real* restrict bgms, __local real* blm,
+INLINE_FUNC void GlobalToLocalCheckedB(const __global real* restrict bgms, LOCAL_PTR real* blm,
                                       const int b_ld, const int b_offset, const int kwg,
                                       const int b_transpose, const int b_conjugate,
                                       const int kSizeN, const int kSizeK) {
--- a/src/kernels/level3/xgemm_direct_part3.opencl
+++ b/src/kernels/level3/xgemm_direct_part3.opencl
@ -24,7 +24,7 @@ INLINE_FUNC void XgemmDirect(const int kSizeM, const int kSizeN, const int kSize
                             const __global realMD* restrict agm, const int a_offset, const int a_ld,
                             const __global realND* restrict bgm, const int b_offset, const int b_ld,
                             __global real* cgm, const int c_offset, const int c_ld,
-                             __local real* alm, __local real* blm,
+                             LOCAL_PTR real* alm, LOCAL_PTR real* blm,
                             const int a_transpose, const int b_transpose, const int c_transpose,
                             const int a_conjugate, const int b_conjugate) {
  const real alpha = GetRealArg(arg_alpha);
@ -147,8 +147,8 @@ INLINE_FUNC void XgemmDirect(const int kSizeM, const int kSizeN, const int kSize
 // =================================================================================================

 // Direct version of the GEMM kernel with [A, B] = [non-transposed, non-transposed]
-__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
-__kernel void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK,
+__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK,
                            const real_arg arg_alpha, const real_arg arg_beta,
                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
                            const __global realND* restrict bgm, const int b_offset, const int b_ld,
@ -162,8 +162,8 @@ __kernel void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK
 }

 // Direct version of the GEMM kernel with [A, B] = [non-transposed, transposed]
-__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
-__kernel void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK,
+__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK,
                            const real_arg arg_alpha, const real_arg arg_beta,
                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
                            const __global realND* restrict bgm, const int b_offset, const int b_ld,
@ -177,8 +177,8 @@ __kernel void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK
 }

 // Direct version of the GEMM kernel with [A, B] = [transposed, non-transposed]
-__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
-__kernel void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK,
+__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK,
                            const real_arg arg_alpha, const real_arg arg_beta,
                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
                            const __global realND* restrict bgm, const int b_offset, const int b_ld,
@ -192,8 +192,8 @@ __kernel void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK
 }

 // Direct version of the GEMM kernel with [A, B] = [transposed, transposed]
-__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
-__kernel void XgemmDirectTT(const int kSizeM, const int kSizeN, const int kSizeK,
+__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+void XgemmDirectTT(const int kSizeM, const int kSizeN, const int kSizeK,
                            const real_arg arg_alpha, const real_arg arg_beta,
                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
                            const __global realND* restrict bgm, const int b_offset, const int b_ld,
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@ -186,7 +186,7 @@ INLINE_FUNC void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
 // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
 // caching the A input matrix.
 #if SA == 1
-INLINE_FUNC void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm,
+INLINE_FUNC void GlobalToLocalA(const __global realM* restrict agm, LOCAL_PTR realM* alm,
                                const int kSizeM, const int tid, const int kwg) {
  const int la0 = tid % MDIMA;
  const int la1 = tid / MDIMA;
@ -216,7 +216,7 @@ INLINE_FUNC void GlobalToLocalA(const __global realM* restrict agm, __local real

 // Same as above, but now for the B input matrix
 #if SB == 1
-INLINE_FUNC void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm,
+INLINE_FUNC void GlobalToLocalB(const __global realN* restrict bgm, LOCAL_PTR realN* blm,
                                const int kSizeN, const int tid, const int kwg) {
  const int lb0 = tid % NDIMB;
  const int lb1 = tid / NDIMB;
@ -298,7 +298,7 @@ INLINE_FUNC void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[
 // Caches on-chip local memory into per-thread private memory (registers). This function is specific
 // for caching the A input matrix.
 #if SA == 1
-INLINE_FUNC void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) {
+INLINE_FUNC void LocalToPrivateA(LOCAL_PTR realM* alm, realM apm[MWI/VWM], const int kg) {
  #pragma unroll
  for (int mi=0; mi<MWI/VWM; ++mi) {
    #if STRM == 0
@ -313,7 +313,7 @@ INLINE_FUNC void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const i

 // Same as above, but now for the B input matrix
 #if SB == 1
-INLINE_FUNC void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) {
+INLINE_FUNC void LocalToPrivateB(LOCAL_PTR realN* blm, realN bpm[NWI/VWN], const int kg) {
  #pragma unroll
  for (int ni=0; ni<NWI/VWN; ++ni) {
    #if STRN == 0
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@ -17,16 +17,16 @@ R"(

 // =================================================================================================

-// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
+// Main body of the matrix-multiplication algorithm. It calls various (inlined) functions.
 INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
                           const __global realM* restrict agm, const __global realN* restrict bgm,
                           __global realM* cgm, realM cpm[NWI][MWI/VWM]
                           #if SA == 1 && SB == 1
-                             , __local realM* alm, __local realN* blm
+                             , LOCAL_PTR realM* alm, LOCAL_PTR realN* blm
                           #elif SA == 1
-                             , __local realM* alm
+                             , LOCAL_PTR realM* alm
                           #elif SB == 1
-                             , __local realN* blm
+                             , LOCAL_PTR realN* blm
                           #endif
                           ) {

@ -192,10 +192,15 @@ void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
           const real_arg arg_beta,
           const __global realM* restrict agm,
           const __global realN* restrict bgm,
-           __global realM* cgm) {
+           __global realM* cgm,
+           const int b_offset, const int c_offset) {
  const real alpha = GetRealArg(arg_alpha);
  const real beta = GetRealArg(arg_beta);

+  // Adds the offsets (in case of use of a single temporary buffer for A, B, and C)
+  bgm = &bgm[b_offset];
+  cgm = &cgm[c_offset];
+
  // Allocates workgroup-private memory (local memory)
  #if SA == 1
    __local realM alm[KWG * MWG/VWM];
--- a/src/kernels/opencl_to_cuda.h
+++ b/src/kernels/opencl_to_cuda.h
@ -0,0 +1,90 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains an (incomplete) header to interpret OpenCL kernels as CUDA kernels.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+// =================================================================================================
+
+// CLBlast specific additions
+#define CUDA 1
+#define LOCAL_PTR  // pointers to local memory don't have to be annotated in CUDA
+
+// Replaces the OpenCL get_xxx_ID with CUDA equivalents
+__device__ int get_local_id(const int x) {
+  if (x == 0) { return threadIdx.x; }
+  if (x == 1) { return threadIdx.y; }
+  return threadIdx.z;
+}
+__device__ int get_group_id(const int x) {
+  if (x == 0) { return blockIdx.x; }
+  if (x == 1) { return blockIdx.y; }
+  return blockIdx.z;
+}
+__device__ int get_local_size(const int x) {
+  if (x == 0) { return blockDim.x; }
+  if (x == 1) { return blockDim.y; }
+  return blockDim.z;
+}
+__device__ int get_num_groups(const int x) {
+  if (x == 0) { return gridDim.x; }
+  if (x == 1) { return gridDim.y; }
+  return gridDim.z;
+}
+__device__ int get_global_size(const int x) {
+  if (x == 0) { return gridDim.x * blockDim.x; }
+  if (x == 1) { return gridDim.y * blockDim.y; }
+  return gridDim.z * blockDim.z;
+}
+__device__ int get_global_id(const int x) {
+  if (x == 0) { return blockIdx.x*blockDim.x + threadIdx.x; }
+  if (x == 1) { return blockIdx.y*blockDim.y + threadIdx.y; }
+  return blockIdx.z*blockDim.z + threadIdx.z;
+}
+
+// Adds the data-types which are not available natively under CUDA
+typedef struct { float s0; float s1; float s2; float s3;
+                 float s4; float s5; float s6; float s7; } float8;
+typedef struct { float s0; float s1; float s2; float s3;
+                 float s4; float s5; float s6; float s7;
+                 float s8; float s9; float s10; float s11;
+                 float s12; float s13; float s14; float s15; } float16;
+typedef struct { double s0; double s1; double s2; double s3;
+                 double s4; double s5; double s6; double s7; } double8;
+typedef struct { double s0; double s1; double s2; double s3;
+                 double s4; double s5; double s6; double s7;
+                 double s8; double s9; double s10; double s11;
+                 double s12; double s13; double s14; double s15; } double16;
+
+// Replaces the OpenCL keywords with CUDA equivalent
+#define __kernel __placeholder__
+#define __global
+#define __placeholder__ extern "C" __global__
+#define __local __shared__
+#define restrict __restrict__
+#define __constant const
+#define inline __device__ // assumes all device functions are annotated with inline in OpenCL
+
+// Kernel attributes (don't replace currently)
+#define reqd_work_group_size(x, y, z)
+
+// Replaces OpenCL synchronisation with CUDA synchronisation
+#define barrier(x) __syncthreads()
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
+
--- a/src/routine.cpp
+++ b/src/routine.cpp
@ -60,7 +60,6 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
    event_(event),
    context_(queue_.GetContext()),
    device_(queue_.GetDevice()),
-    platform_(device_.Platform()),
    db_(kernel_names) {

  InitDatabase(userDatabase);
@ -68,26 +67,35 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
 }

 void Routine::InitDatabase(const std::vector<database::DatabaseEntry> &userDatabase) {
+  const auto platform_id = device_.PlatformID();
  for (const auto &kernel_name : kernel_names_) {

    // Queries the cache to see whether or not the kernel parameter database is already there
    bool has_db;
-    db_(kernel_name) = DatabaseCache::Instance().Get(DatabaseKeyRef{ platform_, device_(), precision_, kernel_name },
+    db_(kernel_name) = DatabaseCache::Instance().Get(DatabaseKeyRef{ platform_id, device_(), precision_, kernel_name },
                                                     &has_db);
    if (has_db) { continue; }

    // Builds the parameter database for this device and routine set and stores it in the cache
+    log_debug("Searching database for kernel '" + kernel_name + "'");
    db_(kernel_name) = Database(device_, kernel_name, precision_, userDatabase);
-    DatabaseCache::Instance().Store(DatabaseKey{ platform_, device_(), precision_, kernel_name },
+    DatabaseCache::Instance().Store(DatabaseKey{ platform_id, device_(), precision_, kernel_name },
                                    Database{ db_(kernel_name) });
  }
 }

 void Routine::InitProgram(std::initializer_list<const char *> source) {

+  // Determines the identifier for this particular routine call
+  auto routine_info = routine_name_;
+  for (const auto &kernel_name : kernel_names_) {
+    routine_info += "_" + kernel_name + db_(kernel_name).GetValuesString();
+  }
+  log_debug(routine_info);
+
  // Queries the cache to see whether or not the program (context-specific) is already there
  bool has_program;
-  program_ = ProgramCache::Instance().Get(ProgramKeyRef{ context_(), device_(), precision_, routine_name_ },
+  program_ = ProgramCache::Instance().Get(ProgramKeyRef{ context_(), device_(), precision_, routine_info },
                                          &has_program);
  if (has_program) { return; }

@ -102,12 +110,12 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
  // is, a program is created and stored in the cache
  const auto device_name = GetDeviceName(device_);
  bool has_binary;
-  auto binary = BinaryCache::Instance().Get(BinaryKeyRef{ precision_, routine_name_, device_name },
+  auto binary = BinaryCache::Instance().Get(BinaryKeyRef{ precision_, routine_info, device_name },
                                            &has_binary);
  if (has_binary) {
    program_ = Program(device_, context_, binary);
    program_.Build(device_, options);
-    ProgramCache::Instance().Store(ProgramKey{ context_(), device_(), precision_, routine_name_ },
+    ProgramCache::Instance().Store(ProgramKey{ context_(), device_(), precision_, routine_info },
                                   Program{ program_ });
    return;
  }
@ -115,13 +123,13 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
  // Otherwise, the kernel will be compiled and program will be built. Both the binary and the
  // program will be added to the cache.

-  // Inspects whether or not cl_khr_fp64 is supported in case of double precision
+  // Inspects whether or not FP64 is supported in case of double precision
  if ((precision_ == Precision::kDouble && !PrecisionSupported<double>(device_)) ||
      (precision_ == Precision::kComplexDouble && !PrecisionSupported<double2>(device_))) {
    throw RuntimeErrorCode(StatusCode::kNoDoublePrecision);
  }

-  // As above, but for cl_khr_fp16 (half precision)
+  // As above, but for FP16 (half precision)
  if (precision_ == Precision::kHalf && !PrecisionSupported<half>(device_)) {
    throw RuntimeErrorCode(StatusCode::kNoHalfPrecision);
  }
@ -159,6 +167,13 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
    source_string += "#define GLOBAL_MEM_FENCE 1\n";
  }

+  // Optionally adds a translation header from OpenCL kernels to CUDA kernels
+  #ifdef CUDA_API
+    source_string +=
+      #include "kernels/opencl_to_cuda.h"
+    ;
+  #endif
+
  // Loads the common header (typedefs and defines and such)
  source_string +=
    #include "kernels/common.opencl"
@ -180,8 +195,8 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
  program_ = Program(context_, source_string);
  try {
    program_.Build(device_, options);
-  } catch (const CLError &e) {
-    if (e.status() == CL_BUILD_PROGRAM_FAILURE) {
+  } catch (const CLCudaAPIBuildError &e) {
+    if (program_.StatusIsCompilationWarningOrError(e.status())) {
      fprintf(stdout, "OpenCL compiler error/warning: %s\n",
              program_.GetBuildInfo(device_).c_str());
    }
@ -189,10 +204,10 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
  }

  // Store the compiled binary and program in the cache
-  BinaryCache::Instance().Store(BinaryKey{ precision_, routine_name_, device_name },
+  BinaryCache::Instance().Store(BinaryKey{precision_, routine_info, device_name},
                                program_.GetIR());

-  ProgramCache::Instance().Store(ProgramKey{ context_(), device_(), precision_, routine_name_ },
+  ProgramCache::Instance().Store(ProgramKey{context_(), device_(), precision_, routine_info},
                                 Program{ program_ });

  // Prints the elapsed compilation time in case of debugging in verbose mode
--- a/src/routine.hpp
+++ b/src/routine.hpp
@ -75,7 +75,6 @@ class Routine {
  EventPointer event_;
  const Context context_;
  const Device device_;
-  const cl_platform_id platform_;

  // Compiled program (either retrieved from cache or compiled in slow path)
  Program program_;
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@ -19,8 +19,7 @@
 #include <string>
 #include <vector>

-#include "clpp11.hpp"
-#include "clblast.h"
+#include "utilities/utilities.hpp"
 #include "database/database.hpp"

 namespace clblast {
--- a/src/routines/level2/xtrsv.cpp
+++ b/src/routines/level2/xtrsv.cpp
@ -131,10 +131,13 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
    if (i > 0) {
      const auto gemv_m = (a_transpose == Transpose::kNo) ? block_size : i;
      const auto gemv_n = (a_transpose == Transpose::kNo) ? i : block_size;
-      DoGemv(layout, a_transpose, gemv_m, gemv_n, ConstantOne<T>(),
-             a_buffer, a_offset + extra_offset_a, a_ld,
-             x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(),
-             x_buffer, x_offset + extra_offset_b, x_inc );
+      auto gemv_event = Event();
+      auto gemv = Xgemv<T>(queue_, gemv_event.pointer());
+      gemv.DoGemv(layout, a_transpose, gemv_m, gemv_n, ConstantOne<T>(),
+                  a_buffer, a_offset + extra_offset_a, a_ld,
+                  x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(),
+                  x_buffer, x_offset + extra_offset_b, x_inc);
+      gemv_event.WaitForCompletion();
    }

    // Runs the triangular substitution for the block size
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@ -161,10 +161,24 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
  auto c_no_temp = c_one == c_one_i && c_two == c_two_i && c_ld == c_one && c_offset == 0 &&
                   c_do_transpose == false;

-  // Creates the temporary matrices
-  const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, a_one_i*a_two_i);
-  const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, b_one_i*b_two_i);
-  const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, c_one_i*c_two_i);
+  // Computes the sizes and offsets for (optional) temporary buffers for the 3 matrices
+  auto temp_size = size_t{0};
+  auto b_temp_offset = size_t{0};
+  auto c_temp_offset = size_t{0};
+  if (!a_no_temp) { temp_size += a_one_i*a_two_i; }
+  if (!b_no_temp) { b_temp_offset = temp_size; temp_size += b_one_i*b_two_i; }
+  if (!c_no_temp) { c_temp_offset = temp_size; temp_size += c_one_i*c_two_i; }
+  if (!IsMultiple(b_temp_offset, db_["VWN"])) { throw BLASError(StatusCode::kUnexpectedError); }
+  if (!IsMultiple(c_temp_offset, db_["VWM"])) { throw BLASError(StatusCode::kUnexpectedError); }
+
+  // Creates the buffer for the (optional) temporary matrices. Note that we use 'a_buffer' in case
+  // when no temporary buffer is needed, but that's just to make it compile: it is never used.
+  const auto temp_buffer = (temp_size > 0) ? Buffer<T>(context_, temp_size) : a_buffer;
+
+  // Sets the buffer pointers for (temp) matrices A, B, and C
+  const auto a_temp = (a_no_temp) ? a_buffer : temp_buffer;
+  const auto b_temp = (b_no_temp) ? b_buffer : temp_buffer;
+  const auto c_temp = (c_no_temp) ? c_buffer : temp_buffer;

  // Events of all kernels (including pre/post processing kernels)
  auto eventWaitList = std::vector<Event>();
@ -188,7 +202,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
    auto eventProcessB = Event();
    PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
                           b_one, b_two, b_ld, b_offset, b_buffer,
-                           b_one_i, b_two_i, b_one_i, 0, b_temp,
+                           b_one_i, b_two_i, b_one_i, b_temp_offset, b_temp,
                           ConstantOne<T>(), program_,
                           true, b_do_transpose, b_conjugate);
    eventWaitList.push_back(eventProcessB);
@ -199,7 +213,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
    auto eventProcessC = Event();
    PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
                           c_one, c_two, c_ld, c_offset, c_buffer,
-                           c_one_i, c_two_i, c_one_i, 0, c_temp,
+                           c_one_i, c_two_i, c_one_i, c_temp_offset, c_temp,
                           ConstantOne<T>(), program_,
                           true, c_do_transpose, false);
    eventWaitList.push_back(eventProcessC);
@ -217,6 +231,8 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
  kernel.SetArgument(5, a_temp());
  kernel.SetArgument(6, b_temp());
  kernel.SetArgument(7, c_temp());
+  kernel.SetArgument(8, static_cast<int>(b_temp_offset / db_["VWN"]));
+  kernel.SetArgument(9, static_cast<int>(c_temp_offset / db_["VWM"]));

  // Computes the global and local thread sizes
  const auto global = std::vector<size_t>{
@ -234,7 +250,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
  if (!c_no_temp) {
    eventWaitList.push_back(eventKernel);
    PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
-                           c_one_i, c_two_i, c_one_i, 0, c_temp,
+                           c_one_i, c_two_i, c_one_i, c_temp_offset, c_temp,
                           c_one, c_two, c_ld, c_offset, c_buffer,
                           ConstantOne<T>(), program_,
                           false, c_do_transpose, false);
--- a/src/routines/level3/xtrsm.cpp
+++ b/src/routines/level3/xtrsm.cpp
@ -73,7 +73,7 @@ void Xtrsm<T>::TrsmColMajor(const Side side, const Triangle triangle,
                            const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {

  // Settings
-  constexpr auto block_size = size_t{32}; // tuneable
+  constexpr auto block_size = size_t{16}; // tuneable

  // Makes sure all dimensions are larger than zero
  if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
@ -128,18 +128,25 @@ void Xtrsm<T>::TrsmColMajor(const Side side, const Triangle triangle,
      for (auto i = size_t{0}; i < m; i += block_size) {
        const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>();
        const auto current_block_size = std::min(m - i, block_size);
-        DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
-               current_block_size, n, current_block_size, gemm_alpha,
-               a_inv_buffer, i * block_size, block_size,
-               b_buffer, b_offset + i, b_ld, ConstantZero<T>(),
-               x_buffer, x_offset + i, x_ld);
+        auto gemm1_event = Event();
+        auto gemm1 = Xgemm<T>(queue_, gemm1_event.pointer());
+        gemm1.DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
+                     current_block_size, n, current_block_size, gemm_alpha,
+                     a_inv_buffer, i * block_size, block_size,
+                     b_buffer, b_offset + i, b_ld, ConstantZero<T>(),
+                     x_buffer, x_offset + i, x_ld);
+        gemm1_event.WaitForCompletion();
        if (i + block_size >= m) { break; }
+
        const auto this_a_offset = (a_transpose == Transpose::kNo) ? (i + block_size) + i * a_ld : i + (block_size + i) * a_ld;
-        DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
-               m - i - block_size, n, block_size, ConstantNegOne<T>(),
-               a_buffer, this_a_offset, a_ld,
-               x_buffer, x_offset + i, x_ld, gemm_alpha,
-               b_buffer, b_offset + i + block_size, b_ld);
+        auto gemm2_event = Event();
+        auto gemm2 = Xgemm<T>(queue_, gemm2_event.pointer());
+        gemm2.DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
+                     m - i - block_size, n, block_size, ConstantNegOne<T>(),
+                     a_buffer, this_a_offset + a_offset, a_ld,
+                     x_buffer, x_offset + i, x_ld, gemm_alpha,
+                     b_buffer, b_offset + i + block_size, b_ld);
+        gemm2_event.WaitForCompletion();
      }
    }

@ -150,18 +157,25 @@ void Xtrsm<T>::TrsmColMajor(const Side side, const Triangle triangle,
      for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) {
        const auto current_block_size = (i == i_start) ? special_block_size : block_size;
        const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>();
-        DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
-               current_block_size, n, current_block_size, gemm_alpha,
-               a_inv_buffer, i * block_size, block_size,
-               b_buffer, b_offset + i, b_ld, ConstantZero<T>(),
-               x_buffer, x_offset + i, x_ld);
+        auto gemm1_event = Event();
+        auto gemm1 = Xgemm<T>(queue_, gemm1_event.pointer());
+        gemm1.DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
+                     current_block_size, n, current_block_size, gemm_alpha,
+                     a_inv_buffer, i * block_size, block_size,
+                     b_buffer, b_offset + i, b_ld, ConstantZero<T>(),
+                     x_buffer, x_offset + i, x_ld);
+        gemm1_event.WaitForCompletion();
        if (i - static_cast<int>(block_size) < 0) { break; }
+
        const auto this_a_offset = (a_transpose == Transpose::kNo) ? i * a_ld : i;
-        DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
-               i, n, current_block_size, ConstantNegOne<T>(),
-               a_buffer, this_a_offset, a_ld,
-               x_buffer, x_offset + i, x_ld, gemm_alpha,
-               b_buffer, b_offset, b_ld);
+        auto gemm2_event = Event();
+        auto gemm2 = Xgemm<T>(queue_, gemm2_event.pointer());
+        gemm2.DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
+                     i, n, current_block_size, ConstantNegOne<T>(),
+                     a_buffer, this_a_offset + a_offset, a_ld,
+                     x_buffer, x_offset + i, x_ld, gemm_alpha,
+                     b_buffer, b_offset, b_ld);
+        gemm2_event.WaitForCompletion();
      }
    }
  }
@ -176,18 +190,25 @@ void Xtrsm<T>::TrsmColMajor(const Side side, const Triangle triangle,
      for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) {
        const auto current_block_size = (i == i_start) ? special_block_size : block_size;
        const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>();
-        DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
-               m, current_block_size, current_block_size, gemm_alpha,
-               b_buffer, b_offset + i * b_ld, b_ld,
-               a_inv_buffer, i * block_size, block_size, ConstantZero<T>(),
-               x_buffer, x_offset + i * x_ld, x_ld);
+        auto gemm1_event = Event();
+        auto gemm1 = Xgemm<T>(queue_, gemm1_event.pointer());
+        gemm1.DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
+                     m, current_block_size, current_block_size, gemm_alpha,
+                     b_buffer, b_offset + i * b_ld, b_ld,
+                     a_inv_buffer, i * block_size, block_size, ConstantZero<T>(),
+                     x_buffer, x_offset + i * x_ld, x_ld);
+        gemm1_event.WaitForCompletion();
        if (i - static_cast<int>(block_size) < 0) { break; }
+
        const auto this_a_offset = (a_transpose == Transpose::kNo) ? i : i * a_ld;
-        DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
-               m, i, current_block_size, ConstantNegOne<T>(),
-               x_buffer, x_offset + i * x_ld, x_ld,
-               a_buffer, this_a_offset, a_ld, gemm_alpha,
-               b_buffer, b_offset, b_ld);
+        auto gemm2_event = Event();
+        auto gemm2 = Xgemm<T>(queue_, gemm2_event.pointer());
+        gemm2.DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
+                     m, i, current_block_size, ConstantNegOne<T>(),
+                     x_buffer, x_offset + i * x_ld, x_ld,
+                     a_buffer, this_a_offset + a_offset, a_ld, gemm_alpha,
+                     b_buffer, b_offset, b_ld);
+        gemm2_event.WaitForCompletion();
      }
    }

@ -196,18 +217,25 @@ void Xtrsm<T>::TrsmColMajor(const Side side, const Triangle triangle,
      for (auto i = size_t{0}; i < n; i += block_size) {
        const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>();
        const auto current_block_size = std::min(n - i, block_size);
-        DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
-               m, current_block_size, current_block_size, gemm_alpha,
-               b_buffer, b_offset + i * b_ld, b_ld,
-               a_inv_buffer, i * block_size, block_size, ConstantZero<T>(),
-               x_buffer, x_offset + i * x_ld, x_ld);
+        auto gemm1_event = Event();
+        auto gemm1 = Xgemm<T>(queue_, gemm1_event.pointer());
+        gemm1.DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
+                     m, current_block_size, current_block_size, gemm_alpha,
+                     b_buffer, b_offset + i * b_ld, b_ld,
+                     a_inv_buffer, i * block_size, block_size, ConstantZero<T>(),
+                     x_buffer, x_offset + i * x_ld, x_ld);
+        gemm1_event.WaitForCompletion();
        if (i + block_size >= n) { break; }
+
        const auto this_a_offset = (a_transpose == Transpose::kNo) ? i + (block_size + i) * a_ld : (i + block_size) + i * a_ld;
-        DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
-               m, n - i - block_size, block_size, ConstantNegOne<T>(),
-               x_buffer, x_offset + i * x_ld, x_ld,
-               a_buffer, this_a_offset, a_ld, gemm_alpha,
-               b_buffer, b_offset + (i + block_size) * b_ld, b_ld);
+        auto gemm2_event = Event();
+        auto gemm2 = Xgemm<T>(queue_, gemm2_event.pointer());
+        gemm2.DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
+                     m, n - i - block_size, block_size, ConstantNegOne<T>(),
+                     x_buffer, x_offset + i * x_ld, x_ld,
+                     a_buffer, this_a_offset + a_offset, a_ld, gemm_alpha,
+                     b_buffer, b_offset + (i + block_size) * b_ld, b_ld);
+        gemm2_event.WaitForCompletion();
      }
    }
  }
--- a/src/routines/levelx/xaxpybatched.cpp
+++ b/src/routines/levelx/xaxpybatched.cpp
@ -59,9 +59,9 @@ void XaxpyBatched<T>::DoAxpyBatched(const size_t n, const std::vector<T> &alphas
    x_offsets_int[batch] = static_cast<int>(x_offsets[batch]);
    y_offsets_int[batch] = static_cast<int>(y_offsets[batch]);
  }
-  auto x_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
-  auto y_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
-  auto alphas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count);
+  auto x_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
+  auto y_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
+  auto alphas_device = Buffer<T>(context_, BufferAccess::kReadWrite, batch_count);
  x_offsets_device.Write(queue_, batch_count, x_offsets_int);
  y_offsets_device.Write(queue_, batch_count, y_offsets_int);
  alphas_device.Write(queue_, batch_count, alphas);
--- a/src/routines/levelx/xgemmbatched.cpp
+++ b/src/routines/levelx/xgemmbatched.cpp
@ -100,8 +100,8 @@ void XgemmBatched<T>::DoGemmBatched(const Layout layout, const Transpose a_trans
  }

  // Upload the scalar arguments to the device
-  auto alphas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count);
-  auto betas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count);
+  auto alphas_device = Buffer<T>(context_, BufferAccess::kReadWrite, batch_count);
+  auto betas_device = Buffer<T>(context_, BufferAccess::kReadWrite, batch_count);
  alphas_device.Write(queue_, batch_count, alphas);
  betas_device.Write(queue_, batch_count, betas);

@ -200,8 +200,8 @@ void XgemmBatched<T>::BatchedGemmIndirect(const size_t m, const size_t n, const
  // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
  // case nothing has to be done, these kernels can be skipped.
  if (!a_no_temp) {
-    auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
-    auto a_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+    auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
+    auto a_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
    a_offsets_device.Write(queue_, batch_count, a_offsets);
    a_offsets_i_device.Write(queue_, batch_count, a_offsets_i);
    auto eventProcessA = Event();
@ -214,8 +214,8 @@ void XgemmBatched<T>::BatchedGemmIndirect(const size_t m, const size_t n, const

  // As above, but now for matrix B
  if (!b_no_temp) {
-    auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
-    auto b_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+    auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
+    auto b_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
    b_offsets_device.Write(queue_, batch_count, b_offsets);
    b_offsets_i_device.Write(queue_, batch_count, b_offsets_i);
    auto eventProcessB = Event();
@ -227,8 +227,8 @@ void XgemmBatched<T>::BatchedGemmIndirect(const size_t m, const size_t n, const
  }

  // As above, but now for matrix C
-  auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
-  auto c_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+  auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
+  auto c_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
  if (!c_no_temp) {
    c_offsets_device.Write(queue_, batch_count, c_offsets);
    c_offsets_i_device.Write(queue_, batch_count, c_offsets_i);
@ -297,9 +297,9 @@ void XgemmBatched<T>::BatchedGemmDirect(const size_t m, const size_t n, const si
                                        const size_t batch_count) {

  // Uploads the offsets to the device
-  auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
-  auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
-  auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+  auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
+  auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
+  auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
  a_offsets_device.Write(queue_, batch_count, a_offsets);
  b_offsets_device.Write(queue_, batch_count, b_offsets);
  c_offsets_device.Write(queue_, batch_count, c_offsets);
--- a/src/routines/routines.hpp
+++ b/src/routines/routines.hpp
@ -0,0 +1,76 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains all the includes of all the routines in CLBlast.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_ROUTINES_H_
+#define CLBLAST_ROUTINES_ROUTINES_H_
+
+// BLAS level-1 includes
+#include "routines/level1/xswap.hpp"
+#include "routines/level1/xscal.hpp"
+#include "routines/level1/xcopy.hpp"
+#include "routines/level1/xaxpy.hpp"
+#include "routines/level1/xdot.hpp"
+#include "routines/level1/xdotu.hpp"
+#include "routines/level1/xdotc.hpp"
+#include "routines/level1/xnrm2.hpp"
+#include "routines/level1/xasum.hpp"
+#include "routines/level1/xsum.hpp" // non-BLAS routine
+#include "routines/level1/xamax.hpp"
+#include "routines/level1/xamin.hpp" // non-BLAS routine
+#include "routines/level1/xmax.hpp" // non-BLAS routine
+#include "routines/level1/xmin.hpp" // non-BLAS routine
+
+// BLAS level-2 includes
+#include "routines/level2/xgemv.hpp"
+#include "routines/level2/xgbmv.hpp"
+#include "routines/level2/xhemv.hpp"
+#include "routines/level2/xhbmv.hpp"
+#include "routines/level2/xhpmv.hpp"
+#include "routines/level2/xsymv.hpp"
+#include "routines/level2/xsbmv.hpp"
+#include "routines/level2/xspmv.hpp"
+#include "routines/level2/xtrmv.hpp"
+#include "routines/level2/xtbmv.hpp"
+#include "routines/level2/xtpmv.hpp"
+#include "routines/level2/xtrsv.hpp"
+#include "routines/level2/xger.hpp"
+#include "routines/level2/xgeru.hpp"
+#include "routines/level2/xgerc.hpp"
+#include "routines/level2/xher.hpp"
+#include "routines/level2/xhpr.hpp"
+#include "routines/level2/xher2.hpp"
+#include "routines/level2/xhpr2.hpp"
+#include "routines/level2/xsyr.hpp"
+#include "routines/level2/xspr.hpp"
+#include "routines/level2/xsyr2.hpp"
+#include "routines/level2/xspr2.hpp"
+
+// BLAS level-3 includes
+#include "routines/level3/xgemm.hpp"
+#include "routines/level3/xsymm.hpp"
+#include "routines/level3/xhemm.hpp"
+#include "routines/level3/xsyrk.hpp"
+#include "routines/level3/xherk.hpp"
+#include "routines/level3/xsyr2k.hpp"
+#include "routines/level3/xher2k.hpp"
+#include "routines/level3/xtrmm.hpp"
+#include "routines/level3/xtrsm.hpp"
+
+// Level-x includes (non-BLAS)
+#include "routines/levelx/xomatcopy.hpp"
+#include "routines/levelx/xim2col.hpp"
+#include "routines/levelx/xaxpybatched.hpp"
+#include "routines/levelx/xgemmbatched.hpp"
+
+// CLBLAST_ROUTINES_ROUTINES_H_
+#endif
--- a/src/tuning/kernels/copy_fast.cpp
+++ b/src/tuning/kernels/copy_fast.cpp
@ -25,70 +25,64 @@ template <typename T>
 class TuneCopy {
 public:

-  // The representative kernel and the source code
-  static std::string KernelFamily() { return "copy"; }
-  static std::string KernelName() { return "CopyMatrixFast"; }
-  static std::string GetSources() {
-    return
-      #include "../src/kernels/common.opencl"
-      #include "../src/kernels/level3/level3.opencl"
-      #include "../src/kernels/level3/copy_fast.opencl"
-    ;
+  // Settings for this kernel (default command-line arguments)
+  static TunerDefaults GetTunerDefaults() {
+    auto settings = TunerDefaults();
+    settings.options = {kArgM, kArgN, kArgAlpha};
+    settings.default_m = 1024;
+    settings.default_n = 1024;
+    return settings;
  }

-  // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; }
+  // Settings for this kernel (general)
+  static TunerSettings GetTunerSettings(const Arguments<T> &args) {
+    auto settings = TunerSettings();
+
+    // Identification of the kernel
+    settings.kernel_family = "copy";
+    settings.kernel_name = "CopyMatrixFast";
+    settings.sources =
+#include "../src/kernels/common.opencl"
+#include "../src/kernels/level3/level3.opencl"
+#include "../src/kernels/level3/copy_fast.opencl"
+    ;
+
+    // Buffer sizes
+    settings.size_a = args.m * args.n;
+    settings.size_b = args.m * args.n;
+
+    // Sets the base thread configuration
+    settings.global_size = {args.m, args.n};
+    settings.global_size_ref = settings.global_size;
+    settings.local_size = {1, 1};
+    settings.local_size_ref = {8, 8};
+
+    // Transforms the thread configuration based on the parameters
+    settings.mul_local = {{"COPY_DIMX", "COPY_DIMY"}};
+    settings.div_global = {{"COPY_VW", "COPY_WPT"}};
+
+    // Sets the tuning parameters and their possible values
+    settings.parameters = {
+      {"COPY_DIMX", {8, 16, 32}},
+      {"COPY_DIMY", {8, 16, 32}},
+      {"COPY_WPT", {1, 2, 4, 8}},
+      {"COPY_VW", {1, 2, 4, 8}},
+    };
+
+    // Describes how to compute the performance metrics
+    settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision);
+    settings.performance_unit = "GB/s";
+
+    return settings;
+  }

  // Tests for valid arguments
  static void TestValidArguments(const Arguments<T> &) { }

-  // Sets the default values for the arguments
-  static size_t DefaultM() { return 1024; }
-  static size_t DefaultN() { return 1024; }
-  static size_t DefaultK() { return 1; } // N/A for this kernel
-  static size_t DefaultBatchCount() { return 1; } // N/A for this kernel
-  static double DefaultFraction() { return 1.0; } // N/A for this kernel
-  static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging
-  static size_t DefaultSwarmSizePSO() { return 8; } // N/A for this kernel
-  static double DefaultInfluenceGlobalPSO(){ return 0.1; }// N/A for this kernel
-  static double DefaultInfluenceLocalPSO(){ return 0.3; }// N/A for this kernel
-  static double DefaultInfluenceRandomPSO(){ return 0.6; }// N/A for this kernel
-  static size_t DefaultHeuristic(){ return static_cast<size_t> (cltune::SearchMethod::FullSearch);} 
-  static double DefaultMaxTempAnn(){ return 1.0;}// N/A for this kernel
-  
-  // Describes how to obtain the sizes of the buffers
-  static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.n; }
-  static size_t GetSizeB(const Arguments<T> &args) { return args.m * args.n; }
-  static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel
-
-  // Sets the tuning parameters and their possible values
-  static void SetParameters(cltune::Tuner &tuner, const size_t id) {
-    tuner.AddParameter(id, "COPY_DIMX", {8, 16, 32});
-    tuner.AddParameter(id, "COPY_DIMY", {8, 16, 32});
-    tuner.AddParameter(id, "COPY_WPT", {1, 2, 4, 8});
-    tuner.AddParameter(id, "COPY_VW", {1, 2, 4, 8});
-  }
-
  // Sets the constraints and local memory size
  static void SetConstraints(cltune::Tuner &, const size_t) { }
  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }

-  // Sets the base thread configuration
-  static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; }
-  static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); }
-  static std::vector<size_t> LocalSize() { return {1, 1}; }
-  static std::vector<size_t> LocalSizeRef() { return {8, 8}; }
-
-  // Transforms the thread configuration based on the parameters
-  using TransformVector = std::vector<std::vector<std::string>>;
-  static TransformVector MulLocal() { return {{"COPY_DIMX", "COPY_DIMY"}}; }
-  static TransformVector DivLocal() { return {}; }
-  static TransformVector MulGlobal() { return {}; }
-  static TransformVector DivGlobal() { return {{"COPY_VW", "COPY_WPT"}}; }
-
  // Sets the kernel's arguments
  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
                           std::vector<T> &, std::vector<T> &,
@ -99,17 +93,6 @@ class TuneCopy {
    tuner.AddArgumentOutput(b_mat);
    tuner.AddArgumentScalar(GetRealArg(args.alpha));
  }
-
-  // Describes how to compute the performance metrics
-  static size_t GetMetric(const Arguments<T> &args) {
-    return 2 * args.m * args.n * GetBytes(args.precision);
-  }
-  static std::string PerformanceUnit() { return "GB/s"; }
-
-  // Returns which Heuristic to run 
-  static size_t GetHeuristic(const Arguments<T> &args){
-    return static_cast<size_t> (cltune::SearchMethod::FullSearch);
-  }
 };

 // =================================================================================================
--- a/src/tuning/kernels/copy_pad.cpp
+++ b/src/tuning/kernels/copy_pad.cpp
@ -25,70 +25,64 @@ template <typename T>
 class TunePad {
 public:

-  // The representative kernel and the source code
-  static std::string KernelFamily() { return "pad"; }
-  static std::string KernelName() { return "CopyPadMatrix"; }
-  static std::string GetSources() {
-    return
-      #include "../src/kernels/common.opencl"
-      #include "../src/kernels/level3/level3.opencl"
-      #include "../src/kernels/level3/copy_pad.opencl"
-    ;
+  // Settings for this kernel (default command-line arguments)
+  static TunerDefaults GetTunerDefaults() {
+    auto settings = TunerDefaults();
+    settings.options = {kArgM, kArgN, kArgAlpha};
+    settings.default_m = 1024;
+    settings.default_n = 1024;
+    return settings;
  }

-  // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; }
+  // Settings for this kernel (general)
+  static TunerSettings GetTunerSettings(const Arguments<T> &args) {
+    auto settings = TunerSettings();
+
+    // Identification of the kernel
+    settings.kernel_family = "pad";
+    settings.kernel_name = "CopyPadMatrix";
+    settings.sources =
+#include "../src/kernels/common.opencl"
+#include "../src/kernels/level3/level3.opencl"
+#include "../src/kernels/level3/copy_pad.opencl"
+    ;
+
+    // Buffer sizes
+    settings.size_a = args.m * args.n;
+    settings.size_b = args.m * args.n;
+
+    // Sets the base thread configuration
+    settings.global_size = {args.m, args.n};
+    settings.global_size_ref = settings.global_size;
+    settings.local_size = {1, 1};
+    settings.local_size_ref = {8, 8};
+
+    // Transforms the thread configuration based on the parameters
+    settings.mul_local = {{"PAD_DIMX", "PAD_DIMY"}};
+    settings.div_global = {{"PAD_WPTX", "PAD_WPTY"}};
+
+    // Sets the tuning parameters and their possible values
+    settings.parameters = {
+      {"PAD_DIMX", {8, 16, 32}},
+      {"PAD_DIMY", {8, 16, 32}},
+      {"PAD_WPTX", {1, 2, 4}},
+      {"PAD_WPTY", {1, 2, 4}},
+    };
+
+    // Describes how to compute the performance metrics
+    settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision);
+    settings.performance_unit = "GB/s";
+
+    return settings;
+  }

  // Tests for valid arguments
  static void TestValidArguments(const Arguments<T> &) { }

-  // Sets the default values for the arguments
-  static size_t DefaultM() { return 1024; }
-  static size_t DefaultN() { return 1024; }
-  static size_t DefaultK() { return 1; } // N/A for this kernel
-  static size_t DefaultBatchCount() { return 1; } // N/A for this kernel
-  static double DefaultFraction() { return 1.0; } // N/A for this kernel
-  static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging
-  static size_t DefaultSwarmSizePSO() { return 8; } // N/A for this kernel
-  static double DefaultInfluenceGlobalPSO(){ return 0.1; }// N/A for this kernel
-  static double DefaultInfluenceLocalPSO(){ return 0.3; }// N/A for this kernel
-  static double DefaultInfluenceRandomPSO(){ return 0.6; }// N/A for this kernel
-  static size_t DefaultHeuristic(){ return static_cast<size_t> (cltune::SearchMethod::FullSearch);} 
-  static double DefaultMaxTempAnn(){ return 1.0;}// N/A for this kernel
-  
-  // Describes how to obtain the sizes of the buffers
-  static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.n; }
-  static size_t GetSizeB(const Arguments<T> &args) { return args.m * args.n; }
-  static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel
-
-  // Sets the tuning parameters and their possible values
-  static void SetParameters(cltune::Tuner &tuner, const size_t id) {
-    tuner.AddParameter(id, "PAD_DIMX", {8, 16, 32});
-    tuner.AddParameter(id, "PAD_DIMY", {8, 16, 32});
-    tuner.AddParameter(id, "PAD_WPTX", {1, 2, 4});
-    tuner.AddParameter(id, "PAD_WPTY", {1, 2, 4});
-  }
-
  // Sets the constraints and local memory size
  static void SetConstraints(cltune::Tuner &, const size_t) { }
  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }

-  // Sets the base thread configuration
-  static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; }
-  static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); }
-  static std::vector<size_t> LocalSize() { return {1, 1}; }
-  static std::vector<size_t> LocalSizeRef() { return {8, 8}; }
-
-  // Transforms the thread configuration based on the parameters
-  using TransformVector = std::vector<std::vector<std::string>>;
-  static TransformVector MulLocal() { return {{"PAD_DIMX", "PAD_DIMY"}}; }
-  static TransformVector DivLocal() { return {}; }
-  static TransformVector MulGlobal() { return {}; }
-  static TransformVector DivGlobal() { return {{"PAD_WPTX", "PAD_WPTY"}}; }
-
  // Sets the kernel's arguments
  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
                           std::vector<T> &, std::vector<T> &,
@ -107,17 +101,6 @@ class TunePad {
    tuner.AddArgumentScalar(GetRealArg(args.alpha));
    tuner.AddArgumentScalar(0);
  }
-
-  // Describes how to compute the performance metrics
-  static size_t GetMetric(const Arguments<T> &args) {
-    return 2 * args.m * args.n * GetBytes(args.precision);
-  }
-  static std::string PerformanceUnit() { return "GB/s"; }
-
-  // Returns which Heuristic to run 
-  static size_t GetHeuristic(const Arguments<T> &args){
-    return static_cast<size_t> (cltune::SearchMethod::FullSearch);
-  }
 };

 // =================================================================================================
--- a/src/tuning/kernels/transpose_fast.cpp
+++ b/src/tuning/kernels/transpose_fast.cpp
@ -25,53 +25,60 @@ template <typename T>
 class TuneTranspose {
 public:

-  // The representative kernel and the source code
-  static std::string KernelFamily() { return "transpose"; }
-  static std::string KernelName() { return "TransposeMatrixFast"; }
-  static std::string GetSources() {
-    return
-      #include "../src/kernels/common.opencl"
-      #include "../src/kernels/level3/level3.opencl"
-      #include "../src/kernels/level3/transpose_fast.opencl"
-    ;
+  // Settings for this kernel (default command-line arguments)
+  static TunerDefaults GetTunerDefaults() {
+    auto settings = TunerDefaults();
+    settings.options = {kArgM, kArgN, kArgAlpha};
+    settings.default_m = 1024;
+    settings.default_n = 1024;
+    return settings;
  }

-  // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; }
+  // Settings for this kernel (general)
+  static TunerSettings GetTunerSettings(const Arguments<T> &args) {
+    auto settings = TunerSettings();
+
+    // Identification of the kernel
+    settings.kernel_family = "transpose";
+    settings.kernel_name = "TransposeMatrixFast";
+    settings.sources =
+#include "../src/kernels/common.opencl"
+#include "../src/kernels/level3/level3.opencl"
+#include "../src/kernels/level3/transpose_fast.opencl"
+    ;
+
+    // Buffer sizes
+    settings.size_a = args.m * args.n;
+    settings.size_b = args.m * args.n;
+
+    // Sets the base thread configuration
+    settings.global_size = {args.m, args.n};
+    settings.global_size_ref = settings.global_size;
+    settings.local_size = {1, 1};
+    settings.local_size_ref = {8, 8};
+
+    // Transforms the thread configuration based on the parameters
+    settings.mul_local = {{"TRA_DIM", "TRA_DIM"}};
+    settings.div_global = {{"TRA_WPT", "TRA_WPT"}};
+
+    // Sets the tuning parameters and their possible values
+    settings.parameters = {
+      {"TRA_DIM", {4, 8, 16, 32, 64}},
+      {"TRA_WPT", {1, 2, 4, 8, 16}},
+      {"TRA_PAD", {0, 1}},
+      {"TRA_SHUFFLE", {0, 1}},
+    };
+
+    // Describes how to compute the performance metrics
+    settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision);
+    settings.performance_unit = "GB/s";
+
+    return settings;
+  }

  // Tests for valid arguments
  static void TestValidArguments(const Arguments<T> &) { }

-  // Sets the default values for the arguments
-  static size_t DefaultM() { return 1024; }
-  static size_t DefaultN() { return 1024; }
-  static size_t DefaultK() { return 1; } // N/A for this kernel
-  static size_t DefaultBatchCount() { return 1; } // N/A for this kernel
-  static double DefaultFraction() { return 1.0; } // N/A for this kernel
-  static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging
-  static size_t DefaultSwarmSizePSO() { return 8; } // N/A for this kernel
-  static double DefaultInfluenceGlobalPSO(){ return 0.1; }// N/A for this kernel
-  static double DefaultInfluenceLocalPSO(){ return 0.3; }// N/A for this kernel
-  static double DefaultInfluenceRandomPSO(){ return 0.6; }// N/A for this kernel
-  static size_t DefaultHeuristic(){ return static_cast<size_t> (cltune::SearchMethod::FullSearch);} 
-  static double DefaultMaxTempAnn(){ return 1.0;}// N/A for this kernel
-   
-  // Describes how to obtain the sizes of the buffers
-  static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.n; }
-  static size_t GetSizeB(const Arguments<T> &args) { return args.m * args.n; }
-  static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel
-
-  // Sets the tuning parameters and their possible values
-  static void SetParameters(cltune::Tuner &tuner, const size_t id) {
-    tuner.AddParameter(id, "TRA_DIM", {4, 8, 16, 32, 64});
-    tuner.AddParameter(id, "TRA_WPT", {1, 2, 4, 8, 16});
-    tuner.AddParameter(id, "TRA_PAD", {0, 1});
-    tuner.AddParameter(id, "TRA_SHUFFLE", {0, 1});
-  }
-
  // Sets the constraints and local memory size
  static void SetConstraints(cltune::Tuner &, const size_t) { }
  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
@ -81,19 +88,6 @@ class TuneTranspose {
    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"TRA_DIM", "TRA_WPT", "TRA_PAD"});
  }

-  // Sets the base thread configuration
-  static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; }
-  static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); }
-  static std::vector<size_t> LocalSize() { return {1, 1}; }
-  static std::vector<size_t> LocalSizeRef() { return {8, 8}; }
-
-  // Transforms the thread configuration based on the parameters
-  using TransformVector = std::vector<std::vector<std::string>>;
-  static TransformVector MulLocal() { return {{"TRA_DIM", "TRA_DIM"}}; }
-  static TransformVector DivLocal() { return {}; }
-  static TransformVector MulGlobal() { return {}; }
-  static TransformVector DivGlobal() { return {{"TRA_WPT", "TRA_WPT"}}; }
-
  // Sets the kernel's arguments
  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
                           std::vector<T> &, std::vector<T> &,
@ -104,17 +98,6 @@ class TuneTranspose {
    tuner.AddArgumentOutput(b_mat);
    tuner.AddArgumentScalar(GetRealArg(args.alpha));
  }
-
-  // Describes how to compute the performance metrics
-  static size_t GetMetric(const Arguments<T> &args) {
-    return 2 * args.m * args.n * GetBytes(args.precision);
-  }
-  static std::string PerformanceUnit() { return "GB/s"; }
-
-  // Returns which Heuristic to run 
-  static size_t GetHeuristic(const Arguments<T> &args){
-    return static_cast<size_t> (cltune::SearchMethod::FullSearch); 
-  }
 };

 // =================================================================================================
--- a/src/tuning/kernels/transpose_pad.cpp
+++ b/src/tuning/kernels/transpose_pad.cpp
@ -25,52 +25,59 @@ template <typename T>
 class TunePadTranspose {
 public:

-  // The representative kernel and the source code
-  static std::string KernelFamily() { return "padtranspose"; }
-  static std::string KernelName() { return "TransposePadMatrix"; }
-  static std::string GetSources() {
-    return
-      #include "../src/kernels/common.opencl"
-      #include "../src/kernels/level3/level3.opencl"
-      #include "../src/kernels/level3/transpose_pad.opencl"
-    ;
+  // Settings for this kernel (default command-line arguments)
+  static TunerDefaults GetTunerDefaults() {
+    auto settings = TunerDefaults();
+    settings.options = {kArgM, kArgN, kArgAlpha};
+    settings.default_m = 1024;
+    settings.default_n = 1024;
+    return settings;
  }

-  // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; }
+  // Settings for this kernel (general)
+  static TunerSettings GetTunerSettings(const Arguments<T> &args) {
+    auto settings = TunerSettings();
+
+    // Identification of the kernel
+    settings.kernel_family = "padtranspose";
+    settings.kernel_name = "TransposePadMatrix";
+    settings.sources =
+#include "../src/kernels/common.opencl"
+#include "../src/kernels/level3/level3.opencl"
+#include "../src/kernels/level3/transpose_pad.opencl"
+    ;
+
+    // Buffer sizes
+    settings.size_a = args.m * args.n;
+    settings.size_b = args.m * args.n;
+
+    // Sets the base thread configuration
+    settings.global_size = {args.m, args.n};
+    settings.global_size_ref = settings.global_size;
+    settings.local_size = {1, 1};
+    settings.local_size_ref = {8, 8};
+
+    // Transforms the thread configuration based on the parameters
+    settings.mul_local = {{"PADTRA_TILE", "PADTRA_TILE"}};
+    settings.div_global = {{"PADTRA_WPT", "PADTRA_WPT"}};
+
+    // Sets the tuning parameters and their possible values
+    settings.parameters = {
+      {"PADTRA_TILE", {8, 16, 32, 64}},
+      {"PADTRA_WPT", {1, 2, 4, 8, 16}},
+      {"PADTRA_PAD", {0, 1}},
+    };
+
+    // Describes how to compute the performance metrics
+    settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision);
+    settings.performance_unit = "GB/s";
+
+    return settings;
+  }

  // Tests for valid arguments
  static void TestValidArguments(const Arguments<T> &) { }

-  // Sets the default values for the arguments
-  static size_t DefaultM() { return 1024; }
-  static size_t DefaultN() { return 1024; }
-  static size_t DefaultK() { return 1; } // N/A for this kernel
-  static size_t DefaultBatchCount() { return 1; } // N/A for this kernel
-  static double DefaultFraction() { return 1.0; } // N/A for this kernel
-  static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging
-  static size_t DefaultSwarmSizePSO() { return 8; } // N/A for this kernel
-  static double DefaultInfluenceGlobalPSO(){ return 0.1; }// N/A for this kernel
-  static double DefaultInfluenceLocalPSO(){ return 0.3; }// N/A for this kernel
-  static double DefaultInfluenceRandomPSO(){ return 0.6; }// N/A for this kernel
-  static size_t DefaultHeuristic(){ return static_cast<size_t> (cltune::SearchMethod::FullSearch);}
-  static double DefaultMaxTempAnn(){ return 1.0;}// N/A for this kernel
-  
-  // Describes how to obtain the sizes of the buffers
-  static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.n; }
-  static size_t GetSizeB(const Arguments<T> &args) { return args.m * args.n; }
-  static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel
-
-  // Sets the tuning parameters and their possible values
-  static void SetParameters(cltune::Tuner &tuner, const size_t id) {
-    tuner.AddParameter(id, "PADTRA_TILE", {8, 16, 32, 64});
-    tuner.AddParameter(id, "PADTRA_WPT", {1, 2, 4, 8, 16});
-    tuner.AddParameter(id, "PADTRA_PAD", {0, 1});
-  }
-
  // Sets the constraints and local memory size
  static void SetConstraints(cltune::Tuner &, const size_t) { }
  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
@ -80,19 +87,6 @@ class TunePadTranspose {
    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"PADTRA_TILE", "PADTRA_WPT", "PADTRA_PAD"});
  }

-  // Sets the base thread configuration
-  static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; }
-  static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); }
-  static std::vector<size_t> LocalSize() { return {1, 1}; }
-  static std::vector<size_t> LocalSizeRef() { return {8, 8}; }
-
-  // Transforms the thread configuration based on the parameters
-  using TransformVector = std::vector<std::vector<std::string>>;
-  static TransformVector MulLocal() { return {{"PADTRA_TILE", "PADTRA_TILE"}}; }
-  static TransformVector DivLocal() { return {}; }
-  static TransformVector MulGlobal() { return {}; }
-  static TransformVector DivGlobal() { return {{"PADTRA_WPT", "PADTRA_WPT"}}; }
-
  // Sets the kernel's arguments
  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
                           std::vector<T> &, std::vector<T> &,
@ -111,17 +105,6 @@ class TunePadTranspose {
    tuner.AddArgumentScalar(GetRealArg(args.alpha));
    tuner.AddArgumentScalar(0);
  }
-
-  // Describes how to compute the performance metrics
-  static size_t GetMetric(const Arguments<T> &args) {
-    return 2 * args.m * args.n * GetBytes(args.precision);
-  }
-  static std::string PerformanceUnit() { return "GB/s"; }
-
-  // Returns which Heuristic to run 
-  static size_t GetHeuristic(const Arguments<T> &args){
-    return static_cast<size_t> (cltune::SearchMethod::FullSearch);
-  }
 };

 // =================================================================================================
--- a/src/tuning/kernels/xaxpy.cpp
+++ b/src/tuning/kernels/xaxpy.cpp
@ -25,19 +25,54 @@ template <typename T>
 class TuneXaxpy {
 public:

-  // The representative kernel and the source code
-  static std::string KernelFamily() { return "xaxpy"; }
-  static std::string KernelName() { return "XaxpyFastest"; }
-  static std::string GetSources() {
-    return
-      #include "../src/kernels/common.opencl"
-      #include "../src/kernels/level1/level1.opencl"
-      #include "../src/kernels/level1/xaxpy.opencl"
-    ;
+  // Settings for this kernel (default command-line arguments)
+  static TunerDefaults GetTunerDefaults() {
+    auto settings = TunerDefaults();
+    settings.options = {kArgN, kArgAlpha};
+    settings.default_n = 4096*1024;
+    return settings;
  }

-  // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() { return {kArgN, kArgAlpha}; }
+  // Settings for this kernel (general)
+  static TunerSettings GetTunerSettings(const Arguments<T> &args) {
+    auto settings = TunerSettings();
+
+    // Identification of the kernel
+    settings.kernel_family = "xaxpy";
+    settings.kernel_name = "XaxpyFastest";
+    settings.sources =
+#include "../src/kernels/common.opencl"
+#include "../src/kernels/level1/level1.opencl"
+#include "../src/kernels/level1/xaxpy.opencl"
+    ;
+
+    // Buffer sizes
+    settings.size_x = args.n;
+    settings.size_y = args.n;
+
+    // Sets the base thread configuration
+    settings.global_size = {args.n};
+    settings.global_size_ref = settings.global_size;
+    settings.local_size = {1};
+    settings.local_size_ref = {64};
+
+    // Transforms the thread configuration based on the parameters
+    settings.mul_local = {{"WGS"}};
+    settings.div_global = {{"WPT"},{"VW"}};
+
+    // Sets the tuning parameters and their possible values
+    settings.parameters = {
+      {"WGS", {64, 128, 256, 512, 1024, 2048}},
+      {"WPT", {1, 2, 4, 8}},
+      {"VW", {1, 2, 4, 8}},
+    };
+
+    // Describes how to compute the performance metrics
+    settings.metric_amount = 3 * args.n * GetBytes(args.precision);
+    settings.performance_unit = "GB/s";
+
+    return settings;
+  }

  // Tests for valid arguments
  static void TestValidArguments(const Arguments<T> &args) {
@ -46,52 +81,10 @@ class TuneXaxpy {
    }
  }

-  // Sets the default values for the arguments
-  static size_t DefaultM() { return 1; } // N/A for this kernel
-  static size_t DefaultN() { return 4096*1024; }
-  static size_t DefaultK() { return 1; } // N/A for this kernel
-  static size_t DefaultBatchCount() { return 1; } // N/A for this kernel
-  static double DefaultFraction() { return 1.0; } // N/A for this kernel
-  static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging
-  static size_t DefaultSwarmSizePSO() { return 8; } // N/A for this kernel
-  static double DefaultInfluenceGlobalPSO(){ return 0.1; }// N/A for this kernel
-  static double DefaultInfluenceLocalPSO(){ return 0.3; }// N/A for this kernel
-  static double DefaultInfluenceRandomPSO(){ return 0.6; }// N/A for this kernel
-  static size_t DefaultHeuristic(){ return static_cast<size_t> (cltune::SearchMethod::FullSearch);} 
-  static double DefaultMaxTempAnn(){ return 1.0;} // N/A for this kernel
-  
-  // Describes how to obtain the sizes of the buffers
-  static size_t GetSizeX(const Arguments<T> &args) { return args.n; }
-  static size_t GetSizeY(const Arguments<T> &args) { return args.n; }
-  static size_t GetSizeA(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeB(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel
-
-  // Sets the tuning parameters and their possible values
-  static void SetParameters(cltune::Tuner &tuner, const size_t id) {
-    tuner.AddParameter(id, "WGS", {64, 128, 256, 512, 1024, 2048});
-    tuner.AddParameter(id, "WPT", {1, 2, 4, 8});
-    tuner.AddParameter(id, "VW", {1, 2, 4, 8});
-  }
-
  // Sets the constraints and local memory size
  static void SetConstraints(cltune::Tuner &, const size_t) { }
  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }

-  // Sets the base thread configuration
-  static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.n}; }
-  static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); }
-  static std::vector<size_t> LocalSize() { return {1}; }
-  static std::vector<size_t> LocalSizeRef() { return {64}; }
-
-  // Transforms the thread configuration based on the parameters
-  using TransformVector = std::vector<std::vector<std::string>>;
-  static TransformVector MulLocal() { return {{"WGS"}}; }
-  static TransformVector DivLocal() { return {}; }
-  static TransformVector MulGlobal() { return {}; }
-  static TransformVector DivGlobal() { return {{"WPT"},{"VW"}}; }
-
  // Sets the kernel's arguments
  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
                           std::vector<T> &x_vec, std::vector<T> &y_vec,
@ -102,17 +95,6 @@ class TuneXaxpy {
    tuner.AddArgumentInput(x_vec);
    tuner.AddArgumentOutput(y_vec);
  }
-
-  // Describes how to compute the performance metrics
-  static size_t GetMetric(const Arguments<T> &args) {
-    return 3 * args.n * GetBytes(args.precision);
-  }
-  static std::string PerformanceUnit() { return "GB/s"; }
-
-  // Returns which Heuristic to run 
-  static size_t GetHeuristic(const Arguments<T> &args){
-    return static_cast<size_t> (cltune::SearchMethod::FullSearch);
-  }
 };

 // =================================================================================================
--- a/Show More
+++ b/Show More