Merge pull request #31 from CNugteren/development

Update to version 0.6.0
2024-08-27 23:37:02 +02:00 · 2016-03-13 11:05:51 +01:00 · 2016-03-13 11:05:51 +01:00 · d190becd89
parent 4678fd371d bf4bd072e2
commit d190becd89
137 changed files with 6198 additions and 1463 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,3 +2,4 @@ build
 stash
 .*
 *.pyc
 *.db
--- a/.travis.yml
+++ b/.travis.yml
@ -1,29 +1,69 @@
 language: cpp
 sudo: required
 dist: trusty
 compiler:
  - gcc
  - clang
 addons:
  apt:
    sources:
      # kubuntu-backports contains newer versions of cmake to install
      - kubuntu-backports
    packages:
      - cmake
 env:
  global:
    - CLBLAST_ROOT=${TRAVIS_BUILD_DIR}/bin/make/release
    - OPENCL_REGISTRY=https://www.khronos.org/registry/cl
    - OPENCL_ROOT=${TRAVIS_BUILD_DIR}/bin/opencl
 before_install:
-  - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
+  - cmake --version;
-  - sudo add-apt-repository -y ppa:kalakris/cmake
+  - ${CC} --version;
-  - sudo apt-get update -qq
+  - ${CXX} --version;
-  - sudo apt-get install -qq gcc-4.8 g++-4.8 clang
+
  - sudo apt-get install -qq fglrx=2:8.960-0ubuntu1 opencl-headers
  - sudo apt-get install -qq cmake
 install:
-  - if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
+  # The following linux logic is necessary because of Travis's move to the GCE platform, which does not
  # currently contain packages for fglrx: https://github.com/travis-ci/travis-ci/issues/5221
  # We build our own linkable .so file
  - if [ ${TRAVIS_OS_NAME} == "linux" ]; then
      mkdir -p ${OPENCL_ROOT};
      pushd ${OPENCL_ROOT};
      travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-ICD-Loader.git;
      mv ./OpenCL-ICD-Loader/* .;
      travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-Headers.git inc/CL;
      pushd inc/CL;
      travis_retry wget -w 1 -np -nd -nv -A h,hpp ${OPENCL_REGISTRY}/api/2.1/cl.hpp;
      popd;
      mkdir -p lib;
      pushd lib;
      cmake -G "Unix Makefiles" ..;
      make;
      cp ./bin/libOpenCL.so .;
      popd;
      pushd inc/CL;
      travis_retry git fetch origin opencl12:opencl12;
      git checkout opencl12;
      popd;
      mv inc/ include/;
      popd;
    fi
 before_script:
-  - mkdir install
+  - mkdir -p ${CLBLAST_ROOT}
-  - export PATH=`pwd`/install/bin:${PATH}
+  - pushd ${CLBLAST_ROOT}
-  - export LD_LIBRARY_PATH=`pwd`/install/lib64:`pwd`/install/lib:${LD_LIBRARY_PATH}
+  - cmake -DOPENCL_ROOT=${OPENCL_ROOT} ${TRAVIS_BUILD_DIR}
-  - mkdir build
+
  - cd build
  - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install ..
 script:
  - make
-  - make install
+
 branches:
  only:
    - master
    - development
 notifications:
  email: false
--- a/17
+++ b/17
@ -1,4 +1,21 @@
 Version 0.6.0
 - Added support for MSVC (Visual Studio) 2015
 - Added tuned parameters for various devices (see README)
 - Now automatically generates C++ code from JSON tuning results
 - Added level-2 routines:
  * SGER/DGER
  * CGERU/ZGERU
  * CGERC/ZGERC
  * CHER/ZHER
  * CHPR/ZHPR
  * CHER2/ZHER2
  * CHPR2/ZHPR2
  * CSYR/ZSYR
  * CSPR/ZSPR
  * CSYR2/ZSYR2
  * CSPR2/ZSPR2
 Version 0.5.0
 - Improved structure and performance of level-2 routines (xSYMV/xHEMV)
 - Reduced compilation time of level-3 OpenCL kernels
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -13,7 +13,7 @@
 cmake_minimum_required(VERSION 2.8.10)
 project("clblast" C CXX)
 set(clblast_VERSION_MAJOR 0)
-set(clblast_VERSION_MINOR 5)
+set(clblast_VERSION_MINOR 6)
 set(clblast_VERSION_PATCH 0)
 # Options and their default values
@ -55,16 +55,21 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
 endif()
 # C++ compiler settings
-set(FLAGS "-O3 -std=c++11")
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
-if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+  set(FLAGS "/Ox")
-  set(FLAGS "${FLAGS} -Wall -Wno-comment -Wno-return-type -Wno-switch -Wno-missing-noreturn")
+  set(FLAGS "${FLAGS} /wd4715")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0)
+else ()
-    set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable")
+  set(FLAGS "-O3 -std=c++11")
  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
    set(FLAGS "${FLAGS} -Wall -Wno-comment -Wno-return-type -Wno-switch -Wno-missing-noreturn")
    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0)
      set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable")
    endif()
  elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
    set(FLAGS "${FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
    set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch")
    set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn")
  endif()
 elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
  set(FLAGS "${FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
  set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch")
  set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn")
 endif()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
@ -102,14 +107,15 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
 # ==================================================================================================
 # Sets the supported routines and the used kernels. New routines and kernels should be added here.
-set(KERNELS copy pad transpose padtranspose xaxpy xdot xgemv xgemm)
+set(KERNELS copy pad transpose padtranspose xaxpy xdot xger xgemm xgemv)
 set(SAMPLE_PROGRAMS_CPP sgemm)
 set(SAMPLE_PROGRAMS_C sgemm)
 set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc)
-set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv)
+set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
                    xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
 set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
 set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES})
-set(PRECISIONS 32 3232 64 6464)
+set(PRECISIONS 32 64 3232 6464)
 # ==================================================================================================
--- a/README.md
+++ b/README.md
@ -6,7 +6,7 @@ CLBlast: The tuned OpenCL BLAS library
 CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices.
-__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version doesn't support the less commonly used routines yet: they will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
+__Note that the CLBlast library is actively being developed, and might not be mature enough for production environments__. This preview-version doesn't support the less commonly used routines yet: they will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details (and how to tune yourself).
 Why CLBlast and not clBLAS or cuBLAS?
@ -17,6 +17,9 @@ Use CLBlast instead of clBLAS:
 * When you care about achieving maximum performance.
 * When you want to be able to inspect the BLAS kernels or easily customize them to your needs.
 * When you run on exotic OpenCL devices which you need to tune yourself.
 * When you are still running on OpenCL 1.1 hardware.
 * When you value an organized and modern C++ codebase.
 * When you target Intel CPUs and GPUs or embedded devices
 Use CLBlast instead of cuBLAS:
@ -41,10 +44,13 @@ The pre-requisites for compilation of CLBlast are:
  - Clang 3.3 or newer
  - AppleClang 5.0 or newer
  - ICC 14.0 or newer
  - MSVC (Visual Studio) 2015 or newer
 * An OpenCL 1.1 or newer library, for example:
  - Apple OpenCL
  - NVIDIA CUDA SDK
  - AMD APP SDK
  - Intel OpenCL
  - Beignet
 An example of an out-of-source build (starting from the root of the CLBlast folder):
@ -79,13 +85,27 @@ Using the tuners (optional)
 The CLBlast library will be tuned in the future for the most commonly used OpenCL devices. This pre-release of CLBlast is only tuned for a limited number of devices, in particular those with the following `CL_DEVICE_NAME` values:
 * NVIDIA GPUs:
-  - GeForce GTX480
+  - GeForce GTX 480
  - GeForce GTX 680
  - GeForce GTX 750 Ti
  - GeForce GTX 980
  - GeForce GTX Titan
  - GeForce GTX Titan X
  - Tesla K20m
  - Tesla K40m
 * AMD GPUs:
  - Tahiti
  - R9 M370X
 * Intel GPUs:
  - Iris
  - Iris Pro
 * Intel CPUs:
  - Core i5-6200U
  - Core i7-3770K
  - Core i7-5930K
 * Other devices:
  - ARM Mali-T628 GPU
  - Intel MIC
 If your device is not (yet) among this list or if you want to tune CLBlast for specific parameters (e.g. rectangular matrix sizes), you should compile the library with the optional tuners:
@ -93,9 +113,19 @@ If your device is not (yet) among this list or if you want to tune CLBlast for s
 Note that CLBlast's tuners are based on the CLTune auto-tuning library, which has to be installed separately (version 1.7.0 or higher). CLTune is available from GitHub.
-Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance.
+Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `DEFAULT_DEVICE` and `DEFAULT_PLATFORM` environmental variables before running CMake.
-The tuner will output a C++ database compatible line with the results, which can be added to `include/internal/database/xxxxx.h` in the appropriate section. Or, if tuning parameters already exist for your device but you believe they can be improved, this is also the place where they can be modified. If you want the found parameters to be included in future releases of CLBlast, please post the JSON output in the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
+The tuners output a JSON-file with the results. The best results need to be added to `include/internal/database/xxxxx.h` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
 In summary, tuning the entire library for your device can be done as follows (starting from the root of the CLBlast folder):
    mkdir build
    cd build
    cmake -DTUNERS=ON ..
    make
    make alltuners
    python ../scripts/database/database.py . ..
    make
 Compiling the tests (optional)
@ -127,10 +157,11 @@ These graphs can be generated automatically on your own device. First, compile C
    Rscript path/to/test/performance/graphs/xgemm.r 0 1
 Supported routines
 -------------
-CLBlast is in active development but already supports the majority of BLAS routines. The currently supported routines are marked with '✔' in the following tables:
+CLBlast is in active development but already supports almost all the BLAS routines. The currently supported routines are marked with '✔' in the following tables. Empty boxes represent routines that still need to be implemented in a future release, whereas routines marked with '-' are not part of BLAS at all.
 | Level-1  | S | D | C | Z | Notes   |
 | ---------|---|---|---|---|---------|
@ -149,7 +180,6 @@ CLBlast is in active development but already supports the majority of BLAS routi
 | xASUM    |   |   | - | - | +SC +DZ |
 | IxAMAX   |   |   |   |   |         |
 | Level-2  | S | D | C | Z | Notes   |
 | ---------|---|---|---|---|---------|
 | xGEMV    | ✔ | ✔ | ✔ | ✔ |         |
@ -166,17 +196,17 @@ CLBlast is in active development but already supports the majority of BLAS routi
 | xTRSV    |   |   |   |   |         |
 | xTBSV    |   |   |   |   |         |
 | xTPSV    |   |   |   |   |         |
-| xGER     |   |   | - | - |         |
+| xGER     | ✔ | ✔ | - | - |         |
-| xGERU    | - | - |   |   |         |
+| xGERU    | - | - | ✔ | ✔ |         |
-| xGERC    | - | - |   |   |         |
+| xGERC    | - | - | ✔ | ✔ |         |
-| xHER     | - | - |   |   |         |
+| xHER     | - | - | ✔ | ✔ |         |
-| xHPR     | - | - |   |   |         |
+| xHPR     | - | - | ✔ | ✔ |         |
-| xHER2    | - | - |   |   |         |
+| xHER2    | - | - | ✔ | ✔ |         |
-| xHPR2    | - | - |   |   |         |
+| xHPR2    | - | - | ✔ | ✔ |         |
-| xSYR     |   |   | - | - |         |
+| xSYR     | ✔ | ✔ | - | - |         |
-| xSPR     |   |   | - | - |         |
+| xSPR     | ✔ | ✔ | - | - |         |
-| xSYR2    |   |   | - | - |         |
+| xSYR2    | ✔ | ✔ | - | - |         |
-| xSPR2    |   |   | - | - |         |
+| xSPR2    | ✔ | ✔ | - | - |         |
 | Level-3  | S | D | C | Z | Notes   |
 | ---------|---|---|---|---|---------|
@ -200,6 +230,12 @@ The contributing authors so far are:
 * [Cedric Nugteren](http://www.cedricnugteren.nl)
 Tuning and testing on a variety of OpenCL devices was made possible by:
 * [TU/e ES research group](http://www.es.ele.tue.nl/)
 * [ASCI DAS4 and DAS5](http://www.cs.vu.nl/das4/)
 * [Dividiti](http://www.dividiti.com)
 * [SURFsara HPC center](http://www.surfsara.com)
 Support us
 -------------
@ -210,20 +246,8 @@ This project started in March 2015 as an evenings and weekends free-time project
 To-do list before release of version 1.0
 -------------
- Increase the functionality:
+- Support all routines supported by clBLAS
-  * Support all routines supported by clBLAS
+- Allow the user control over events and synchronization
-  * Allow the user control over events and synchronization
+- Add half-precision routines (e.g. HGEMM)
-  * Add half-precision routines (e.g. HGEMM)
+- Enable correctness and performance testing against a CPU-based BLAS library
- Improve host performance:
+- Test in multi-threaded environments
  * Allow initialization to pre-compile kernels and store to disk
 - Improve device performance:
  * Tune for a wider range of devices
  * Allow users to define custom tuned parameters
 - Improve the tuning
  * Make the tuners upload their data to a central server
 - Improve the performance comparisons:
  * Enable comparison against optionally: ViennaCL, cuBLAS, MAGMA OpenCL
 - Further reduce the likelihood of crashes:
  * Add checks for proper command-line arguments in the tuner, tester and client
  * Add checks for valid database parameters
  * Test in multi-threaded environments
--- a/cmake/Modules/FindOpenCL.cmake
+++ b/cmake/Modules/FindOpenCL.cmake
@ -34,6 +34,7 @@ set(OPENCL_HINTS
 set(OPENCL_PATHS
  /usr/local/cuda
  /opt/cuda
  /opt/intel/opencl
  /usr
  /usr/local
 )
@ -52,7 +53,7 @@ mark_as_advanced(OPENCL_INCLUDE_DIRS)
 find_library(OPENCL_LIBRARIES
  NAMES OpenCL
  HINTS ${OPENCL_HINTS}
-  PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 OpenCL/common/lib/x64
+  PATH_SUFFIXES lib lib64 lib/x86_64 lib/x86_64/sdk lib/x64 lib/x86 lib/Win32 OpenCL/common/lib/x64
  PATHS ${OPENCL_PATHS}
  DOC "OpenCL library"
 )
--- a/cmake/Modules/FindclBLAS.cmake
+++ b/cmake/Modules/FindclBLAS.cmake
@ -45,7 +45,7 @@ mark_as_advanced(CLBLAS_INCLUDE_DIRS)
 find_library(CLBLAS_LIBRARIES
  NAMES clBLAS
  HINTS ${CLBLAS_HINTS}
-  PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32
+  PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import
  PATHS ${CLBLAS_PATHS}
  DOC "clBLAS library"
 )
--- a/doc/performance/Intel_Iris/SAXPY.pdf
+++ b/doc/performance/Intel_Iris/SAXPY.pdf
--- a/doc/performance/Intel_Iris/SGEMM.pdf
+++ b/doc/performance/Intel_Iris/SGEMM.pdf
--- a/doc/performance/Intel_Iris/SGEMV.pdf
+++ b/doc/performance/Intel_Iris/SGEMV.pdf
--- a/doc/performance/Intel_Iris/SSYMM.pdf
+++ b/doc/performance/Intel_Iris/SSYMM.pdf
--- a/doc/performance/Intel_Iris/SSYRK.pdf
+++ b/doc/performance/Intel_Iris/SSYRK.pdf
--- a/doc/performance/Intel_IrisPro/SAXPY.pdf
+++ b/doc/performance/Intel_IrisPro/SAXPY.pdf
--- a/doc/performance/Intel_IrisPro/SGEMM.pdf
+++ b/doc/performance/Intel_IrisPro/SGEMM.pdf
--- a/doc/performance/Intel_IrisPro/SGEMV.pdf
+++ b/doc/performance/Intel_IrisPro/SGEMV.pdf
--- a/doc/performance/Intel_IrisPro/SSYMM.pdf
+++ b/doc/performance/Intel_IrisPro/SSYMM.pdf
--- a/doc/performance/Intel_IrisPro/SSYRK.pdf
+++ b/doc/performance/Intel_IrisPro/SSYRK.pdf
--- a/doc/performance/Radeon_M370X/SAXPY.pdf
+++ b/doc/performance/Radeon_M370X/SAXPY.pdf
--- a/doc/performance/Radeon_M370X/SGEMM.pdf
+++ b/doc/performance/Radeon_M370X/SGEMM.pdf
--- a/doc/performance/Radeon_M370X/SGEMV.pdf
+++ b/doc/performance/Radeon_M370X/SGEMV.pdf
--- a/doc/performance/Radeon_M370X/SSYMM.pdf
+++ b/doc/performance/Radeon_M370X/SSYMM.pdf
--- a/include/internal/clpp11.h
+++ b/include/internal/clpp11.h
@ -76,7 +76,7 @@ class Event {
  explicit Event(const cl_event event): event_(event) { }
  // Regular constructor
-  explicit Event() { }
+  explicit Event(): event_(nullptr) { }
  // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
  // the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation:
@ -119,6 +119,13 @@ class Platform {
    platform_ = platforms[platform_id];
  }
  // Returns the number of devices on this platform
  size_t NumDevices() const {
    auto result = cl_uint{0};
    CheckError(clGetDeviceIDs(platform_, CL_DEVICE_TYPE_ALL, 0, nullptr, &result));
    return static_cast<size_t>(result);
  }
  // Accessor to the private data-member
  const cl_platform_id& operator()() const { return platform_; }
 private:
@ -136,11 +143,11 @@ class Device {
  // Initialize the device. Note that this constructor can throw exceptions!
  explicit Device(const Platform &platform, const size_t device_id) {
-    auto num_devices = cl_uint{0};
+    auto num_devices = platform.NumDevices();
    CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, 0, nullptr, &num_devices));
    if (num_devices == 0) { Error("no devices found"); }
    auto devices = std::vector<cl_device_id>(num_devices);
-    CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, num_devices, devices.data(), nullptr));
+    CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, static_cast<cl_uint>(num_devices),
                              devices.data(), nullptr));
    if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); }
    device_ = devices[device_id];
  }
@ -172,6 +179,7 @@ class Device {
  size_t CoreClock() const { return GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY); }
  size_t ComputeUnits() const { return GetInfo(CL_DEVICE_MAX_COMPUTE_UNITS); }
  size_t MemorySize() const { return GetInfo(CL_DEVICE_GLOBAL_MEM_SIZE); }
  size_t MaxAllocSize() const { return GetInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE); }
  size_t MemoryClock() const { return 0; } // Not exposed in OpenCL
  size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL
@ -225,7 +233,7 @@ class Device {
    auto result = std::string{};
    result.resize(bytes);
    CheckError(clGetDeviceInfo(device_, info, bytes, &result[0], nullptr));
-    return std::string{result.c_str()};
+    return std::string{result.c_str()}; // Removes any trailing '\0'-characters
  }
 };
@ -342,7 +350,12 @@ class Queue {
      queue_(new cl_command_queue, [](cl_command_queue* s) { CheckError(clReleaseCommandQueue(*s));
                                                             delete s; }) {
    auto status = CL_SUCCESS;
-    *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
+    #ifdef CL_VERSION_2_0
      cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
      *queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status);
    #else
      *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
    #endif
    CheckError(status);
  }
@ -408,7 +421,7 @@ class BufferHost {
 // =================================================================================================
 // Enumeration of buffer access types
-enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite };
+enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned };
 // C++11 version of 'cl_mem'
 template <typename T>
@ -418,13 +431,17 @@ class Buffer {
  // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
  explicit Buffer(const cl_mem buffer):
      buffer_(new cl_mem),
-      access_(BufferAccess::kReadWrite) {
+      access_(BufferAccess::kNotOwned) {
    *buffer_ = buffer;
  }
-  // Regular constructor with memory management
+  // Regular constructor with memory management. If this class does not own the buffer object, then
  // the memory will not be freed automatically afterwards.
  explicit Buffer(const Context &context, const BufferAccess access, const size_t size):
-      buffer_(new cl_mem, [](cl_mem* m) { CheckError(clReleaseMemObject(*m)); delete m; }),
+      buffer_(new cl_mem, [access](cl_mem* m) {
        if (access != BufferAccess::kNotOwned) { CheckError(clReleaseMemObject(*m)); }
        delete m;
      }),
      access_(access) {
    auto flags = cl_mem_flags{CL_MEM_READ_WRITE};
    if (access_ == BufferAccess::kReadOnly) { flags = CL_MEM_READ_ONLY; }
@ -439,57 +456,74 @@ class Buffer {
    Buffer<T>(context, BufferAccess::kReadWrite, size) {
  }
  // Constructs a new buffer based on an existing host-container
  template <typename Iterator>
  explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end):
    Buffer(context, BufferAccess::kReadWrite, static_cast<size_t>(end - start)) {
    auto size = static_cast<size_t>(end - start);
    auto pointer = &*start;
    CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), pointer, 0,
                                    nullptr, nullptr));
    queue.Finish();
  }
  // Copies from device to host: reading the device buffer a-synchronously
-  void ReadAsync(const Queue &queue, const size_t size, T* host) {
+  void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
    if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
-    CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), host, 0,
+    CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
-                                   nullptr, nullptr));
+                                   host, 0, nullptr, nullptr));
  }
-  void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host) {
+  void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
                 const size_t offset = 0) {
    if (host.size() < size) { Error("target host buffer is too small"); }
-    ReadAsync(queue, size, host.data());
+    ReadAsync(queue, size, host.data(), offset);
  }
-  void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host) {
+  void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
                 const size_t offset = 0) {
    if (host.size() < size) { Error("target host buffer is too small"); }
-    ReadAsync(queue, size, host.data());
+    ReadAsync(queue, size, host.data(), offset);
  }
  // Copies from device to host: reading the device buffer
-  void Read(const Queue &queue, const size_t size, T* host) {
+  void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
-    ReadAsync(queue, size, host);
+    ReadAsync(queue, size, host, offset);
    queue.Finish();
  }
-  void Read(const Queue &queue, const size_t size, std::vector<T> &host) {
+  void Read(const Queue &queue, const size_t size, std::vector<T> &host, const size_t offset = 0) {
-    Read(queue, size, host.data());
+    Read(queue, size, host.data(), offset);
  }
-  void Read(const Queue &queue, const size_t size, BufferHost<T> &host) {
+  void Read(const Queue &queue, const size_t size, BufferHost<T> &host, const size_t offset = 0) {
-    Read(queue, size, host.data());
+    Read(queue, size, host.data(), offset);
  }
  // Copies from host to device: writing the device buffer a-synchronously
-  void WriteAsync(const Queue &queue, const size_t size, const T* host) {
+  void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
    if (access_ == BufferAccess::kReadOnly) { Error("writing to a read-only buffer"); }
-    if (GetSize() < size*sizeof(T)) { Error("target device buffer is too small"); }
+    if (GetSize() < (offset+size)*sizeof(T)) { Error("target device buffer is too small"); }
-    CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), host, 0,
+    CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
-                                    nullptr, nullptr));
+                                    host, 0, nullptr, nullptr));
  }
-  void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host) {
+  void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host,
-    WriteAsync(queue, size, host.data());
+                  const size_t offset = 0) {
    WriteAsync(queue, size, host.data(), offset);
  }
-  void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host) {
+  void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host,
-    WriteAsync(queue, size, host.data());
+                  const size_t offset = 0) {
    WriteAsync(queue, size, host.data(), offset);
  }
  // Copies from host to device: writing the device buffer
-  void Write(const Queue &queue, const size_t size, const T* host) {
+  void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
-    WriteAsync(queue, size, host);
+    WriteAsync(queue, size, host, offset);
    queue.Finish();
  }
-  void Write(const Queue &queue, const size_t size, const std::vector<T> &host) {
+  void Write(const Queue &queue, const size_t size, const std::vector<T> &host,
-    Write(queue, size, host.data());
+             const size_t offset = 0) {
    Write(queue, size, host.data(), offset);
  }
-  void Write(const Queue &queue, const size_t size, const BufferHost<T> &host) {
+  void Write(const Queue &queue, const size_t size, const BufferHost<T> &host,
-    Write(queue, size, host.data());
+             const size_t offset = 0) {
    Write(queue, size, host.data(), offset);
  }
  // Copies the contents of this buffer into another device buffer
@ -573,6 +607,13 @@ class Kernel {
                                      0, nullptr, &(event())));
  }
  // As above, but with the default local workgroup size
  void Launch(const Queue &queue, const std::vector<size_t> &global, Event &event) {
    CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
                                      nullptr, global.data(), nullptr,
                                      0, nullptr, &(event())));
  }
  // Accessor to the private data-member
  const cl_kernel& operator()() const { return *kernel_; }
 private:
--- a/include/internal/database.h
+++ b/include/internal/database.h
@ -56,24 +56,26 @@ class Database {
  static constexpr auto kDeviceTypeAll = "default";
  // The OpenCL device vendors
  static constexpr auto kDeviceVendorNVIDIA = "NVIDIA Corporation";
  static constexpr auto kDeviceVendorAMD = "Advanced Micro Devices, Inc.";
  static constexpr auto kDeviceVendorIntel = "Intel";
  static constexpr auto kDeviceVendorAll = "default";
-  // The OpenCL device names
+  // Alternative names for some OpenCL vendors
-  static constexpr auto kDefaultDevice = "default";
+  const std::unordered_map<std::string,std::string> kVendorNames {
-
+    {"Intel(R) Corporation", "Intel"},
    {"GenuineIntel", "Intel"},
    {"Advanced Micro Devices, Inc.", "AMD"},
    {"NVIDIA Corporation", "NVIDIA"},
  };
  // The database consists of separate database entries, stored together in a vector
  static const DatabaseEntry XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
  static const DatabaseEntry XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble;
  static const DatabaseEntry XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
  static const DatabaseEntry XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
  static const DatabaseEntry XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
  static const DatabaseEntry CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
  static const DatabaseEntry PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
-  static const DatabaseEntry TraSingle, TraDouble, TraComplexSingle, TraComplexDouble;
+  static const DatabaseEntry TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble;
-  static const DatabaseEntry PadTraSingle, PadTraDouble, PadTraComplexSingle, PadTraComplexDouble;
+  static const DatabaseEntry PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble;
  static const std::vector<DatabaseEntry> database;
  // The constructor
--- a/include/internal/database/copy.h
+++ b/include/internal/database/copy.h
@ -5,9 +5,9 @@
 // width of 100 characters per line.
 //
 // Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
+//   Database generator <database.py>
 //
-// This file populates the database with best-found tuning parameters for the Copy kernels.
+// This file populates the database with best-found tuning parameters for the 'Copy' kernels.
 //
 // =================================================================================================
@ -16,54 +16,56 @@ namespace clblast {
 const Database::DatabaseEntry Database::CopySingle = {
  "Copy", Precision::kSingle, {
-    { // NVIDIA GPUs
+    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
+      kDeviceTypeGPU, "AMD", {
-        { "GeForce GTX 480",  { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
-        { "Tesla K20m",       { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_WPT",2}, {"COPY_VW",4} } },
+        { "Tahiti",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
-        { "Tesla K40m",       { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_WPT",4}, {"COPY_VW",4} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
      }
    },
-    { // AMD GPUs
+    { // ARM GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "ARM", {
-        { "Tahiti",           { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",4}, {"COPY_VW",2} } },
+        { "Mali-T628",                                       { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",2} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "Intel", {
-        { "Iris",             { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",4} } },
+        { "Iris",                                            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "Iris Pro",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
      }
    },
-    { // Default
+    { // Intel accelerators
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAccelerator, "Intel", {
-        { kDefaultDevice,     { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
      }
    },
  }
 };
 // =================================================================================================
 const Database::DatabaseEntry Database::CopyDouble = {
  "Copy", Precision::kDouble, {
    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
+      kDeviceTypeGPU, "NVIDIA", {
-        { "GeForce GTX 480",  { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+        { "GeForce GTX 480",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
-        { "Tesla K20m",       { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } },
+        { "GeForce GTX 680",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
-        { "Tesla K40m",       { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } },
+        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
-      }
+        { "GeForce GTX 980",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
-    },
+        { "GeForce GTX TITAN",                               { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
-    { // AMD GPUs
+        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+        { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
-        { "Tahiti",           { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",4} } },
+        { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
-      }
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
    },
    { // Intel GPUs
      kDeviceTypeGPU, kDeviceVendorIntel, {
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
  }
@ -73,26 +75,100 @@ const Database::DatabaseEntry Database::CopyDouble = {
 const Database::DatabaseEntry Database::CopyComplexSingle = {
  "Copy", Precision::kComplexSingle, {
-    { // NVIDIA GPUs
+    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
+      kDeviceTypeGPU, "AMD", {
-        { "GeForce GTX 480",  { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
-        { "Tesla K20m",       { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",1} } },
+        { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
-        { "Tesla K40m",       { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
-    { // AMD GPUs
+    { // Intel CPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeCPU, "Intel", {
-        { "Tahiti",           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "Intel", {
-        { "Iris",             { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+        { "Iris",                                            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "Iris Pro",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } },
        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX 980",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
        { "Tesla K40m",                                      { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
  }
 };
 // =================================================================================================
 const Database::DatabaseEntry Database::CopyDouble = {
  "Copy", Precision::kDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
    { // ARM GPUs
      kDeviceTypeGPU, "ARM", {
        { "Mali-T628",                                       { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",2} } },
        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",2} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "GeForce GTX 680",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "GeForce GTX 980",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
        { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
  }
@ -102,25 +178,49 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
 const Database::DatabaseEntry Database::CopyComplexDouble = {
  "Copy", Precision::kComplexDouble, {
    { // NVIDIA GPUs
      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
        { "GeForce GTX 480",  { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
        { "Tesla K20m",       { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",1}, {"COPY_VW",1} } },
        { "Tesla K40m",       { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
      }
    },
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "AMD", {
-        { "Tahiti",           { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",4}, {"COPY_VW",2} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
-    { // Intel GPUs
+    { // ARM GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "ARM", {
        { "Mali-T628",                                       { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX 680",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX 980",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
        { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
  }
--- a/include/internal/database/pad.h
+++ b/include/internal/database/pad.h
@ -5,9 +5,9 @@
 // width of 100 characters per line.
 //
 // Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
+//   Database generator <database.py>
 //
-// This file populates the database with best-found tuning parameters for the Pad kernels.
+// This file populates the database with best-found tuning parameters for the 'Pad' kernels.
 //
 // =================================================================================================
@ -16,54 +16,56 @@ namespace clblast {
 const Database::DatabaseEntry Database::PadSingle = {
  "Pad", Precision::kSingle, {
-    { // NVIDIA GPUs
+    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
+      kDeviceTypeGPU, "AMD", {
-        { "GeForce GTX 480",  { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "Tesla K20m",       { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "Tahiti",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
-        { "Tesla K40m",       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
-    { // AMD GPUs
+    { // ARM GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "ARM", {
-        { "Tahiti",           { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "Mali-T628",                                       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "Intel", {
-        { "Iris",             { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "Iris",                                            { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "Iris Pro",                                        { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
      }
    },
-    { // Default
+    { // Intel accelerators
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAccelerator, "Intel", {
-        { kDefaultDevice,     { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
      }
    },
  }
 };
 // =================================================================================================
 const Database::DatabaseEntry Database::PadDouble = {
  "Pad", Precision::kDouble, {
    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
+      kDeviceTypeGPU, "NVIDIA", {
-        { "GeForce GTX 480",  { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 480",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
-        { "Tesla K20m",       { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 680",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
-        { "Tesla K40m",       { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
-      }
+        { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-    },
+        { "GeForce GTX TITAN",                               { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-    { // AMD GPUs
+        { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+        { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-        { "Tahiti",           { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "Tesla K40m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-      }
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
    },
    { // Intel GPUs
      kDeviceTypeGPU, kDeviceVendorIntel, {
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
  }
@ -73,26 +75,108 @@ const Database::DatabaseEntry Database::PadDouble = {
 const Database::DatabaseEntry Database::PadComplexSingle = {
  "Pad", Precision::kComplexSingle, {
-    { // NVIDIA GPUs
+    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
+      kDeviceTypeGPU, "AMD", {
-        { "GeForce GTX 480",  { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "Tesla K20m",       { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "Tahiti",                                          { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "Tesla K40m",       { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
-    { // AMD GPUs
+    { // ARM GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "ARM", {
-        { "Tahiti",           { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Mali-T628",                                       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "Intel", {
-        { "Iris",             { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Iris",                                            { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
        { "Iris Pro",                                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "GeForce GTX 680",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX TITAN",                               { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Tesla K40m",                                      { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
  }
 };
 // =================================================================================================
 const Database::DatabaseEntry Database::PadDouble = {
  "Pad", Precision::kDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tahiti",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
    { // ARM GPUs
      kDeviceTypeGPU, "ARM", {
        { "Mali-T628",                                       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX 680",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX 980",                                 { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX TITAN",                               { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tesla K40m",                                      { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
  }
@ -102,25 +186,49 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
 const Database::DatabaseEntry Database::PadComplexDouble = {
  "Pad", Precision::kComplexDouble, {
    { // NVIDIA GPUs
      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
        { "GeForce GTX 480",  { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tesla K20m",       { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tesla K40m",       { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "AMD", {
-        { "Tahiti",           { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tahiti",                                          { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
-    { // Intel GPUs
+    { // ARM GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "ARM", {
        { "Mali-T628",                                       { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX 680",                                 { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "GeForce GTX TITAN",                               { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "Tesla K20m",                                      { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
        { "Tesla K40m",                                      { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
  }
--- a/include/internal/database/padtranspose.h
+++ b/include/internal/database/padtranspose.h
@ -5,37 +5,67 @@
 // width of 100 characters per line.
 //
 // Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
+//   Database generator <database.py>
 //
-// This file populates the database with best-found tuning parameters for the PadTranspose kernels.
+// This file populates the database with best-found tuning parameters for the 'Padtranspose' kernels.
 //
 // =================================================================================================
 namespace clblast {
 // =================================================================================================
-const Database::DatabaseEntry Database::PadTraSingle = {
+const Database::DatabaseEntry Database::PadtransposeSingle = {
-  "PadTranspose", Precision::kSingle, {
+  "Padtranspose", Precision::kSingle, {
-    { // NVIDIA GPUs
+    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
+      kDeviceTypeGPU, "AMD", {
-        { "GeForce GTX 480",  { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
-        { "Tesla K20m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
+        { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
-        { "Tesla K40m",       { {"PADTRA_TILE",32}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
      }
    },
-    { // AMD GPUs
+    { // ARM GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "ARM", {
-        { "Tahiti",           { {"PADTRA_TILE",16}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
+        { "Mali-T628",                                       { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "Intel", {
-        { "Iris",             { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
+        { "Iris",                                            { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "Iris Pro",                                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
        { "GeForce GTX 980",                                 { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
        { "Tesla K20m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
      }
    },
  }
@ -43,27 +73,58 @@ const Database::DatabaseEntry Database::PadTraSingle = {
 // =================================================================================================
-const Database::DatabaseEntry Database::PadTraDouble = {
+const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
-  "PadTranspose", Precision::kDouble, {
+  "Padtranspose", Precision::kComplexSingle, {
-    { // NVIDIA GPUs
+    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
+      kDeviceTypeGPU, "AMD", {
-        { "GeForce GTX 480",  { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
-        { "Tesla K20m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
+        { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
-        { "Tesla K40m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
      }
    },
-    { // AMD GPUs
+    { // ARM GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "ARM", {
-        { "Tahiti",           { {"PADTRA_TILE",8}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
+        { "Mali-T628",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "Intel", {
        { "Iris",                                            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "Iris Pro",                                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX 980",                                 { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
        { "Tesla K20m",                                      { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
      }
    },
  }
@ -71,28 +132,51 @@ const Database::DatabaseEntry Database::PadTraDouble = {
 // =================================================================================================
-const Database::DatabaseEntry Database::PadTraComplexSingle = {
+const Database::DatabaseEntry Database::PadtransposeDouble = {
-  "PadTranspose", Precision::kComplexSingle, {
+  "Padtranspose", Precision::kDouble, {
    { // NVIDIA GPUs
      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
        { "GeForce GTX 480",  { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
        { "Tesla K20m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
        { "Tesla K40m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
      }
    },
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "AMD", {
-        { "Tahiti",           { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
        { "Tahiti",                                          { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
      }
    },
-    { // Intel GPUs
+    { // ARM GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "ARM", {
-        { "Iris",             { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
+        { "Mali-T628",                                       { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
        { "GeForce GTX 980",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
        { "Tesla K20m",                                      { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
      }
    },
  }
@ -100,27 +184,51 @@ const Database::DatabaseEntry Database::PadTraComplexSingle = {
 // =================================================================================================
-const Database::DatabaseEntry Database::PadTraComplexDouble = {
+const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
-  "PadTranspose", Precision::kComplexDouble, {
+  "Padtranspose", Precision::kComplexDouble, {
    { // NVIDIA GPUs
      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
        { "GeForce GTX 480",  { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
        { "Tesla K20m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
        { "Tesla K40m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
      }
    },
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "AMD", {
-        { "Tahiti",           { {"PADTRA_TILE",8}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
      }
    },
-    { // Intel GPUs
+    { // ARM GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "ARM", {
        { "Mali-T628",                                       { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
        { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
        { "GeForce GTX 980",                                 { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
        { "Tesla K20m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
      }
    },
  }
--- a/include/internal/database/transpose.h
+++ b/include/internal/database/transpose.h
@ -5,37 +5,67 @@
 // width of 100 characters per line.
 //
 // Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
+//   Database generator <database.py>
 //
-// This file populates the database with best-found tuning parameters for the Transpose kernels.
+// This file populates the database with best-found tuning parameters for the 'Transpose' kernels.
 //
 // =================================================================================================
 namespace clblast {
 // =================================================================================================
-const Database::DatabaseEntry Database::TraSingle = {
+const Database::DatabaseEntry Database::TransposeSingle = {
  "Transpose", Precision::kSingle, {
-    { // NVIDIA GPUs
+    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
+      kDeviceTypeGPU, "AMD", {
-        { "GeForce GTX 480",  { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
-        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
+        { "Tahiti",                                          { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
-        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
      }
    },
-    { // AMD GPUs
+    { // ARM GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "ARM", {
-        { "Tahiti",           { {"TRA_DIM",16}, {"TRA_WPT",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
+        { "Mali-T628",                                       { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "Intel", {
-        { "Iris",             { {"TRA_DIM",8}, {"TRA_WPT",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
+        { "Iris",                                            { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "Iris Pro",                                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
        { "GeForce GTX 750 Ti",                              { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "GeForce GTX 980",                                 { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "GeForce GTX TITAN X",                             { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "Tesla K20m",                                      { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "Tesla K40m",                                      { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
      }
    },
  }
@ -43,56 +73,52 @@ const Database::DatabaseEntry Database::TraSingle = {
 // =================================================================================================
-const Database::DatabaseEntry Database::TraDouble = {
+const Database::DatabaseEntry Database::TransposeComplexSingle = {
  "Transpose", Precision::kDouble, {
    { // NVIDIA GPUs
      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
        { "GeForce GTX 480",  { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
      }
    },
    { // AMD GPUs
      kDeviceTypeGPU, kDeviceVendorAMD, {
        { "Tahiti",           { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
      }
    },
    { // Intel GPUs
      kDeviceTypeGPU, kDeviceVendorIntel, {
      }
    },
    { // Default
      kDeviceTypeAll, kDeviceVendorAll, {
        { kDefaultDevice,     { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
      }
    },
  }
 };
 // =================================================================================================
 const Database::DatabaseEntry Database::TraComplexSingle = {
  "Transpose", Precision::kComplexSingle, {
-    { // NVIDIA GPUs
+    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
+      kDeviceTypeGPU, "AMD", {
-        { "GeForce GTX 480",  { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
-        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
+        { "Tahiti",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
-        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
      }
    },
-    { // AMD GPUs
+    { // ARM GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "ARM", {
-        { "Tahiti",           { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1} } },
+        { "Mali-T628",                                       { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "Intel", {
-        { "Iris",             { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
+        { "Iris",                                            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "Iris Pro",                                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "GeForce GTX 750 Ti",                              { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX 980",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
      }
    },
  }
@ -100,27 +126,97 @@ const Database::DatabaseEntry Database::TraComplexSingle = {
 // =================================================================================================
-const Database::DatabaseEntry Database::TraComplexDouble = {
+const Database::DatabaseEntry Database::TransposeDouble = {
-  "Transpose", Precision::kComplexDouble, {
+  "Transpose", Precision::kDouble, {
    { // NVIDIA GPUs
      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
        { "GeForce GTX 480",  { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
      }
    },
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "AMD", {
-        { "Tahiti",           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
        { "Tahiti",                                          { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
      }
    },
-    { // Intel GPUs
+    { // ARM GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "ARM", {
        { "Mali-T628",                                       { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "default",                                         { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
        { "GeForce GTX 750 Ti",                              { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX 980",                                 { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "GeForce GTX TITAN",                               { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
      }
    },
  }
 };
 // =================================================================================================
 const Database::DatabaseEntry Database::TransposeComplexDouble = {
  "Transpose", Precision::kComplexDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
        { "Tahiti",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
      }
    },
    { // ARM GPUs
      kDeviceTypeGPU, "ARM", {
        { "Mali-T628",                                       { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
        { "GeForce GTX 750 Ti",                              { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX 980",                                 { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX TITAN",                               { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
      }
    },
  }
--- a/include/internal/database/xaxpy.h
+++ b/include/internal/database/xaxpy.h
@ -5,9 +5,9 @@
 // width of 100 characters per line.
 //
 // Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
+//   Database generator <database.py>
 //
-// This file populates the database with best-found tuning parameters for the Xaxpy kernels.
+// This file populates the database with best-found tuning parameters for the 'Xaxpy' kernels.
 //
 // =================================================================================================
@ -16,26 +16,115 @@ namespace clblast {
 const Database::DatabaseEntry Database::XaxpySingle = {
  "Xaxpy", Precision::kSingle, {
-    { // NVIDIA GPUs
+    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
+      kDeviceTypeGPU, "AMD", {
-        { "GeForce GTX 480",  { {"WGS",128}, {"WPT",1}, {"VW",2} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS",128}, {"WPT",1} } },
-        { "Tesla K20m",       { {"WGS",128}, {"WPT",2}, {"VW",2} } },
+        { "Tahiti",                                          { {"VW",2}, {"WGS",64}, {"WPT",1} } },
-        { "Tesla K40m",       { {"WGS",128}, {"WPT",1}, {"VW",4} } },
+        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
    },
-    { // AMD GPUs
+    { // ARM GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "ARM", {
-        { "Tahiti",           { {"WGS",64}, {"WPT",1}, {"VW",2} } },
+        { "Mali-T628",                                       { {"VW",4}, {"WGS",256}, {"WPT",1} } },
        { "default",                                         { {"VW",4}, {"WGS",256}, {"WPT",1} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS",512}, {"WPT",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",4}, {"WGS",256}, {"WPT",1} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "Intel", {
-        { "Iris",             { {"WGS",512}, {"WPT",1}, {"VW",1} } },
+        { "Iris",                                            { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "Iris Pro",                                        { {"VW",1}, {"WGS",128}, {"WPT",2} } },
        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",2}, {"WGS",1024}, {"WPT",2} } },
        { "default",                                         { {"VW",2}, {"WGS",1024}, {"WPT",2} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"VW",4}, {"WGS",64}, {"WPT",1} } },
        { "GeForce GTX 680",                                 { {"VW",2}, {"WGS",64}, {"WPT",1} } },
        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
        { "GeForce GTX TITAN",                               { {"VW",4}, {"WGS",256}, {"WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "Tesla K20m",                                      { {"VW",4}, {"WGS",128}, {"WPT",1} } },
        { "Tesla K40m",                                      { {"VW",4}, {"WGS",128}, {"WPT",1} } },
        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
    },
  }
 };
 // =================================================================================================
 const Database::DatabaseEntry Database::XaxpyComplexSingle = {
  "Xaxpy", Precision::kComplexSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",2}, {"WGS",64}, {"WPT",8} } },
        { "Tahiti",                                          { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
    },
    { // ARM GPUs
      kDeviceTypeGPU, "ARM", {
        { "Mali-T628",                                       { {"VW",1}, {"WGS",256}, {"WPT",1} } },
        { "default",                                         { {"VW",1}, {"WGS",256}, {"WPT",1} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",4}, {"WGS",256}, {"WPT",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",1}, {"WGS",1024}, {"WPT",2} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
        { "default",                                         { {"VW",1}, {"WGS",256}, {"WPT",1} } },
      }
    },
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Iris",                                            { {"VW",2}, {"WGS",128}, {"WPT",1} } },
        { "Iris Pro",                                        { {"VW",1}, {"WGS",256}, {"WPT",8} } },
        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
        { "default",                                         { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",512}, {"WPT",1} } },
        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS",256}, {"WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",512}, {"WPT",1} } },
        { "Tesla K20m",                                      { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "Tesla K40m",                                      { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
    },
  }
@ -45,53 +134,49 @@ const Database::DatabaseEntry Database::XaxpySingle = {
 const Database::DatabaseEntry Database::XaxpyDouble = {
  "Xaxpy", Precision::kDouble, {
    { // NVIDIA GPUs
      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
        { "GeForce GTX 480",  { {"WGS",128}, {"WPT",1}, {"VW",1} } },
        { "Tesla K20m",       { {"WGS",512}, {"WPT",1}, {"VW",2} } },
        { "Tesla K40m",       { {"WGS",64}, {"WPT",1}, {"VW",2} } },
      }
    },
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "AMD", {
-        { "Tahiti",           { {"WGS",256}, {"WPT",1}, {"VW",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS",256}, {"WPT",1} } },
        { "Tahiti",                                          { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
    },
-    { // Intel GPUs
+    { // ARM GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "ARM", {
        { "Mali-T628",                                       { {"VW",2}, {"WGS",128}, {"WPT",2} } },
        { "default",                                         { {"VW",2}, {"WGS",128}, {"WPT",2} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",8}, {"WGS",64}, {"WPT",1} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",8}, {"WGS",2048}, {"WPT",1} } },
        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",2}, {"WGS",512}, {"WPT",1} } },
        { "default",                                         { {"VW",2}, {"WGS",512}, {"WPT",1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"VW",2}, {"WGS",64}, {"WPT",1} } },
        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
        { "GeForce GTX TITAN",                               { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",512}, {"WPT",1} } },
        { "Tesla K20m",                                      { {"VW",2}, {"WGS",128}, {"WPT",1} } },
        { "Tesla K40m",                                      { {"VW",2}, {"WGS",128}, {"WPT",1} } },
        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
      }
    },
  }
 };
 // =================================================================================================
 const Database::DatabaseEntry Database::XaxpyComplexSingle = {
  "Xaxpy", Precision::kComplexSingle, {
    { // NVIDIA GPUs
      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
        { "GeForce GTX 480",  { {"WGS",256}, {"WPT",1}, {"VW",1} } },
        { "Tesla K20m",       { {"WGS",128}, {"WPT",1}, {"VW",1} } },
        { "Tesla K40m",       { {"WGS",128}, {"WPT",2}, {"VW",1} } },
      }
    },
    { // AMD GPUs
      kDeviceTypeGPU, kDeviceVendorAMD, {
        { "Tahiti",           { {"WGS",64}, {"WPT",1}, {"VW",1} } },
      }
    },
    { // Intel GPUs
      kDeviceTypeGPU, kDeviceVendorIntel, {
        { "Iris",             { {"WGS",256}, {"WPT",1}, {"VW",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, kDeviceVendorAll, {
        { kDefaultDevice,     { {"WGS",128}, {"WPT",1}, {"VW",1} } },
      }
    },
  }
@ -101,25 +186,49 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
 const Database::DatabaseEntry Database::XaxpyComplexDouble = {
  "Xaxpy", Precision::kComplexDouble, {
    { // NVIDIA GPUs
      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
        { "GeForce GTX 480",  { {"WGS",128}, {"WPT",2}, {"VW",1} } },
        { "Tesla K20m",       { {"WGS",256}, {"WPT",1}, {"VW",1} } },
        { "Tesla K40m",       { {"WGS",64}, {"WPT",2}, {"VW",1} } },
      }
    },
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "AMD", {
-        { "Tahiti",           { {"WGS",64}, {"WPT",1}, {"VW",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "Tahiti",                                          { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
      }
    },
-    { // Intel GPUs
+    { // ARM GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "ARM", {
        { "Mali-T628",                                       { {"VW",1}, {"WGS",64}, {"WPT",8} } },
        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",8} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",8}, {"WGS",128}, {"WPT",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",8}, {"WGS",512}, {"WPT",1} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS",256}, {"WPT",1} } },
        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
        { "default",                                         { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",256}, {"WPT",2} } },
        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
        { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS",64}, {"WPT",4} } },
        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
        { "Tesla K20m",                                      { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "Tesla K40m",                                      { {"VW",1}, {"WGS",64}, {"WPT",1} } },
        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
    },
  }
--- a/include/internal/database/xdot.h
+++ b/include/internal/database/xdot.h
@ -5,9 +5,9 @@
 // width of 100 characters per line.
 //
 // Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
+//   Database generator <database.py>
 //
-// This file populates the database with best-found tuning parameters for the Xdot kernels.
+// This file populates the database with best-found tuning parameters for the 'Xdot' kernels.
 //
 // =================================================================================================
@ -16,22 +16,115 @@ namespace clblast {
 const Database::DatabaseEntry Database::XdotSingle = {
  "Xdot", Precision::kSingle, {
-    { // NVIDIA GPUs
+    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
+      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
        { "Tahiti",                                          { {"VW",1}, {"WGS1",256}, {"WGS2",256} } },
        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
      }
    },
-    { // AMD GPUs
+    { // ARM GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "ARM", {
        { "Mali-T628",                                       { {"VW",1}, {"WGS1",128}, {"WGS2",256} } },
        { "default",                                         { {"VW",1}, {"WGS1",128}, {"WGS2",256} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "Intel", {
-        { "Iris",             { {"WGS1",512}, {"WGS2",512} } },
+        { "Iris",                                            { {"VW",1}, {"WGS1",512}, {"WGS2",32} } },
        { "Iris Pro",                                        { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
        { "default",                                         { {"VW",1}, {"WGS1",128}, {"WGS2",32} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"VW",1}, {"WGS1",256}, {"WGS2",128} } },
        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS1",128}, {"WGS2",128} } },
        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
        { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
        { "Tesla K20m",                                      { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
        { "Tesla K40m",                                      { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"WGS1",64}, {"WGS2",64} } },
+        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
      }
    },
  }
 };
 // =================================================================================================
 const Database::DatabaseEntry Database::XdotComplexSingle = {
  "Xdot", Precision::kComplexSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
        { "Tahiti",                                          { {"VW",1}, {"WGS1",64}, {"WGS2",256} } },
        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
      }
    },
    { // ARM GPUs
      kDeviceTypeGPU, "ARM", {
        { "Mali-T628",                                       { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
        { "default",                                         { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
      }
    },
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Iris",                                            { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
        { "Iris Pro",                                        { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
        { "default",                                         { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"VW",1}, {"WGS1",512}, {"WGS2",512} } },
        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS1",256}, {"WGS2",32} } },
        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS1",128}, {"WGS2",32} } },
        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
        { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
        { "Tesla K20m",                                      { {"VW",1}, {"WGS1",256}, {"WGS2",512} } },
        { "Tesla K40m",                                      { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
      }
    },
  }
@ -41,45 +134,49 @@ const Database::DatabaseEntry Database::XdotSingle = {
 const Database::DatabaseEntry Database::XdotDouble = {
  "Xdot", Precision::kDouble, {
    { // NVIDIA GPUs
      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
      }
    },
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
        { "Tahiti",                                          { {"VW",1}, {"WGS1",64}, {"WGS2",256} } },
        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
      }
    },
-    { // Intel GPUs
+    { // ARM GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "ARM", {
        { "Mali-T628",                                       { {"VW",1}, {"WGS1",64}, {"WGS2",512} } },
        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",512} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS1",512}, {"WGS2",512} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS1",1024}, {"WGS2",512} } },
        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",512} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
        { "default",                                         { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS1",32}, {"WGS2",512} } },
        { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS1",128}, {"WGS2",128} } },
        { "Tesla K20m",                                      { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
        { "Tesla K40m",                                      { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
        { "default",                                         { {"VW",1}, {"WGS1",32}, {"WGS2",128} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"WGS1",64}, {"WGS2",64} } },
+        { "default",                                         { {"VW",1}, {"WGS1",32}, {"WGS2",128} } },
      }
    },
  }
 };
 // =================================================================================================
 const Database::DatabaseEntry Database::XdotComplexSingle = {
  "Xdot", Precision::kComplexSingle, {
    { // NVIDIA GPUs
      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
      }
    },
    { // AMD GPUs
      kDeviceTypeGPU, kDeviceVendorAMD, {
      }
    },
    { // Intel GPUs
      kDeviceTypeGPU, kDeviceVendorIntel, {
        { "Iris",             { {"WGS1",512}, {"WGS2",512} } },
      }
    },
    { // Default
      kDeviceTypeAll, kDeviceVendorAll, {
        { kDefaultDevice,     { {"WGS1",64}, {"WGS2",64} } },
      }
    },
  }
@ -89,21 +186,49 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
 const Database::DatabaseEntry Database::XdotComplexDouble = {
  "Xdot", Precision::kComplexDouble, {
    { // NVIDIA GPUs
      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
      }
    },
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
        { "Tahiti",                                          { {"VW",1}, {"WGS1",64}, {"WGS2",256} } },
        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
      }
    },
-    { // Intel GPUs
+    { // ARM GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "ARM", {
        { "Mali-T628",                                       { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
        { "default",                                         { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",32}, {"WGS2",1024} } },
        { "default",                                         { {"VW",1}, {"WGS1",32}, {"WGS2",1024} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"VW",1}, {"WGS1",512}, {"WGS2",512} } },
        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS1",256}, {"WGS2",64} } },
        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS1",32}, {"WGS2",128} } },
        { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS1",128}, {"WGS2",128} } },
        { "Tesla K20m",                                      { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
        { "Tesla K40m",                                      { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
        { "default",                                         { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"WGS1",64}, {"WGS2",64} } },
+        { "default",                                         { {"VW",1}, {"WGS1",32}, {"WGS2",32} } },
      }
    },
  }
--- a/include/internal/database/xgemm.h
+++ b/include/internal/database/xgemm.h
@ -5,9 +5,9 @@
 // width of 100 characters per line.
 //
 // Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
+//   Database generator <database.py>
 //
-// This file populates the database with best-found tuning parameters for the Xgemm kernels.
+// This file populates the database with best-found tuning parameters for the 'Xgemm' kernels.
 //
 // =================================================================================================
@ -16,56 +16,56 @@ namespace clblast {
 const Database::DatabaseEntry Database::XgemmSingle = {
  "Xgemm", Precision::kSingle, {
-    { // NVIDIA GPUs
+    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
+      kDeviceTypeGPU, "AMD", {
-        { "GeForce GTX 480",  { {"MWG",128}, {"NWG",64}, {"KWG",32}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",2}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
-        { "Tesla K20m",       { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",4}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+        { "Tahiti",                                          { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
-        { "Tesla K40m",       { {"MWG",128}, {"NWG",128}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
        { kDefaultDevice,     { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
      }
    },
-    { // AMD GPUs
+    { // ARM GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "ARM", {
-        { "Tahiti",           { {"MWG",128}, {"NWG",128}, {"KWG",32}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",32}, {"NDIMB",8}, {"KWI",2}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
+        { "Mali-T628",                                       { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "Intel", {
-        { "Iris",             { {"MWG",64}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",8}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } },
+        { "Iris",                                            { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
        { "Iris Pro",                                        { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
        { "default",                                         { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
      }
    },
-    { // Default
+    { // Intel accelerators
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAccelerator, "Intel", {
-        { kDefaultDevice,     { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
      }
    },
  }
 };
 // =================================================================================================
 const Database::DatabaseEntry Database::XgemmDouble = {
  "Xgemm", Precision::kDouble, {
    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
+      kDeviceTypeGPU, "NVIDIA", {
-        { "GeForce GTX 480",  { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+        { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
-        { "Tesla K20m",       { {"MWG",64}, {"NWG",128}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",32}, {"NDIMB",32}, {"KWI",8}, {"VWM",2}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
+        { "GeForce GTX 680",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
-        { "Tesla K40m",       { {"MWG",64}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",4} } },
-        { kDefaultDevice,     { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
+        { "GeForce GTX 980",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",8} } },
-      }
+        { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
-    },
+        { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
-    { // AMD GPUs
+        { "Tesla K20m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+        { "Tesla K40m",                                      { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
-        { "Tahiti",           { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
      }
    },
    { // Intel GPUs
      kDeviceTypeGPU, kDeviceVendorIntel, {
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
      }
    },
  }
@ -75,27 +75,108 @@ const Database::DatabaseEntry Database::XgemmDouble = {
 const Database::DatabaseEntry Database::XgemmComplexSingle = {
  "Xgemm", Precision::kComplexSingle, {
-    { // NVIDIA GPUs
+    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
+      kDeviceTypeGPU, "AMD", {
-        { "GeForce GTX 480",  { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
-        { "Tesla K20m",       { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",8}, {"KWI",8}, {"VWM",2}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } },
+        { "Tahiti",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
-        { "Tesla K40m",       { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",1}, {"STRM",0}, {"STRN",1}, {"SA",1}, {"SB",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
        { kDefaultDevice,     { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
      }
    },
-    { // AMD GPUs
+    { // ARM GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "ARM", {
-        { "Tahiti",           { {"MWG",16}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } },
+        { "Mali-T628",                                       { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "Intel", {
-        { "Iris",             { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+        { "Iris",                                            { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Iris Pro",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
        { "GeForce GTX 680",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
        { "GeForce GTX 980",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
        { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
        { "Tesla K20m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
        { "Tesla K40m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
      }
    },
  }
 };
 // =================================================================================================
 const Database::DatabaseEntry Database::XgemmDouble = {
  "Xgemm", Precision::kDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
        { "Tahiti",                                          { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
      }
    },
    { // ARM GPUs
      kDeviceTypeGPU, "ARM", {
        { "Mali-T628",                                       { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",2} } },
        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",2} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
        { "default",                                         { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
        { "GeForce GTX 680",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
        { "GeForce GTX 980",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
        { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
        { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tesla K20m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tesla K40m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
  }
@ -105,29 +186,52 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
 const Database::DatabaseEntry Database::XgemmComplexDouble = {
  "Xgemm", Precision::kComplexDouble, {
    { // NVIDIA GPUs
      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
        { "GeForce GTX 480",  { {"MWG",16}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
        { "Tesla K20m",       { {"MWG",16}, {"NWG",128}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",8}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } },
        { "Tesla K40m",       { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",8}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",1} } },
        { kDefaultDevice,     { {"MWG",16}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
      }
    },
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "AMD", {
-        { "Tahiti",           { {"MWG",128}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
        { "Tahiti",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "default",                                         { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
      }
    },
-    { // Intel GPUs
+    { // ARM GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "ARM", {
        { "Mali-T628",                                       { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",1} } },
        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",1} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "GeForce GTX 680",                                 { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
        { "GeForce GTX 980",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
        { "GeForce GTX TITAN X",                             { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tesla K20m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "Tesla K40m",                                      { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+        { "default",                                         { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
      }
    },
  }
 };
 // =================================================================================================
 } // namespace clblast
--- a/include/internal/database/xgemv.h
+++ b/include/internal/database/xgemv.h
@ -5,9 +5,9 @@
 // width of 100 characters per line.
 //
 // Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
+//   Database generator <database.py>
 //
-// This file populates the database with best-found tuning parameters for the Xgemv kernels.
+// This file populates the database with best-found tuning parameters for the 'Xgemv' kernels.
 //
 // =================================================================================================
@ -16,26 +16,97 @@ namespace clblast {
 const Database::DatabaseEntry Database::XgemvSingle = {
  "Xgemv", Precision::kSingle, {
-    { // NVIDIA GPUs
+    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
+      kDeviceTypeGPU, "AMD", {
-        { "GeForce GTX 480",  { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Tesla K20m",       { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
+        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Tesla K40m",       { {"WGS1",256}, {"WPT1",1}, {"WGS2",256}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",4} } },
+        { "default",                                         { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
-    { // AMD GPUs
+    { // Intel CPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeCPU, "Intel", {
-        { "Tahiti",           { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",4} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "Intel", {
-        { "Iris",             { {"WGS1",256}, {"WPT1",2}, {"WGS2",64}, {"WPT2",4}, {"VW2",4}, {"WGS3",256}, {"WPT3",2}, {"VW3",8} } },
+        { "Iris",                                            { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",8} } },
        { "Iris Pro",                                        { {"WGS1",256}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
        { "GeForce GTX 680",                                 { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
        { "GeForce GTX 750 Ti",                              { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",128}, {"WPT3",4} } },
        { "GeForce GTX 980",                                 { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
        { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
        { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
        { "Tesla K20m",                                      { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
        { "Tesla K40m",                                      { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
  }
 };
 // =================================================================================================
 const Database::DatabaseEntry Database::XgemvComplexSingle = {
  "Xgemv", Precision::kComplexSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
        { "Tahiti",                                          { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Iris",                                            { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
        { "Iris Pro",                                        { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
        { "GeForce GTX 680",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
        { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
  }
@ -45,53 +116,42 @@ const Database::DatabaseEntry Database::XgemvSingle = {
 const Database::DatabaseEntry Database::XgemvDouble = {
  "Xgemv", Precision::kDouble, {
    { // NVIDIA GPUs
      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
        { "GeForce GTX 480",  { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
        { "Tesla K20m",       { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
        { "Tesla K40m",       { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
      }
    },
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "AMD", {
-        { "Tahiti",           { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
-    { // Intel GPUs
+    { // Intel CPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",2}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",2} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
        { "GeForce GTX 980",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
        { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
        { "GeForce GTX TITAN X",                             { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
        { "Tesla K20m",                                      { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
        { "Tesla K40m",                                      { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
  }
 };
 // =================================================================================================
 const Database::DatabaseEntry Database::XgemvComplexSingle = {
  "Xgemv", Precision::kComplexSingle, {
    { // NVIDIA GPUs
      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
        { "GeForce GTX 480",  { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
        { "Tesla K20m",       { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
        { "Tesla K40m",       { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
      }
    },
    { // AMD GPUs
      kDeviceTypeGPU, kDeviceVendorAMD, {
        { "Tahiti",           { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
      }
    },
    { // Intel GPUs
      kDeviceTypeGPU, kDeviceVendorIntel, {
        { "Iris",             { {"WGS1",256}, {"WPT1",1}, {"WGS2",64}, {"WPT2",4}, {"VW2",2}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, kDeviceVendorAll, {
        { kDefaultDevice,     { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
      }
    },
  }
@ -101,25 +161,35 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
 const Database::DatabaseEntry Database::XgemvComplexDouble = {
  "Xgemv", Precision::kComplexDouble, {
    { // NVIDIA GPUs
      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
        { "GeForce GTX 480",  { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
        { "Tesla K20m",       { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
        { "Tesla K40m",       { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
      }
    },
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "AMD", {
-        { "Tahiti",           { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
-    { // Intel GPUs
+    { // Intel CPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
      }
    },
    { // Intel accelerators
      kDeviceTypeAccelerator, "Intel", {
        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
+      kDeviceTypeAll, "default", {
-        { kDefaultDevice,     { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
  }
--- a/include/internal/database/xger.h
+++ b/include/internal/database/xger.h
@ -0,0 +1,188 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Database generator <database.py>
 //
 // This file populates the database with best-found tuning parameters for the 'Xger' kernels.
 //
 // =================================================================================================
 namespace clblast {
 // =================================================================================================
 const Database::DatabaseEntry Database::XgerSingle = {
  "Xger", Precision::kSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
        { "Tahiti",                                          { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
      }
    },
    { // ARM GPUs
      kDeviceTypeGPU, "ARM", {
        { "Mali-T628",                                       { {"WGS1",64}, {"WGS2",4}, {"WPT",4} } },
        { "default",                                         { {"WGS1",64}, {"WGS2",4}, {"WPT",4} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
        { "default",                                         { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
      }
    },
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Iris Pro",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
        { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
        { "default",                                         { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
        { "default",                                         { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
      }
    },
  }
 };
 // =================================================================================================
 const Database::DatabaseEntry Database::XgerComplexSingle = {
  "Xger", Precision::kComplexSingle, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
        { "Tahiti",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
      }
    },
    { // ARM GPUs
      kDeviceTypeGPU, "ARM", {
        { "Mali-T628",                                       { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } },
        { "default",                                         { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
      }
    },
    { // Intel GPUs
      kDeviceTypeGPU, "Intel", {
        { "Iris Pro",                                        { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
        { "default",                                         { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
        { "GeForce GTX 680",                                 { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
        { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
        { "default",                                         { {"WGS1",16}, {"WGS2",2}, {"WPT",2} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
        { "default",                                         { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } },
      }
    },
  }
 };
 // =================================================================================================
 const Database::DatabaseEntry Database::XgerDouble = {
  "Xger", Precision::kDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
        { "Tahiti",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
        { "default",                                         { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } },
      }
    },
    { // ARM GPUs
      kDeviceTypeGPU, "ARM", {
        { "Mali-T628",                                       { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
        { "default",                                         { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",512}, {"WGS2",16}, {"WPT",1} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } },
        { "default",                                         { {"WGS1",512}, {"WGS2",8}, {"WPT",1} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
        { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
        { "default",                                         { {"WGS1",16}, {"WGS2",4}, {"WPT",2} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
        { "default",                                         { {"WGS1",16}, {"WGS2",2}, {"WPT",1} } },
      }
    },
  }
 };
 // =================================================================================================
 const Database::DatabaseEntry Database::XgerComplexDouble = {
  "Xger", Precision::kComplexDouble, {
    { // AMD GPUs
      kDeviceTypeGPU, "AMD", {
        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
        { "Tahiti",                                          { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
        { "default",                                         { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
      }
    },
    { // ARM GPUs
      kDeviceTypeGPU, "ARM", {
        { "Mali-T628",                                       { {"WGS1",64}, {"WGS2",2}, {"WPT",4} } },
        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",4} } },
      }
    },
    { // Intel CPUs
      kDeviceTypeCPU, "Intel", {
        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
      }
    },
    { // NVIDIA GPUs
      kDeviceTypeGPU, "NVIDIA", {
        { "GeForce GTX 480",                                 { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
        { "GeForce GTX 680",                                 { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
        { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
        { "default",                                         { {"WGS1",8}, {"WGS2",2}, {"WPT",1} } },
      }
    },
    { // Default
      kDeviceTypeAll, "default", {
        { "default",                                         { {"WGS1",8}, {"WGS2",1}, {"WPT",1} } },
      }
    },
  }
 };
 // =================================================================================================
 } // namespace clblast
--- a/include/internal/routines/level2/xger.h
+++ b/include/internal/routines/level2/xger.h
@ -0,0 +1,58 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xger routine. The precision is implemented using a template argument.
 //
 // =================================================================================================
 #ifndef CLBLAST_ROUTINES_XGER_H_
 #define CLBLAST_ROUTINES_XGER_H_
 #include "internal/routine.h"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T>
 class Xger: public Routine<T> {
 public:
  // Members and methods from the base class
  using Routine<T>::db_;
  using Routine<T>::source_string_;
  using Routine<T>::queue_;
  using Routine<T>::GetProgramFromCache;
  using Routine<T>::TestVectorX;
  using Routine<T>::TestVectorY;
  using Routine<T>::TestMatrixA;
  using Routine<T>::RunKernel;
  using Routine<T>::ErrorIn;
  // Constructor
  Xger(Queue &queue, Event &event, const std::string &name = "GER");
  // Templated-precision implementation of the routine
  StatusCode DoGer(const Layout layout,
                   const size_t m, const size_t n,
                   const T alpha,
                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                   const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
                   const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
 private:
  // Static variable to get the precision
  const static Precision precision_;
 };
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_ROUTINES_XGER_H_
 #endif
--- a/include/internal/routines/level2/xgerc.h
+++ b/include/internal/routines/level2/xgerc.h
@ -0,0 +1,46 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xgerc routine. The precision is implemented using a template argument.
 //
 // =================================================================================================
 #ifndef CLBLAST_ROUTINES_XGERC_H_
 #define CLBLAST_ROUTINES_XGERC_H_
 #include "internal/routines/level2/xger.h"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T>
 class Xgerc: public Xger<T> {
 public:
  // Uses the regular Xger routine
  using Xger<T>::DoGer;
  // Constructor
  Xgerc(Queue &queue, Event &event, const std::string &name = "GERC");
  // Templated-precision implementation of the routine
  StatusCode DoGerc(const Layout layout,
                    const size_t m, const size_t n,
                    const T alpha,
                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
 };
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_ROUTINES_XGERC_H_
 #endif
--- a/include/internal/routines/level2/xgeru.h
+++ b/include/internal/routines/level2/xgeru.h
@ -0,0 +1,46 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xgeru routine. The precision is implemented using a template argument.
 //
 // =================================================================================================
 #ifndef CLBLAST_ROUTINES_XGERU_H_
 #define CLBLAST_ROUTINES_XGERU_H_
 #include "internal/routines/level2/xger.h"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T>
 class Xgeru: public Xger<T> {
 public:
  // Uses the regular Xger routine
  using Xger<T>::DoGer;
  // Constructor
  Xgeru(Queue &queue, Event &event, const std::string &name = "GERU");
  // Templated-precision implementation of the routine
  StatusCode DoGeru(const Layout layout,
                    const size_t m, const size_t n,
                    const T alpha,
                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
 };
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_ROUTINES_XGERU_H_
 #endif
--- a/include/internal/routines/level2/xher.h
+++ b/include/internal/routines/level2/xher.h
@ -0,0 +1,61 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xher routine. The precision is implemented using a template argument.
 //
 // =================================================================================================
 #ifndef CLBLAST_ROUTINES_XHER_H_
 #define CLBLAST_ROUTINES_XHER_H_
 #include "internal/routine.h"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T, typename U>
 class Xher: public Routine<T> {
 public:
  // Members and methods from the base class
  using Routine<T>::db_;
  using Routine<T>::source_string_;
  using Routine<T>::queue_;
  using Routine<T>::GetProgramFromCache;
  using Routine<T>::TestVectorX;
  using Routine<T>::TestMatrixA;
  using Routine<T>::TestMatrixAP;
  using Routine<T>::RunKernel;
  using Routine<T>::ErrorIn;
  // Constructor
  Xher(Queue &queue, Event &event, const std::string &name = "HER");
  // Translates alpha of type 'U' into type 'T'
  T GetAlpha(const U alpha);
  // Templated-precision implementation of the routine
  StatusCode DoHer(const Layout layout, const Triangle triangle,
                   const size_t n,
                   const U alpha,
                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                   const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
                   const bool packed = false);
 private:
  // Static variable to get the precision
  const static Precision precision_;
 };
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_ROUTINES_XHER_H_
 #endif
--- a/include/internal/routines/level2/xher2.h
+++ b/include/internal/routines/level2/xher2.h
@ -0,0 +1,60 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xher2 routine. The precision is implemented using a template argument.
 //
 // =================================================================================================
 #ifndef CLBLAST_ROUTINES_XHER2_H_
 #define CLBLAST_ROUTINES_XHER2_H_
 #include "internal/routine.h"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T>
 class Xher2: public Routine<T> {
 public:
  // Members and methods from the base class
  using Routine<T>::db_;
  using Routine<T>::source_string_;
  using Routine<T>::queue_;
  using Routine<T>::GetProgramFromCache;
  using Routine<T>::TestVectorX;
  using Routine<T>::TestVectorY;
  using Routine<T>::TestMatrixA;
  using Routine<T>::TestMatrixAP;
  using Routine<T>::RunKernel;
  using Routine<T>::ErrorIn;
  // Constructor
  Xher2(Queue &queue, Event &event, const std::string &name = "HER2");
  // Templated-precision implementation of the routine
  StatusCode DoHer2(const Layout layout, const Triangle triangle,
                    const size_t n,
                    const T alpha,
                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
                    const bool packed = false);
 private:
  // Static variable to get the precision
  const static Precision precision_;
 };
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_ROUTINES_XHER2_H_
 #endif
--- a/include/internal/routines/level2/xhpr.h
+++ b/include/internal/routines/level2/xhpr.h
@ -0,0 +1,45 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xhpr routine. The precision is implemented using a template argument.
 //
 // =================================================================================================
 #ifndef CLBLAST_ROUTINES_XHPR_H_
 #define CLBLAST_ROUTINES_XHPR_H_
 #include "internal/routines/level2/xher.h"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T, typename U>
 class Xhpr: public Xher<T,U> {
 public:
  // Uses the regular Xher routine
  using Xher<T,U>::DoHer;
  // Constructor
  Xhpr(Queue &queue, Event &event, const std::string &name = "HPR");
  // Templated-precision implementation of the routine
  StatusCode DoHpr(const Layout layout, const Triangle triangle,
                   const size_t n,
                   const U alpha,
                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                   const Buffer<T> &ap_buffer, const size_t ap_offset);
 };
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_ROUTINES_XHPR_H_
 #endif
--- a/include/internal/routines/level2/xhpr2.h
+++ b/include/internal/routines/level2/xhpr2.h
@ -0,0 +1,46 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xhpr2 routine. The precision is implemented using a template argument.
 //
 // =================================================================================================
 #ifndef CLBLAST_ROUTINES_XHPR2_H_
 #define CLBLAST_ROUTINES_XHPR2_H_
 #include "internal/routines/level2/xher2.h"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T>
 class Xhpr2: public Xher2<T> {
 public:
  // Uses the regular Xher2 routine
  using Xher2<T>::DoHer2;
  // Constructor
  Xhpr2(Queue &queue, Event &event, const std::string &name = "HPR2");
  // Templated-precision implementation of the routine
  StatusCode DoHpr2(const Layout layout, const Triangle triangle,
                    const size_t n,
                    const T alpha,
                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
                    const Buffer<T> &ap_buffer, const size_t ap_offset);
 };
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_ROUTINES_XHPR2_H_
 #endif
--- a/include/internal/routines/level2/xspr.h
+++ b/include/internal/routines/level2/xspr.h
@ -0,0 +1,45 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xspr routine. The precision is implemented using a template argument.
 //
 // =================================================================================================
 #ifndef CLBLAST_ROUTINES_XSPR_H_
 #define CLBLAST_ROUTINES_XSPR_H_
 #include "internal/routines/level2/xher.h"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T>
 class Xspr: public Xher<T,T> {
 public:
  // Uses the regular Xher routine
  using Xher<T,T>::DoHer;
  // Constructor
  Xspr(Queue &queue, Event &event, const std::string &name = "SPR");
  // Templated-precision implementation of the routine
  StatusCode DoSpr(const Layout layout, const Triangle triangle,
                   const size_t n,
                   const T alpha,
                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                   const Buffer<T> &ap_buffer, const size_t ap_offset);
 };
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_ROUTINES_XSPR_H_
 #endif
--- a/include/internal/routines/level2/xspr2.h
+++ b/include/internal/routines/level2/xspr2.h
@ -0,0 +1,46 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xspr2 routine. The precision is implemented using a template argument.
 //
 // =================================================================================================
 #ifndef CLBLAST_ROUTINES_XSPR2_H_
 #define CLBLAST_ROUTINES_XSPR2_H_
 #include "internal/routines/level2/xher2.h"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T>
 class Xspr2: public Xher2<T> {
 public:
  // Uses the regular Xher2 routine
  using Xher2<T>::DoHer2;
  // Constructor
  Xspr2(Queue &queue, Event &event, const std::string &name = "SPR2");
  // Templated-precision implementation of the routine
  StatusCode DoSpr2(const Layout layout, const Triangle triangle,
                    const size_t n,
                    const T alpha,
                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
                    const Buffer<T> &ap_buffer, const size_t ap_offset);
 };
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_ROUTINES_XSPR2_H_
 #endif
--- a/include/internal/routines/level2/xsyr.h
+++ b/include/internal/routines/level2/xsyr.h
@ -0,0 +1,45 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xsyr routine. The precision is implemented using a template argument.
 //
 // =================================================================================================
 #ifndef CLBLAST_ROUTINES_XSYR_H_
 #define CLBLAST_ROUTINES_XSYR_H_
 #include "internal/routines/level2/xher.h"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T>
 class Xsyr: public Xher<T,T> {
 public:
  // Uses the regular Xher routine
  using Xher<T,T>::DoHer;
  // Constructor
  Xsyr(Queue &queue, Event &event, const std::string &name = "SYR");
  // Templated-precision implementation of the routine
  StatusCode DoSyr(const Layout layout, const Triangle triangle,
                   const size_t n,
                   const T alpha,
                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                   const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
 };
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_ROUTINES_XSYR_H_
 #endif
--- a/include/internal/routines/level2/xsyr2.h
+++ b/include/internal/routines/level2/xsyr2.h
@ -0,0 +1,46 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xsyr2 routine. The precision is implemented using a template argument.
 //
 // =================================================================================================
 #ifndef CLBLAST_ROUTINES_XSYR2_H_
 #define CLBLAST_ROUTINES_XSYR2_H_
 #include "internal/routines/level2/xher2.h"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T>
 class Xsyr2: public Xher2<T> {
 public:
  // Uses the regular Xher2 routine
  using Xher2<T>::DoHer2;
  // Constructor
  Xsyr2(Queue &queue, Event &event, const std::string &name = "SYR2");
  // Templated-precision implementation of the routine
  StatusCode DoSyr2(const Layout layout, const Triangle triangle,
                    const size_t n,
                    const T alpha,
                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
 };
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_ROUTINES_XSYR2_H_
 #endif
--- a/include/internal/tuning.h
+++ b/include/internal/tuning.h
@ -127,9 +127,11 @@ void Tuner(int argc, char* argv[]) {
    {"precision", precision_string}
  };
  for (auto &o: C::GetOptions()) {
-    if (o == kArgM) { metadata.push_back({"arg_m", std::to_string(args.m)}); }
+    if (o == kArgM)     { metadata.push_back({"arg_m", std::to_string(args.m)}); }
-    if (o == kArgN) { metadata.push_back({"arg_n", std::to_string(args.n)}); }
+    if (o == kArgN)     { metadata.push_back({"arg_n", std::to_string(args.n)}); }
-    if (o == kArgK) { metadata.push_back({"arg_k", std::to_string(args.k)}); }
+    if (o == kArgK)     { metadata.push_back({"arg_k", std::to_string(args.k)}); }
    if (o == kArgAlpha) { metadata.push_back({"arg_alpha", ToString(args.alpha)}); }
    if (o == kArgBeta)  { metadata.push_back({"arg_beta", ToString(args.beta)}); }
  }
  tuner.PrintJSON("clblast_"+C::KernelFamily()+"_"+precision_string+".json", metadata);
 }
--- a/include/internal/utilities.h
+++ b/include/internal/utilities.h
@ -125,7 +125,7 @@ struct Arguments {
  // Tuner-specific arguments
  double fraction = 1.0;
  // Client-specific arguments
-  bool compare_clblas = 1;
+  int compare_clblas = 1;
  size_t step = 1;
  size_t num_steps = 0;
  size_t num_runs = 10;
@ -171,7 +171,8 @@ T GetArgument(const int argc, char *argv[], std::string &help,
              const std::string &option, const T default_value);
 // Returns the precision only
-Precision GetPrecision(const int argc, char *argv[]);
+Precision GetPrecision(const int argc, char *argv[],
                       const Precision default_precision = Precision::kSingle);
 // As in "GetArgument", but now only checks whether an argument is given or not
 bool CheckArgument(const int argc, char *argv[], std::string &help, const std::string &option);
--- a/scripts/database/database.py
+++ b/scripts/database/database.py
@ -15,12 +15,36 @@ import os.path
 import glob
 import re
 import json
 try:
 	from urllib.request import urlopen # Python 3
 except ImportError:
 	from urllib2 import urlopen # Python 2
 # Additional modules
 import pandas as pd
 # Server storing a copy of the database
 DATABASE_SERVER_URL = "http://www.cedricnugteren.nl/tuning/clblast.db"
 # Constants
-ATTRIBUTES = ["device", "type", "vendor", "precision", "kernel_family", "arg_m", "arg_n", "arg_k"]
+VENDOR_DEFAULT = "default"
 DEVICETYPE_DEFAULT = "All"
 DEVICENAME_DEFAULT = "default"
 # Attributes
 DEVICETYPE_ATTRIBUTES = ["device_vendor", "device_type"]
 DEVICE_ATTRIBUTES = ["device", "device_core_clock", "device_compute_units"]
 KERNEL_ATTRIBUTES = ["precision", "kernel_family",
                     "arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"]
 ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICETYPE_ATTRIBUTES + KERNEL_ATTRIBUTES
 # OpenCL vendor names and their short name
 VENDOR_NAMES = { "device_vendor": {
  "GenuineIntel": "Intel",
  "Intel(R) Corporation": "Intel",
  "Advanced Micro Devices, Inc.": "AMD",
  "NVIDIA Corporation": "NVIDIA",
 }}
 # Pandas options
 pd.set_option('display.width', 1000)
@ -29,6 +53,14 @@ pd.set_option('display.width', 1000)
 # Database operations
 # ==================================================================================================
 # Downloads the database and save it to disk
 def DownloadDatabase(filename):
 	print("## Downloading database from '"+DATABASE_SERVER_URL+"'...")
 	df = urlopen(DATABASE_SERVER_URL)
 	output = open(file_db,'wb')
 	output.write(df.read())
 	output.close()
 # Loads the database from disk
 def LoadDatabase(filename):
 	return pd.read_pickle(filename)
@ -60,15 +92,58 @@ def ConcatenateData(df1, df2):
 def RemoveDuplicates(df):
 	return df.drop_duplicates()
-# Bests
+def RemoveEntriesByDevice(df, devicename):
 	return df[df["device"] != devicename]
 def GetEntriesByField(df, field, value):
 	return df[df[field] == value]
 # Fixes the problem that some vendors use multiple different names
 def SanitizeVendorNames(df):
 	df = df.replace(VENDOR_NAMES)
 	return df
 # Retrieves the results with the lowest execution times
 def GetBestResults(df):
 	dfbest = pd.DataFrame()
 	grouped = df.groupby(ATTRIBUTES+["kernel"])
 	for name, dfgroup in grouped:
-		bestcase = dfgroup.loc[[dfgroup["time"].idxmin()]]
+		besttime = dfgroup["time"].min()
-		dfbest = ConcatenateData(dfbest, bestcase)
+		bestcase = dfgroup[dfgroup["time"] == besttime].iloc[0]
 		dfbest = dfbest.append(bestcase, ignore_index=True)
 	return dfbest
 # Sets defaults for devices of the same type/vendor based on the smallest values of all know
 # entries. The average might be better for performance but some parameters might not be supported
 # on other devices.
 def CalculateDefaults(df):
 	dfdefault = pd.DataFrame()
 	# Defaults per type/vendor
 	groups = df.groupby(DEVICETYPE_ATTRIBUTES+KERNEL_ATTRIBUTES+["kernel"])
 	for name, dfgroup in groups:
 		default_values = dfgroup.min(axis=0)
 		default_values["device"] = DEVICENAME_DEFAULT
 		default_values["device_compute_units"] = 0
 		default_values["device_core_clock"] = 0
 		default_values["time"] = 0.0
 		dfdefault = dfdefault.append(default_values, ignore_index=True)
 	# Defaults in general
 	groups = df.groupby(KERNEL_ATTRIBUTES+["kernel"])
 	for name, dfgroup in groups:
 		default_values = dfgroup.min(axis=0)
 		default_values["device_vendor"] = VENDOR_DEFAULT
 		default_values["device_type"] = DEVICETYPE_DEFAULT
 		default_values["device"] = DEVICENAME_DEFAULT
 		default_values["device_compute_units"] = 0
 		default_values["device_core_clock"] = 0
 		default_values["time"] = 0.0
 		dfdefault = dfdefault.append(default_values, ignore_index=True)
 	# Database with both types of defaults only
 	return dfdefault
 # ==================================================================================================
 # C++ header generation
 # ==================================================================================================
@ -110,27 +185,28 @@ def GetPrecision(family, precision):
 # The C++ device type and vendor
 def GetDeviceVendor(vendor, devtype):
-	return("    { // %s %ss\n      kDeviceType%s, kDeviceVendor%s, {\n"
+	if vendor == VENDOR_DEFAULT and devtype == DEVICETYPE_DEFAULT:
-	       % (vendor, devtype, devtype, vendor))
+		return("    { // Default\n      kDeviceType%s, \"%s\", {\n" % (devtype, vendor))
 	return("    { // %s %ss\n      kDeviceType%s, \"%s\", {\n" % (vendor, devtype, devtype[0].upper() + devtype[1:], vendor))
 # Prints the data to a C++ database
-def PrintData(df):
+def PrintData(df, outputdir):
 	# Iterates over the kernel families: creates a new file per family
 	for family, dffamily in df.groupby(["kernel_family"]):
 		dffamily = dffamily.dropna(axis=1, how='all')
-		f = open(family+'.h', 'w+')
+		f = open(os.path.join(outputdir, family+'.h'), 'w+')
 		f.write(GetHeader(family))
 		# Loops over the different entries for this family and prints their headers
 		for precision, dfprecision in dffamily.groupby(["precision"]):
 			f.write(GetPrecision(family, precision))
-			for vendor, dfvendor in dfprecision.groupby(["vendor"]):
+			for vendor, dfvendor in dfprecision.groupby(["device_vendor"]):
-				for devtype, dfdevtype in dfvendor.groupby(["type"]):
+				for devtype, dfdevtype in dfvendor.groupby(["device_type"]):
 					f.write(GetDeviceVendor(vendor, devtype))
 					for device, dfdevice in dfdevtype.groupby(["device"]):
 						devicename = "\"%s\"," % device
-						f.write("        { %-20s { " % devicename)
+						f.write("        { %-50s { " % devicename)
 						# Collects the paramaters for this case and prints them
 						parameters = []
@ -152,57 +228,70 @@ def PrintData(df):
 # Checks for the number of command-line arguments
 if len(sys.argv) != 3:
-	print "[ERROR] Usage: database.py <folder_with_json_files> <root_of_clblast>"
+	print("[ERROR] Usage: database.py <folder_with_json_files> <root_of_clblast>")
 	sys.exit()
 # Parses the command-line arguments
 path_json = sys.argv[1]
 path_clblast = sys.argv[2]
-file_db = path_clblast+"/src/database.db"
+file_db = os.path.join(path_clblast, "scripts", "database", "database.db")
-glob_json = path_json+"/*.json"
+glob_json = os.path.join(path_json, "*.json")
 # Checks whether the command-line arguments are valid; exists otherwise
-clblast_h = path_clblast+"/include/clblast.h" # Not used but just for validation
+clblast_h = os.path.join(path_clblast, "include", "clblast.h") # Not used but just for validation
 if not os.path.isfile(clblast_h):
-	print "[ERROR] The path '"+path_clblast+"' does not point to the root of the CLBlast library"
+	print("[ERROR] The path '"+path_clblast+"' does not point to the root of the CLBlast library")
 	sys.exit()
 if len(glob.glob(glob_json)) < 1:
-	print "[ERROR] The path '"+path_json+"' does not contain any JSON files"
+	print("## The path '"+path_json+"' does not contain any JSON files")
 	sys.exit()
 # ==================================================================================================
 # The main body of the script
 # ==================================================================================================
-# Loads the database if it exists. If not, a new database is initialized
+# Downloads the database if a local copy is not present
 db_exists = os.path.isfile(file_db)
-database = LoadDatabase(file_db) if db_exists else pd.DataFrame()
+if not db_exists:
 	DownloadDatabase(file_db)
 # Loads the database from disk
 print("## Loading the database from disk...")
 database = LoadDatabase(file_db)
 # Loops over all JSON files in the supplied folder
 for file_json in glob.glob(glob_json):
 	# Loads the newly imported data
-	print "## Processing '"+file_json+"'",
+	sys.stdout.write("## Processing '"+file_json+"' ")
 	imported_data = ImportDataFromFile(file_json)
 	imported_data = SanitizeVendorNames(imported_data)
 	# Adds the new data to the database
 	old_size = len(database.index)
 	database = ConcatenateData(database, imported_data)
 	database = RemoveDuplicates(database)
 	new_size = len(database.index)
-	print "with "+str(new_size-old_size)+" new items"
+	print("with "+str(new_size-old_size)+" new items")
-# Stores the new database back to disk
+
-SaveDatabase(database, file_db)
+# Stores the modified database back to disk
 if len(glob.glob(glob_json)) >= 1:
 	print("## Storing the database to disk...")
 	SaveDatabase(database, file_db)
 # Retrieves the best performing results
 print("## Calculating the best results per device/kernel...")
 bests = GetBestResults(database)
-# TODO: Determines the defaults for other vendors and per vendor
+# Determines the defaults for other vendors and per vendor
-#defaults = CalculateDefaults(bests)
+defaults = CalculateDefaults(bests)
-#bests = ConcatenateData(bests, defaults)
+bests = ConcatenateData(bests, defaults)
 # Outputs the data as a C++ database
-PrintData(bests)
+path_cpp_database = os.path.join(path_clblast, "include", "internal", "database")
 print("## Producing a C++ database in '"+path_cpp_database+"'...")
 PrintData(bests, path_cpp_database)
 print("## All done")
 # ==================================================================================================
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@ -78,17 +78,17 @@ routines = [
  Routine(False, "2a", "tbsv",  T,  [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a banded triangular system of equations"),
  Routine(False, "2a", "tpsv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], False, "Solves a packed triangular system of equations"),
  # Level 2: matrix update
-  Routine(False, "2b", "ger",   T,  [S,D],     ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 matrix update"),
+  Routine(True,  "2b", "ger",   T,  [S,D],     ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 matrix update"),
-  Routine(False, "2b", "geru",  T,  [C,Z],     ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex matrix update"),
+  Routine(True,  "2b", "geru",  T,  [C,Z],     ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex matrix update"),
-  Routine(False, "2b", "gerc",  T,  [C,Z],     ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex conjugated matrix update"),
+  Routine(True,  "2b", "gerc",  T,  [C,Z],     ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex conjugated matrix update"),
-  Routine(False, "2b", "her",   Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Hermitian rank-1 matrix update"),
+  Routine(True,  "2b", "her",   Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Hermitian rank-1 matrix update"),
-  Routine(False, "2b", "hpr",   Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Hermitian packed rank-1 matrix update"),
+  Routine(True,  "2b", "hpr",   Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Hermitian packed rank-1 matrix update"),
-  Routine(False, "2b", "her2",  T,  [C,Z],     ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Hermitian rank-2 matrix update"),
+  Routine(True,  "2b", "her2",  T,  [C,Z],     ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Hermitian rank-2 matrix update"),
-  Routine(False, "2b", "hpr2",  T,  [C,Z],     ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Hermitian packed rank-2 matrix update"),
+  Routine(True,  "2b", "hpr2",  T,  [C,Z],     ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Hermitian packed rank-2 matrix update"),
-  Routine(False, "2b", "syr",   T,  [S,D],     ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Symmetric rank-1 matrix update"),
+  Routine(True,  "2b", "syr",   T,  [S,D],     ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Symmetric rank-1 matrix update"),
-  Routine(False, "2b", "spr",   T,  [S,D],     ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Symmetric packed rank-1 matrix update"),
+  Routine(True,  "2b", "spr",   T,  [S,D],     ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Symmetric packed rank-1 matrix update"),
-  Routine(False, "2b", "syr2",  T,  [S,D],     ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Symmetric rank-2 matrix update"),
+  Routine(True,  "2b", "syr2",  T,  [S,D],     ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Symmetric rank-2 matrix update"),
-  Routine(False, "2b", "spr2",  T,  [S,D],     ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Symmetric packed rank-2 matrix update"),
+  Routine(True,  "2b", "spr2",  T,  [S,D],     ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Symmetric packed rank-2 matrix update"),
 ],
 [ # Level 3: matrix-matrix
  Routine(True,  "3", "gemm",  T,  [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "General matrix-matrix multiplication"),
@ -103,7 +103,17 @@ routines = [
 ]]
 # ==================================================================================================
 # Translates an option name to a CLBlast data-type
 def PrecisionToFullName(x):
 	return {
 		'H': "Half",
 		'S': "Single",
 		'D': "Double",
 		'C': "ComplexSingle",
 		'Z': "ComplexDouble",
 	}[x]
 # ==================================================================================================
 # Separators for the BLAS levels
 separators = ["""
 // =================================================================================================
@ -237,7 +247,7 @@ files = [
  path_clblast+"/src/clblast_c.cc",
  path_clblast+"/test/wrapper_clblas.h",
 ]
-header_lines = [84, 52, 80, 24, 22]
+header_lines = [84, 63, 80, 24, 22]
 footer_lines = [6, 3, 5, 2, 6]
 # Checks whether the command-line arguments are valid; exists otherwise
@ -315,16 +325,10 @@ for level in [1,2,3]:
 			body += "using double2 = clblast::double2;\n\n"
 			body += "// Main function (not within the clblast namespace)\n"
 			body += "int main(int argc, char *argv[]) {\n"
-			body += "  switch(clblast::GetPrecision(argc, argv)) {\n"
+			default = PrecisionToFullName(routine.flavours[0].name)
 			body += "  switch(clblast::GetPrecision(argc, argv, clblast::Precision::k"+default+")) {\n"
 			for precision in ["H","S","D","C","Z"]:
-				enum = {
+				body += "    case clblast::Precision::k"+PrecisionToFullName(precision)+":"
 				    'H': "Half",
 				    'S': "Single",
 				    'D': "Double",
 				    'C': "ComplexSingle",
 				    'Z': "ComplexDouble",
 				}[precision]
 				body += "    case clblast::Precision::k"+enum+":"
 				found = False
 				for flavour in routine.flavours:
 					if flavour.name == precision:
--- a/src/clblast.cc
+++ b/src/clblast.cc
@ -38,6 +38,17 @@
 #include "internal/routines/level2/xtrmv.h"
 #include "internal/routines/level2/xtbmv.h"
 #include "internal/routines/level2/xtpmv.h"
 #include "internal/routines/level2/xger.h"
 #include "internal/routines/level2/xgeru.h"
 #include "internal/routines/level2/xgerc.h"
 #include "internal/routines/level2/xher.h"
 #include "internal/routines/level2/xhpr.h"
 #include "internal/routines/level2/xher2.h"
 #include "internal/routines/level2/xhpr2.h"
 #include "internal/routines/level2/xsyr.h"
 #include "internal/routines/level2/xspr.h"
 #include "internal/routines/level2/xsyr2.h"
 #include "internal/routines/level2/xspr2.h"
 // BLAS level-3 includes
 #include "internal/routines/level3/xgemm.h"
@ -835,14 +846,24 @@ template StatusCode Tpsv<double2>(const Layout, const Triangle, const Transpose,
 // General rank-1 matrix update: SGER/DGER
 template <typename T>
-StatusCode Ger(const Layout,
+StatusCode Ger(const Layout layout,
-               const size_t, const size_t,
+               const size_t m, const size_t n,
-               const T,
+               const T alpha,
-               const cl_mem, const size_t, const size_t,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-               const cl_mem, const size_t, const size_t,
+               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-               cl_mem, const size_t, const size_t,
+               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-               cl_command_queue*, cl_event*) {
+               cl_command_queue* queue, cl_event* event) {
-  return StatusCode::kNotImplemented;
+  auto queue_cpp = Queue(*queue);
  auto event_cpp = Event(*event);
  auto routine = Xger<T>(queue_cpp, event_cpp);
  auto status = routine.SetUp();
  if (status != StatusCode::kSuccess) { return status; }
  return routine.DoGer(layout,
                       m, n,
                       alpha,
                       Buffer<T>(x_buffer), x_offset, x_inc,
                       Buffer<T>(y_buffer), y_offset, y_inc,
                       Buffer<T>(a_buffer), a_offset, a_ld);
 }
 template StatusCode Ger<float>(const Layout,
                               const size_t, const size_t,
@ -861,14 +882,24 @@ template StatusCode Ger<double>(const Layout,
 // General rank-1 complex matrix update: CGERU/ZGERU
 template <typename T>
-StatusCode Geru(const Layout,
+StatusCode Geru(const Layout layout,
-                const size_t, const size_t,
+                const size_t m, const size_t n,
-                const T,
+                const T alpha,
-                const cl_mem, const size_t, const size_t,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                const cl_mem, const size_t, const size_t,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                cl_mem, const size_t, const size_t,
+                cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                cl_command_queue*, cl_event*) {
+                cl_command_queue* queue, cl_event* event) {
-  return StatusCode::kNotImplemented;
+  auto queue_cpp = Queue(*queue);
  auto event_cpp = Event(*event);
  auto routine = Xgeru<T>(queue_cpp, event_cpp);
  auto status = routine.SetUp();
  if (status != StatusCode::kSuccess) { return status; }
  return routine.DoGeru(layout,
                        m, n,
                        alpha,
                        Buffer<T>(x_buffer), x_offset, x_inc,
                        Buffer<T>(y_buffer), y_offset, y_inc,
                        Buffer<T>(a_buffer), a_offset, a_ld);
 }
 template StatusCode Geru<float2>(const Layout,
                                 const size_t, const size_t,
@ -887,14 +918,24 @@ template StatusCode Geru<double2>(const Layout,
 // General rank-1 complex conjugated matrix update: CGERC/ZGERC
 template <typename T>
-StatusCode Gerc(const Layout,
+StatusCode Gerc(const Layout layout,
-                const size_t, const size_t,
+                const size_t m, const size_t n,
-                const T,
+                const T alpha,
-                const cl_mem, const size_t, const size_t,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                const cl_mem, const size_t, const size_t,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                cl_mem, const size_t, const size_t,
+                cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                cl_command_queue*, cl_event*) {
+                cl_command_queue* queue, cl_event* event) {
-  return StatusCode::kNotImplemented;
+  auto queue_cpp = Queue(*queue);
  auto event_cpp = Event(*event);
  auto routine = Xgerc<T>(queue_cpp, event_cpp);
  auto status = routine.SetUp();
  if (status != StatusCode::kSuccess) { return status; }
  return routine.DoGerc(layout,
                        m, n,
                        alpha,
                        Buffer<T>(x_buffer), x_offset, x_inc,
                        Buffer<T>(y_buffer), y_offset, y_inc,
                        Buffer<T>(a_buffer), a_offset, a_ld);
 }
 template StatusCode Gerc<float2>(const Layout,
                                 const size_t, const size_t,
@ -913,13 +954,22 @@ template StatusCode Gerc<double2>(const Layout,
 // Hermitian rank-1 matrix update: CHER/ZHER
 template <typename T>
-StatusCode Her(const Layout, const Triangle,
+StatusCode Her(const Layout layout, const Triangle triangle,
-               const size_t,
+               const size_t n,
-               const T,
+               const T alpha,
-               const cl_mem, const size_t, const size_t,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-               cl_mem, const size_t, const size_t,
+               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-               cl_command_queue*, cl_event*) {
+               cl_command_queue* queue, cl_event* event) {
-  return StatusCode::kNotImplemented;
+  auto queue_cpp = Queue(*queue);
  auto event_cpp = Event(*event);
  auto routine = Xher<std::complex<T>,T>(queue_cpp, event_cpp);
  auto status = routine.SetUp();
  if (status != StatusCode::kSuccess) { return status; }
  return routine.DoHer(layout, triangle,
                       n,
                       alpha,
                       Buffer<std::complex<T>>(x_buffer), x_offset, x_inc,
                       Buffer<std::complex<T>>(a_buffer), a_offset, a_ld);
 }
 template StatusCode Her<float>(const Layout, const Triangle,
                               const size_t,
@ -936,13 +986,22 @@ template StatusCode Her<double>(const Layout, const Triangle,
 // Hermitian packed rank-1 matrix update: CHPR/ZHPR
 template <typename T>
-StatusCode Hpr(const Layout, const Triangle,
+StatusCode Hpr(const Layout layout, const Triangle triangle,
-               const size_t,
+               const size_t n,
-               const T,
+               const T alpha,
-               const cl_mem, const size_t, const size_t,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-               cl_mem, const size_t,
+               cl_mem ap_buffer, const size_t ap_offset,
-               cl_command_queue*, cl_event*) {
+               cl_command_queue* queue, cl_event* event) {
-  return StatusCode::kNotImplemented;
+  auto queue_cpp = Queue(*queue);
  auto event_cpp = Event(*event);
  auto routine = Xhpr<std::complex<T>,T>(queue_cpp, event_cpp);
  auto status = routine.SetUp();
  if (status != StatusCode::kSuccess) { return status; }
  return routine.DoHpr(layout, triangle,
                       n,
                       alpha,
                       Buffer<std::complex<T>>(x_buffer), x_offset, x_inc,
                       Buffer<std::complex<T>>(ap_buffer), ap_offset);
 }
 template StatusCode Hpr<float>(const Layout, const Triangle,
                               const size_t,
@ -959,14 +1018,24 @@ template StatusCode Hpr<double>(const Layout, const Triangle,
 // Hermitian rank-2 matrix update: CHER2/ZHER2
 template <typename T>
-StatusCode Her2(const Layout, const Triangle,
+StatusCode Her2(const Layout layout, const Triangle triangle,
-                const size_t,
+                const size_t n,
-                const T,
+                const T alpha,
-                const cl_mem, const size_t, const size_t,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                const cl_mem, const size_t, const size_t,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                cl_mem, const size_t, const size_t,
+                cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                cl_command_queue*, cl_event*) {
+                cl_command_queue* queue, cl_event* event) {
-  return StatusCode::kNotImplemented;
+  auto queue_cpp = Queue(*queue);
  auto event_cpp = Event(*event);
  auto routine = Xher2<T>(queue_cpp, event_cpp);
  auto status = routine.SetUp();
  if (status != StatusCode::kSuccess) { return status; }
  return routine.DoHer2(layout, triangle,
                        n,
                        alpha,
                        Buffer<T>(x_buffer), x_offset, x_inc,
                        Buffer<T>(y_buffer), y_offset, y_inc,
                        Buffer<T>(a_buffer), a_offset, a_ld);
 }
 template StatusCode Her2<float2>(const Layout, const Triangle,
                                 const size_t,
@ -985,14 +1054,24 @@ template StatusCode Her2<double2>(const Layout, const Triangle,
 // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2
 template <typename T>
-StatusCode Hpr2(const Layout, const Triangle,
+StatusCode Hpr2(const Layout layout, const Triangle triangle,
-                const size_t,
+                const size_t n,
-                const T,
+                const T alpha,
-                const cl_mem, const size_t, const size_t,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                const cl_mem, const size_t, const size_t,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                cl_mem, const size_t,
+                cl_mem ap_buffer, const size_t ap_offset,
-                cl_command_queue*, cl_event*) {
+                cl_command_queue* queue, cl_event* event) {
-  return StatusCode::kNotImplemented;
+  auto queue_cpp = Queue(*queue);
  auto event_cpp = Event(*event);
  auto routine = Xhpr2<T>(queue_cpp, event_cpp);
  auto status = routine.SetUp();
  if (status != StatusCode::kSuccess) { return status; }
  return routine.DoHpr2(layout, triangle,
                        n,
                        alpha,
                        Buffer<T>(x_buffer), x_offset, x_inc,
                        Buffer<T>(y_buffer), y_offset, y_inc,
                        Buffer<T>(ap_buffer), ap_offset);
 }
 template StatusCode Hpr2<float2>(const Layout, const Triangle,
                                 const size_t,
@ -1011,13 +1090,22 @@ template StatusCode Hpr2<double2>(const Layout, const Triangle,
 // Symmetric rank-1 matrix update: SSYR/DSYR
 template <typename T>
-StatusCode Syr(const Layout, const Triangle,
+StatusCode Syr(const Layout layout, const Triangle triangle,
-               const size_t,
+               const size_t n,
-               const T,
+               const T alpha,
-               const cl_mem, const size_t, const size_t,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-               cl_mem, const size_t, const size_t,
+               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-               cl_command_queue*, cl_event*) {
+               cl_command_queue* queue, cl_event* event) {
-  return StatusCode::kNotImplemented;
+  auto queue_cpp = Queue(*queue);
  auto event_cpp = Event(*event);
  auto routine = Xsyr<T>(queue_cpp, event_cpp);
  auto status = routine.SetUp();
  if (status != StatusCode::kSuccess) { return status; }
  return routine.DoSyr(layout, triangle,
                       n,
                       alpha,
                       Buffer<T>(x_buffer), x_offset, x_inc,
                       Buffer<T>(a_buffer), a_offset, a_ld);
 }
 template StatusCode Syr<float>(const Layout, const Triangle,
                               const size_t,
@ -1034,13 +1122,22 @@ template StatusCode Syr<double>(const Layout, const Triangle,
 // Symmetric packed rank-1 matrix update: SSPR/DSPR
 template <typename T>
-StatusCode Spr(const Layout, const Triangle,
+StatusCode Spr(const Layout layout, const Triangle triangle,
-               const size_t,
+               const size_t n,
-               const T,
+               const T alpha,
-               const cl_mem, const size_t, const size_t,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-               cl_mem, const size_t,
+               cl_mem ap_buffer, const size_t ap_offset,
-               cl_command_queue*, cl_event*) {
+               cl_command_queue* queue, cl_event* event) {
-  return StatusCode::kNotImplemented;
+  auto queue_cpp = Queue(*queue);
  auto event_cpp = Event(*event);
  auto routine = Xspr<T>(queue_cpp, event_cpp);
  auto status = routine.SetUp();
  if (status != StatusCode::kSuccess) { return status; }
  return routine.DoSpr(layout, triangle,
                       n,
                       alpha,
                       Buffer<T>(x_buffer), x_offset, x_inc,
                       Buffer<T>(ap_buffer), ap_offset);
 }
 template StatusCode Spr<float>(const Layout, const Triangle,
                               const size_t,
@ -1057,14 +1154,24 @@ template StatusCode Spr<double>(const Layout, const Triangle,
 // Symmetric rank-2 matrix update: SSYR2/DSYR2
 template <typename T>
-StatusCode Syr2(const Layout, const Triangle,
+StatusCode Syr2(const Layout layout, const Triangle triangle,
-                const size_t,
+                const size_t n,
-                const T,
+                const T alpha,
-                const cl_mem, const size_t, const size_t,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                const cl_mem, const size_t, const size_t,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                cl_mem, const size_t, const size_t,
+                cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                cl_command_queue*, cl_event*) {
+                cl_command_queue* queue, cl_event* event) {
-  return StatusCode::kNotImplemented;
+  auto queue_cpp = Queue(*queue);
  auto event_cpp = Event(*event);
  auto routine = Xsyr2<T>(queue_cpp, event_cpp);
  auto status = routine.SetUp();
  if (status != StatusCode::kSuccess) { return status; }
  return routine.DoSyr2(layout, triangle,
                        n,
                        alpha,
                        Buffer<T>(x_buffer), x_offset, x_inc,
                        Buffer<T>(y_buffer), y_offset, y_inc,
                        Buffer<T>(a_buffer), a_offset, a_ld);
 }
 template StatusCode Syr2<float>(const Layout, const Triangle,
                                const size_t,
@ -1083,14 +1190,24 @@ template StatusCode Syr2<double>(const Layout, const Triangle,
 // Symmetric packed rank-2 matrix update: SSPR2/DSPR2
 template <typename T>
-StatusCode Spr2(const Layout, const Triangle,
+StatusCode Spr2(const Layout layout, const Triangle triangle,
-                const size_t,
+                const size_t n,
-                const T,
+                const T alpha,
-                const cl_mem, const size_t, const size_t,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                const cl_mem, const size_t, const size_t,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                cl_mem, const size_t,
+                cl_mem ap_buffer, const size_t ap_offset,
-                cl_command_queue*, cl_event*) {
+                cl_command_queue* queue, cl_event* event) {
-  return StatusCode::kNotImplemented;
+  auto queue_cpp = Queue(*queue);
  auto event_cpp = Event(*event);
  auto routine = Xspr2<T>(queue_cpp, event_cpp);
  auto status = routine.SetUp();
  if (status != StatusCode::kSuccess) { return status; }
  return routine.DoSpr2(layout, triangle,
                        n,
                        alpha,
                        Buffer<T>(x_buffer), x_offset, x_inc,
                        Buffer<T>(y_buffer), y_offset, y_inc,
                        Buffer<T>(ap_buffer), ap_offset);
 }
 template StatusCode Spr2<float>(const Layout, const Triangle,
                                const size_t,
--- a/src/database.cc
+++ b/src/database.cc
@ -15,6 +15,7 @@
 #include "internal/database/xaxpy.h"
 #include "internal/database/xdot.h"
 #include "internal/database/xgemv.h"
 #include "internal/database/xger.h"
 #include "internal/database/xgemm.h"
 #include "internal/database/copy.h"
 #include "internal/database/pad.h"
@ -31,11 +32,12 @@ const std::vector<Database::DatabaseEntry> Database::database = {
  XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble,
  XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble,
  XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble,
  XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble,
  XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
  CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
  PadSingle, PadDouble, PadComplexSingle, PadComplexDouble,
-  TraSingle, TraDouble, TraComplexSingle, TraComplexDouble,
+  TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble,
-  PadTraSingle, PadTraDouble, PadTraComplexSingle, PadTraComplexDouble
+  PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble
 };
 // =================================================================================================
@ -77,19 +79,29 @@ Database::Parameters Database::Search(const std::string &this_kernel,
                                      const std::string &this_vendor,
                                      const std::string &this_device,
                                      const Precision this_precision) const {
  // Set the short vendor name
  auto this_short_vendor = this_vendor;
  for (auto &combination : kVendorNames) {
    if (this_vendor == combination.first) {
      this_short_vendor = combination.second;
    }
  }
  // Selects the right kernel
  for (auto &db: database) {
    if (db.kernel == this_kernel && db.precision == this_precision) {
      // Searches for the right vendor and device type, or selects the default if unavailable. This
      // assumes that the default vendor / device type is last in the database.
      for (auto &vendor: db.vendors) {
-        if ((vendor.name == this_vendor || vendor.name == kDeviceVendorAll) &&
+        if ((vendor.name == this_short_vendor || vendor.name == kDeviceVendorAll) &&
-            (vendor.type == this_type   || vendor.type == kDeviceTypeAll)) {
+            (vendor.type == this_type || vendor.type == kDeviceTypeAll)) {
          // Searches for the right device. If the current device is unavailable, selects the vendor
          // default parameters. This assumes the default is last in the database.
          for (auto &device: vendor.devices) {
-            if (device.name == this_device || device.name == kDefaultDevice) {
+
            if (device.name == this_device || device.name == "default") {
              // Sets the parameters accordingly
              return device.parameters;
--- a/src/kernels/level2/level2.opencl
+++ b/src/kernels/level2/level2.opencl
@ -0,0 +1,158 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file contains common functions for matrix update kernels (Xger, Xher).
 //
 // =================================================================================================
 // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
 // literal). Comment-out this line for syntax-highlighting when developing.
 R"(
 // =================================================================================================
 // Parameters set by the tuner or by the database. Here they are given a basic default value in case
 // this kernel file is used outside of the CLBlast library.
 #ifndef WGS1
  #define WGS1 8    // The local work-group size in first dimension
 #endif
 #ifndef WGS2
  #define WGS2 8    // The local work-group size in second dimension
 #endif
 #ifndef WPT
  #define WPT 1     // The amount of work-per-thread in both dimensions
 #endif
 // =================================================================================================
 // Returns an element from a vector
 inline real LoadVector(const int id, const int max,
                       __global real* gm, const int offset, const int inc,
                       const int do_conjugate) {
  if (id < max) {
    real result = gm[id*inc + offset];
    if (do_conjugate) {
      #if defined(ROUTINE_GERC) || defined(ROUTINE_HER) || defined(ROUTINE_HPR) || defined(ROUTINE_HER2) || defined(ROUTINE_HPR2)
        COMPLEX_CONJUGATE(result);
      #endif
    }
    return result;
  }
  else {
    real default_result;
    SetToZero(default_result);
    return default_result;
  }
 }
 // Performs the rank-1 matrix update
 inline void MatrixUpdate(const int id1, const int id2, const int max1, const int max2,
                         __global real* agm, const int a_offset, const int a_ld,
                         const real alpha, const real xvalue, const real yvalue,
                         const int is_upper) {
  // Bounds of a regular matrix
  if (id1 < max1 && id2 < max2) {
    #if defined(ROUTINE_SPR) || defined(ROUTINE_HPR)
      int a_index;
      if (is_upper) {
        a_index = (id1 <= id2) ? ((id2+1)*id2)/2 + id1 : ((id1+1)*id1)/2 + id2;
      }
      else {
        a_index = (id1 >= id2) ? ((2*a_ld-(id2+1))*id2)/2 + id1 : ((2*a_ld-(id1+1))*id1)/2 + id2;
      }
      a_index += a_offset;
    #else
      const int a_index = id2*a_ld + id1 + a_offset;
    #endif
    // Loads the current value of the A matrix
    const real avalue = agm[a_index];
    // Computes result = alpha * x[i] * y[j] + a[i][j]
    #if PRECISION == 3232 || PRECISION == 6464
      real ax;
      ax.x = MulReal(alpha, xvalue);
      ax.y = MulImag(alpha, xvalue);
      real result;
      result.x = MulReal(ax, yvalue) + avalue.x;
      result.y = MulImag(ax, yvalue) + avalue.y;
    #else
      real result = alpha * xvalue * yvalue + avalue;
    #endif
    // For hermetian matrices
    #if defined(ROUTINE_HER) || defined(ROUTINE_HPR)
      if (id1 == id2) { result.y = ZERO; }
    #endif
    // Stores the final result
    agm[a_index] = result;
  }
 }
 // Performs the rank-2 matrix update
 inline void MatrixUpdate2(const int id1, const int id2, const int max1, const int max2,
                          __global real* agm, const int a_offset, const int a_ld,
                          const real alpha1, const real xvalue, const real yvalue,
                          const real alpha2, const real xtvalue, const real ytvalue,
                          const int is_upper) {
  // Bounds of a regular matrix
  if (id1 < max1 && id2 < max2) {
    #if defined(ROUTINE_SPR2) || defined(ROUTINE_HPR2)
      int a_index;
      if (is_upper) {
        a_index = (id1 <= id2) ? ((id2+1)*id2)/2 + id1 : ((id1+1)*id1)/2 + id2;
      }
      else {
        a_index = (id1 >= id2) ? ((2*a_ld-(id2+1))*id2)/2 + id1 : ((2*a_ld-(id1+1))*id1)/2 + id2;
      }
      a_index += a_offset;
    #else
      const int a_index = id2*a_ld + id1 + a_offset;
    #endif
    // Loads the current value of the A matrix
    const real avalue = agm[a_index];
    // Computes result = alpha * x[i] * y[j] + alpha * x[j] * y[i] + a[i][j]
    #if PRECISION == 3232 || PRECISION == 6464
      real ax;
      ax.x = MulReal(alpha2, xvalue);
      ax.y = MulImag(alpha2, xvalue);
      real atx;
      atx.x = MulReal(alpha1, xtvalue);
      atx.y = MulImag(alpha1, xtvalue);
      real result;
      result.x = MulReal(ax, yvalue) + MulReal(atx, ytvalue) + avalue.x;
      result.y = MulImag(ax, yvalue) + MulImag(atx, ytvalue) + avalue.y;
    #else
      real result = alpha1 * xvalue * yvalue + alpha2 * xtvalue * ytvalue + avalue;
    #endif
    // For hermetian matrices
    #if defined(ROUTINE_HER2) || defined(ROUTINE_HPR2)
      if (id1 == id2) { result.y = ZERO; }
    #endif
    // Stores the final result
    agm[a_index] = result;
  }
 }
 // =================================================================================================
 // End of the C++11 raw string literal
 )"
 // =================================================================================================
--- a/src/kernels/level2/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file contains the Xgemv kernel for matrix-vector multiplication.
+// This file contains the Xgemv kernel (generic version) for matrix-vector multiplication.
 //
 // =================================================================================================
@ -27,56 +27,11 @@ R"(
 #ifndef WPT1
  #define WPT1 1      // The amount of work-per-thread
 #endif
-
+#ifndef UNROLL1
-// 2: For the fast version
+  #define UNROLL1 32  // Unroll factor (must be a divider of WGS1)
 #ifndef WGS2
  #define WGS2 64     // The local work-group size
 #endif
 #ifndef WPT2
  #define WPT2 1      // The amount of work-per-thread
 #endif
 #ifndef VW2
  #define VW2 1       // Vector width of matrix A loads
 #endif
-// 3: For the fast rotated version
+// 2 and 3: For the fast versions, see 'xgemv_fast.opencl'
 #ifndef WGS3
  #define WGS3 64     // The local work-group size
 #endif
 #ifndef WPT3
  #define WPT3 1      // The amount of work-per-thread
 #endif
 #ifndef VW3
  #define VW3 1       // Vector width of matrix A loads
 #endif
 // =================================================================================================
 // Data-widths for the 'fast' kernel
 #if VW2 == 1
  typedef real realVF;
 #elif VW2 == 2
  typedef real2 realVF;
 #elif VW2 == 4
  typedef real4 realVF;
 #elif VW2 == 8
  typedef real8 realVF;
 #elif VW2 == 16
  typedef real16 realVF;
 #endif
 // Data-widths for the 'fast' kernel with rotated matrix
 #if VW3 == 1
  typedef real realVFR;
 #elif VW3 == 2
  typedef real2 realVFR;
 #elif VW3 == 4
  typedef real4 realVFR;
 #elif VW3 == 8
  typedef real8 realVFR;
 #elif VW3 == 16
  typedef real16 realVFR;
 #endif
 // =================================================================================================
@ -252,18 +207,6 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
  return result;
 }
 // Loads a vector input value (1/2)
 inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y,
                            const int a_ld) {
  return agm[a_ld*y + x];
 }
 // Loads a vector input value (2/2): as before, but different data-type
 inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x, const int y,
                              const int a_ld) {
  return agm[a_ld*y + x];
 }
 // =================================================================================================
 // Full version of the kernel
@ -301,28 +244,31 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
    barrier(CLK_LOCAL_MEM_FENCE);
    // Loops over the work per thread, and checks whether in bounds
    #pragma unroll
    for (int w=0; w<WPT1; ++w) {
      const int gid = w*get_global_size(0) + get_global_id(0);
      if (gid < m) {
        // The multiply-add function for the main part (divisable by WGS1)
        if (a_rotated == 0) { // Not rotated
-          #pragma unroll
+          for (int kloop=0; kloop<WGS1; kloop+=UNROLL1) {
-          for (int kloop=0; kloop<WGS1; ++kloop) {
+            #pragma unroll
-            const int k = kwg + kloop;
+            for (int kunroll=0; kunroll<UNROLL1; ++kunroll) {
-            real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku);
+              const int k = kwg + kloop + kunroll;
-            if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
+              real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku);
-            MultiplyAdd(acc[w], xlm[kloop], value);
+              if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
              MultiplyAdd(acc[w], xlm[kloop + kunroll], value);
            }
          }
        }
        else { // Transposed
-          #pragma unroll
+          for (int kloop=0; kloop<WGS1; kloop+=UNROLL1) {
-          for (int kloop=0; kloop<WGS1; ++kloop) {
+            #pragma unroll
-            const int k = kwg + kloop;
+            for (int kunroll=0; kunroll<UNROLL1; ++kunroll) {
-            real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku);
+              const int k = kwg + kloop + kunroll;
-            if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
+              real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku);
-            MultiplyAdd(acc[w], xlm[kloop], value);
+              if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
              MultiplyAdd(acc[w], xlm[kloop + kunroll], value);
            }
          }
        }
      }
@ -365,200 +311,6 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
 // =================================================================================================
 // Faster version of the kernel, assuming that:
 // --> 'm' and 'n' are multiples of WGS2
 // --> 'a_offset' is 0
 // --> 'a_ld' is a multiple of VW2
 // --> 'a_rotated' is 0
 // --> 'do_conjugate' is 0
 __attribute__((reqd_work_group_size(WGS2, 1, 1)))
 __kernel void XgemvFast(const int m, const int n, const real alpha, const real beta,
                        const int a_rotated,
                        const __global realVF* restrict agm, const int a_offset, const int a_ld,
                        const __global real* restrict xgm, const int x_offset, const int x_inc,
                        __global real* ygm, const int y_offset, const int y_inc,
                        const int do_conjugate, const int parameter,
                        const int kl, const int ku) {
  // Local memory for the vector X
  __local real xlm[WGS2];
  // Initializes the accumulation register
  real acc[WPT2];
  #pragma unroll
  for (int w=0; w<WPT2; ++w) {
    SetToZero(acc[w]);
  }
  // Loops over work-group sized portions of the work
  for (int kwg=0; kwg<n; kwg+=WGS2) {
    // Loads the vector X into local memory
    const int lid = get_local_id(0);
    xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
    // Synchronizes all threads in a workgroup
    barrier(CLK_LOCAL_MEM_FENCE);
    // The multiply-add function (not rotated)
    #pragma unroll
    for (int kl=0; kl<WGS2; ++kl) {
      const int k = kwg + kl;
      #pragma unroll
      for (int w=0; w<WPT2/VW2; ++w) {
        const int gid = (WPT2/VW2)*get_global_id(0) + w;
        realVF avec = LoadMatrixAVF(agm, gid, k, a_ld/VW2);
        #if VW2 == 1
          MultiplyAdd(acc[VW2*w+0], xlm[kl], avec);
        #elif VW2 == 2
          MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.x);
          MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.y);
        #elif VW2 == 4
          MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.x);
          MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.y);
          MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.z);
          MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.w);
        #elif VW2 == 8
          MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.s0);
          MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.s1);
          MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.s2);
          MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.s3);
          MultiplyAdd(acc[VW2*w+4], xlm[kl], avec.s4);
          MultiplyAdd(acc[VW2*w+5], xlm[kl], avec.s5);
          MultiplyAdd(acc[VW2*w+6], xlm[kl], avec.s6);
          MultiplyAdd(acc[VW2*w+7], xlm[kl], avec.s7);
        #elif VW2 == 16
          MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.s0);
          MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.s1);
          MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.s2);
          MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.s3);
          MultiplyAdd(acc[VW2*w+4], xlm[kl], avec.s4);
          MultiplyAdd(acc[VW2*w+5], xlm[kl], avec.s5);
          MultiplyAdd(acc[VW2*w+6], xlm[kl], avec.s6);
          MultiplyAdd(acc[VW2*w+7], xlm[kl], avec.s7);
          MultiplyAdd(acc[VW2*w+8], xlm[kl], avec.s8);
          MultiplyAdd(acc[VW2*w+9], xlm[kl], avec.s9);
          MultiplyAdd(acc[VW2*w+10], xlm[kl], avec.sA);
          MultiplyAdd(acc[VW2*w+11], xlm[kl], avec.sB);
          MultiplyAdd(acc[VW2*w+12], xlm[kl], avec.sC);
          MultiplyAdd(acc[VW2*w+13], xlm[kl], avec.sD);
          MultiplyAdd(acc[VW2*w+14], xlm[kl], avec.sE);
          MultiplyAdd(acc[VW2*w+15], xlm[kl], avec.sF);
        #endif
      }
    }
    // Synchronizes all threads in a workgroup
    barrier(CLK_LOCAL_MEM_FENCE);
  }
  // Stores the final result
  #pragma unroll
  for (int w=0; w<WPT2; ++w) {
    const int gid = WPT2*get_global_id(0) + w;
    real yval = ygm[gid*y_inc + y_offset];
    AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, yval);
  }
 }
 // =================================================================================================
 // Faster version of the kernel, assuming that:
 // --> 'm' and 'n' are multiples of WGS3
 // --> 'a_offset' is 0
 // --> 'a_ld' is a multiple of VW3
 // --> 'a_rotated' is 1
 // --> 'do_conjugate' is 0
 __attribute__((reqd_work_group_size(WGS3, 1, 1)))
 __kernel void XgemvFastRot(const int m, const int n, const real alpha, const real beta,
                           const int a_rotated,
                           const __global realVFR* restrict agm, const int a_offset, const int a_ld,
                           const __global real* restrict xgm, const int x_offset, const int x_inc,
                           __global real* ygm, const int y_offset, const int y_inc,
                           const int do_conjugate, const int parameter,
                           const int kl, const int ku) {
  // Local memory for the vector X
  __local real xlm[WGS3];
  // Initializes the accumulation register
  real acc[WPT3];
  #pragma unroll
  for (int w=0; w<WPT3; ++w) {
    SetToZero(acc[w]);
  }
  // Loops over work-group sized portions of the work
  for (int kwg=0; kwg<n; kwg+=WGS3) {
    // Loads the vector X into local memory
    const int lid = get_local_id(0);
    xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
    // Synchronizes all threads in a workgroup
    barrier(CLK_LOCAL_MEM_FENCE);
    // The multiply-add function (rotated)
    #pragma unroll
    for (int kl=0; kl<WGS3/VW3; ++kl) {
      const int k = (kwg/VW3) + kl;
      #pragma unroll
      for (int w=0; w<WPT3; ++w) {
        const int gid = WPT3*get_global_id(0) + w;
        realVFR avec = LoadMatrixAVFR(agm, k, gid, a_ld/VW3);
        #if VW3 == 1
          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec);
        #elif VW3 == 2
          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
        #elif VW3 == 4
          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
          MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.z);
          MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.w);
        #elif VW3 == 8
          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
          MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
          MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
          MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
          MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
          MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
          MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
        #elif VW3 == 16
          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
          MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
          MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
          MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
          MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
          MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
          MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
          MultiplyAdd(acc[w], xlm[VW3*kl+8], avec.s8);
          MultiplyAdd(acc[w], xlm[VW3*kl+9], avec.s9);
          MultiplyAdd(acc[w], xlm[VW3*kl+10], avec.sA);
          MultiplyAdd(acc[w], xlm[VW3*kl+11], avec.sB);
          MultiplyAdd(acc[w], xlm[VW3*kl+12], avec.sC);
          MultiplyAdd(acc[w], xlm[VW3*kl+13], avec.sD);
          MultiplyAdd(acc[w], xlm[VW3*kl+14], avec.sE);
          MultiplyAdd(acc[w], xlm[VW3*kl+15], avec.sF);
        #endif
      }
    }
    // Synchronizes all threads in a workgroup
    barrier(CLK_LOCAL_MEM_FENCE);
  }
  // Stores the final result
  #pragma unroll
  for (int w=0; w<WPT3; ++w) {
    const int gid = WPT3*get_global_id(0) + w;
    real yval = ygm[gid*y_inc + y_offset];
    AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, yval);
  }
 }
 // =================================================================================================
 // End of the C++11 raw string literal
 )"
--- a/src/kernels/level2/xgemv_fast.opencl
+++ b/src/kernels/level2/xgemv_fast.opencl
@ -0,0 +1,288 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file contains the Xgemv kernel (fast versions) for matrix-vector multiplication.
 //
 // =================================================================================================
 // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
 // literal). Comment-out this line for syntax-highlighting when developing.
 R"(
 // =================================================================================================
 // Parameters set by the tuner or by the database. Here they are given a basic default value in case
 // this kernel file is used outside of the CLBlast library.
 // 1: For the full version, see 'xgemv.opencl'
 // 2: For the fast version
 #ifndef WGS2
  #define WGS2 64     // The local work-group size
 #endif
 #ifndef WPT2
  #define WPT2 1      // The amount of work-per-thread
 #endif
 #ifndef VW2
  #define VW2 1       // Vector width of matrix A loads
 #endif
 // 3: For the fast rotated version
 #ifndef WGS3
  #define WGS3 64     // The local work-group size
 #endif
 #ifndef WPT3
  #define WPT3 1      // The amount of work-per-thread
 #endif
 #ifndef VW3
  #define VW3 1       // Vector width of matrix A loads
 #endif
 // =================================================================================================
 // Data-widths for the 'fast' kernel
 #if VW2 == 1
  typedef real realVF;
 #elif VW2 == 2
  typedef real2 realVF;
 #elif VW2 == 4
  typedef real4 realVF;
 #elif VW2 == 8
  typedef real8 realVF;
 #elif VW2 == 16
  typedef real16 realVF;
 #endif
 // Data-widths for the 'fast' kernel with rotated matrix
 #if VW3 == 1
  typedef real realVFR;
 #elif VW3 == 2
  typedef real2 realVFR;
 #elif VW3 == 4
  typedef real4 realVFR;
 #elif VW3 == 8
  typedef real8 realVFR;
 #elif VW3 == 16
  typedef real16 realVFR;
 #endif
 // =================================================================================================
 // Loads a vector input value (1/2)
 inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y,
                            const int a_ld) {
  return agm[a_ld*y + x];
 }
 // Loads a vector input value (2/2): as before, but different data-type
 inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x, const int y,
                              const int a_ld) {
  return agm[a_ld*y + x];
 }
 // =================================================================================================
 // Faster version of the kernel, assuming that:
 // --> 'm' and 'n' are multiples of WGS2
 // --> 'a_offset' is 0
 // --> 'a_ld' is a multiple of VW2
 // --> 'a_rotated' is 0
 // --> 'do_conjugate' is 0
 __attribute__((reqd_work_group_size(WGS2, 1, 1)))
 __kernel void XgemvFast(const int m, const int n, const real alpha, const real beta,
                        const int a_rotated,
                        const __global realVF* restrict agm, const int a_offset, const int a_ld,
                        const __global real* restrict xgm, const int x_offset, const int x_inc,
                        __global real* ygm, const int y_offset, const int y_inc,
                        const int do_conjugate, const int parameter,
                        const int kl, const int ku) {
  // Local memory for the vector X
  __local real xlm[WGS2];
  // Initializes the accumulation register
  real acc[WPT2];
  #pragma unroll
  for (int w=0; w<WPT2; ++w) {
    SetToZero(acc[w]);
  }
  // Loops over work-group sized portions of the work
  for (int kwg=0; kwg<n; kwg+=WGS2) {
    // Loads the vector X into local memory
    const int lid = get_local_id(0);
    xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
    // Synchronizes all threads in a workgroup
    barrier(CLK_LOCAL_MEM_FENCE);
    // The multiply-add function (not rotated)
    #pragma unroll
    for (int kl=0; kl<WGS2; ++kl) {
      const int k = kwg + kl;
      #pragma unroll
      for (int w=0; w<WPT2/VW2; ++w) {
        const int gid = (WPT2/VW2)*get_global_id(0) + w;
        realVF avec = LoadMatrixAVF(agm, gid, k, a_ld/VW2);
        #if VW2 == 1
          MultiplyAdd(acc[VW2*w+0], xlm[kl], avec);
        #elif VW2 == 2
          MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.x);
          MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.y);
        #elif VW2 == 4
          MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.x);
          MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.y);
          MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.z);
          MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.w);
        #elif VW2 == 8
          MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.s0);
          MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.s1);
          MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.s2);
          MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.s3);
          MultiplyAdd(acc[VW2*w+4], xlm[kl], avec.s4);
          MultiplyAdd(acc[VW2*w+5], xlm[kl], avec.s5);
          MultiplyAdd(acc[VW2*w+6], xlm[kl], avec.s6);
          MultiplyAdd(acc[VW2*w+7], xlm[kl], avec.s7);
        #elif VW2 == 16
          MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.s0);
          MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.s1);
          MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.s2);
          MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.s3);
          MultiplyAdd(acc[VW2*w+4], xlm[kl], avec.s4);
          MultiplyAdd(acc[VW2*w+5], xlm[kl], avec.s5);
          MultiplyAdd(acc[VW2*w+6], xlm[kl], avec.s6);
          MultiplyAdd(acc[VW2*w+7], xlm[kl], avec.s7);
          MultiplyAdd(acc[VW2*w+8], xlm[kl], avec.s8);
          MultiplyAdd(acc[VW2*w+9], xlm[kl], avec.s9);
          MultiplyAdd(acc[VW2*w+10], xlm[kl], avec.sA);
          MultiplyAdd(acc[VW2*w+11], xlm[kl], avec.sB);
          MultiplyAdd(acc[VW2*w+12], xlm[kl], avec.sC);
          MultiplyAdd(acc[VW2*w+13], xlm[kl], avec.sD);
          MultiplyAdd(acc[VW2*w+14], xlm[kl], avec.sE);
          MultiplyAdd(acc[VW2*w+15], xlm[kl], avec.sF);
        #endif
      }
    }
    // Synchronizes all threads in a workgroup
    barrier(CLK_LOCAL_MEM_FENCE);
  }
  // Stores the final result
  #pragma unroll
  for (int w=0; w<WPT2; ++w) {
    const int gid = WPT2*get_global_id(0) + w;
    real yval = ygm[gid*y_inc + y_offset];
    AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, yval);
  }
 }
 // =================================================================================================
 // Faster version of the kernel, assuming that:
 // --> 'm' and 'n' are multiples of WGS3
 // --> 'a_offset' is 0
 // --> 'a_ld' is a multiple of VW3
 // --> 'a_rotated' is 1
 // --> 'do_conjugate' is 0
 __attribute__((reqd_work_group_size(WGS3, 1, 1)))
 __kernel void XgemvFastRot(const int m, const int n, const real alpha, const real beta,
                           const int a_rotated,
                           const __global realVFR* restrict agm, const int a_offset, const int a_ld,
                           const __global real* restrict xgm, const int x_offset, const int x_inc,
                           __global real* ygm, const int y_offset, const int y_inc,
                           const int do_conjugate, const int parameter,
                           const int kl, const int ku) {
  // Local memory for the vector X
  __local real xlm[WGS3];
  // Initializes the accumulation register
  real acc[WPT3];
  #pragma unroll
  for (int w=0; w<WPT3; ++w) {
    SetToZero(acc[w]);
  }
  // Loops over work-group sized portions of the work
  for (int kwg=0; kwg<n; kwg+=WGS3) {
    // Loads the vector X into local memory
    const int lid = get_local_id(0);
    xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
    // Synchronizes all threads in a workgroup
    barrier(CLK_LOCAL_MEM_FENCE);
    // The multiply-add function (rotated)
    #pragma unroll
    for (int kl=0; kl<WGS3/VW3; ++kl) {
      const int k = (kwg/VW3) + kl;
      #pragma unroll
      for (int w=0; w<WPT3; ++w) {
        const int gid = WPT3*get_global_id(0) + w;
        realVFR avec = LoadMatrixAVFR(agm, k, gid, a_ld/VW3);
        #if VW3 == 1
          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec);
        #elif VW3 == 2
          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
        #elif VW3 == 4
          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
          MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.z);
          MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.w);
        #elif VW3 == 8
          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
          MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
          MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
          MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
          MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
          MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
          MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
        #elif VW3 == 16
          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
          MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
          MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
          MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
          MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
          MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
          MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
          MultiplyAdd(acc[w], xlm[VW3*kl+8], avec.s8);
          MultiplyAdd(acc[w], xlm[VW3*kl+9], avec.s9);
          MultiplyAdd(acc[w], xlm[VW3*kl+10], avec.sA);
          MultiplyAdd(acc[w], xlm[VW3*kl+11], avec.sB);
          MultiplyAdd(acc[w], xlm[VW3*kl+12], avec.sC);
          MultiplyAdd(acc[w], xlm[VW3*kl+13], avec.sD);
          MultiplyAdd(acc[w], xlm[VW3*kl+14], avec.sE);
          MultiplyAdd(acc[w], xlm[VW3*kl+15], avec.sF);
        #endif
      }
    }
    // Synchronizes all threads in a workgroup
    barrier(CLK_LOCAL_MEM_FENCE);
  }
  // Stores the final result
  #pragma unroll
  for (int w=0; w<WPT3; ++w) {
    const int gid = WPT3*get_global_id(0) + w;
    real yval = ygm[gid*y_inc + y_offset];
    AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, yval);
  }
 }
 // =================================================================================================
 // End of the C++11 raw string literal
 )"
 // =================================================================================================
--- a/src/kernels/level2/xger.opencl
+++ b/src/kernels/level2/xger.opencl
@ -0,0 +1,106 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file contains the Xger kernels for rank-1 matrix update.
 //
 // =================================================================================================
 // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
 // literal). Comment-out this line for syntax-highlighting when developing.
 R"(
 // =================================================================================================
 // Regular version of the rank-1 matrix update kernel (GER, GERU, GERC)
 __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
 __kernel void Xger(const int max1, const int max2, const real alpha,
                   const __global real* restrict xgm, const int x_offset, const int x_inc,
                   const __global real* ygm, const int y_offset, const int y_inc,
                   __global real* restrict agm, const int a_offset, const int a_ld,
                   const int is_rowmajor) {
  // Register storage for X and Y
  real xvalues[WPT];
  real yvalues[WPT];
  // Row-major version
  if (is_rowmajor) {
    // Loads the X-vector
    #pragma unroll
    for (int w=0; w<WPT; ++w) {
      const int id2 = w*get_global_size(1) + get_global_id(1);
      xvalues[w] = LoadVector(id2, max2, xgm, x_offset, x_inc, false);
    }
    // Loads the Y-vector
    #pragma unroll
    for (int w=0; w<WPT; ++w) {
      const int id1 = w*get_global_size(0) + get_global_id(0);
      yvalues[w] = LoadVector(id1, max1, ygm, y_offset, y_inc, true);
    }
    // Loops over the work per thread twice
    #pragma unroll
    for (int w1=0; w1<WPT; ++w1) {
      #pragma unroll
      for (int w2=0; w2<WPT; ++w2) {
        // Global thread IDs
        const int id1 = w1*get_global_size(0) + get_global_id(0);
        const int id2 = w2*get_global_size(1) + get_global_id(1);
        // Loads A, performs the operation, and stores the result into A
        MatrixUpdate(id1, id2, max1, max2, agm, a_offset, a_ld,
                     alpha, xvalues[w2], yvalues[w1], false);
      }
    }
  }
  // Col-major version
  else {
    // Loads the X-vector
    #pragma unroll
    for (int w=0; w<WPT; ++w) {
      const int id1 = w*get_global_size(0) + get_global_id(0);
      xvalues[w] = LoadVector(id1, max1, xgm, x_offset, x_inc, false);
    }
    // Loads the Y-vector
    #pragma unroll
    for (int w=0; w<WPT; ++w) {
      const int id2 = w*get_global_size(1) + get_global_id(1);
      yvalues[w] = LoadVector(id2, max2, ygm, y_offset, y_inc, true);
    }
    // Loops over the work per thread twice
    #pragma unroll
    for (int w1=0; w1<WPT; ++w1) {
      #pragma unroll
      for (int w2=0; w2<WPT; ++w2) {
        // Global thread IDs
        const int id1 = w1*get_global_size(0) + get_global_id(0);
        const int id2 = w2*get_global_size(1) + get_global_id(1);
        // Loads A, performs the operation, and stores the result into A
        MatrixUpdate(id1, id2, max1, max2, agm, a_offset, a_ld,
                     alpha, xvalues[w1], yvalues[w2], false);
      }
    }
  }
 }
 // =================================================================================================
 // End of the C++11 raw string literal
 )"
 // =================================================================================================
--- a/src/kernels/level2/xher.opencl
+++ b/src/kernels/level2/xher.opencl
@ -0,0 +1,73 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file contains the Xher kernels for rank-1 matrix update.
 //
 // =================================================================================================
 // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
 // literal). Comment-out this line for syntax-highlighting when developing.
 R"(
 // =================================================================================================
 // Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR)
 __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
 __kernel void Xher(const int n, const real alpha,
                   const __global real* restrict xgm, const int x_offset, const int x_inc,
                   __global real* restrict agm, const int a_offset, const int a_ld,
                   const int is_upper, const int is_rowmajor) {
  // Register storage for X and XT
  real xvalues[WPT];
  real xtvalues[WPT];
  // Loads the X-vector
  #pragma unroll
  for (int w=0; w<WPT; ++w) {
    const int id2 = w*get_global_size(1) + get_global_id(1);
    xvalues[w] = LoadVector(id2, n, xgm, x_offset, x_inc, !is_rowmajor);
  }
  // Loads the X-transposed-vector
  #pragma unroll
  for (int w=0; w<WPT; ++w) {
    const int id1 = w*get_global_size(0) + get_global_id(0);
    xtvalues[w] = LoadVector(id1, n, xgm, x_offset, x_inc, is_rowmajor);
  }
  // Loops over the work per thread twice
  #pragma unroll
  for (int w1=0; w1<WPT; ++w1) {
    #pragma unroll
    for (int w2=0; w2<WPT; ++w2) {
      // Global thread IDs
      const int id1 = w1*get_global_size(0) + get_global_id(0);
      const int id2 = w2*get_global_size(1) + get_global_id(1);
      // Skip these threads if they do not contain threads contributing to the matrix-triangle
      if ((is_upper && (id1 > id2)) || (!is_upper && (id2 > id1))) {
        // Do nothing
      }
      // Loads A, performs the operation, and stores the result into A
      else {
        MatrixUpdate(id1, id2, n, n, agm, a_offset, a_ld, alpha, xvalues[w2], xtvalues[w1], is_upper);
      }
    }
  }
 }
 // =================================================================================================
 // End of the C++11 raw string literal
 )"
 // =================================================================================================
--- a/src/kernels/level2/xher2.opencl
+++ b/src/kernels/level2/xher2.opencl
@ -0,0 +1,104 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file contains the Xher2 kernels for rank-2 matrix update.
 //
 // =================================================================================================
 // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
 // literal). Comment-out this line for syntax-highlighting when developing.
 R"(
 // =================================================================================================
 // Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2)
 __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
 __kernel void Xher2(const int n, const real alpha,
                    const __global real* restrict xgm, const int x_offset, const int x_inc,
                    const __global real* restrict ygm, const int y_offset, const int y_inc,
                    __global real* restrict agm, const int a_offset, const int a_ld,
                    const int is_upper, const int is_rowmajor) {
  // Register storage for X and Y
  real xvalues[WPT];
  real yvalues[WPT];
  real xtvalues[WPT];
  real ytvalues[WPT];
  // Loads the X-vector
  #pragma unroll
  for (int w=0; w<WPT; ++w) {
    const int id2 = w*get_global_size(1) + get_global_id(1);
    xvalues[w] = LoadVector(id2, n, xgm, x_offset, x_inc, !is_rowmajor);
  }
  // Loads the X-transposed-vector
  #pragma unroll
  for (int w=0; w<WPT; ++w) {
    const int id1 = w*get_global_size(0) + get_global_id(0);
    xtvalues[w] = LoadVector(id1, n, xgm, x_offset, x_inc, is_rowmajor);
  }
  // Loads the Y-vector
  #pragma unroll
  for (int w=0; w<WPT; ++w) {
    const int id1 = w*get_global_size(0) + get_global_id(0);
    yvalues[w] = LoadVector(id1, n, ygm, y_offset, y_inc, is_rowmajor);
  }
  // Loads the Y-transposed-vector
  #pragma unroll
  for (int w=0; w<WPT; ++w) {
    const int id2 = w*get_global_size(1) + get_global_id(1);
    ytvalues[w] = LoadVector(id2, n, ygm, y_offset, y_inc, !is_rowmajor);
  }
  // Sets the proper value of alpha in case conjugation is needed
  real alpha1 = alpha;
  real alpha2 = alpha;
  #if defined(ROUTINE_HER2) || defined(ROUTINE_HPR2)
    if (is_rowmajor) {
      COMPLEX_CONJUGATE(alpha1);
    }
    else {
      COMPLEX_CONJUGATE(alpha2);
    }
  #endif
  // Loops over the work per thread twice
  #pragma unroll
  for (int w1=0; w1<WPT; ++w1) {
    #pragma unroll
    for (int w2=0; w2<WPT; ++w2) {
      // Global thread IDs
      const int id1 = w1*get_global_size(0) + get_global_id(0);
      const int id2 = w2*get_global_size(1) + get_global_id(1);
      // Skip these threads if they do not contain threads contributing to the matrix-triangle
      if ((is_upper && (id1 > id2)) || (!is_upper && (id2 > id1))) {
        // Do nothing
      }
      // Loads A, performs the operation, and stores the result into A
      else {
        MatrixUpdate2(id1, id2, n, n, agm, a_offset, a_ld,
                      alpha1, xvalues[w2], yvalues[w1],
                      alpha2, xtvalues[w1], ytvalues[w2], is_upper);
      }
    }
  }
 }
 // =================================================================================================
 // End of the C++11 raw string literal
 )"
 // =================================================================================================
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@ -0,0 +1,329 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto
 // et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable
 // (and tunable!) using more or less the same parameters/naming conventions as in the paper. It
 // supports single and double precision (SGEMM/DGEMM) through a pre-processor define.
 //
 // Matrices are accessed as follows:
 // A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m)
 // B: [k*N + n], with 'k' ranging from 0:K and 'n' from 0:N (n,k,n)
 // C: [n*M + m], with 'n' ranging from 0:N and 'm' from 0:M (m,n,m)
 //
 // Or as an image (assuming column-major)
 //       K                      
 //    o-------o                 
 //    |       |                 
 //  N | [B^T] |                 
 //    |       |                 
 //    o-------o                 
 //        K               N     
 //    o-------o        o-----o  
 //  M |  [A]  |      M | [C] |  
 //    |       |        |     |  
 //    o-------o        o-----o  
 //                              
 //
 // This kernel is seperated into two files. This is part 1 out of 2,
 //
 // =================================================================================================
 // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
 // literal). Comment-out this line for syntax-highlighting when developing.
 R"(
 // =================================================================================================
 // Parameters set by the tuner or by the database. Here they are given a basic default value in case
 // this kernel file is used outside of the CLBlast library.
 #ifndef MWG
  #define MWG 8      // Tile-size in dimension M (e.g. 64, 128)
 #endif
 #ifndef NWG
  #define NWG 8      // Tile-size in dimension N (e.g. 64, 128)
 #endif
 #ifndef KWG
  #define KWG 8      // Tile-size in dimension K (e.g. 8, 16)
 #endif
 #ifndef MDIMC
  #define MDIMC 8    // Threads per workgroup in M-dimension (e.g. 8, 16, 32)
 #endif
 #ifndef NDIMC
  #define NDIMC 8    // Threads per workgroup in N-dimension (e.g. 8, 16, 32)
 #endif
 #ifndef MDIMA
  #define MDIMA 8    // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
 #endif
 #ifndef NDIMB
  #define NDIMB 8    // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
 #endif
 #ifndef KWI
  #define KWI 1      // Unroll factor of the KWG loop (smaller or equal than KWG)
 #endif
 #ifndef VWM
  #define VWM 1      // Vector width of matrices A and C 
 #endif
 #ifndef VWN
  #define VWN 1      // Vector width of matrix B
 #endif
 #ifndef STRM
  #define STRM 0     // Use strided access within a thread in the M-dimension (1) or not (0)
 #endif
 #ifndef STRN
  #define STRN 0     // Use strided access within a thread in the N-dimension (1) or not (0)
 #endif
 #ifndef SA
  #define SA 0       // Use local/shared memory to cache matrix A (1) or not (0)
 #endif
 #ifndef SB
  #define SB 0       // Use local/shared memory to cache matrix B (1) or not (0)
 #endif
 // Helper parameters based on the above tuning parameters
 #define MWI (MWG/MDIMC)               // Work per work-item (M-dimension)
 #define NWI (NWG/NDIMC)               // Work per work-item (N-dimension)
 #define KDIMA ((MDIMC*NDIMC)/(MDIMA)) // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
 #define KDIMB ((MDIMC*NDIMC)/(NDIMB)) // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
 #define MWA (MWG/MDIMA)               // Amount of loads-per-thread for matrix A (M-dimension)
 #define KWA (KWG/KDIMA)               // Amount of loads-per-thread for matrix A (K-dimension)
 #define KWB (KWG/KDIMB)               // Amount of loads-per-thread for matrix B (K-dimension)
 #define NWB (NWG/NDIMB)               // Amount of loads-per-thread for matrix B (N-dimension)
 // Settings
 #define USE_VECTOR_MAD 0              // Unroll (0) or don't (1) unroll the vector MAD manually
 // =================================================================================================
 // Data-widths in dimension M
 #if VWM == 1
    typedef real realM;
 #elif VWM == 2
    typedef real2 realM;
 #elif VWM == 4
    typedef real4 realM;
 #elif VWM == 8
    typedef real8 realM;
 #elif VWM == 16
    typedef real16 realM;
 #endif
 // Data-widths in dimension N
 #if VWN == 1
    typedef real realN;
 #elif VWN == 2
    typedef real2 realN;
 #elif VWN == 4
    typedef real4 realN;
 #elif VWN == 8
    typedef real8 realN;
 #elif VWN == 16
    typedef real16 realN;
 #endif
 // =================================================================================================
 // Initializes the accumulation registers to zero
 inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
  #pragma unroll
  for (int mi=0; mi<MWI/VWM; ++mi) {
    #pragma unroll
    for (int ni=0; ni<NWI; ++ni) {
      #if VWM == 1
        SetToZero(cpm[ni][mi]);
      #elif VWM == 2
        SetToZero(cpm[ni][mi].x);
        SetToZero(cpm[ni][mi].y);
      #elif VWM == 4
        SetToZero(cpm[ni][mi].x);
        SetToZero(cpm[ni][mi].y);
        SetToZero(cpm[ni][mi].z);
        SetToZero(cpm[ni][mi].w);
      #elif VWM == 8
        SetToZero(cpm[ni][mi].s0);
        SetToZero(cpm[ni][mi].s1);
        SetToZero(cpm[ni][mi].s2);
        SetToZero(cpm[ni][mi].s3);
        SetToZero(cpm[ni][mi].s4);
        SetToZero(cpm[ni][mi].s5);
        SetToZero(cpm[ni][mi].s6);
        SetToZero(cpm[ni][mi].s7);
      #elif VWM == 16
        SetToZero(cpm[ni][mi].s0);
        SetToZero(cpm[ni][mi].s1);
        SetToZero(cpm[ni][mi].s2);
        SetToZero(cpm[ni][mi].s3);
        SetToZero(cpm[ni][mi].s4);
        SetToZero(cpm[ni][mi].s5);
        SetToZero(cpm[ni][mi].s6);
        SetToZero(cpm[ni][mi].s7);
        SetToZero(cpm[ni][mi].s8);
        SetToZero(cpm[ni][mi].s9);
        SetToZero(cpm[ni][mi].sA);
        SetToZero(cpm[ni][mi].sB);
        SetToZero(cpm[ni][mi].sC);
        SetToZero(cpm[ni][mi].sD);
        SetToZero(cpm[ni][mi].sE);
        SetToZero(cpm[ni][mi].sF);
      #endif
    }
  }
 }
 // =================================================================================================
 // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
 // caching the A input matrix.
 #if SA == 1
 inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm,
                           const int kSizeM, const int tid, const int kwg) {
  const int la0 = tid % MDIMA;
  const int la1 = tid / MDIMA;
  #pragma unroll
  for (int mia=0; mia<MWA/VWM; ++mia) {
    #pragma unroll
    for (int kia=0; kia<KWA; ++kia) {
      // Computes the indices based on strided/non-strided access
      #if STRM == 0
        int mg = mia + la0*(MWA/VWM);
      #elif STRM == 1
        int mg = la0 + mia*MDIMA;
      #endif
      // Computes the indices for the global memory
      int kg = kia + la1*KWA;
      int idm = mg + get_group_id(0)*(MWG/VWM);
      int idk = kg + kwg;
      // Loads the data from global memory (not transposed) into the local memory
      alm[kg*(MWG/VWM) + mg] = agm[idk*(kSizeM/VWM) + idm];
    }
  }
 }
 #endif
 // Same as above, but now for the B input matrix
 #if SB == 1
 inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm,
                           const int kSizeN, const int tid, const int kwg) {
  const int lb0 = tid % NDIMB;
  const int lb1 = tid / NDIMB;
  #pragma unroll
  for (int kib=0; kib<KWB; ++kib) {
    #pragma unroll
    for (int nib=0; nib<NWB/VWN; ++nib) {
      // Computes the indices based on strided/non-strided access
      #if STRN == 0
        int ng = nib + lb0*(NWB/VWN);
      #elif STRN == 1
        int ng = lb0 + nib*NDIMB;
      #endif
      // Computes the indices for the global memory
      int kg = kib + lb1*KWB;
      int idn = ng + get_group_id(1)*(NWG/VWN);
      int idk = kg + kwg;
      // Loads the data from global memory (transposed) into the local memory
      blm[kg*(NWG/VWN) + ng] = bgm[idk*(kSizeN/VWN) + idn];
    }
  }
 }
 #endif
 // =================================================================================================
 // Caches global off-chip memory directly into per-thread private memory (registers). This function
 // is specific for caching the A input matrix.
 #if SA == 0
 inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM],
                             const int kSizeM, const int idk, const int kwg) {
  #pragma unroll
  for (int mi=0; mi<MWI/VWM; ++mi) {
    // Computes the indices based on strided/non-strided access
    #if STRM == 0
      int mg = mi + get_local_id(0)*(MWI/VWM);
    #elif STRM == 1
      int mg = get_local_id(0) + mi*MDIMC;
    #endif
    // Computes the indices for the global memory
    int idm = mg + get_group_id(0)*(MWG/VWM);
    // Loads the data from global memory (not transposed) and stores into registers
    apm[mi] = agm[idk*(kSizeM/VWM) + idm];
  }
 }
 #endif
 // Same as above, but now for the B input matrix
 #if SB == 0
 inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN],
                             const int kSizeN, const int idk) {
  #pragma unroll
  for (int ni=0; ni<NWI/VWN; ++ni) {
    // Computes the indices based on strided/non-strided access
    #if STRN == 0
      int ng = ni + get_local_id(1)*(NWI/VWN);
    #elif STRN == 1
      int ng = get_local_id(1) + ni*NDIMC;
    #endif
    // Computes the indices for the global memory
    int idn = ng + get_group_id(1)*(NWG/VWN);
    // Loads the data from global memory (transposed) and stores into registers
    bpm[ni] = bgm[idk*(kSizeN/VWN) + idn];
  }
 }
 #endif
 // =================================================================================================
 // Caches on-chip local memory into per-thread private memory (registers). This function is specific
 // for caching the A input matrix.
 #if SA == 1
 inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) {
  #pragma unroll
  for (int mi=0; mi<MWI/VWM; ++mi) {
    #if STRM == 0
      int mg = mi + get_local_id(0)*(MWI/VWM);
    #elif STRM == 1
      int mg = get_local_id(0) + mi*MDIMC;
    #endif
    apm[mi] = alm[kg*(MWG/VWM) + mg];
  }
 }
 #endif
 // Same as above, but now for the B input matrix
 #if SB == 1
 inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) {
  #pragma unroll
  for (int ni=0; ni<NWI/VWN; ++ni) {
    #if STRN == 0
      int ng = ni + get_local_id(1)*(NWI/VWN);
    #elif STRN == 1
      int ng = get_local_id(1) + ni*NDIMC;
    #endif
    bpm[ni] = blm[kg*(NWG/VWN) + ng];
  }
 }
 #endif
 // =================================================================================================
 // End of the C++11 raw string literal
 )"
 // =================================================================================================
--- a/src/kernels/level3/xgemm_part2.opencl
+++ b/src/kernels/level3/xgemm_part2.opencl
@ -7,29 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto
+// This is part 2 of 2 of the GEMM kernel. See part 1 for more information.
 // et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable
 // (and tunable!) using more or less the same parameters/naming conventions as in the paper. It
 // supports single and double precision (SGEMM/DGEMM) through a pre-processor define.
 //
 // Matrices are accessed as follows:
 // A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m)
 // B: [k*N + n], with 'k' ranging from 0:K and 'n' from 0:N (n,k,n)
 // C: [n*M + m], with 'n' ranging from 0:N and 'm' from 0:M (m,n,m)
 //
 // Or as an image (assuming column-major)
 //       K                      
 //    o-------o                 
 //    |       |                 
 //  N | [B^T] |                 
 //    |       |                 
 //    o-------o                 
 //        K               N     
 //    o-------o        o-----o  
 //  M |  [A]  |      M | [C] |  
 //    |       |        |     |  
 //    o-------o        o-----o  
 //                              
 //
 // =================================================================================================
@ -39,288 +17,6 @@ R"(
 // =================================================================================================
 // Parameters set by the tuner or by the database. Here they are given a basic default value in case
 // this kernel file is used outside of the CLBlast library.
 #ifndef MWG
  #define MWG 8      // Tile-size in dimension M (e.g. 64, 128)
 #endif
 #ifndef NWG
  #define NWG 8      // Tile-size in dimension N (e.g. 64, 128)
 #endif
 #ifndef KWG
  #define KWG 8      // Tile-size in dimension K (e.g. 8, 16)
 #endif
 #ifndef MDIMC
  #define MDIMC 8    // Threads per workgroup in M-dimension (e.g. 8, 16, 32)
 #endif
 #ifndef NDIMC
  #define NDIMC 8    // Threads per workgroup in N-dimension (e.g. 8, 16, 32)
 #endif
 #ifndef MDIMA
  #define MDIMA 8    // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
 #endif
 #ifndef NDIMB
  #define NDIMB 8    // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
 #endif
 #ifndef KWI
  #define KWI 1      // Unroll factor of the KWG loop (smaller or equal than KWG)
 #endif
 #ifndef VWM
  #define VWM 1      // Vector width of matrices A and C 
 #endif
 #ifndef VWN
  #define VWN 1      // Vector width of matrix B
 #endif
 #ifndef STRM
  #define STRM 0     // Use strided access within a thread in the M-dimension (1) or not (0)
 #endif
 #ifndef STRN
  #define STRN 0     // Use strided access within a thread in the N-dimension (1) or not (0)
 #endif
 #ifndef SA
  #define SA 0       // Use local/shared memory to cache matrix A (1) or not (0)
 #endif
 #ifndef SB
  #define SB 0       // Use local/shared memory to cache matrix B (1) or not (0)
 #endif
 // Helper parameters based on the above tuning parameters
 #define MWI (MWG/MDIMC)               // Work per work-item (M-dimension)
 #define NWI (NWG/NDIMC)               // Work per work-item (N-dimension)
 #define KDIMA ((MDIMC*NDIMC)/(MDIMA)) // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
 #define KDIMB ((MDIMC*NDIMC)/(NDIMB)) // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
 #define MWA (MWG/MDIMA)               // Amount of loads-per-thread for matrix A (M-dimension)
 #define KWA (KWG/KDIMA)               // Amount of loads-per-thread for matrix A (K-dimension)
 #define KWB (KWG/KDIMB)               // Amount of loads-per-thread for matrix B (K-dimension)
 #define NWB (NWG/NDIMB)               // Amount of loads-per-thread for matrix B (N-dimension)
 // Settings
 #define USE_VECTOR_MAD 0              // Unroll (0) or don't (1) unroll the vector MAD manually
 // =================================================================================================
 // Data-widths in dimension M
 #if VWM == 1
    typedef real realM;
 #elif VWM == 2
    typedef real2 realM;
 #elif VWM == 4
    typedef real4 realM;
 #elif VWM == 8
    typedef real8 realM;
 #elif VWM == 16
    typedef real16 realM;
 #endif
 // Data-widths in dimension N
 #if VWN == 1
    typedef real realN;
 #elif VWN == 2
    typedef real2 realN;
 #elif VWN == 4
    typedef real4 realN;
 #elif VWN == 8
    typedef real8 realN;
 #elif VWN == 16
    typedef real16 realN;
 #endif
 // =================================================================================================
 // Initializes the accumulation registers to zero
 inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
  #pragma unroll
  for (int mi=0; mi<MWI/VWM; ++mi) {
    #pragma unroll
    for (int ni=0; ni<NWI; ++ni) {
      #if VWM == 1
        SetToZero(cpm[ni][mi]);
      #elif VWM == 2
        SetToZero(cpm[ni][mi].x);
        SetToZero(cpm[ni][mi].y);
      #elif VWM == 4
        SetToZero(cpm[ni][mi].x);
        SetToZero(cpm[ni][mi].y);
        SetToZero(cpm[ni][mi].z);
        SetToZero(cpm[ni][mi].w);
      #elif VWM == 8
        SetToZero(cpm[ni][mi].s0);
        SetToZero(cpm[ni][mi].s1);
        SetToZero(cpm[ni][mi].s2);
        SetToZero(cpm[ni][mi].s3);
        SetToZero(cpm[ni][mi].s4);
        SetToZero(cpm[ni][mi].s5);
        SetToZero(cpm[ni][mi].s6);
        SetToZero(cpm[ni][mi].s7);
      #elif VWM == 16
        SetToZero(cpm[ni][mi].s0);
        SetToZero(cpm[ni][mi].s1);
        SetToZero(cpm[ni][mi].s2);
        SetToZero(cpm[ni][mi].s3);
        SetToZero(cpm[ni][mi].s4);
        SetToZero(cpm[ni][mi].s5);
        SetToZero(cpm[ni][mi].s6);
        SetToZero(cpm[ni][mi].s7);
        SetToZero(cpm[ni][mi].s8);
        SetToZero(cpm[ni][mi].s9);
        SetToZero(cpm[ni][mi].sA);
        SetToZero(cpm[ni][mi].sB);
        SetToZero(cpm[ni][mi].sC);
        SetToZero(cpm[ni][mi].sD);
        SetToZero(cpm[ni][mi].sE);
        SetToZero(cpm[ni][mi].sF);
      #endif
    }
  }
 }
 // =================================================================================================
 // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
 // caching the A input matrix.
 #if SA == 1
 inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm,
                           const int kSizeM, const int tid, const int kwg) {
  const int la0 = tid % MDIMA;
  const int la1 = tid / MDIMA;
  #pragma unroll
  for (int mia=0; mia<MWA/VWM; ++mia) {
    #pragma unroll
    for (int kia=0; kia<KWA; ++kia) {
      // Computes the indices based on strided/non-strided access
      #if STRM == 0
        int mg = mia + la0*(MWA/VWM);
      #elif STRM == 1
        int mg = la0 + mia*MDIMA;
      #endif
      // Computes the indices for the global memory
      int kg = kia + la1*KWA;
      int idm = mg + get_group_id(0)*(MWG/VWM);
      int idk = kg + kwg;
      // Loads the data from global memory (not transposed) into the local memory
      alm[kg*(MWG/VWM) + mg] = agm[idk*(kSizeM/VWM) + idm];
    }
  }
 }
 #endif
 // Same as above, but now for the B input matrix
 #if SB == 1
 inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm,
                           const int kSizeN, const int tid, const int kwg) {
  const int lb0 = tid % NDIMB;
  const int lb1 = tid / NDIMB;
  #pragma unroll
  for (int kib=0; kib<KWB; ++kib) {
    #pragma unroll
    for (int nib=0; nib<NWB/VWN; ++nib) {
      // Computes the indices based on strided/non-strided access
      #if STRN == 0
        int ng = nib + lb0*(NWB/VWN);
      #elif STRN == 1
        int ng = lb0 + nib*NDIMB;
      #endif
      // Computes the indices for the global memory
      int kg = kib + lb1*KWB;
      int idn = ng + get_group_id(1)*(NWG/VWN);
      int idk = kg + kwg;
      // Loads the data from global memory (transposed) into the local memory
      blm[kg*(NWG/VWN) + ng] = bgm[idk*(kSizeN/VWN) + idn];
    }
  }
 }
 #endif
 // =================================================================================================
 // Caches global off-chip memory directly into per-thread private memory (registers). This function
 // is specific for caching the A input matrix.
 #if SA == 0
 inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM],
                             const int kSizeM, const int idk, const int kwg) {
  #pragma unroll
  for (int mi=0; mi<MWI/VWM; ++mi) {
    // Computes the indices based on strided/non-strided access
    #if STRM == 0
      int mg = mi + get_local_id(0)*(MWI/VWM);
    #elif STRM == 1
      int mg = get_local_id(0) + mi*MDIMC;
    #endif
    // Computes the indices for the global memory
    int idm = mg + get_group_id(0)*(MWG/VWM);
    // Loads the data from global memory (not transposed) and stores into registers
    apm[mi] = agm[idk*(kSizeM/VWM) + idm];
  }
 }
 #endif
 // Same as above, but now for the B input matrix
 #if SB == 0
 inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN],
                             const int kSizeN, const int idk) {
  #pragma unroll
  for (int ni=0; ni<NWI/VWN; ++ni) {
    // Computes the indices based on strided/non-strided access
    #if STRN == 0
      int ng = ni + get_local_id(1)*(NWI/VWN);
    #elif STRN == 1
      int ng = get_local_id(1) + ni*NDIMC;
    #endif
    // Computes the indices for the global memory
    int idn = ng + get_group_id(1)*(NWG/VWN);
    // Loads the data from global memory (transposed) and stores into registers
    bpm[ni] = bgm[idk*(kSizeN/VWN) + idn];
  }
 }
 #endif
 // =================================================================================================
 // Caches on-chip local memory into per-thread private memory (registers). This function is specific
 // for caching the A input matrix.
 #if SA == 1
 inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) {
  #pragma unroll
  for (int mi=0; mi<MWI/VWM; ++mi) {
    #if STRM == 0
      int mg = mi + get_local_id(0)*(MWI/VWM);
    #elif STRM == 1
      int mg = get_local_id(0) + mi*MDIMC;
    #endif
    apm[mi] = alm[kg*(MWG/VWM) + mg];
  }
 }
 #endif
 // Same as above, but now for the B input matrix
 #if SB == 1
 inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) {
  #pragma unroll
  for (int ni=0; ni<NWI/VWN; ++ni) {
    #if STRN == 0
      int ng = ni + get_local_id(1)*(NWI/VWN);
    #elif STRN == 1
      int ng = get_local_id(1) + ni*NDIMC;
    #endif
    bpm[ni] = blm[kg*(NWG/VWN) + ng];
  }
 }
 #endif
 // =================================================================================================
 // The vectorised multiply-add function
 inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
  #if USE_VECTOR_MAD == 1
--- a/src/routines/level1/xdotu.cc
+++ b/src/routines/level1/xdotu.cc
@ -14,7 +14,6 @@
 #include "internal/routines/level1/xdotu.h"
 #include <string>
 #include <vector>
 namespace clblast {
 // =================================================================================================
--- a/src/routines/level2/xgemv.cc
+++ b/src/routines/level2/xgemv.cc
@ -33,6 +33,7 @@ Xgemv<T>::Xgemv(Queue &queue, Event &event, const std::string &name):
    Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) {
  source_string_ =
    #include "../../kernels/level2/xgemv.opencl"
    #include "../../kernels/level2/xgemv_fast.opencl"
  ;
 }
--- a/src/routines/level2/xger.cc
+++ b/src/routines/level2/xger.cc
@ -0,0 +1,112 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xger class (see the header for information about the class).
 //
 // =================================================================================================
 #include "internal/routines/level2/xger.h"
 #include <string>
 #include <vector>
 namespace clblast {
 // =================================================================================================
 // Specific implementations to get the memory-type based on a template argument
 template <> const Precision Xger<float>::precision_ = Precision::kSingle;
 template <> const Precision Xger<double>::precision_ = Precision::kDouble;
 template <> const Precision Xger<float2>::precision_ = Precision::kComplexSingle;
 template <> const Precision Xger<double2>::precision_ = Precision::kComplexDouble;
 // =================================================================================================
 // Constructor: forwards to base class constructor
 template <typename T>
 Xger<T>::Xger(Queue &queue, Event &event, const std::string &name):
    Routine<T>(queue, event, name, {"Xger"}, precision_) {
  source_string_ =
    #include "../../kernels/level2/level2.opencl"
    #include "../../kernels/level2/xger.opencl"
  ;
 }
 // =================================================================================================
 // The main routine
 template <typename T>
 StatusCode Xger<T>::DoGer(const Layout layout,
                          const size_t m, const size_t n,
                          const T alpha,
                          const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                          const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
                          const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
  // Makes sure all dimensions are larger than zero
  if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
  // Computes whether or not the matrix has an alternative layout (row or column-major).
  const auto a_is_rowmajor = (layout == Layout::kRowMajor);
  const auto a_one = (a_is_rowmajor) ? n : m;
  const auto a_two = (a_is_rowmajor) ? m : n;
  // Tests the matrix and the vectors for validity
  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
  if (ErrorIn(status)) { return status; }
  status = TestVectorX(m, x_buffer, x_offset, x_inc, sizeof(T));
  if (ErrorIn(status)) { return status; }
  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
  if (ErrorIn(status)) { return status; }
  // Retrieves the Xgemv kernel from the compiled binary
  try {
    auto& program = GetProgramFromCache();
    auto kernel = Kernel(program, "Xger");
    // Sets the kernel arguments
    kernel.SetArgument(0, static_cast<int>(a_one));
    kernel.SetArgument(1, static_cast<int>(a_two));
    kernel.SetArgument(2, alpha);
    kernel.SetArgument(3, x_buffer());
    kernel.SetArgument(4, static_cast<int>(x_offset));
    kernel.SetArgument(5, static_cast<int>(x_inc));
    kernel.SetArgument(6, y_buffer());
    kernel.SetArgument(7, static_cast<int>(y_offset));
    kernel.SetArgument(8, static_cast<int>(y_inc));
    kernel.SetArgument(9, a_buffer());
    kernel.SetArgument(10, static_cast<int>(a_offset));
    kernel.SetArgument(11, static_cast<int>(a_ld));
    kernel.SetArgument(12, static_cast<int>(a_is_rowmajor));
    // Launches the kernel
    auto a_one_ceiled = Ceil(CeilDiv(a_one, db_["WPT"]), db_["WGS1"]);
    auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
    auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
    auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
    status = RunKernel(kernel, global, local);
    if (ErrorIn(status)) { return status; }
    // Waits for all kernels to finish
    queue_.Finish();
    // Succesfully finished the computation
    return StatusCode::kSuccess;
  } catch (...) { return StatusCode::kInvalidKernel; }
 }
 // =================================================================================================
 // Compiles the templated class
 template class Xger<float>;
 template class Xger<double>;
 template class Xger<float2>;
 template class Xger<double2>;
 // =================================================================================================
 } // namespace clblast
--- a/src/routines/level2/xgerc.cc
+++ b/src/routines/level2/xgerc.cc
@ -0,0 +1,53 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xgerc class (see the header for information about the class).
 //
 // =================================================================================================
 #include "internal/routines/level2/xgerc.h"
 #include <string>
 namespace clblast {
 // =================================================================================================
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgerc<T>::Xgerc(Queue &queue, Event &event, const std::string &name):
    Xger<T>(queue, event, name) {
 }
 // =================================================================================================
 // The main routine
 template <typename T>
 StatusCode Xgerc<T>::DoGerc(const Layout layout,
                            const size_t m, const size_t n,
                            const T alpha,
                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
  // Regular Ger operation on complex data, plus conjugation in the kernel guarded by the
  // ROUTINE_GERC guard.
  return DoGer(layout, m, n, alpha,
               x_buffer, x_offset, x_inc,
               y_buffer, y_offset, y_inc,
               a_buffer, a_offset, a_ld);
 }
 // =================================================================================================
 // Compiles the templated class
 template class Xgerc<float2>;
 template class Xgerc<double2>;
 // =================================================================================================
 } // namespace clblast
--- a/src/routines/level2/xgeru.cc
+++ b/src/routines/level2/xgeru.cc
@ -0,0 +1,52 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xgeru class (see the header for information about the class).
 //
 // =================================================================================================
 #include "internal/routines/level2/xgeru.h"
 #include <string>
 namespace clblast {
 // =================================================================================================
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgeru<T>::Xgeru(Queue &queue, Event &event, const std::string &name):
    Xger<T>(queue, event, name) {
 }
 // =================================================================================================
 // The main routine
 template <typename T>
 StatusCode Xgeru<T>::DoGeru(const Layout layout,
                            const size_t m, const size_t n,
                            const T alpha,
                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
  // Regular Ger operation on complex data
  return DoGer(layout, m, n, alpha,
               x_buffer, x_offset, x_inc,
               y_buffer, y_offset, y_inc,
               a_buffer, a_offset, a_ld);
 }
 // =================================================================================================
 // Compiles the templated class
 template class Xgeru<float2>;
 template class Xgeru<double2>;
 // =================================================================================================
 } // namespace clblast
--- a/src/routines/level2/xher.cc
+++ b/src/routines/level2/xher.cc
@ -0,0 +1,122 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xher class (see the header for information about the class).
 //
 // =================================================================================================
 #include "internal/routines/level2/xher.h"
 #include <string>
 namespace clblast {
 // =================================================================================================
 // Specific implementations to get the memory-type based on a template argument
 template <> const Precision Xher<float, float>::precision_ = Precision::kSingle;
 template <> const Precision Xher<double, double>::precision_ = Precision::kDouble;
 template <> const Precision Xher<float2, float>::precision_ = Precision::kComplexSingle;
 template <> const Precision Xher<double2, double>::precision_ = Precision::kComplexDouble;
 // =================================================================================================
 // Constructor: forwards to base class constructor
 template <typename T, typename U>
 Xher<T,U>::Xher(Queue &queue, Event &event, const std::string &name):
    Routine<T>(queue, event, name, {"Xger"}, precision_) {
  source_string_ =
    #include "../../kernels/level2/level2.opencl"
    #include "../../kernels/level2/xher.opencl"
  ;
 }
 // =================================================================================================
 // Specializations to compute alpha of type 'T'
 template <> float2 Xher<float2,float>::GetAlpha(const float alpha) { return float2{alpha, 0.0f}; }
 template <> double2 Xher<double2,double>::GetAlpha(const double alpha) { return double2{alpha, 0.0}; }
 template <> float Xher<float,float>::GetAlpha(const float alpha) { return alpha; }
 template <> double Xher<double,double>::GetAlpha(const double alpha) { return alpha; }
 // =================================================================================================
 // The main routine
 template <typename T, typename U>
 StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
                            const size_t n,
                            const U alpha,
                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
                            const bool packed) {
  // Makes sure the dimensions are larger than zero
  if (n == 0) { return StatusCode::kInvalidDimension; }
  // The data is either in the upper or lower triangle
  const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
                         (triangle == Triangle::kLower && layout == Layout::kRowMajor));
  const auto is_rowmajor = (layout == Layout::kRowMajor);
  // Creates a matching version of alpha
  const auto matching_alpha = GetAlpha(alpha);
  // Tests the matrix and the vectors for validity
  auto status = StatusCode::kSuccess;
  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
  else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); }
  if (ErrorIn(status)) { return status; }
  status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
  if (ErrorIn(status)) { return status; }
  // If alpha is zero an update is not required
  if (alpha == U{0}) { return StatusCode::kSuccess; }
  // Retrieves the Xgemv kernel from the compiled binary
  try {
    auto& program = GetProgramFromCache();
    auto kernel = Kernel(program, "Xher");
    // Sets the kernel arguments
    kernel.SetArgument(0, static_cast<int>(n));
    kernel.SetArgument(1, matching_alpha);
    kernel.SetArgument(2, x_buffer());
    kernel.SetArgument(3, static_cast<int>(x_offset));
    kernel.SetArgument(4, static_cast<int>(x_inc));
    kernel.SetArgument(5, a_buffer());
    kernel.SetArgument(6, static_cast<int>(a_offset));
    kernel.SetArgument(7, static_cast<int>(a_ld));
    kernel.SetArgument(8, static_cast<int>(is_upper));
    kernel.SetArgument(9, static_cast<int>(is_rowmajor));
    // Launches the kernel
    auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
    auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
    auto global = std::vector<size_t>{global_one, global_two};
    auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
    status = RunKernel(kernel, global, local);
    if (ErrorIn(status)) { return status; }
    // Waits for all kernels to finish
    queue_.Finish();
    // Succesfully finished the computation
    return StatusCode::kSuccess;
  } catch (...) { return StatusCode::kInvalidKernel; }
 }
 // =================================================================================================
 // Compiles the templated class
 template class Xher<float, float>;
 template class Xher<double, double>;
 template class Xher<float2, float>;
 template class Xher<double2, double>;
 // =================================================================================================
 } // namespace clblast
--- a/src/routines/level2/xher2.cc
+++ b/src/routines/level2/xher2.cc
@ -0,0 +1,114 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xher2 class (see the header for information about the class).
 //
 // =================================================================================================
 #include "internal/routines/level2/xher2.h"
 #include <string>
 namespace clblast {
 // =================================================================================================
 // Specific implementations to get the memory-type based on a template argument
 template <> const Precision Xher2<float>::precision_ = Precision::kSingle;
 template <> const Precision Xher2<double>::precision_ = Precision::kDouble;
 template <> const Precision Xher2<float2>::precision_ = Precision::kComplexSingle;
 template <> const Precision Xher2<double2>::precision_ = Precision::kComplexDouble;
 // =================================================================================================
 // Constructor: forwards to base class constructor
 template <typename T>
 Xher2<T>::Xher2(Queue &queue, Event &event, const std::string &name):
    Routine<T>(queue, event, name, {"Xger"}, precision_) {
  source_string_ =
    #include "../../kernels/level2/level2.opencl"
    #include "../../kernels/level2/xher2.opencl"
  ;
 }
 // =================================================================================================
 // The main routine
 template <typename T>
 StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
                            const size_t n,
                            const T alpha,
                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
                            const bool packed) {
  // Makes sure the dimensions are larger than zero
  if (n == 0) { return StatusCode::kInvalidDimension; }
  // The data is either in the upper or lower triangle
  const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
                         (triangle == Triangle::kLower && layout == Layout::kRowMajor));
  const auto is_rowmajor = (layout == Layout::kRowMajor);
  // Tests the matrix and the vectors for validity
  auto status = StatusCode::kSuccess;
  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
  else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); }
  if (ErrorIn(status)) { return status; }
  status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
  if (ErrorIn(status)) { return status; }
  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
  if (ErrorIn(status)) { return status; }
  // Retrieves the Xgemv kernel from the compiled binary
  try {
    auto& program = GetProgramFromCache();
    auto kernel = Kernel(program, "Xher2");
    // Sets the kernel arguments
    kernel.SetArgument(0, static_cast<int>(n));
    kernel.SetArgument(1, alpha);
    kernel.SetArgument(2, x_buffer());
    kernel.SetArgument(3, static_cast<int>(x_offset));
    kernel.SetArgument(4, static_cast<int>(x_inc));
    kernel.SetArgument(5, y_buffer());
    kernel.SetArgument(6, static_cast<int>(y_offset));
    kernel.SetArgument(7, static_cast<int>(y_inc));
    kernel.SetArgument(8, a_buffer());
    kernel.SetArgument(9, static_cast<int>(a_offset));
    kernel.SetArgument(10, static_cast<int>(a_ld));
    kernel.SetArgument(11, static_cast<int>(is_upper));
    kernel.SetArgument(12, static_cast<int>(is_rowmajor));
    // Launches the kernel
    auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
    auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
    auto global = std::vector<size_t>{global_one, global_two};
    auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
    status = RunKernel(kernel, global, local);
    if (ErrorIn(status)) { return status; }
    // Waits for all kernels to finish
    queue_.Finish();
    // Succesfully finished the computation
    return StatusCode::kSuccess;
  } catch (...) { return StatusCode::kInvalidKernel; }
 }
 // =================================================================================================
 // Compiles the templated class
 template class Xher2<float>;
 template class Xher2<double>;
 template class Xher2<float2>;
 template class Xher2<double2>;
 // =================================================================================================
 } // namespace clblast
--- a/src/routines/level2/xhpr.cc
+++ b/src/routines/level2/xhpr.cc
@ -0,0 +1,51 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xhpr class (see the header for information about the class).
 //
 // =================================================================================================
 #include "internal/routines/level2/xhpr.h"
 #include <string>
 namespace clblast {
 // =================================================================================================
 // Constructor: forwards to base class constructor
 template <typename T, typename U>
 Xhpr<T,U>::Xhpr(Queue &queue, Event &event, const std::string &name):
    Xher<T,U>(queue, event, name) {
 }
 // =================================================================================================
 // The main routine
 template <typename T, typename U>
 StatusCode Xhpr<T,U>::DoHpr(const Layout layout, const Triangle triangle,
                            const size_t n,
                            const U alpha,
                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                            const Buffer<T> &ap_buffer, const size_t ap_offset) {
  // Specific Xhpr functionality is implemented in the kernel using defines
  return DoHer(layout, triangle, n, alpha,
               x_buffer, x_offset, x_inc,
               ap_buffer, ap_offset, n,
               true); // packed matrix
 }
 // =================================================================================================
 // Compiles the templated class
 template class Xhpr<float2, float>;
 template class Xhpr<double2, double>;
 // =================================================================================================
 } // namespace clblast
--- a/src/routines/level2/xhpr2.cc
+++ b/src/routines/level2/xhpr2.cc
@ -0,0 +1,53 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xhpr2 class (see the header for information about the class).
 //
 // =================================================================================================
 #include "internal/routines/level2/xhpr2.h"
 #include <string>
 namespace clblast {
 // =================================================================================================
 // Constructor: forwards to base class constructor
 template <typename T>
 Xhpr2<T>::Xhpr2(Queue &queue, Event &event, const std::string &name):
    Xher2<T>(queue, event, name) {
 }
 // =================================================================================================
 // The main routine
 template <typename T>
 StatusCode Xhpr2<T>::DoHpr2(const Layout layout, const Triangle triangle,
                            const size_t n,
                            const T alpha,
                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
                            const Buffer<T> &ap_buffer, const size_t ap_offset) {
  // Specific Xhpr2 functionality is implemented in the kernel using defines
  return DoHer2(layout, triangle, n, alpha,
                x_buffer, x_offset, x_inc,
                y_buffer, y_offset, y_inc,
                ap_buffer, ap_offset, n,
                true); // packed matrix
 }
 // =================================================================================================
 // Compiles the templated class
 template class Xhpr2<float2>;
 template class Xhpr2<double2>;
 // =================================================================================================
 } // namespace clblast
--- a/src/routines/level2/xspr.cc
+++ b/src/routines/level2/xspr.cc
@ -0,0 +1,51 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xspr class (see the header for information about the class).
 //
 // =================================================================================================
 #include "internal/routines/level2/xspr.h"
 #include <string>
 namespace clblast {
 // =================================================================================================
 // Constructor: forwards to base class constructor
 template <typename T>
 Xspr<T>::Xspr(Queue &queue, Event &event, const std::string &name):
    Xher<T,T>(queue, event, name) {
 }
 // =================================================================================================
 // The main routine
 template <typename T>
 StatusCode Xspr<T>::DoSpr(const Layout layout, const Triangle triangle,
                          const size_t n,
                          const T alpha,
                          const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                          const Buffer<T> &ap_buffer, const size_t ap_offset) {
  // Specific Xspr functionality is implemented in the kernel using defines
  return DoHer(layout, triangle, n, alpha,
               x_buffer, x_offset, x_inc,
               ap_buffer, ap_offset, n,
               true); // packed matrix
 }
 // =================================================================================================
 // Compiles the templated class
 template class Xspr<float>;
 template class Xspr<double>;
 // =================================================================================================
 } // namespace clblast
--- a/src/routines/level2/xspr2.cc
+++ b/src/routines/level2/xspr2.cc
@ -0,0 +1,53 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xspr2 class (see the header for information about the class).
 //
 // =================================================================================================
 #include "internal/routines/level2/xspr2.h"
 #include <string>
 namespace clblast {
 // =================================================================================================
 // Constructor: forwards to base class constructor
 template <typename T>
 Xspr2<T>::Xspr2(Queue &queue, Event &event, const std::string &name):
    Xher2<T>(queue, event, name) {
 }
 // =================================================================================================
 // The main routine
 template <typename T>
 StatusCode Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle,
                            const size_t n,
                            const T alpha,
                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
                            const Buffer<T> &ap_buffer, const size_t ap_offset) {
  // Specific Xspr2 functionality is implemented in the kernel using defines
  return DoHer2(layout, triangle, n, alpha,
                x_buffer, x_offset, x_inc,
                y_buffer, y_offset, y_inc,
                ap_buffer, ap_offset, n,
                true); // packed matrix
 }
 // =================================================================================================
 // Compiles the templated class
 template class Xspr2<float>;
 template class Xspr2<double>;
 // =================================================================================================
 } // namespace clblast
--- a/src/routines/level2/xsyr.cc
+++ b/src/routines/level2/xsyr.cc
@ -0,0 +1,50 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xsyr class (see the header for information about the class).
 //
 // =================================================================================================
 #include "internal/routines/level2/xsyr.h"
 #include <string>
 namespace clblast {
 // =================================================================================================
 // Constructor: forwards to base class constructor
 template <typename T>
 Xsyr<T>::Xsyr(Queue &queue, Event &event, const std::string &name):
    Xher<T,T>(queue, event, name) {
 }
 // =================================================================================================
 // The main routine
 template <typename T>
 StatusCode Xsyr<T>::DoSyr(const Layout layout, const Triangle triangle,
                          const size_t n,
                          const T alpha,
                          const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                          const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
  // Specific Xsyr functionality is implemented in the kernel using defines
  return DoHer(layout, triangle, n, alpha,
               x_buffer, x_offset, x_inc,
               a_buffer, a_offset, a_ld);
 }
 // =================================================================================================
 // Compiles the templated class
 template class Xsyr<float>;
 template class Xsyr<double>;
 // =================================================================================================
 } // namespace clblast
--- a/src/routines/level2/xsyr2.cc
+++ b/src/routines/level2/xsyr2.cc
@ -0,0 +1,52 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xsyr2 class (see the header for information about the class).
 //
 // =================================================================================================
 #include "internal/routines/level2/xsyr2.h"
 #include <string>
 namespace clblast {
 // =================================================================================================
 // Constructor: forwards to base class constructor
 template <typename T>
 Xsyr2<T>::Xsyr2(Queue &queue, Event &event, const std::string &name):
    Xher2<T>(queue, event, name) {
 }
 // =================================================================================================
 // The main routine
 template <typename T>
 StatusCode Xsyr2<T>::DoSyr2(const Layout layout, const Triangle triangle,
                            const size_t n,
                            const T alpha,
                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
  // Specific Xsyr2 functionality is implemented in the kernel using defines
  return DoHer2(layout, triangle, n, alpha,
                x_buffer, x_offset, x_inc,
                y_buffer, y_offset, y_inc,
                a_buffer, a_offset, a_ld);
 }
 // =================================================================================================
 // Compiles the templated class
 template class Xsyr2<float>;
 template class Xsyr2<double>;
 // =================================================================================================
 } // namespace clblast
--- a/src/routines/level3/xgemm.cc
+++ b/src/routines/level3/xgemm.cc
@ -30,13 +30,14 @@ template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDoub
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgemm<T>::Xgemm(Queue &queue, Event &event, const std::string &name):
-    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
  source_string_ =
    #include "../../kernels/level3/copy.opencl"
    #include "../../kernels/level3/pad.opencl"
    #include "../../kernels/level3/transpose.opencl"
    #include "../../kernels/level3/padtranspose.opencl"
-    #include "../../kernels/level3/xgemm.opencl"
+    #include "../../kernels/level3/xgemm_part1.opencl"
    #include "../../kernels/level3/xgemm_part2.opencl"
  ;
 }
--- a/src/routines/level3/xher2k.cc
+++ b/src/routines/level3/xher2k.cc
@ -28,13 +28,14 @@ template <> const Precision Xher2k<double2,double>::precision_ = Precision::kCom
 // Constructor: forwards to base class constructor
 template <typename T, typename U>
 Xher2k<T,U>::Xher2k(Queue &queue, Event &event, const std::string &name):
-    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
  source_string_ =
    #include "../../kernels/level3/copy.opencl"
    #include "../../kernels/level3/pad.opencl"
    #include "../../kernels/level3/transpose.opencl"
    #include "../../kernels/level3/padtranspose.opencl"
-    #include "../../kernels/level3/xgemm.opencl"
+    #include "../../kernels/level3/xgemm_part1.opencl"
    #include "../../kernels/level3/xgemm_part2.opencl"
  ;
 }
--- a/src/routines/level3/xherk.cc
+++ b/src/routines/level3/xherk.cc
@ -28,13 +28,14 @@ template <> const Precision Xherk<double2,double>::precision_ = Precision::kComp
 // Constructor: forwards to base class constructor
 template <typename T, typename U>
 Xherk<T,U>::Xherk(Queue &queue, Event &event, const std::string &name):
-    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
  source_string_ =
    #include "../../kernels/level3/copy.opencl"
    #include "../../kernels/level3/pad.opencl"
    #include "../../kernels/level3/transpose.opencl"
    #include "../../kernels/level3/padtranspose.opencl"
-    #include "../../kernels/level3/xgemm.opencl"
+    #include "../../kernels/level3/xgemm_part1.opencl"
    #include "../../kernels/level3/xgemm_part2.opencl"
  ;
 }
--- a/src/routines/level3/xsyr2k.cc
+++ b/src/routines/level3/xsyr2k.cc
@ -30,13 +30,14 @@ template <> const Precision Xsyr2k<double2>::precision_ = Precision::kComplexDou
 // Constructor: forwards to base class constructor
 template <typename T>
 Xsyr2k<T>::Xsyr2k(Queue &queue, Event &event, const std::string &name):
-    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
  source_string_ =
    #include "../../kernels/level3/copy.opencl"
    #include "../../kernels/level3/pad.opencl"
    #include "../../kernels/level3/transpose.opencl"
    #include "../../kernels/level3/padtranspose.opencl"
-    #include "../../kernels/level3/xgemm.opencl"
+    #include "../../kernels/level3/xgemm_part1.opencl"
    #include "../../kernels/level3/xgemm_part2.opencl"
  ;
 }
--- a/src/routines/level3/xsyrk.cc
+++ b/src/routines/level3/xsyrk.cc
@ -30,13 +30,14 @@ template <> const Precision Xsyrk<double2>::precision_ = Precision::kComplexDoub
 // Constructor: forwards to base class constructor
 template <typename T>
 Xsyrk<T>::Xsyrk(Queue &queue, Event &event, const std::string &name):
-    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
  source_string_ =
    #include "../../kernels/level3/copy.opencl"
    #include "../../kernels/level3/pad.opencl"
    #include "../../kernels/level3/transpose.opencl"
    #include "../../kernels/level3/padtranspose.opencl"
-    #include "../../kernels/level3/xgemm.opencl"
+    #include "../../kernels/level3/xgemm_part1.opencl"
    #include "../../kernels/level3/xgemm_part2.opencl"
  ;
 }
--- a/src/tuning/xgemm.cc
+++ b/src/tuning/xgemm.cc
@ -31,7 +31,8 @@ class TuneXgemm {
  static std::string GetSources() {
    return
      #include "../src/kernels/common.opencl"
-      #include "../src/kernels/level3/xgemm.opencl"
+      #include "../src/kernels/level3/xgemm_part1.opencl"
      #include "../src/kernels/level3/xgemm_part2.opencl"
    ;
  }
--- a/src/tuning/xgemv.cc
+++ b/src/tuning/xgemv.cc
@ -35,6 +35,7 @@ class TuneXgemv {
    return
      #include "../src/kernels/common.opencl"
      #include "../src/kernels/level2/xgemv.opencl"
      #include "../src/kernels/level2/xgemv_fast.opencl"
    ;
  }
@ -60,8 +61,8 @@ class TuneXgemv {
  // Sets the tuning parameters and their possible values
  static void SetParameters(cltune::Tuner &tuner, const size_t id) {
-    tuner.AddParameter(id, "WGS"+std::to_string(V), {64, 128, 256, 512, 1024, 1536, 2048});
+    tuner.AddParameter(id, "WGS"+std::to_string(V), {64, 128, 256});
-    tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4, 8});
+    tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4});
    if (V==2 || V==3) { tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); }
  }
@ -72,7 +73,10 @@ class TuneXgemv {
      tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)});
    }
  }
-  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
+  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
    auto LocalMemorySize = [args] (std::vector<size_t> v) { return v[0]*GetBytes(args.precision); };
    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)});
  }
  // Sets the base thread configuration
  static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m}; }
@ -108,6 +112,9 @@ class TuneXgemv {
    tuner.AddArgumentScalar(0);
    tuner.AddArgumentScalar(1);
    tuner.AddArgumentScalar(0); // Conjugate transpose
    tuner.AddArgumentScalar(0); // Additional parameter
    tuner.AddArgumentScalar(0); // Banded 'kl'
    tuner.AddArgumentScalar(0); // Banded 'ku'
  }
  // Describes how to compute the performance metrics
--- a/src/tuning/xger.cc
+++ b/src/tuning/xger.cc
@ -0,0 +1,129 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file uses the CLTune auto-tuner to tune the xger OpenCL kernels.
 //
 // =================================================================================================
 #include <string>
 #include <vector>
 #include "internal/utilities.h"
 #include "internal/tuning.h"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T>
 class TuneXger {
 public:
  // The representative kernel and the source code
  static std::string KernelFamily() { return "xger"; }
  static std::string KernelName() { return "Xger"; }
  static std::string GetSources() {
    return
      #include "../src/kernels/common.opencl"
      #include "../src/kernels/level2/level2.opencl"
      #include "../src/kernels/level2/xger.opencl"
    ;
  }
  // The list of arguments relevant for this routine
  static std::vector<std::string> GetOptions() { return {kArgN, kArgM, kArgAlpha}; }
  // Tests for valid arguments
  static void TestValidArguments(const Arguments<T> &) { }
  // Sets the default values for the arguments
  static size_t DefaultM() { return 1024; }
  static size_t DefaultN() { return 1024; }
  static size_t DefaultK() { return 1; } // N/A for this kernel
  static double DefaultFraction() { return 1.0; } // N/A for this kernel
  // Describes how to obtain the sizes of the buffers
  static size_t GetSizeX(const Arguments<T> &args) { return args.m; }
  static size_t GetSizeY(const Arguments<T> &args) { return args.n; }
  static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.n; }
  static size_t GetSizeB(const Arguments<T> &) { return 1; } // N/A for this kernel
  static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel
  static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel
  // Sets the tuning parameters and their possible values
  static void SetParameters(cltune::Tuner &tuner, const size_t id) {
    tuner.AddParameter(id, "WGS1", {4, 8, 16, 32, 64, 128, 256, 512});
    tuner.AddParameter(id, "WGS2", {1, 2, 4, 8, 16, 32, 64, 128, 256});
    tuner.AddParameter(id, "WPT", {1, 2, 4});
  }
  // Sets the constraints and local memory size
  static void SetConstraints(cltune::Tuner &, const size_t) { }
  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
  // Sets the base thread configuration
  static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; }
  static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); }
  static std::vector<size_t> LocalSize() { return {1, 1}; }
  static std::vector<size_t> LocalSizeRef() { return {8, 8}; }
  // Transforms the thread configuration based on the parameters
  using TransformVector = std::vector<std::vector<std::string>>;
  static TransformVector MulLocal() { return {{"WGS1", "WGS2"}}; }
  static TransformVector DivLocal() { return {}; }
  static TransformVector MulGlobal() { return {}; }
  static TransformVector DivGlobal() { return {{"WPT", "WPT"}}; }
  // Sets the kernel's arguments
  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
                           std::vector<T> &x_vec, std::vector<T> &y_vec,
                           std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
                           std::vector<T> &) {
    tuner.AddArgumentScalar(static_cast<int>(args.m));
    tuner.AddArgumentScalar(static_cast<int>(args.n));
    tuner.AddArgumentScalar(args.alpha);
    tuner.AddArgumentInput(x_vec);
    tuner.AddArgumentScalar(0); // x_offset
    tuner.AddArgumentScalar(1); // x_increment
    tuner.AddArgumentInput(y_vec);
    tuner.AddArgumentScalar(0); // y_offset
    tuner.AddArgumentScalar(1); // y_increment
    tuner.AddArgumentOutput(a_mat);
    tuner.AddArgumentScalar(0); // a_offset
    tuner.AddArgumentScalar(static_cast<int>(args.m)); // a_ld
    tuner.AddArgumentScalar(0); // a_is_rowmajor
  }
  // Describes how to compute the performance metrics
  static size_t GetMetric(const Arguments<T> &args) {
    return (2*args.m*args.n + args.m + args.n) * GetBytes(args.precision);
  }
  static std::string PerformanceUnit() { return "GB/s"; }
 };
 // =================================================================================================
 } // namespace clblast
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
 using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
  switch(clblast::GetPrecision(argc, argv)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXger<float>, float>(argc, argv); break;
    case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXger<double>, double>(argc, argv); break;
    case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXger<float2>, float2>(argc, argv); break;
    case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneXger<double2>, double2>(argc, argv); break;
  }
  return 0;
 }
 // =================================================================================================
--- a/src/utilities.cc
+++ b/src/utilities.cc
@ -103,7 +103,13 @@ std::string ToString(Precision value) {
 // both the real and imaginary parts.
 template <typename T>
 T ConvertArgument(const char* value) {
-  return static_cast<T>(std::stod(value));
+  return static_cast<T>(std::stoi(value));
 }
 template <> float ConvertArgument(const char* value) {
  return static_cast<float>(std::stod(value));
 }
 template <> double ConvertArgument(const char* value) {
  return static_cast<double>(std::stod(value));
 }
 template <> float2 ConvertArgument(const char* value) {
  auto val = static_cast<float>(std::stod(value));
@ -139,7 +145,6 @@ T GetArgument(const int argc, char *argv[], std::string &help,
 }
 // Compiles the above function
 template bool GetArgument<bool>(const int, char **, std::string&, const std::string&, const bool);
 template int GetArgument<int>(const int, char **, std::string&, const std::string&, const int);
 template size_t GetArgument<size_t>(const int, char **, std::string&, const std::string&, const size_t);
 template float GetArgument<float>(const int, char **, std::string&, const std::string&, const float);
@ -156,9 +161,9 @@ template Precision GetArgument<Precision>(const int, char **, std::string&, cons
 // =================================================================================================
 // Returns only the precision argument
-Precision GetPrecision(const int argc, char *argv[]) {
+Precision GetPrecision(const int argc, char *argv[], const Precision default_precision) {
  auto dummy = std::string{};
-  return GetArgument(argc, argv, dummy, kArgPrecision, Precision::kSingle);
+  return GetArgument(argc, argv, dummy, kArgPrecision, default_precision);
 }
 // =================================================================================================
--- a/test/correctness/testblas.cc
+++ b/test/correctness/testblas.cc
@ -35,7 +35,7 @@ TestBlas<T,U>::TestBlas(int argc, char *argv[], const bool silent,
                        const Routine run_routine, const Routine run_reference,
                        const ResultGet get_result, const ResultIndex get_index,
                        const ResultIterator get_id1, const ResultIterator get_id2):
-    Tester<T,U>{argc, argv, silent, name, options},
+    Tester<T,U>(argc, argv, silent, name, options),
    run_routine_(run_routine),
    run_reference_(run_reference),
    get_result_(get_result),
--- a/test/correctness/tester.cc
+++ b/test/correctness/tester.cc
@ -80,11 +80,11 @@ template <typename T, typename U>
 Tester<T,U>::~Tester() {
  if (PrecisionSupported<T>(device_)) {
    fprintf(stdout, "* Completed all test-cases for this routine. Results:\n");
-    fprintf(stdout, "   %lu test(s) passed\n", tests_passed_);
+    fprintf(stdout, "   %zu test(s) passed\n", tests_passed_);
    if (tests_skipped_ > 0) { fprintf(stdout, "%s", kPrintWarning.c_str()); }
-    fprintf(stdout, "   %lu test(s) skipped%s\n", tests_skipped_, kPrintEnd.c_str());
+    fprintf(stdout, "   %zu test(s) skipped%s\n", tests_skipped_, kPrintEnd.c_str());
    if (tests_failed_ > 0) { fprintf(stdout, "%s", kPrintError.c_str()); }
-    fprintf(stdout, "   %lu test(s) failed%s\n", tests_failed_, kPrintEnd.c_str());
+    fprintf(stdout, "   %zu test(s) failed%s\n", tests_failed_, kPrintEnd.c_str());
  }
  fprintf(stdout, "\n");
  clblasTeardown();
@ -129,29 +129,29 @@ void Tester<T,U>::TestEnd() {
      fprintf(stdout, "   Status code %d (expected %d): ", entry.status_found, entry.status_expect);
    }
    for (auto &o: options_) {
-      if (o == kArgM)        { fprintf(stdout, "%s=%lu ", kArgM, entry.args.m); }
+      if (o == kArgM)        { fprintf(stdout, "%s=%zu ", kArgM, entry.args.m); }
-      if (o == kArgN)        { fprintf(stdout, "%s=%lu ", kArgN, entry.args.n); }
+      if (o == kArgN)        { fprintf(stdout, "%s=%zu ", kArgN, entry.args.n); }
-      if (o == kArgK)        { fprintf(stdout, "%s=%lu ", kArgK, entry.args.k); }
+      if (o == kArgK)        { fprintf(stdout, "%s=%zu ", kArgK, entry.args.k); }
-      if (o == kArgKU)       { fprintf(stdout, "%s=%lu ", kArgKU, entry.args.ku); }
+      if (o == kArgKU)       { fprintf(stdout, "%s=%zu ", kArgKU, entry.args.ku); }
-      if (o == kArgKL)       { fprintf(stdout, "%s=%lu ", kArgKL, entry.args.kl); }
+      if (o == kArgKL)       { fprintf(stdout, "%s=%zu ", kArgKL, entry.args.kl); }
      if (o == kArgLayout)   { fprintf(stdout, "%s=%d ", kArgLayout, entry.args.layout);}
      if (o == kArgATransp)  { fprintf(stdout, "%s=%d ", kArgATransp, entry.args.a_transpose);}
      if (o == kArgBTransp)  { fprintf(stdout, "%s=%d ", kArgBTransp, entry.args.b_transpose);}
      if (o == kArgSide)     { fprintf(stdout, "%s=%d ", kArgSide, entry.args.side);}
      if (o == kArgTriangle) { fprintf(stdout, "%s=%d ", kArgTriangle, entry.args.triangle);}
      if (o == kArgDiagonal) { fprintf(stdout, "%s=%d ", kArgDiagonal, entry.args.diagonal);}
-      if (o == kArgXInc)     { fprintf(stdout, "%s=%lu ", kArgXInc, entry.args.x_inc);}
+      if (o == kArgXInc)     { fprintf(stdout, "%s=%zu ", kArgXInc, entry.args.x_inc);}
-      if (o == kArgYInc)     { fprintf(stdout, "%s=%lu ", kArgYInc, entry.args.y_inc);}
+      if (o == kArgYInc)     { fprintf(stdout, "%s=%zu ", kArgYInc, entry.args.y_inc);}
-      if (o == kArgXOffset)  { fprintf(stdout, "%s=%lu ", kArgXOffset, entry.args.x_offset);}
+      if (o == kArgXOffset)  { fprintf(stdout, "%s=%zu ", kArgXOffset, entry.args.x_offset);}
-      if (o == kArgYOffset)  { fprintf(stdout, "%s=%lu ", kArgYOffset, entry.args.y_offset);}
+      if (o == kArgYOffset)  { fprintf(stdout, "%s=%zu ", kArgYOffset, entry.args.y_offset);}
-      if (o == kArgALeadDim) { fprintf(stdout, "%s=%lu ", kArgALeadDim, entry.args.a_ld);}
+      if (o == kArgALeadDim) { fprintf(stdout, "%s=%zu ", kArgALeadDim, entry.args.a_ld);}
-      if (o == kArgBLeadDim) { fprintf(stdout, "%s=%lu ", kArgBLeadDim, entry.args.b_ld);}
+      if (o == kArgBLeadDim) { fprintf(stdout, "%s=%zu ", kArgBLeadDim, entry.args.b_ld);}
-      if (o == kArgCLeadDim) { fprintf(stdout, "%s=%lu ", kArgCLeadDim, entry.args.c_ld);}
+      if (o == kArgCLeadDim) { fprintf(stdout, "%s=%zu ", kArgCLeadDim, entry.args.c_ld);}
-      if (o == kArgAOffset)  { fprintf(stdout, "%s=%lu ", kArgAOffset, entry.args.a_offset);}
+      if (o == kArgAOffset)  { fprintf(stdout, "%s=%zu ", kArgAOffset, entry.args.a_offset);}
-      if (o == kArgBOffset)  { fprintf(stdout, "%s=%lu ", kArgBOffset, entry.args.b_offset);}
+      if (o == kArgBOffset)  { fprintf(stdout, "%s=%zu ", kArgBOffset, entry.args.b_offset);}
-      if (o == kArgCOffset)  { fprintf(stdout, "%s=%lu ", kArgCOffset, entry.args.c_offset);}
+      if (o == kArgCOffset)  { fprintf(stdout, "%s=%zu ", kArgCOffset, entry.args.c_offset);}
-      if (o == kArgAPOffset) { fprintf(stdout, "%s=%lu ", kArgAPOffset, entry.args.ap_offset);}
+      if (o == kArgAPOffset) { fprintf(stdout, "%s=%zu ", kArgAPOffset, entry.args.ap_offset);}
-      if (o == kArgDotOffset){ fprintf(stdout, "%s=%lu ", kArgDotOffset, entry.args.dot_offset);}
+      if (o == kArgDotOffset){ fprintf(stdout, "%s=%zu ", kArgDotOffset, entry.args.dot_offset);}
    }
    fprintf(stdout, "\n");
  }
@ -159,18 +159,18 @@ void Tester<T,U>::TestEnd() {
  // Prints a test summary
  auto pass_rate = 100*num_passed_ / static_cast<float>(num_passed_ + num_skipped_ + num_failed_);
  fprintf(stdout, "   Pass rate %s%5.1lf%%%s:", kPrintMessage.c_str(), pass_rate, kPrintEnd.c_str());
-  fprintf(stdout, " %lu passed /", num_passed_);
+  fprintf(stdout, " %zu passed /", num_passed_);
  if (num_skipped_ != 0) {
-    fprintf(stdout, " %s%lu skipped%s /", kPrintWarning.c_str(), num_skipped_, kPrintEnd.c_str());
+    fprintf(stdout, " %s%zu skipped%s /", kPrintWarning.c_str(), num_skipped_, kPrintEnd.c_str());
  }
  else {
-    fprintf(stdout, " %lu skipped /", num_skipped_);
+    fprintf(stdout, " %zu skipped /", num_skipped_);
  }
  if (num_failed_ != 0) {
-    fprintf(stdout, " %s%lu failed%s\n", kPrintError.c_str(), num_failed_, kPrintEnd.c_str());
+    fprintf(stdout, " %s%zu failed%s\n", kPrintError.c_str(), num_failed_, kPrintEnd.c_str());
  }
  else {
-    fprintf(stdout, " %lu failed\n", num_failed_);
+    fprintf(stdout, " %zu failed\n", num_failed_);
  }
 }
@ -280,21 +280,21 @@ bool TestSimilarity(const T val1, const T val2) {
  const auto difference = std::fabs(val1 - val2);
  // Set the allowed error margin for floating-point comparisons
-  constexpr auto kErrorMarginRelative = 1.0e-2;
+  constexpr auto kErrorMarginRelative = T{0.025};
-  constexpr auto kErrorMarginAbsolute = 1.0e-10;
+  constexpr auto kErrorMarginAbsolute = T{1.0e-6};
  // Shortcut, handles infinities
  if (val1 == val2) {
    return true;
  }
  // The values are zero or very small: the relative error is less meaningful
-  else if (val1 == 0 || val2 == 0 || difference < static_cast<T>(kErrorMarginAbsolute)) {
+  else if (val1 == 0 || val2 == 0 || difference < kErrorMarginAbsolute) {
-    return (difference < static_cast<T>(kErrorMarginAbsolute));
+    return (difference < kErrorMarginAbsolute);
  }
  // Use relative error
  else {
    const auto absolute_sum = std::fabs(val1) + std::fabs(val2);
-    return (difference / absolute_sum) < static_cast<T>(kErrorMarginRelative);
+    return (difference / absolute_sum) < kErrorMarginRelative;
  }
 }
--- a/test/performance/client.cc
+++ b/test/performance/client.cc
@ -15,6 +15,7 @@
 #include <string>
 #include <vector>
 #include <utility>
 #include <algorithm>
 #include <chrono>
@ -48,11 +49,11 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric
  for (auto &o: options_) {
    // Data-sizes
-    if (o == kArgM)  { args.m   = GetArgument(argc, argv, help, kArgM, 512UL); }
+    if (o == kArgM)  { args.m   = GetArgument(argc, argv, help, kArgM, size_t{512}); }
-    if (o == kArgN)  { args.n   = GetArgument(argc, argv, help, kArgN, 512UL); }
+    if (o == kArgN)  { args.n   = GetArgument(argc, argv, help, kArgN, size_t{512}); }
-    if (o == kArgK)  { args.k   = GetArgument(argc, argv, help, kArgK, 512UL); }
+    if (o == kArgK)  { args.k   = GetArgument(argc, argv, help, kArgK, size_t{512}); }
-    if (o == kArgKU) { args.ku  = GetArgument(argc, argv, help, kArgKU, 128UL); }
+    if (o == kArgKU) { args.ku  = GetArgument(argc, argv, help, kArgKU, size_t{128}); }
-    if (o == kArgKL) { args.kl  = GetArgument(argc, argv, help, kArgKL, 128UL); }
+    if (o == kArgKL) { args.kl  = GetArgument(argc, argv, help, kArgKL, size_t{128}); }
    // Data-layouts
    if (o == kArgLayout)   { args.layout      = GetArgument(argc, argv, help, kArgLayout, Layout::kRowMajor); }
@ -89,7 +90,7 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric
  args.platform_id    = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
  args.device_id      = GetArgument(argc, argv, help, kArgDevice, size_t{0});
  args.precision      = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
-  args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, true);
+  args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, 1);
  args.step           = GetArgument(argc, argv, help, kArgStepSize, size_t{1});
  args.num_steps      = GetArgument(argc, argv, help, kArgNumSteps, size_t{0});
  args.num_runs       = GetArgument(argc, argv, help, kArgNumRuns, size_t{10});
@ -112,7 +113,7 @@ template <typename T, typename U>
 void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) {
  // Prints the header of the output table
-  PrintTableHeader(args.silent, options_);
+  PrintTableHeader(args);
  // Initializes OpenCL and the libraries
  auto platform = Platform(args.platform_id);
@ -162,11 +163,16 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
    auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, dot};
    // Runs the routines and collects the timings
    auto timings = std::vector<std::pair<std::string, double>>();
    auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
-    auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference_, "clBLAS");
+    timings.push_back(std::pair<std::string, double>("CLBlast", ms_clblast));
    if (args.compare_clblas) {
      auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference_, "clBLAS");
      timings.push_back(std::pair<std::string, double>("clBLAS", ms_clblas));
    }
-    // Prints the performance of both libraries
+    // Prints the performance of the tested libraries
-    PrintTableRow(args, ms_clblast, ms_clblas);
+    PrintTableRow(args, timings);
    // Makes the jump to the next step
    ++s;
@ -213,20 +219,27 @@ double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &ar
 // Prints the header of the performance table
 template <typename T, typename U>
-void Client<T,U>::PrintTableHeader(const bool silent, const std::vector<std::string> &args) {
+void Client<T,U>::PrintTableHeader(const Arguments<U>& args) {
-  if (!silent) {
+
-    for (auto i=size_t{0}; i<args.size(); ++i) { fprintf(stdout, "%9s ", ""); }
+  // First line (optional)
-    fprintf(stdout, " | <--       CLBlast       --> | <--      clBLAS      --> |\n");
+  if (!args.silent) {
    for (auto i=size_t{0}; i<options_.size(); ++i) { fprintf(stdout, "%9s ", ""); }
    fprintf(stdout, " | <--       CLBlast       -->");
    if (args.compare_clblas) { fprintf(stdout, " | <--       clBLAS        -->"); }
    fprintf(stdout, " |\n");
  }
-  for (auto &argument: args) { fprintf(stdout, "%9s;", argument.c_str()); }
+
-  fprintf(stdout, "%9s;%9s;%9s;%9s;%9s;%9s\n",
+  // Second line
-          "ms_1", "GFLOPS_1", "GBs_1", "ms_2", "GFLOPS_2", "GBs_2");
+  for (auto &option: options_) { fprintf(stdout, "%9s;", option.c_str()); }
  fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1");
  if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); }
  fprintf(stdout, "\n");
 }
 // Print a performance-result row
 template <typename T, typename U>
-void Client<T,U>::PrintTableRow(const Arguments<U>& args, const double ms_clblast,
+void Client<T,U>::PrintTableRow(const Arguments<U>& args,
-                                const double ms_clblas) {
+                                const std::vector<std::pair<std::string, double>>& timings) {
  // Creates a vector of relevant variables
  auto integers = std::vector<size_t>{};
@ -261,34 +274,36 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args, const double ms_clblas
    else if (o == kArgBeta) {     strings.push_back(ToString(args.beta)); }
  }
  // Computes the GFLOPS and GB/s metrics
  auto flops = get_flops_(args);
  auto bytes = get_bytes_(args);
  auto gflops_clblast = (ms_clblast != 0.0) ? (flops*1e-6)/ms_clblast : 0;
  auto gflops_clblas = (ms_clblas != 0.0) ? (flops*1e-6)/ms_clblas: 0;
  auto gbs_clblast = (ms_clblast != 0.0) ? (bytes*1e-6)/ms_clblast : 0;
  auto gbs_clblas = (ms_clblas != 0.0) ? (bytes*1e-6)/ms_clblas: 0;
  // Outputs the argument values
  for (auto &argument: integers) {
    if (!args.no_abbrv && argument >= 1024*1024 && IsMultiple(argument, 1024*1024)) {
-      fprintf(stdout, "%8luM;", argument/(1024*1024));
+      fprintf(stdout, "%8zuM;", argument/(1024*1024));
    }
    else if (!args.no_abbrv && argument >= 1024 && IsMultiple(argument, 1024)) {
-      fprintf(stdout, "%8luK;", argument/1024);
+      fprintf(stdout, "%8zuK;", argument/1024);
    }
    else {
-      fprintf(stdout, "%9lu;", argument);
+      fprintf(stdout, "%9zu;", argument);
    }
  }
  for (auto &argument: strings) {
    fprintf(stdout, "%9s;", argument.c_str());
  }
-  // Outputs the performance numbers
+  // Loops over all tested libraries
-  fprintf(stdout, "%9.2lf;%9.1lf;%9.1lf;%9.2lf;%9.1lf;%9.1lf\n",
+  for (const auto& timing : timings) {
-          ms_clblast, gflops_clblast, gbs_clblast,
+
-          ms_clblas, gflops_clblas, gbs_clblas);
+    // Computes the GFLOPS and GB/s metrics
    auto flops = get_flops_(args);
    auto bytes = get_bytes_(args);
    auto gflops = (timing.second != 0.0) ? (flops*1e-6)/timing.second : 0;
    auto gbs = (timing.second != 0.0) ? (bytes*1e-6)/timing.second : 0;
    // Outputs the performance numbers
    if (timing.first != "CLBlast") { fprintf(stdout, ";"); }
    fprintf(stdout, "%9.2lf;%9.1lf;%9.1lf", timing.second, gflops, gbs);
  }
  fprintf(stdout, "\n");
 }
 // =================================================================================================
--- a/test/performance/client.h
+++ b/test/performance/client.h
@ -23,6 +23,7 @@
 #include <string>
 #include <vector>
 #include <utility>
 // The libraries to test
 #include <clBLAS.h>
@ -64,10 +65,11 @@ class Client {
                        Queue &queue, Routine run_blas, const std::string &library_name);
  // Prints the header of a performance-data table
-  void PrintTableHeader(const bool silent, const std::vector<std::string> &args);
+  void PrintTableHeader(const Arguments<U>& args);
  // Prints a row of performance data, including results of two libraries
-  void PrintTableRow(const Arguments<U>& args, const double ms_clblast, const double ms_clblas);
+  void PrintTableRow(const Arguments<U>& args,
                     const std::vector<std::pair<std::string, double>>& timings);
  // The routine-specific functions passed to the tester
  const Routine run_routine_;
--- a/test/performance/graphs/common.r
+++ b/test/performance/graphs/common.r
@ -63,7 +63,7 @@ main <- function(routine_name, precision, test_names, test_values,
  if (precision == 64) { display_name <- gsub("^X","D",display_name); }
  if (precision == 3232) { display_name <- gsub("^X","C",display_name); }
  if (precision == 6464) { display_name <- gsub("^X","Z",display_name); }
-  executable <- paste("./client_", routine_name, sep="")
+  executable <- paste("./clblast_client_", routine_name, sep="")
  # Configures the outputfile
  pdf(paste(display_name, ".pdf", sep=""), height=8, width=13)
--- a/test/performance/routines/level1/xaxpy.cc
+++ b/test/performance/routines/level1/xaxpy.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle:
      clblast::RunClient<clblast::TestXaxpy<float>, float, float>(argc, argv); break;
--- a/test/performance/routines/level1/xcopy.cc
+++ b/test/performance/routines/level1/xcopy.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle:
      clblast::RunClient<clblast::TestXcopy<float>, float, float>(argc, argv); break;
--- a/test/performance/routines/level1/xdot.cc
+++ b/test/performance/routines/level1/xdot.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle:
      clblast::RunClient<clblast::TestXdot<float>, float, float>(argc, argv); break;
--- a/test/performance/routines/level1/xdotc.cc
+++ b/test/performance/routines/level1/xdotc.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
--- a/test/performance/routines/level1/xdotu.cc
+++ b/test/performance/routines/level1/xdotu.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
--- a/test/performance/routines/level1/xscal.cc
+++ b/test/performance/routines/level1/xscal.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle:
      clblast::RunClient<clblast::TestXscal<float>, float, float>(argc, argv); break;
--- a/test/performance/routines/level1/xswap.cc
+++ b/test/performance/routines/level1/xswap.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle:
      clblast::RunClient<clblast::TestXswap<float>, float, float>(argc, argv); break;
--- a/test/performance/routines/level2/xgbmv.cc
+++ b/test/performance/routines/level2/xgbmv.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle:
      clblast::RunClient<clblast::TestXgbmv<float>, float, float>(argc, argv); break;
--- a/test/performance/routines/level2/xgemv.cc
+++ b/test/performance/routines/level2/xgemv.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle:
      clblast::RunClient<clblast::TestXgemv<float>, float, float>(argc, argv); break;
--- a/test/performance/routines/level2/xger.cc
+++ b/test/performance/routines/level2/xger.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle:
      clblast::RunClient<clblast::TestXger<float>, float, float>(argc, argv); break;
--- a/test/performance/routines/level2/xgerc.cc
+++ b/test/performance/routines/level2/xgerc.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
--- a/test/performance/routines/level2/xgeru.cc
+++ b/test/performance/routines/level2/xgeru.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
--- a/test/performance/routines/level2/xhbmv.cc
+++ b/test/performance/routines/level2/xhbmv.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
--- a/test/performance/routines/level2/xhemv.cc
+++ b/test/performance/routines/level2/xhemv.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
--- a/test/performance/routines/level2/xher.cc
+++ b/test/performance/routines/level2/xher.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
--- a/test/performance/routines/level2/xher2.cc
+++ b/test/performance/routines/level2/xher2.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
--- a/Show more
+++ b/Show more