Merge pull request #31 from CNugteren/development

Update to version 0.6.0
2024-07-07 12:23:46 +02:00 · 2016-03-13 11:05:51 +01:00 · 2016-03-13 11:05:51 +01:00 · d190becd89
parent 4678fd371d bf4bd072e2
commit d190becd89
137 changed files with 6198 additions and 1463 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,5 @@
 build
 stash
 .*
-*.pyc
+*.pyc
+*.db
--- a/.travis.yml
+++ b/.travis.yml
@ -1,29 +1,69 @@
 language: cpp
+sudo: required
+dist: trusty
+
 compiler:
  - gcc
  - clang
+
+addons:
+  apt:
+    sources:
+      # kubuntu-backports contains newer versions of cmake to install
+      - kubuntu-backports
+    packages:
+      - cmake
+
+env:
+  global:
+    - CLBLAST_ROOT=${TRAVIS_BUILD_DIR}/bin/make/release
+    - OPENCL_REGISTRY=https://www.khronos.org/registry/cl
+    - OPENCL_ROOT=${TRAVIS_BUILD_DIR}/bin/opencl
+
 before_install:
-  - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
-  - sudo add-apt-repository -y ppa:kalakris/cmake
-  - sudo apt-get update -qq
-  - sudo apt-get install -qq gcc-4.8 g++-4.8 clang
-  - sudo apt-get install -qq fglrx=2:8.960-0ubuntu1 opencl-headers
-  - sudo apt-get install -qq cmake
+  - cmake --version;
+  - ${CC} --version;
+  - ${CXX} --version;
+
 install:
-  - if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
+  # The following linux logic is necessary because of Travis's move to the GCE platform, which does not
+  # currently contain packages for fglrx: https://github.com/travis-ci/travis-ci/issues/5221
+  # We build our own linkable .so file
+  - if [ ${TRAVIS_OS_NAME} == "linux" ]; then
+      mkdir -p ${OPENCL_ROOT};
+      pushd ${OPENCL_ROOT};
+      travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-ICD-Loader.git;
+      mv ./OpenCL-ICD-Loader/* .;
+      travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-Headers.git inc/CL;
+      pushd inc/CL;
+      travis_retry wget -w 1 -np -nd -nv -A h,hpp ${OPENCL_REGISTRY}/api/2.1/cl.hpp;
+      popd;
+      mkdir -p lib;
+      pushd lib;
+      cmake -G "Unix Makefiles" ..;
+      make;
+      cp ./bin/libOpenCL.so .;
+      popd;
+      pushd inc/CL;
+      travis_retry git fetch origin opencl12:opencl12;
+      git checkout opencl12;
+      popd;
+      mv inc/ include/;
+      popd;
+    fi
+
 before_script:
-  - mkdir install
-  - export PATH=`pwd`/install/bin:${PATH}
-  - export LD_LIBRARY_PATH=`pwd`/install/lib64:`pwd`/install/lib:${LD_LIBRARY_PATH}
-  - mkdir build
-  - cd build
-  - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install ..
+  - mkdir -p ${CLBLAST_ROOT}
+  - pushd ${CLBLAST_ROOT}
+  - cmake -DOPENCL_ROOT=${OPENCL_ROOT} ${TRAVIS_BUILD_DIR}
+
 script:
  - make
-  - make install
+
 branches:
  only:
    - master
    - development
+
 notifications:
  email: false
--- a/17
+++ b/17
@ -1,4 +1,21 @@

+Version 0.6.0
+- Added support for MSVC (Visual Studio) 2015
+- Added tuned parameters for various devices (see README)
+- Now automatically generates C++ code from JSON tuning results
+- Added level-2 routines:
+  * SGER/DGER
+  * CGERU/ZGERU
+  * CGERC/ZGERC
+  * CHER/ZHER
+  * CHPR/ZHPR
+  * CHER2/ZHER2
+  * CHPR2/ZHPR2
+  * CSYR/ZSYR
+  * CSPR/ZSPR
+  * CSYR2/ZSYR2
+  * CSPR2/ZSPR2
+
 Version 0.5.0
 - Improved structure and performance of level-2 routines (xSYMV/xHEMV)
 - Reduced compilation time of level-3 OpenCL kernels
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -13,7 +13,7 @@
 cmake_minimum_required(VERSION 2.8.10)
 project("clblast" C CXX)
 set(clblast_VERSION_MAJOR 0)
-set(clblast_VERSION_MINOR 5)
+set(clblast_VERSION_MINOR 6)
 set(clblast_VERSION_PATCH 0)

 # Options and their default values
@ -55,16 +55,21 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
 endif()

 # C++ compiler settings
-set(FLAGS "-O3 -std=c++11")
-if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-  set(FLAGS "${FLAGS} -Wall -Wno-comment -Wno-return-type -Wno-switch -Wno-missing-noreturn")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0)
-    set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable")
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+  set(FLAGS "/Ox")
+  set(FLAGS "${FLAGS} /wd4715")
+else ()
+  set(FLAGS "-O3 -std=c++11")
+  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+    set(FLAGS "${FLAGS} -Wall -Wno-comment -Wno-return-type -Wno-switch -Wno-missing-noreturn")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0)
+      set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable")
+    endif()
+  elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+    set(FLAGS "${FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
+    set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch")
+    set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn")
  endif()
-elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-  set(FLAGS "${FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
-  set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch")
-  set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn")
 endif()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")

@ -102,14 +107,15 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
 # ==================================================================================================

 # Sets the supported routines and the used kernels. New routines and kernels should be added here.
-set(KERNELS copy pad transpose padtranspose xaxpy xdot xgemv xgemm)
+set(KERNELS copy pad transpose padtranspose xaxpy xdot xger xgemm xgemv)
 set(SAMPLE_PROGRAMS_CPP sgemm)
 set(SAMPLE_PROGRAMS_C sgemm)
 set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc)
-set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv)
+set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
+                    xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
 set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
 set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES})
-set(PRECISIONS 32 3232 64 6464)
+set(PRECISIONS 32 64 3232 6464)

 # ==================================================================================================

--- a/README.md
+++ b/README.md
@ -6,7 +6,7 @@ CLBlast: The tuned OpenCL BLAS library

 CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices.

-__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version doesn't support the less commonly used routines yet: they will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
+__Note that the CLBlast library is actively being developed, and might not be mature enough for production environments__. This preview-version doesn't support the less commonly used routines yet: they will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details (and how to tune yourself).


 Why CLBlast and not clBLAS or cuBLAS?
@ -17,6 +17,9 @@ Use CLBlast instead of clBLAS:
 * When you care about achieving maximum performance.
 * When you want to be able to inspect the BLAS kernels or easily customize them to your needs.
 * When you run on exotic OpenCL devices which you need to tune yourself.
+* When you are still running on OpenCL 1.1 hardware.
+* When you value an organized and modern C++ codebase.
+* When you target Intel CPUs and GPUs or embedded devices

 Use CLBlast instead of cuBLAS:

@ -41,10 +44,13 @@ The pre-requisites for compilation of CLBlast are:
  - Clang 3.3 or newer
  - AppleClang 5.0 or newer
  - ICC 14.0 or newer
+  - MSVC (Visual Studio) 2015 or newer
 * An OpenCL 1.1 or newer library, for example:
  - Apple OpenCL
  - NVIDIA CUDA SDK
  - AMD APP SDK
+  - Intel OpenCL
+  - Beignet

 An example of an out-of-source build (starting from the root of the CLBlast folder):

@ -79,13 +85,27 @@ Using the tuners (optional)
 The CLBlast library will be tuned in the future for the most commonly used OpenCL devices. This pre-release of CLBlast is only tuned for a limited number of devices, in particular those with the following `CL_DEVICE_NAME` values:

 * NVIDIA GPUs:
-  - GeForce GTX480
+  - GeForce GTX 480
+  - GeForce GTX 680
+  - GeForce GTX 750 Ti
+  - GeForce GTX 980
+  - GeForce GTX Titan
+  - GeForce GTX Titan X
  - Tesla K20m
  - Tesla K40m
 * AMD GPUs:
  - Tahiti
+  - R9 M370X
 * Intel GPUs:
  - Iris
+  - Iris Pro
+* Intel CPUs:
+  - Core i5-6200U
+  - Core i7-3770K
+  - Core i7-5930K
+* Other devices:
+  - ARM Mali-T628 GPU
+  - Intel MIC

 If your device is not (yet) among this list or if you want to tune CLBlast for specific parameters (e.g. rectangular matrix sizes), you should compile the library with the optional tuners:

@ -93,9 +113,19 @@ If your device is not (yet) among this list or if you want to tune CLBlast for s

 Note that CLBlast's tuners are based on the CLTune auto-tuning library, which has to be installed separately (version 1.7.0 or higher). CLTune is available from GitHub.

-Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance.
+Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `DEFAULT_DEVICE` and `DEFAULT_PLATFORM` environmental variables before running CMake.

-The tuner will output a C++ database compatible line with the results, which can be added to `include/internal/database/xxxxx.h` in the appropriate section. Or, if tuning parameters already exist for your device but you believe they can be improved, this is also the place where they can be modified. If you want the found parameters to be included in future releases of CLBlast, please post the JSON output in the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
+The tuners output a JSON-file with the results. The best results need to be added to `include/internal/database/xxxxx.h` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
+
+In summary, tuning the entire library for your device can be done as follows (starting from the root of the CLBlast folder):
+
+    mkdir build
+    cd build
+    cmake -DTUNERS=ON ..
+    make
+    make alltuners
+    python ../scripts/database/database.py . ..
+    make


 Compiling the tests (optional)
@ -127,10 +157,11 @@ These graphs can be generated automatically on your own device. First, compile C

    Rscript path/to/test/performance/graphs/xgemm.r 0 1

+
 Supported routines
 -------------

-CLBlast is in active development but already supports the majority of BLAS routines. The currently supported routines are marked with '✔' in the following tables:
+CLBlast is in active development but already supports almost all the BLAS routines. The currently supported routines are marked with '✔' in the following tables. Empty boxes represent routines that still need to be implemented in a future release, whereas routines marked with '-' are not part of BLAS at all.

 | Level-1  | S | D | C | Z | Notes   |
 | ---------|---|---|---|---|---------|
@ -149,7 +180,6 @@ CLBlast is in active development but already supports the majority of BLAS routi
 | xASUM    |   |   | - | - | +SC +DZ |
 | IxAMAX   |   |   |   |   |         |

-
 | Level-2  | S | D | C | Z | Notes   |
 | ---------|---|---|---|---|---------|
 | xGEMV    | ✔ | ✔ | ✔ | ✔ |         |
@ -166,17 +196,17 @@ CLBlast is in active development but already supports the majority of BLAS routi
 | xTRSV    |   |   |   |   |         |
 | xTBSV    |   |   |   |   |         |
 | xTPSV    |   |   |   |   |         |
-| xGER     |   |   | - | - |         |
-| xGERU    | - | - |   |   |         |
-| xGERC    | - | - |   |   |         |
-| xHER     | - | - |   |   |         |
-| xHPR     | - | - |   |   |         |
-| xHER2    | - | - |   |   |         |
-| xHPR2    | - | - |   |   |         |
-| xSYR     |   |   | - | - |         |
-| xSPR     |   |   | - | - |         |
-| xSYR2    |   |   | - | - |         |
-| xSPR2    |   |   | - | - |         |
+| xGER     | ✔ | ✔ | - | - |         |
+| xGERU    | - | - | ✔ | ✔ |         |
+| xGERC    | - | - | ✔ | ✔ |         |
+| xHER     | - | - | ✔ | ✔ |         |
+| xHPR     | - | - | ✔ | ✔ |         |
+| xHER2    | - | - | ✔ | ✔ |         |
+| xHPR2    | - | - | ✔ | ✔ |         |
+| xSYR     | ✔ | ✔ | - | - |         |
+| xSPR     | ✔ | ✔ | - | - |         |
+| xSYR2    | ✔ | ✔ | - | - |         |
+| xSPR2    | ✔ | ✔ | - | - |         |

 | Level-3  | S | D | C | Z | Notes   |
 | ---------|---|---|---|---|---------|
@ -200,6 +230,12 @@ The contributing authors so far are:

 * [Cedric Nugteren](http://www.cedricnugteren.nl)

+Tuning and testing on a variety of OpenCL devices was made possible by:
+
+* [TU/e ES research group](http://www.es.ele.tue.nl/)
+* [ASCI DAS4 and DAS5](http://www.cs.vu.nl/das4/)
+* [Dividiti](http://www.dividiti.com)
+* [SURFsara HPC center](http://www.surfsara.com)

 Support us
 -------------
@ -210,20 +246,8 @@ This project started in March 2015 as an evenings and weekends free-time project
 To-do list before release of version 1.0
 -------------

- Increase the functionality:
-  * Support all routines supported by clBLAS
-  * Allow the user control over events and synchronization
-  * Add half-precision routines (e.g. HGEMM)
- Improve host performance:
-  * Allow initialization to pre-compile kernels and store to disk
- Improve device performance:
-  * Tune for a wider range of devices
-  * Allow users to define custom tuned parameters
- Improve the tuning
-  * Make the tuners upload their data to a central server
- Improve the performance comparisons:
-  * Enable comparison against optionally: ViennaCL, cuBLAS, MAGMA OpenCL
- Further reduce the likelihood of crashes:
-  * Add checks for proper command-line arguments in the tuner, tester and client
-  * Add checks for valid database parameters
-  * Test in multi-threaded environments
+- Support all routines supported by clBLAS
+- Allow the user control over events and synchronization
+- Add half-precision routines (e.g. HGEMM)
+- Enable correctness and performance testing against a CPU-based BLAS library
+- Test in multi-threaded environments
--- a/cmake/Modules/FindOpenCL.cmake
+++ b/cmake/Modules/FindOpenCL.cmake
@ -34,6 +34,7 @@ set(OPENCL_HINTS
 set(OPENCL_PATHS
  /usr/local/cuda
  /opt/cuda
+  /opt/intel/opencl
  /usr
  /usr/local
 )
@ -52,7 +53,7 @@ mark_as_advanced(OPENCL_INCLUDE_DIRS)
 find_library(OPENCL_LIBRARIES
  NAMES OpenCL
  HINTS ${OPENCL_HINTS}
-  PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 OpenCL/common/lib/x64
+  PATH_SUFFIXES lib lib64 lib/x86_64 lib/x86_64/sdk lib/x64 lib/x86 lib/Win32 OpenCL/common/lib/x64
  PATHS ${OPENCL_PATHS}
  DOC "OpenCL library"
 )
--- a/cmake/Modules/FindclBLAS.cmake
+++ b/cmake/Modules/FindclBLAS.cmake
@ -45,7 +45,7 @@ mark_as_advanced(CLBLAS_INCLUDE_DIRS)
 find_library(CLBLAS_LIBRARIES
  NAMES clBLAS
  HINTS ${CLBLAS_HINTS}
-  PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32
+  PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import
  PATHS ${CLBLAS_PATHS}
  DOC "clBLAS library"
 )
--- a/doc/performance/Intel_Iris/SAXPY.pdf
+++ b/doc/performance/Intel_Iris/SAXPY.pdf
--- a/doc/performance/Intel_Iris/SGEMM.pdf
+++ b/doc/performance/Intel_Iris/SGEMM.pdf
--- a/doc/performance/Intel_Iris/SGEMV.pdf
+++ b/doc/performance/Intel_Iris/SGEMV.pdf
--- a/doc/performance/Intel_Iris/SSYMM.pdf
+++ b/doc/performance/Intel_Iris/SSYMM.pdf
--- a/doc/performance/Intel_Iris/SSYRK.pdf
+++ b/doc/performance/Intel_Iris/SSYRK.pdf
--- a/doc/performance/Intel_IrisPro/SAXPY.pdf
+++ b/doc/performance/Intel_IrisPro/SAXPY.pdf
--- a/doc/performance/Intel_IrisPro/SGEMM.pdf
+++ b/doc/performance/Intel_IrisPro/SGEMM.pdf
--- a/doc/performance/Intel_IrisPro/SGEMV.pdf
+++ b/doc/performance/Intel_IrisPro/SGEMV.pdf
--- a/doc/performance/Intel_IrisPro/SSYMM.pdf
+++ b/doc/performance/Intel_IrisPro/SSYMM.pdf
--- a/doc/performance/Intel_IrisPro/SSYRK.pdf
+++ b/doc/performance/Intel_IrisPro/SSYRK.pdf
--- a/doc/performance/Radeon_M370X/SAXPY.pdf
+++ b/doc/performance/Radeon_M370X/SAXPY.pdf
--- a/doc/performance/Radeon_M370X/SGEMM.pdf
+++ b/doc/performance/Radeon_M370X/SGEMM.pdf
--- a/doc/performance/Radeon_M370X/SGEMV.pdf
+++ b/doc/performance/Radeon_M370X/SGEMV.pdf
--- a/doc/performance/Radeon_M370X/SSYMM.pdf
+++ b/doc/performance/Radeon_M370X/SSYMM.pdf
--- a/include/internal/clpp11.h
+++ b/include/internal/clpp11.h
@ -76,7 +76,7 @@ class Event {
  explicit Event(const cl_event event): event_(event) { }

  // Regular constructor
-  explicit Event() { }
+  explicit Event(): event_(nullptr) { }

  // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
  // the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation:
@ -119,6 +119,13 @@ class Platform {
    platform_ = platforms[platform_id];
  }

+  // Returns the number of devices on this platform
+  size_t NumDevices() const {
+    auto result = cl_uint{0};
+    CheckError(clGetDeviceIDs(platform_, CL_DEVICE_TYPE_ALL, 0, nullptr, &result));
+    return static_cast<size_t>(result);
+  }
+
  // Accessor to the private data-member
  const cl_platform_id& operator()() const { return platform_; }
 private:
@ -136,11 +143,11 @@ class Device {

  // Initialize the device. Note that this constructor can throw exceptions!
  explicit Device(const Platform &platform, const size_t device_id) {
-    auto num_devices = cl_uint{0};
-    CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, 0, nullptr, &num_devices));
+    auto num_devices = platform.NumDevices();
    if (num_devices == 0) { Error("no devices found"); }
    auto devices = std::vector<cl_device_id>(num_devices);
-    CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, num_devices, devices.data(), nullptr));
+    CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, static_cast<cl_uint>(num_devices),
+                              devices.data(), nullptr));
    if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); }
    device_ = devices[device_id];
  }
@ -172,6 +179,7 @@ class Device {
  size_t CoreClock() const { return GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY); }
  size_t ComputeUnits() const { return GetInfo(CL_DEVICE_MAX_COMPUTE_UNITS); }
  size_t MemorySize() const { return GetInfo(CL_DEVICE_GLOBAL_MEM_SIZE); }
+  size_t MaxAllocSize() const { return GetInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE); }
  size_t MemoryClock() const { return 0; } // Not exposed in OpenCL
  size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL

@ -225,7 +233,7 @@ class Device {
    auto result = std::string{};
    result.resize(bytes);
    CheckError(clGetDeviceInfo(device_, info, bytes, &result[0], nullptr));
-    return std::string{result.c_str()};
+    return std::string{result.c_str()}; // Removes any trailing '\0'-characters
  }
 };

@ -342,7 +350,12 @@ class Queue {
      queue_(new cl_command_queue, [](cl_command_queue* s) { CheckError(clReleaseCommandQueue(*s));
                                                             delete s; }) {
    auto status = CL_SUCCESS;
-    *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
+    #ifdef CL_VERSION_2_0
+      cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
+      *queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status);
+    #else
+      *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
+    #endif
    CheckError(status);
  }

@ -408,7 +421,7 @@ class BufferHost {
 // =================================================================================================

 // Enumeration of buffer access types
-enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite };
+enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned };

 // C++11 version of 'cl_mem'
 template <typename T>
@ -418,13 +431,17 @@ class Buffer {
  // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
  explicit Buffer(const cl_mem buffer):
      buffer_(new cl_mem),
-      access_(BufferAccess::kReadWrite) {
+      access_(BufferAccess::kNotOwned) {
    *buffer_ = buffer;
  }

-  // Regular constructor with memory management
+  // Regular constructor with memory management. If this class does not own the buffer object, then
+  // the memory will not be freed automatically afterwards.
  explicit Buffer(const Context &context, const BufferAccess access, const size_t size):
-      buffer_(new cl_mem, [](cl_mem* m) { CheckError(clReleaseMemObject(*m)); delete m; }),
+      buffer_(new cl_mem, [access](cl_mem* m) {
+        if (access != BufferAccess::kNotOwned) { CheckError(clReleaseMemObject(*m)); }
+        delete m;
+      }),
      access_(access) {
    auto flags = cl_mem_flags{CL_MEM_READ_WRITE};
    if (access_ == BufferAccess::kReadOnly) { flags = CL_MEM_READ_ONLY; }
@ -439,57 +456,74 @@ class Buffer {
    Buffer<T>(context, BufferAccess::kReadWrite, size) {
  }

+  // Constructs a new buffer based on an existing host-container
+  template <typename Iterator>
+  explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end):
+    Buffer(context, BufferAccess::kReadWrite, static_cast<size_t>(end - start)) {
+    auto size = static_cast<size_t>(end - start);
+    auto pointer = &*start;
+    CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), pointer, 0,
+                                    nullptr, nullptr));
+    queue.Finish();
+  }
+
  // Copies from device to host: reading the device buffer a-synchronously
-  void ReadAsync(const Queue &queue, const size_t size, T* host) {
+  void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
    if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
-    CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), host, 0,
-                                   nullptr, nullptr));
+    CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
+                                   host, 0, nullptr, nullptr));
  }
-  void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host) {
+  void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
+                 const size_t offset = 0) {
    if (host.size() < size) { Error("target host buffer is too small"); }
-    ReadAsync(queue, size, host.data());
+    ReadAsync(queue, size, host.data(), offset);
  }
-  void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host) {
+  void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
+                 const size_t offset = 0) {
    if (host.size() < size) { Error("target host buffer is too small"); }
-    ReadAsync(queue, size, host.data());
+    ReadAsync(queue, size, host.data(), offset);
  }

  // Copies from device to host: reading the device buffer
-  void Read(const Queue &queue, const size_t size, T* host) {
-    ReadAsync(queue, size, host);
+  void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
+    ReadAsync(queue, size, host, offset);
    queue.Finish();
  }
-  void Read(const Queue &queue, const size_t size, std::vector<T> &host) {
-    Read(queue, size, host.data());
+  void Read(const Queue &queue, const size_t size, std::vector<T> &host, const size_t offset = 0) {
+    Read(queue, size, host.data(), offset);
  }
-  void Read(const Queue &queue, const size_t size, BufferHost<T> &host) {
-    Read(queue, size, host.data());
+  void Read(const Queue &queue, const size_t size, BufferHost<T> &host, const size_t offset = 0) {
+    Read(queue, size, host.data(), offset);
  }

  // Copies from host to device: writing the device buffer a-synchronously
-  void WriteAsync(const Queue &queue, const size_t size, const T* host) {
+  void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
    if (access_ == BufferAccess::kReadOnly) { Error("writing to a read-only buffer"); }
-    if (GetSize() < size*sizeof(T)) { Error("target device buffer is too small"); }
-    CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), host, 0,
-                                    nullptr, nullptr));
+    if (GetSize() < (offset+size)*sizeof(T)) { Error("target device buffer is too small"); }
+    CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
+                                    host, 0, nullptr, nullptr));
  }
-  void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host) {
-    WriteAsync(queue, size, host.data());
+  void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host,
+                  const size_t offset = 0) {
+    WriteAsync(queue, size, host.data(), offset);
  }
-  void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host) {
-    WriteAsync(queue, size, host.data());
+  void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host,
+                  const size_t offset = 0) {
+    WriteAsync(queue, size, host.data(), offset);
  }

  // Copies from host to device: writing the device buffer
-  void Write(const Queue &queue, const size_t size, const T* host) {
-    WriteAsync(queue, size, host);
+  void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
+    WriteAsync(queue, size, host, offset);
    queue.Finish();
  }
-  void Write(const Queue &queue, const size_t size, const std::vector<T> &host) {
-    Write(queue, size, host.data());
+  void Write(const Queue &queue, const size_t size, const std::vector<T> &host,
+             const size_t offset = 0) {
+    Write(queue, size, host.data(), offset);
  }
-  void Write(const Queue &queue, const size_t size, const BufferHost<T> &host) {
-    Write(queue, size, host.data());
+  void Write(const Queue &queue, const size_t size, const BufferHost<T> &host,
+             const size_t offset = 0) {
+    Write(queue, size, host.data(), offset);
  }

  // Copies the contents of this buffer into another device buffer
@ -573,6 +607,13 @@ class Kernel {
                                      0, nullptr, &(event())));
  }

+  // As above, but with the default local workgroup size
+  void Launch(const Queue &queue, const std::vector<size_t> &global, Event &event) {
+    CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
+                                      nullptr, global.data(), nullptr,
+                                      0, nullptr, &(event())));
+  }
+
  // Accessor to the private data-member
  const cl_kernel& operator()() const { return *kernel_; }
 private:
--- a/include/internal/database.h
+++ b/include/internal/database.h
@ -56,24 +56,26 @@ class Database {
  static constexpr auto kDeviceTypeAll = "default";

  // The OpenCL device vendors
-  static constexpr auto kDeviceVendorNVIDIA = "NVIDIA Corporation";
-  static constexpr auto kDeviceVendorAMD = "Advanced Micro Devices, Inc.";
-  static constexpr auto kDeviceVendorIntel = "Intel";
  static constexpr auto kDeviceVendorAll = "default";

-  // The OpenCL device names
-  static constexpr auto kDefaultDevice = "default";
-
+  // Alternative names for some OpenCL vendors
+  const std::unordered_map<std::string,std::string> kVendorNames {
+    {"Intel(R) Corporation", "Intel"},
+    {"GenuineIntel", "Intel"},
+    {"Advanced Micro Devices, Inc.", "AMD"},
+    {"NVIDIA Corporation", "NVIDIA"},
+  };

  // The database consists of separate database entries, stored together in a vector
  static const DatabaseEntry XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
  static const DatabaseEntry XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble;
  static const DatabaseEntry XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
+  static const DatabaseEntry XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
  static const DatabaseEntry XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
  static const DatabaseEntry CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
  static const DatabaseEntry PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
-  static const DatabaseEntry TraSingle, TraDouble, TraComplexSingle, TraComplexDouble;
-  static const DatabaseEntry PadTraSingle, PadTraDouble, PadTraComplexSingle, PadTraComplexDouble;
+  static const DatabaseEntry TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble;
+  static const DatabaseEntry PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble;
  static const std::vector<DatabaseEntry> database;

  // The constructor
--- a/include/internal/database/copy.h
+++ b/include/internal/database/copy.h
@ -5,9 +5,9 @@
 // width of 100 characters per line.
 //
 // Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
+//   Database generator <database.py>
 //
-// This file populates the database with best-found tuning parameters for the Copy kernels.
+// This file populates the database with best-found tuning parameters for the 'Copy' kernels.
 //
 // =================================================================================================

@ -16,54 +16,56 @@ namespace clblast {

 const Database::DatabaseEntry Database::CopySingle = {
  "Copy", Precision::kSingle, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } },
-        { "Tesla K20m",       { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_WPT",2}, {"COPY_VW",4} } },
-        { "Tesla K40m",       { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_WPT",4}, {"COPY_VW",4} } },
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "Tahiti",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
      }
    },
-    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",4}, {"COPY_VW",2} } },
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",2} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
-        { "Iris",             { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",4} } },
+      kDeviceTypeGPU, "Intel", {
+        { "Iris",                                            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+        { "Iris Pro",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
      }
    },
-    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
      }
    },
-  }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::CopyDouble = {
-  "Copy", Precision::kDouble, {
    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
-        { "Tesla K20m",       { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } },
-        { "Tesla K40m",       { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } },
-      }
-    },
-    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",4} } },
-      }
-    },
-    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "GeForce GTX 680",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "GeForce GTX 980",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX TITAN",                               { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
+        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
+        { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
  }
@ -73,26 +75,100 @@ const Database::DatabaseEntry Database::CopyDouble = {

 const Database::DatabaseEntry Database::CopyComplexSingle = {
  "Copy", Precision::kComplexSingle, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_WPT",1}, {"COPY_VW",1} } },
-        { "Tesla K20m",       { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",1} } },
-        { "Tesla K40m",       { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
-    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
-        { "Iris",             { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+      kDeviceTypeGPU, "Intel", {
+        { "Iris",                                            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+        { "Iris Pro",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 980",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
+        { "Tesla K40m",                                      { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::CopyDouble = {
+  "Copy", Precision::kDouble, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+      }
+    },
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",2} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",2} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "GeForce GTX 680",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "GeForce GTX 980",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "GeForce GTX TITAN",                               { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
  }
@ -102,25 +178,49 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {

 const Database::DatabaseEntry Database::CopyComplexDouble = {
  "Copy", Precision::kComplexDouble, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
-        { "Tesla K20m",       { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",1}, {"COPY_VW",1} } },
-        { "Tesla K40m",       { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
-      }
-    },
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",4}, {"COPY_VW",2} } },
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
-    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 680",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 980",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX TITAN",                               { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+        { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
      }
    },
  }
--- a/include/internal/database/pad.h
+++ b/include/internal/database/pad.h
@ -5,9 +5,9 @@
 // width of 100 characters per line.
 //
 // Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
+//   Database generator <database.py>
 //
-// This file populates the database with best-found tuning parameters for the Pad kernels.
+// This file populates the database with best-found tuning parameters for the 'Pad' kernels.
 //
 // =================================================================================================

@ -16,54 +16,56 @@ namespace clblast {

 const Database::DatabaseEntry Database::PadSingle = {
  "Pad", Precision::kSingle, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
-        { "Tesla K20m",       { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-        { "Tesla K40m",       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Tahiti",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
-    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
-        { "Iris",             { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+      kDeviceTypeGPU, "Intel", {
+        { "Iris",                                            { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "Iris Pro",                                        { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
      }
    },
-    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
      }
    },
-  }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::PadDouble = {
-  "Pad", Precision::kDouble, {
    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "Tesla K20m",       { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "Tesla K40m",       { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-      }
-    },
-    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
-      }
-    },
-    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+        { "GeForce GTX 680",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX TITAN",                               { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "Tesla K40m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
  }
@ -73,26 +75,108 @@ const Database::DatabaseEntry Database::PadDouble = {

 const Database::DatabaseEntry Database::PadComplexSingle = {
  "Pad", Precision::kComplexSingle, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "Tesla K20m",       { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
-        { "Tesla K40m",       { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Tahiti",                                          { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
-    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
-        { "Iris",             { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      kDeviceTypeGPU, "Intel", {
+        { "Iris",                                            { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
+        { "Iris Pro",                                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 680",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX TITAN",                               { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "Tesla K40m",                                      { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadDouble = {
+  "Pad", Precision::kDouble, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Tahiti",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      }
+    },
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 680",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 980",                                 { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX TITAN",                               { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Tesla K40m",                                      { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
  }
@ -102,25 +186,49 @@ const Database::DatabaseEntry Database::PadComplexSingle = {

 const Database::DatabaseEntry Database::PadComplexDouble = {
  "Pad", Precision::kComplexDouble, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "Tesla K20m",       { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "Tesla K40m",       { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-      }
-    },
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Tahiti",                                          { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
-    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 680",                                 { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX TITAN",                               { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Tesla K20m",                                      { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "Tesla K40m",                                      { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
      }
    },
  }
--- a/include/internal/database/padtranspose.h
+++ b/include/internal/database/padtranspose.h
@ -5,37 +5,67 @@
 // width of 100 characters per line.
 //
 // Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
+//   Database generator <database.py>
 //
-// This file populates the database with best-found tuning parameters for the PadTranspose kernels.
+// This file populates the database with best-found tuning parameters for the 'Padtranspose' kernels.
 //
 // =================================================================================================

 namespace clblast {
 // =================================================================================================

-const Database::DatabaseEntry Database::PadTraSingle = {
-  "PadTranspose", Precision::kSingle, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
-        { "Tesla K20m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
-        { "Tesla K40m",       { {"PADTRA_TILE",32}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
+const Database::DatabaseEntry Database::PadtransposeSingle = {
+  "Padtranspose", Precision::kSingle, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+        { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
      }
    },
-    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"PADTRA_TILE",16}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
-        { "Iris",             { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
+      kDeviceTypeGPU, "Intel", {
+        { "Iris",                                            { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Iris Pro",                                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
+        { "GeForce GTX 980",                                 { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+        { "Tesla K20m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
      }
    },
  }
@ -43,27 +73,58 @@ const Database::DatabaseEntry Database::PadTraSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::PadTraDouble = {
-  "PadTranspose", Precision::kDouble, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
-        { "Tesla K20m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
-        { "Tesla K40m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
+const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
+  "Padtranspose", Precision::kComplexSingle, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+        { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
      }
    },
-    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"PADTRA_TILE",8}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "Intel", {
+        { "Iris",                                            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Iris Pro",                                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 980",                                 { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+        { "Tesla K20m",                                      { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
      }
    },
  }
@ -71,28 +132,51 @@ const Database::DatabaseEntry Database::PadTraDouble = {

 // =================================================================================================

-const Database::DatabaseEntry Database::PadTraComplexSingle = {
-  "PadTranspose", Precision::kComplexSingle, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
-        { "Tesla K20m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
-        { "Tesla K40m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
-      }
-    },
+const Database::DatabaseEntry Database::PadtransposeDouble = {
+  "Padtranspose", Precision::kDouble, {
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+        { "Tahiti",                                          { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
      }
    },
-    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
-        { "Iris",             { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
+        { "GeForce GTX 980",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX TITAN",                               { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+        { "Tesla K20m",                                      { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
      }
    },
  }
@ -100,27 +184,51 @@ const Database::DatabaseEntry Database::PadTraComplexSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::PadTraComplexDouble = {
-  "PadTranspose", Precision::kComplexDouble, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
-        { "Tesla K20m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
-        { "Tesla K40m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
-      }
-    },
+const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
+  "Padtranspose", Precision::kComplexDouble, {
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"PADTRA_TILE",8}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+        { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
      }
    },
-    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "GeForce GTX 980",                                 { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+        { "Tesla K20m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
      }
    },
  }
--- a/include/internal/database/transpose.h
+++ b/include/internal/database/transpose.h
@ -5,37 +5,67 @@
 // width of 100 characters per line.
 //
 // Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
+//   Database generator <database.py>
 //
-// This file populates the database with best-found tuning parameters for the Transpose kernels.
+// This file populates the database with best-found tuning parameters for the 'Transpose' kernels.
 //
 // =================================================================================================

 namespace clblast {
 // =================================================================================================

-const Database::DatabaseEntry Database::TraSingle = {
+const Database::DatabaseEntry Database::TransposeSingle = {
  "Transpose", Precision::kSingle, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
-        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
-        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
+        { "Tahiti",                                          { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
      }
    },
-    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"TRA_DIM",16}, {"TRA_WPT",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
-        { "Iris",             { {"TRA_DIM",8}, {"TRA_WPT",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
+      kDeviceTypeGPU, "Intel", {
+        { "Iris",                                            { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "Iris Pro",                                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "GeForce GTX 750 Ti",                              { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "GeForce GTX 980",                                 { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX TITAN",                               { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "GeForce GTX TITAN X",                             { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "Tesla K20m",                                      { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "Tesla K40m",                                      { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
      }
    },
  }
@ -43,56 +73,52 @@ const Database::DatabaseEntry Database::TraSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::TraDouble = {
-  "Transpose", Precision::kDouble, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
-        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
-        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
-      }
-    },
-    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
-      }
-    },
-    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
-      }
-    },
-    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
-      }
-    },
-  }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::TraComplexSingle = {
+const Database::DatabaseEntry Database::TransposeComplexSingle = {
  "Transpose", Precision::kComplexSingle, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
-        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
-        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "Tahiti",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
      }
    },
-    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1} } },
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
-        { "Iris",             { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
+      kDeviceTypeGPU, "Intel", {
+        { "Iris",                                            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "Iris Pro",                                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX 980",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX TITAN",                               { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
      }
    },
  }
@ -100,27 +126,97 @@ const Database::DatabaseEntry Database::TraComplexSingle = {

 // =================================================================================================

-const Database::DatabaseEntry Database::TraComplexDouble = {
-  "Transpose", Precision::kComplexDouble, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
-        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
-        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
-      }
-    },
+const Database::DatabaseEntry Database::TransposeDouble = {
+  "Transpose", Precision::kDouble, {
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "Tahiti",                                          { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
      }
    },
-    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "GeForce GTX 750 Ti",                              { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX 980",                                 { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "GeForce GTX TITAN",                               { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::TransposeComplexDouble = {
+  "Transpose", Precision::kComplexDouble, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "Tahiti",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+      }
+    },
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX 980",                                 { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX TITAN",                               { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
      }
    },
  }
--- a/include/internal/database/xaxpy.h
+++ b/include/internal/database/xaxpy.h
@ -5,9 +5,9 @@
 // width of 100 characters per line.
 //
 // Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
+//   Database generator <database.py>
 //
-// This file populates the database with best-found tuning parameters for the Xaxpy kernels.
+// This file populates the database with best-found tuning parameters for the 'Xaxpy' kernels.
 //
 // =================================================================================================

@ -16,26 +16,115 @@ namespace clblast {

 const Database::DatabaseEntry Database::XaxpySingle = {
  "Xaxpy", Precision::kSingle, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"WGS",128}, {"WPT",1}, {"VW",2} } },
-        { "Tesla K20m",       { {"WGS",128}, {"WPT",2}, {"VW",2} } },
-        { "Tesla K40m",       { {"WGS",128}, {"WPT",1}, {"VW",4} } },
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "Tahiti",                                          { {"VW",2}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
    },
-    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"WGS",64}, {"WPT",1}, {"VW",2} } },
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"VW",4}, {"WGS",256}, {"WPT",1} } },
+        { "default",                                         { {"VW",4}, {"WGS",256}, {"WPT",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS",512}, {"WPT",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",4}, {"WGS",256}, {"WPT",1} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
-        { "Iris",             { {"WGS",512}, {"WPT",1}, {"VW",1} } },
+      kDeviceTypeGPU, "Intel", {
+        { "Iris",                                            { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "Iris Pro",                                        { {"VW",1}, {"WGS",128}, {"WPT",2} } },
+        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",2}, {"WGS",1024}, {"WPT",2} } },
+        { "default",                                         { {"VW",2}, {"WGS",1024}, {"WPT",2} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"VW",4}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 680",                                 { {"VW",2}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+        { "GeForce GTX TITAN",                               { {"VW",4}, {"WGS",256}, {"WPT",1} } },
+        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "Tesla K20m",                                      { {"VW",4}, {"WGS",128}, {"WPT",1} } },
+        { "Tesla K40m",                                      { {"VW",4}, {"WGS",128}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XaxpyComplexSingle = {
+  "Xaxpy", Precision::kComplexSingle, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",2}, {"WGS",64}, {"WPT",8} } },
+        { "Tahiti",                                          { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+      }
+    },
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",4}, {"WGS",256}, {"WPT",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",1}, {"WGS",1024}, {"WPT",2} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+      }
+    },
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Iris",                                            { {"VW",2}, {"WGS",128}, {"WPT",1} } },
+        { "Iris Pro",                                        { {"VW",1}, {"WGS",256}, {"WPT",8} } },
+        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",512}, {"WPT",1} } },
+        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",512}, {"WPT",1} } },
+        { "Tesla K20m",                                      { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "Tesla K40m",                                      { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
    },
  }
@ -45,53 +134,49 @@ const Database::DatabaseEntry Database::XaxpySingle = {

 const Database::DatabaseEntry Database::XaxpyDouble = {
  "Xaxpy", Precision::kDouble, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"WGS",128}, {"WPT",1}, {"VW",1} } },
-        { "Tesla K20m",       { {"WGS",512}, {"WPT",1}, {"VW",2} } },
-        { "Tesla K40m",       { {"WGS",64}, {"WPT",1}, {"VW",2} } },
-      }
-    },
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"WGS",256}, {"WPT",1}, {"VW",1} } },
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+        { "Tahiti",                                          { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
    },
-    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"VW",2}, {"WGS",128}, {"WPT",2} } },
+        { "default",                                         { {"VW",2}, {"WGS",128}, {"WPT",2} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",8}, {"WGS",64}, {"WPT",1} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",8}, {"WGS",2048}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",2}, {"WGS",512}, {"WPT",1} } },
+        { "default",                                         { {"VW",2}, {"WGS",512}, {"WPT",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"VW",2}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+        { "GeForce GTX TITAN",                               { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
+        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",512}, {"WPT",1} } },
+        { "Tesla K20m",                                      { {"VW",2}, {"WGS",128}, {"WPT",1} } },
+        { "Tesla K40m",                                      { {"VW",2}, {"WGS",128}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"WGS",128}, {"WPT",1}, {"VW",1} } },
-      }
-    },
-  }
-};
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XaxpyComplexSingle = {
-  "Xaxpy", Precision::kComplexSingle, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"WGS",256}, {"WPT",1}, {"VW",1} } },
-        { "Tesla K20m",       { {"WGS",128}, {"WPT",1}, {"VW",1} } },
-        { "Tesla K40m",       { {"WGS",128}, {"WPT",2}, {"VW",1} } },
-      }
-    },
-    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"WGS",64}, {"WPT",1}, {"VW",1} } },
-      }
-    },
-    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
-        { "Iris",             { {"WGS",256}, {"WPT",1}, {"VW",1} } },
-      }
-    },
-    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
      }
    },
  }
@ -101,25 +186,49 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {

 const Database::DatabaseEntry Database::XaxpyComplexDouble = {
  "Xaxpy", Precision::kComplexDouble, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"WGS",128}, {"WPT",2}, {"VW",1} } },
-        { "Tesla K20m",       { {"WGS",256}, {"WPT",1}, {"VW",1} } },
-        { "Tesla K40m",       { {"WGS",64}, {"WPT",2}, {"VW",1} } },
-      }
-    },
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"WGS",64}, {"WPT",1}, {"VW",1} } },
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "Tahiti",                                          { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
      }
    },
-    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"VW",1}, {"WGS",64}, {"WPT",8} } },
+        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",8} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",8}, {"WGS",128}, {"WPT",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",8}, {"WGS",512}, {"WPT",1} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",256}, {"WPT",2} } },
+        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+        { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS",64}, {"WPT",4} } },
+        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+        { "Tesla K20m",                                      { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "Tesla K40m",                                      { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
      }
    },
  }
--- a/include/internal/database/xdot.h
+++ b/include/internal/database/xdot.h
@ -5,9 +5,9 @@
 // width of 100 characters per line.
 //
 // Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
+//   Database generator <database.py>
 //
-// This file populates the database with best-found tuning parameters for the Xdot kernels.
+// This file populates the database with best-found tuning parameters for the 'Xdot' kernels.
 //
 // =================================================================================================

@ -16,22 +16,115 @@ namespace clblast {

 const Database::DatabaseEntry Database::XdotSingle = {
  "Xdot", Precision::kSingle, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
+        { "Tahiti",                                          { {"VW",1}, {"WGS1",256}, {"WGS2",256} } },
+        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
      }
    },
-    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"VW",1}, {"WGS1",128}, {"WGS2",256} } },
+        { "default",                                         { {"VW",1}, {"WGS1",128}, {"WGS2",256} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
+        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
-        { "Iris",             { {"WGS1",512}, {"WGS2",512} } },
+      kDeviceTypeGPU, "Intel", {
+        { "Iris",                                            { {"VW",1}, {"WGS1",512}, {"WGS2",32} } },
+        { "Iris Pro",                                        { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
+        { "default",                                         { {"VW",1}, {"WGS1",128}, {"WGS2",32} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
+        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"VW",1}, {"WGS1",256}, {"WGS2",128} } },
+        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS1",128}, {"WGS2",128} } },
+        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
+        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
+        { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
+        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
+        { "Tesla K20m",                                      { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
+        { "Tesla K40m",                                      { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
+        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"WGS1",64}, {"WGS2",64} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XdotComplexSingle = {
+  "Xdot", Precision::kComplexSingle, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
+        { "Tahiti",                                          { {"VW",1}, {"WGS1",64}, {"WGS2",256} } },
+        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
+      }
+    },
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
+        { "default",                                         { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
+        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
+      }
+    },
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Iris",                                            { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
+        { "Iris Pro",                                        { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
+        { "default",                                         { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
+        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"VW",1}, {"WGS1",512}, {"WGS2",512} } },
+        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS1",256}, {"WGS2",32} } },
+        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS1",128}, {"WGS2",32} } },
+        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
+        { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
+        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
+        { "Tesla K20m",                                      { {"VW",1}, {"WGS1",256}, {"WGS2",512} } },
+        { "Tesla K40m",                                      { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
+        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
      }
    },
  }
@ -41,45 +134,49 @@ const Database::DatabaseEntry Database::XdotSingle = {

 const Database::DatabaseEntry Database::XdotDouble = {
  "Xdot", Precision::kDouble, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-      }
-    },
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
+        { "Tahiti",                                          { {"VW",1}, {"WGS1",64}, {"WGS2",256} } },
+        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
      }
    },
-    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"VW",1}, {"WGS1",64}, {"WGS2",512} } },
+        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",512} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS1",512}, {"WGS2",512} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS1",1024}, {"WGS2",512} } },
+        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",512} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
+        { "default",                                         { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
+        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
+        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
+        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS1",32}, {"WGS2",512} } },
+        { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
+        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS1",128}, {"WGS2",128} } },
+        { "Tesla K20m",                                      { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
+        { "Tesla K40m",                                      { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
+        { "default",                                         { {"VW",1}, {"WGS1",32}, {"WGS2",128} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"WGS1",64}, {"WGS2",64} } },
-      }
-    },
-  }
-};
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XdotComplexSingle = {
-  "Xdot", Precision::kComplexSingle, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-      }
-    },
-    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-      }
-    },
-    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
-        { "Iris",             { {"WGS1",512}, {"WGS2",512} } },
-      }
-    },
-    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"WGS1",64}, {"WGS2",64} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW",1}, {"WGS1",32}, {"WGS2",128} } },
      }
    },
  }
@ -89,21 +186,49 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {

 const Database::DatabaseEntry Database::XdotComplexDouble = {
  "Xdot", Precision::kComplexDouble, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-      }
-    },
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
+        { "Tahiti",                                          { {"VW",1}, {"WGS1",64}, {"WGS2",256} } },
+        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
      }
    },
-    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
+        { "default",                                         { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
+        { "default",                                         { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",32}, {"WGS2",1024} } },
+        { "default",                                         { {"VW",1}, {"WGS1",32}, {"WGS2",1024} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"VW",1}, {"WGS1",512}, {"WGS2",512} } },
+        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS1",256}, {"WGS2",64} } },
+        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
+        { "GeForce GTX 980",                                 { {"VW",1}, {"WGS1",32}, {"WGS2",128} } },
+        { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
+        { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS1",128}, {"WGS2",128} } },
+        { "Tesla K20m",                                      { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
+        { "Tesla K40m",                                      { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
+        { "default",                                         { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"WGS1",64}, {"WGS2",64} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW",1}, {"WGS1",32}, {"WGS2",32} } },
      }
    },
  }
--- a/include/internal/database/xgemm.h
+++ b/include/internal/database/xgemm.h
@ -5,9 +5,9 @@
 // width of 100 characters per line.
 //
 // Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
+//   Database generator <database.py>
 //
-// This file populates the database with best-found tuning parameters for the Xgemm kernels.
+// This file populates the database with best-found tuning parameters for the 'Xgemm' kernels.
 //
 // =================================================================================================

@ -16,56 +16,56 @@ namespace clblast {

 const Database::DatabaseEntry Database::XgemmSingle = {
  "Xgemm", Precision::kSingle, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"MWG",128}, {"NWG",64}, {"KWG",32}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",2}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
-        { "Tesla K20m",       { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",4}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
-        { "Tesla K40m",       { {"MWG",128}, {"NWG",128}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
-        { kDefaultDevice,     { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
+        { "Tahiti",                                          { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
      }
    },
-    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"MWG",128}, {"NWG",128}, {"KWG",32}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",32}, {"NDIMB",8}, {"KWI",2}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
-        { "Iris",             { {"MWG",64}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",8}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } },
+      kDeviceTypeGPU, "Intel", {
+        { "Iris",                                            { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
+        { "Iris Pro",                                        { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+        { "default",                                         { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
      }
    },
-    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
      }
    },
-  }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XgemmDouble = {
-  "Xgemm", Precision::kDouble, {
    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
-        { "Tesla K20m",       { {"MWG",64}, {"NWG",128}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",32}, {"NDIMB",32}, {"KWI",8}, {"VWM",2}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
-        { "Tesla K40m",       { {"MWG",64}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
-        { kDefaultDevice,     { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
-      }
-    },
-    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
-      }
-    },
-    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+        { "GeForce GTX 680",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
+        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",4} } },
+        { "GeForce GTX 980",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",8} } },
+        { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
+        { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
+        { "Tesla K20m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
+        { "Tesla K40m",                                      { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
      }
    },
  }
@ -75,27 +75,108 @@ const Database::DatabaseEntry Database::XgemmDouble = {

 const Database::DatabaseEntry Database::XgemmComplexSingle = {
  "Xgemm", Precision::kComplexSingle, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
-        { "Tesla K20m",       { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",8}, {"KWI",8}, {"VWM",2}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } },
-        { "Tesla K40m",       { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",1}, {"STRM",0}, {"STRN",1}, {"SA",1}, {"SB",1} } },
-        { kDefaultDevice,     { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+        { "Tahiti",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
      }
    },
-    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"MWG",16}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } },
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
-        { "Iris",             { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+      kDeviceTypeGPU, "Intel", {
+        { "Iris",                                            { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "Iris Pro",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+        { "GeForce GTX 680",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
+        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+        { "GeForce GTX 980",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
+        { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
+        { "Tesla K20m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+        { "Tesla K40m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmDouble = {
+  "Xgemm", Precision::kDouble, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+        { "Tahiti",                                          { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
+      }
+    },
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",2} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",2} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+        { "default",                                         { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+        { "GeForce GTX 680",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
+        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
+        { "GeForce GTX 980",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
+        { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+        { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "Tesla K20m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "Tesla K40m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
  }
@ -105,29 +186,52 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {

 const Database::DatabaseEntry Database::XgemmComplexDouble = {
  "Xgemm", Precision::kComplexDouble, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"MWG",16}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
-        { "Tesla K20m",       { {"MWG",16}, {"NWG",128}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",8}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } },
-        { "Tesla K40m",       { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",8}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",1} } },
-        { kDefaultDevice,     { {"MWG",16}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
-      }
-    },
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"MWG",128}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+        { "Tahiti",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
      }
    },
-    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",1} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "GeForce GTX 680",                                 { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+        { "GeForce GTX 980",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+        { "GeForce GTX TITAN X",                             { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "Tesla K20m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "Tesla K40m",                                      { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
      }
    },
  }
 };
+
 // =================================================================================================
 } // namespace clblast
--- a/include/internal/database/xgemv.h
+++ b/include/internal/database/xgemv.h
@ -5,9 +5,9 @@
 // width of 100 characters per line.
 //
 // Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
+//   Database generator <database.py>
 //
-// This file populates the database with best-found tuning parameters for the Xgemv kernels.
+// This file populates the database with best-found tuning parameters for the 'Xgemv' kernels.
 //
 // =================================================================================================

@ -16,26 +16,97 @@ namespace clblast {

 const Database::DatabaseEntry Database::XgemvSingle = {
  "Xgemv", Precision::kSingle, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
-        { "Tesla K20m",       { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
-        { "Tesla K40m",       { {"WGS1",256}, {"WPT1",1}, {"WGS2",256}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",4} } },
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
-    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",4} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
-        { "Iris",             { {"WGS1",256}, {"WPT1",2}, {"WGS2",64}, {"WPT2",4}, {"VW2",4}, {"WGS3",256}, {"WPT3",2}, {"VW3",8} } },
+      kDeviceTypeGPU, "Intel", {
+        { "Iris",                                            { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",8} } },
+        { "Iris Pro",                                        { {"WGS1",256}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
+        { "GeForce GTX 680",                                 { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",128}, {"WPT3",4} } },
+        { "GeForce GTX 980",                                 { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+        { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
+        { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+        { "Tesla K20m",                                      { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
+        { "Tesla K40m",                                      { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvComplexSingle = {
+  "Xgemv", Precision::kComplexSingle, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+        { "Tahiti",                                          { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Iris",                                            { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Iris Pro",                                        { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GeForce GTX 680",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
  }
@ -45,53 +116,42 @@ const Database::DatabaseEntry Database::XgemvSingle = {

 const Database::DatabaseEntry Database::XgemvDouble = {
  "Xgemv", Precision::kDouble, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
-        { "Tesla K20m",       { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
-        { "Tesla K40m",       { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
-      }
-    },
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
-    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",2}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",2} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
+        { "GeForce GTX 980",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
+        { "GeForce GTX TITAN X",                             { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+        { "Tesla K20m",                                      { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Tesla K40m",                                      { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
-      }
-    },
-  }
-};
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XgemvComplexSingle = {
-  "Xgemv", Precision::kComplexSingle, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
-        { "Tesla K20m",       { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
-        { "Tesla K40m",       { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
-      }
-    },
-    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
-      }
-    },
-    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
-        { "Iris",             { {"WGS1",256}, {"WPT1",1}, {"WGS2",64}, {"WPT2",4}, {"VW2",2}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
-      }
-    },
-    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
  }
@ -101,25 +161,35 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {

 const Database::DatabaseEntry Database::XgemvComplexDouble = {
  "Xgemv", Precision::kComplexDouble, {
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, kDeviceVendorNVIDIA, {
-        { "GeForce GTX 480",  { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
-        { "Tesla K20m",       { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
-        { "Tesla K40m",       { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
-      }
-    },
    { // AMD GPUs
-      kDeviceTypeGPU, kDeviceVendorAMD, {
-        { "Tahiti",           { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
-    { // Intel GPUs
-      kDeviceTypeGPU, kDeviceVendorIntel, {
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
    { // Default
-      kDeviceTypeAll, kDeviceVendorAll, {
-        { kDefaultDevice,     { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
      }
    },
  }
--- a/include/internal/database/xger.h
+++ b/include/internal/database/xger.h
@ -0,0 +1,188 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Database generator <database.py>
+//
+// This file populates the database with best-found tuning parameters for the 'Xger' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgerSingle = {
+  "Xger", Precision::kSingle, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
+        { "Tahiti",                                          { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
+      }
+    },
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"WGS1",64}, {"WGS2",4}, {"WPT",4} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",4}, {"WPT",4} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
+      }
+    },
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Iris Pro",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
+        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
+        { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgerComplexSingle = {
+  "Xger", Precision::kComplexSingle, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
+        { "Tahiti",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
+      }
+    },
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
+      }
+    },
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Iris Pro",                                        { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
+        { "default",                                         { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
+        { "GeForce GTX 680",                                 { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+        { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",16}, {"WGS2",2}, {"WPT",2} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgerDouble = {
+  "Xger", Precision::kDouble, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
+        { "Tahiti",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } },
+      }
+    },
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",512}, {"WGS2",16}, {"WPT",1} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",512}, {"WGS2",8}, {"WPT",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
+        { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",16}, {"WGS2",4}, {"WPT",2} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"WGS1",16}, {"WGS2",2}, {"WPT",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgerComplexDouble = {
+  "Xger", Precision::kComplexDouble, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+        { "Tahiti",                                          { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
+      }
+    },
+    { // ARM GPUs
+      kDeviceTypeGPU, "ARM", {
+        { "Mali-T628",                                       { {"WGS1",64}, {"WGS2",2}, {"WPT",4} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",4} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 480",                                 { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
+        { "GeForce GTX 680",                                 { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
+        { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",8}, {"WGS2",2}, {"WPT",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"WGS1",8}, {"WGS2",1}, {"WPT",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
--- a/include/internal/routines/level2/xger.h
+++ b/include/internal/routines/level2/xger.h
@ -0,0 +1,58 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xger routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGER_H_
+#define CLBLAST_ROUTINES_XGER_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xger: public Routine<T> {
+ public:
+
+  // Members and methods from the base class
+  using Routine<T>::db_;
+  using Routine<T>::source_string_;
+  using Routine<T>::queue_;
+  using Routine<T>::GetProgramFromCache;
+  using Routine<T>::TestVectorX;
+  using Routine<T>::TestVectorY;
+  using Routine<T>::TestMatrixA;
+  using Routine<T>::RunKernel;
+  using Routine<T>::ErrorIn;
+
+  // Constructor
+  Xger(Queue &queue, Event &event, const std::string &name = "GER");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoGer(const Layout layout,
+                   const size_t m, const size_t n,
+                   const T alpha,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                   const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                   const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGER_H_
+#endif
--- a/include/internal/routines/level2/xgerc.h
+++ b/include/internal/routines/level2/xgerc.h
@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgerc routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGERC_H_
+#define CLBLAST_ROUTINES_XGERC_H_
+
+#include "internal/routines/level2/xger.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgerc: public Xger<T> {
+ public:
+
+  // Uses the regular Xger routine
+  using Xger<T>::DoGer;
+
+  // Constructor
+  Xgerc(Queue &queue, Event &event, const std::string &name = "GERC");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoGerc(const Layout layout,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGERC_H_
+#endif
--- a/include/internal/routines/level2/xgeru.h
+++ b/include/internal/routines/level2/xgeru.h
@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgeru routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGERU_H_
+#define CLBLAST_ROUTINES_XGERU_H_
+
+#include "internal/routines/level2/xger.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgeru: public Xger<T> {
+ public:
+
+  // Uses the regular Xger routine
+  using Xger<T>::DoGer;
+
+  // Constructor
+  Xgeru(Queue &queue, Event &event, const std::string &name = "GERU");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoGeru(const Layout layout,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGERU_H_
+#endif
--- a/include/internal/routines/level2/xher.h
+++ b/include/internal/routines/level2/xher.h
@ -0,0 +1,61 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xher routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHER_H_
+#define CLBLAST_ROUTINES_XHER_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class Xher: public Routine<T> {
+ public:
+
+  // Members and methods from the base class
+  using Routine<T>::db_;
+  using Routine<T>::source_string_;
+  using Routine<T>::queue_;
+  using Routine<T>::GetProgramFromCache;
+  using Routine<T>::TestVectorX;
+  using Routine<T>::TestMatrixA;
+  using Routine<T>::TestMatrixAP;
+  using Routine<T>::RunKernel;
+  using Routine<T>::ErrorIn;
+
+  // Constructor
+  Xher(Queue &queue, Event &event, const std::string &name = "HER");
+
+  // Translates alpha of type 'U' into type 'T'
+  T GetAlpha(const U alpha);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHer(const Layout layout, const Triangle triangle,
+                   const size_t n,
+                   const U alpha,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                   const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                   const bool packed = false);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHER_H_
+#endif
--- a/include/internal/routines/level2/xher2.h
+++ b/include/internal/routines/level2/xher2.h
@ -0,0 +1,60 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xher2 routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHER2_H_
+#define CLBLAST_ROUTINES_XHER2_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xher2: public Routine<T> {
+ public:
+
+  // Members and methods from the base class
+  using Routine<T>::db_;
+  using Routine<T>::source_string_;
+  using Routine<T>::queue_;
+  using Routine<T>::GetProgramFromCache;
+  using Routine<T>::TestVectorX;
+  using Routine<T>::TestVectorY;
+  using Routine<T>::TestMatrixA;
+  using Routine<T>::TestMatrixAP;
+  using Routine<T>::RunKernel;
+  using Routine<T>::ErrorIn;
+
+  // Constructor
+  Xher2(Queue &queue, Event &event, const std::string &name = "HER2");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHer2(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const bool packed = false);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHER2_H_
+#endif
--- a/include/internal/routines/level2/xhpr.h
+++ b/include/internal/routines/level2/xhpr.h
@ -0,0 +1,45 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhpr routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHPR_H_
+#define CLBLAST_ROUTINES_XHPR_H_
+
+#include "internal/routines/level2/xher.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class Xhpr: public Xher<T,U> {
+ public:
+
+  // Uses the regular Xher routine
+  using Xher<T,U>::DoHer;
+
+  // Constructor
+  Xhpr(Queue &queue, Event &event, const std::string &name = "HPR");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHpr(const Layout layout, const Triangle triangle,
+                   const size_t n,
+                   const U alpha,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                   const Buffer<T> &ap_buffer, const size_t ap_offset);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHPR_H_
+#endif
--- a/include/internal/routines/level2/xhpr2.h
+++ b/include/internal/routines/level2/xhpr2.h
@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhpr2 routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHPR2_H_
+#define CLBLAST_ROUTINES_XHPR2_H_
+
+#include "internal/routines/level2/xher2.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhpr2: public Xher2<T> {
+ public:
+
+  // Uses the regular Xher2 routine
+  using Xher2<T>::DoHer2;
+
+  // Constructor
+  Xhpr2(Queue &queue, Event &event, const std::string &name = "HPR2");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHpr2(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    const Buffer<T> &ap_buffer, const size_t ap_offset);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHPR2_H_
+#endif
--- a/include/internal/routines/level2/xspr.h
+++ b/include/internal/routines/level2/xspr.h
@ -0,0 +1,45 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xspr routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSPR_H_
+#define CLBLAST_ROUTINES_XSPR_H_
+
+#include "internal/routines/level2/xher.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xspr: public Xher<T,T> {
+ public:
+
+  // Uses the regular Xher routine
+  using Xher<T,T>::DoHer;
+
+  // Constructor
+  Xspr(Queue &queue, Event &event, const std::string &name = "SPR");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSpr(const Layout layout, const Triangle triangle,
+                   const size_t n,
+                   const T alpha,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                   const Buffer<T> &ap_buffer, const size_t ap_offset);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSPR_H_
+#endif
--- a/include/internal/routines/level2/xspr2.h
+++ b/include/internal/routines/level2/xspr2.h
@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xspr2 routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSPR2_H_
+#define CLBLAST_ROUTINES_XSPR2_H_
+
+#include "internal/routines/level2/xher2.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xspr2: public Xher2<T> {
+ public:
+
+  // Uses the regular Xher2 routine
+  using Xher2<T>::DoHer2;
+
+  // Constructor
+  Xspr2(Queue &queue, Event &event, const std::string &name = "SPR2");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSpr2(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    const Buffer<T> &ap_buffer, const size_t ap_offset);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSPR2_H_
+#endif
--- a/include/internal/routines/level2/xsyr.h
+++ b/include/internal/routines/level2/xsyr.h
@ -0,0 +1,45 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYR_H_
+#define CLBLAST_ROUTINES_XSYR_H_
+
+#include "internal/routines/level2/xher.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsyr: public Xher<T,T> {
+ public:
+
+  // Uses the regular Xher routine
+  using Xher<T,T>::DoHer;
+
+  // Constructor
+  Xsyr(Queue &queue, Event &event, const std::string &name = "SYR");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSyr(const Layout layout, const Triangle triangle,
+                   const size_t n,
+                   const T alpha,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                   const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYR_H_
+#endif
--- a/include/internal/routines/level2/xsyr2.h
+++ b/include/internal/routines/level2/xsyr2.h
@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr2 routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYR2_H_
+#define CLBLAST_ROUTINES_XSYR2_H_
+
+#include "internal/routines/level2/xher2.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsyr2: public Xher2<T> {
+ public:
+
+  // Uses the regular Xher2 routine
+  using Xher2<T>::DoHer2;
+
+  // Constructor
+  Xsyr2(Queue &queue, Event &event, const std::string &name = "SYR2");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSyr2(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYR2_H_
+#endif
--- a/include/internal/tuning.h
+++ b/include/internal/tuning.h
@ -127,9 +127,11 @@ void Tuner(int argc, char* argv[]) {
    {"precision", precision_string}
  };
  for (auto &o: C::GetOptions()) {
-    if (o == kArgM) { metadata.push_back({"arg_m", std::to_string(args.m)}); }
-    if (o == kArgN) { metadata.push_back({"arg_n", std::to_string(args.n)}); }
-    if (o == kArgK) { metadata.push_back({"arg_k", std::to_string(args.k)}); }
+    if (o == kArgM)     { metadata.push_back({"arg_m", std::to_string(args.m)}); }
+    if (o == kArgN)     { metadata.push_back({"arg_n", std::to_string(args.n)}); }
+    if (o == kArgK)     { metadata.push_back({"arg_k", std::to_string(args.k)}); }
+    if (o == kArgAlpha) { metadata.push_back({"arg_alpha", ToString(args.alpha)}); }
+    if (o == kArgBeta)  { metadata.push_back({"arg_beta", ToString(args.beta)}); }
  }
  tuner.PrintJSON("clblast_"+C::KernelFamily()+"_"+precision_string+".json", metadata);
 }
--- a/include/internal/utilities.h
+++ b/include/internal/utilities.h
@ -125,7 +125,7 @@ struct Arguments {
  // Tuner-specific arguments
  double fraction = 1.0;
  // Client-specific arguments
-  bool compare_clblas = 1;
+  int compare_clblas = 1;
  size_t step = 1;
  size_t num_steps = 0;
  size_t num_runs = 10;
@ -171,7 +171,8 @@ T GetArgument(const int argc, char *argv[], std::string &help,
              const std::string &option, const T default_value);

 // Returns the precision only
-Precision GetPrecision(const int argc, char *argv[]);
+Precision GetPrecision(const int argc, char *argv[],
+                       const Precision default_precision = Precision::kSingle);

 // As in "GetArgument", but now only checks whether an argument is given or not
 bool CheckArgument(const int argc, char *argv[], std::string &help, const std::string &option);
--- a/scripts/database/database.py
+++ b/scripts/database/database.py
@ -15,12 +15,36 @@ import os.path
 import glob
 import re
 import json
+try:
+	from urllib.request import urlopen # Python 3
+except ImportError:
+	from urllib2 import urlopen # Python 2

 # Additional modules
 import pandas as pd

+# Server storing a copy of the database
+DATABASE_SERVER_URL = "http://www.cedricnugteren.nl/tuning/clblast.db"
+
 # Constants
-ATTRIBUTES = ["device", "type", "vendor", "precision", "kernel_family", "arg_m", "arg_n", "arg_k"]
+VENDOR_DEFAULT = "default"
+DEVICETYPE_DEFAULT = "All"
+DEVICENAME_DEFAULT = "default"
+
+# Attributes
+DEVICETYPE_ATTRIBUTES = ["device_vendor", "device_type"]
+DEVICE_ATTRIBUTES = ["device", "device_core_clock", "device_compute_units"]
+KERNEL_ATTRIBUTES = ["precision", "kernel_family",
+                     "arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"]
+ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICETYPE_ATTRIBUTES + KERNEL_ATTRIBUTES
+
+# OpenCL vendor names and their short name
+VENDOR_NAMES = { "device_vendor": {
+  "GenuineIntel": "Intel",
+  "Intel(R) Corporation": "Intel",
+  "Advanced Micro Devices, Inc.": "AMD",
+  "NVIDIA Corporation": "NVIDIA",
+}}

 # Pandas options
 pd.set_option('display.width', 1000)
@ -29,6 +53,14 @@ pd.set_option('display.width', 1000)
 # Database operations
 # ==================================================================================================

+# Downloads the database and save it to disk
+def DownloadDatabase(filename):
+	print("## Downloading database from '"+DATABASE_SERVER_URL+"'...")
+	df = urlopen(DATABASE_SERVER_URL)
+	output = open(file_db,'wb')
+	output.write(df.read())
+	output.close()
+
 # Loads the database from disk
 def LoadDatabase(filename):
 	return pd.read_pickle(filename)
@ -60,15 +92,58 @@ def ConcatenateData(df1, df2):
 def RemoveDuplicates(df):
 	return df.drop_duplicates()

-# Bests
+def RemoveEntriesByDevice(df, devicename):
+	return df[df["device"] != devicename]
+
+def GetEntriesByField(df, field, value):
+	return df[df[field] == value]
+
+# Fixes the problem that some vendors use multiple different names
+def SanitizeVendorNames(df):
+	df = df.replace(VENDOR_NAMES)
+	return df
+
+# Retrieves the results with the lowest execution times
 def GetBestResults(df):
 	dfbest = pd.DataFrame()
 	grouped = df.groupby(ATTRIBUTES+["kernel"])
 	for name, dfgroup in grouped:
-		bestcase = dfgroup.loc[[dfgroup["time"].idxmin()]]
-		dfbest = ConcatenateData(dfbest, bestcase)
+		besttime = dfgroup["time"].min()
+		bestcase = dfgroup[dfgroup["time"] == besttime].iloc[0]
+		dfbest = dfbest.append(bestcase, ignore_index=True)
 	return dfbest

+# Sets defaults for devices of the same type/vendor based on the smallest values of all know
+# entries. The average might be better for performance but some parameters might not be supported
+# on other devices.
+def CalculateDefaults(df):
+	dfdefault = pd.DataFrame()
+
+	# Defaults per type/vendor
+	groups = df.groupby(DEVICETYPE_ATTRIBUTES+KERNEL_ATTRIBUTES+["kernel"])
+	for name, dfgroup in groups:
+		default_values = dfgroup.min(axis=0)
+		default_values["device"] = DEVICENAME_DEFAULT
+		default_values["device_compute_units"] = 0
+		default_values["device_core_clock"] = 0
+		default_values["time"] = 0.0
+		dfdefault = dfdefault.append(default_values, ignore_index=True)
+	
+	# Defaults in general
+	groups = df.groupby(KERNEL_ATTRIBUTES+["kernel"])
+	for name, dfgroup in groups:
+		default_values = dfgroup.min(axis=0)
+		default_values["device_vendor"] = VENDOR_DEFAULT
+		default_values["device_type"] = DEVICETYPE_DEFAULT
+		default_values["device"] = DEVICENAME_DEFAULT
+		default_values["device_compute_units"] = 0
+		default_values["device_core_clock"] = 0
+		default_values["time"] = 0.0
+		dfdefault = dfdefault.append(default_values, ignore_index=True)
+	
+	# Database with both types of defaults only
+	return dfdefault
+
 # ==================================================================================================
 # C++ header generation
 # ==================================================================================================
@ -110,27 +185,28 @@ def GetPrecision(family, precision):

 # The C++ device type and vendor
 def GetDeviceVendor(vendor, devtype):
-	return("    { // %s %ss\n      kDeviceType%s, kDeviceVendor%s, {\n"
-	       % (vendor, devtype, devtype, vendor))
+	if vendor == VENDOR_DEFAULT and devtype == DEVICETYPE_DEFAULT:
+		return("    { // Default\n      kDeviceType%s, \"%s\", {\n" % (devtype, vendor))
+	return("    { // %s %ss\n      kDeviceType%s, \"%s\", {\n" % (vendor, devtype, devtype[0].upper() + devtype[1:], vendor))

 # Prints the data to a C++ database
-def PrintData(df):
+def PrintData(df, outputdir):

 	# Iterates over the kernel families: creates a new file per family
 	for family, dffamily in df.groupby(["kernel_family"]):
 		dffamily = dffamily.dropna(axis=1, how='all')
-		f = open(family+'.h', 'w+')
+		f = open(os.path.join(outputdir, family+'.h'), 'w+')
 		f.write(GetHeader(family))

 		# Loops over the different entries for this family and prints their headers
 		for precision, dfprecision in dffamily.groupby(["precision"]):
 			f.write(GetPrecision(family, precision))
-			for vendor, dfvendor in dfprecision.groupby(["vendor"]):
-				for devtype, dfdevtype in dfvendor.groupby(["type"]):
+			for vendor, dfvendor in dfprecision.groupby(["device_vendor"]):
+				for devtype, dfdevtype in dfvendor.groupby(["device_type"]):
 					f.write(GetDeviceVendor(vendor, devtype))
 					for device, dfdevice in dfdevtype.groupby(["device"]):
 						devicename = "\"%s\"," % device
-						f.write("        { %-20s { " % devicename)
+						f.write("        { %-50s { " % devicename)

 						# Collects the paramaters for this case and prints them
 						parameters = []
@ -152,57 +228,70 @@ def PrintData(df):

 # Checks for the number of command-line arguments
 if len(sys.argv) != 3:
-	print "[ERROR] Usage: database.py <folder_with_json_files> <root_of_clblast>"
+	print("[ERROR] Usage: database.py <folder_with_json_files> <root_of_clblast>")
 	sys.exit()

 # Parses the command-line arguments
 path_json = sys.argv[1]
 path_clblast = sys.argv[2]
-file_db = path_clblast+"/src/database.db"
-glob_json = path_json+"/*.json"
+file_db = os.path.join(path_clblast, "scripts", "database", "database.db")
+glob_json = os.path.join(path_json, "*.json")

 # Checks whether the command-line arguments are valid; exists otherwise
-clblast_h = path_clblast+"/include/clblast.h" # Not used but just for validation
+clblast_h = os.path.join(path_clblast, "include", "clblast.h") # Not used but just for validation
 if not os.path.isfile(clblast_h):
-	print "[ERROR] The path '"+path_clblast+"' does not point to the root of the CLBlast library"
+	print("[ERROR] The path '"+path_clblast+"' does not point to the root of the CLBlast library")
 	sys.exit()
 if len(glob.glob(glob_json)) < 1:
-	print "[ERROR] The path '"+path_json+"' does not contain any JSON files"
-	sys.exit()
+	print("## The path '"+path_json+"' does not contain any JSON files")

 # ==================================================================================================
 # The main body of the script
 # ==================================================================================================

-# Loads the database if it exists. If not, a new database is initialized
+# Downloads the database if a local copy is not present
 db_exists = os.path.isfile(file_db)
-database = LoadDatabase(file_db) if db_exists else pd.DataFrame()
+if not db_exists:
+	DownloadDatabase(file_db)
+
+# Loads the database from disk
+print("## Loading the database from disk...")
+database = LoadDatabase(file_db)

 # Loops over all JSON files in the supplied folder
 for file_json in glob.glob(glob_json):

 	# Loads the newly imported data
-	print "## Processing '"+file_json+"'",
+	sys.stdout.write("## Processing '"+file_json+"' ")
 	imported_data = ImportDataFromFile(file_json)
+	imported_data = SanitizeVendorNames(imported_data)

 	# Adds the new data to the database
 	old_size = len(database.index)
 	database = ConcatenateData(database, imported_data)
 	database = RemoveDuplicates(database)
 	new_size = len(database.index)
-	print "with "+str(new_size-old_size)+" new items"
+	print("with "+str(new_size-old_size)+" new items")

-# Stores the new database back to disk
-SaveDatabase(database, file_db)
+
+# Stores the modified database back to disk
+if len(glob.glob(glob_json)) >= 1:
+	print("## Storing the database to disk...")
+	SaveDatabase(database, file_db)

 # Retrieves the best performing results
+print("## Calculating the best results per device/kernel...")
 bests = GetBestResults(database)

-# TODO: Determines the defaults for other vendors and per vendor
-#defaults = CalculateDefaults(bests)
-#bests = ConcatenateData(bests, defaults)
+# Determines the defaults for other vendors and per vendor
+defaults = CalculateDefaults(bests)
+bests = ConcatenateData(bests, defaults)

 # Outputs the data as a C++ database
-PrintData(bests)
+path_cpp_database = os.path.join(path_clblast, "include", "internal", "database")
+print("## Producing a C++ database in '"+path_cpp_database+"'...")
+PrintData(bests, path_cpp_database)
+
+print("## All done")

 # ==================================================================================================
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@ -78,17 +78,17 @@ routines = [
  Routine(False, "2a", "tbsv",  T,  [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a banded triangular system of equations"),
  Routine(False, "2a", "tpsv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], False, "Solves a packed triangular system of equations"),
  # Level 2: matrix update
-  Routine(False, "2b", "ger",   T,  [S,D],     ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 matrix update"),
-  Routine(False, "2b", "geru",  T,  [C,Z],     ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex matrix update"),
-  Routine(False, "2b", "gerc",  T,  [C,Z],     ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex conjugated matrix update"),
-  Routine(False, "2b", "her",   Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Hermitian rank-1 matrix update"),
-  Routine(False, "2b", "hpr",   Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Hermitian packed rank-1 matrix update"),
-  Routine(False, "2b", "her2",  T,  [C,Z],     ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Hermitian rank-2 matrix update"),
-  Routine(False, "2b", "hpr2",  T,  [C,Z],     ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Hermitian packed rank-2 matrix update"),
-  Routine(False, "2b", "syr",   T,  [S,D],     ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Symmetric rank-1 matrix update"),
-  Routine(False, "2b", "spr",   T,  [S,D],     ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Symmetric packed rank-1 matrix update"),
-  Routine(False, "2b", "syr2",  T,  [S,D],     ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Symmetric rank-2 matrix update"),
-  Routine(False, "2b", "spr2",  T,  [S,D],     ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Symmetric packed rank-2 matrix update"),
+  Routine(True,  "2b", "ger",   T,  [S,D],     ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 matrix update"),
+  Routine(True,  "2b", "geru",  T,  [C,Z],     ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex matrix update"),
+  Routine(True,  "2b", "gerc",  T,  [C,Z],     ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex conjugated matrix update"),
+  Routine(True,  "2b", "her",   Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Hermitian rank-1 matrix update"),
+  Routine(True,  "2b", "hpr",   Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Hermitian packed rank-1 matrix update"),
+  Routine(True,  "2b", "her2",  T,  [C,Z],     ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Hermitian rank-2 matrix update"),
+  Routine(True,  "2b", "hpr2",  T,  [C,Z],     ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Hermitian packed rank-2 matrix update"),
+  Routine(True,  "2b", "syr",   T,  [S,D],     ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Symmetric rank-1 matrix update"),
+  Routine(True,  "2b", "spr",   T,  [S,D],     ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Symmetric packed rank-1 matrix update"),
+  Routine(True,  "2b", "syr2",  T,  [S,D],     ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Symmetric rank-2 matrix update"),
+  Routine(True,  "2b", "spr2",  T,  [S,D],     ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Symmetric packed rank-2 matrix update"),
 ],
 [ # Level 3: matrix-matrix
  Routine(True,  "3", "gemm",  T,  [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "General matrix-matrix multiplication"),
@ -103,7 +103,17 @@ routines = [
 ]]

 # ==================================================================================================
+# Translates an option name to a CLBlast data-type
+def PrecisionToFullName(x):
+	return {
+		'H': "Half",
+		'S': "Single",
+		'D': "Double",
+		'C': "ComplexSingle",
+		'Z': "ComplexDouble",
+	}[x]

+# ==================================================================================================
 # Separators for the BLAS levels
 separators = ["""
 // =================================================================================================
@ -237,7 +247,7 @@ files = [
  path_clblast+"/src/clblast_c.cc",
  path_clblast+"/test/wrapper_clblas.h",
 ]
-header_lines = [84, 52, 80, 24, 22]
+header_lines = [84, 63, 80, 24, 22]
 footer_lines = [6, 3, 5, 2, 6]

 # Checks whether the command-line arguments are valid; exists otherwise
@ -315,16 +325,10 @@ for level in [1,2,3]:
 			body += "using double2 = clblast::double2;\n\n"
 			body += "// Main function (not within the clblast namespace)\n"
 			body += "int main(int argc, char *argv[]) {\n"
-			body += "  switch(clblast::GetPrecision(argc, argv)) {\n"
+			default = PrecisionToFullName(routine.flavours[0].name)
+			body += "  switch(clblast::GetPrecision(argc, argv, clblast::Precision::k"+default+")) {\n"
 			for precision in ["H","S","D","C","Z"]:
-				enum = {
-				    'H': "Half",
-				    'S': "Single",
-				    'D': "Double",
-				    'C': "ComplexSingle",
-				    'Z': "ComplexDouble",
-				}[precision]
-				body += "    case clblast::Precision::k"+enum+":"
+				body += "    case clblast::Precision::k"+PrecisionToFullName(precision)+":"
 				found = False
 				for flavour in routine.flavours:
 					if flavour.name == precision:
--- a/src/clblast.cc
+++ b/src/clblast.cc
@ -38,6 +38,17 @@
 #include "internal/routines/level2/xtrmv.h"
 #include "internal/routines/level2/xtbmv.h"
 #include "internal/routines/level2/xtpmv.h"
+#include "internal/routines/level2/xger.h"
+#include "internal/routines/level2/xgeru.h"
+#include "internal/routines/level2/xgerc.h"
+#include "internal/routines/level2/xher.h"
+#include "internal/routines/level2/xhpr.h"
+#include "internal/routines/level2/xher2.h"
+#include "internal/routines/level2/xhpr2.h"
+#include "internal/routines/level2/xsyr.h"
+#include "internal/routines/level2/xspr.h"
+#include "internal/routines/level2/xsyr2.h"
+#include "internal/routines/level2/xspr2.h"

 // BLAS level-3 includes
 #include "internal/routines/level3/xgemm.h"
@ -835,14 +846,24 @@ template StatusCode Tpsv<double2>(const Layout, const Triangle, const Transpose,

 // General rank-1 matrix update: SGER/DGER
 template <typename T>
-StatusCode Ger(const Layout,
-               const size_t, const size_t,
-               const T,
-               const cl_mem, const size_t, const size_t,
-               const cl_mem, const size_t, const size_t,
-               cl_mem, const size_t, const size_t,
-               cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Ger(const Layout layout,
+               const size_t m, const size_t n,
+               const T alpha,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+               cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xger<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoGer(layout,
+                       m, n,
+                       alpha,
+                       Buffer<T>(x_buffer), x_offset, x_inc,
+                       Buffer<T>(y_buffer), y_offset, y_inc,
+                       Buffer<T>(a_buffer), a_offset, a_ld);
 }
 template StatusCode Ger<float>(const Layout,
                               const size_t, const size_t,
@ -861,14 +882,24 @@ template StatusCode Ger<double>(const Layout,

 // General rank-1 complex matrix update: CGERU/ZGERU
 template <typename T>
-StatusCode Geru(const Layout,
-                const size_t, const size_t,
-                const T,
-                const cl_mem, const size_t, const size_t,
-                const cl_mem, const size_t, const size_t,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Geru(const Layout layout,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xgeru<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoGeru(layout,
+                        m, n,
+                        alpha,
+                        Buffer<T>(x_buffer), x_offset, x_inc,
+                        Buffer<T>(y_buffer), y_offset, y_inc,
+                        Buffer<T>(a_buffer), a_offset, a_ld);
 }
 template StatusCode Geru<float2>(const Layout,
                                 const size_t, const size_t,
@ -887,14 +918,24 @@ template StatusCode Geru<double2>(const Layout,

 // General rank-1 complex conjugated matrix update: CGERC/ZGERC
 template <typename T>
-StatusCode Gerc(const Layout,
-                const size_t, const size_t,
-                const T,
-                const cl_mem, const size_t, const size_t,
-                const cl_mem, const size_t, const size_t,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Gerc(const Layout layout,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xgerc<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoGerc(layout,
+                        m, n,
+                        alpha,
+                        Buffer<T>(x_buffer), x_offset, x_inc,
+                        Buffer<T>(y_buffer), y_offset, y_inc,
+                        Buffer<T>(a_buffer), a_offset, a_ld);
 }
 template StatusCode Gerc<float2>(const Layout,
                                 const size_t, const size_t,
@ -913,13 +954,22 @@ template StatusCode Gerc<double2>(const Layout,

 // Hermitian rank-1 matrix update: CHER/ZHER
 template <typename T>
-StatusCode Her(const Layout, const Triangle,
-               const size_t,
-               const T,
-               const cl_mem, const size_t, const size_t,
-               cl_mem, const size_t, const size_t,
-               cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Her(const Layout layout, const Triangle triangle,
+               const size_t n,
+               const T alpha,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+               cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xher<std::complex<T>,T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoHer(layout, triangle,
+                       n,
+                       alpha,
+                       Buffer<std::complex<T>>(x_buffer), x_offset, x_inc,
+                       Buffer<std::complex<T>>(a_buffer), a_offset, a_ld);
 }
 template StatusCode Her<float>(const Layout, const Triangle,
                               const size_t,
@ -936,13 +986,22 @@ template StatusCode Her<double>(const Layout, const Triangle,

 // Hermitian packed rank-1 matrix update: CHPR/ZHPR
 template <typename T>
-StatusCode Hpr(const Layout, const Triangle,
-               const size_t,
-               const T,
-               const cl_mem, const size_t, const size_t,
-               cl_mem, const size_t,
-               cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Hpr(const Layout layout, const Triangle triangle,
+               const size_t n,
+               const T alpha,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_mem ap_buffer, const size_t ap_offset,
+               cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xhpr<std::complex<T>,T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoHpr(layout, triangle,
+                       n,
+                       alpha,
+                       Buffer<std::complex<T>>(x_buffer), x_offset, x_inc,
+                       Buffer<std::complex<T>>(ap_buffer), ap_offset);
 }
 template StatusCode Hpr<float>(const Layout, const Triangle,
                               const size_t,
@ -959,14 +1018,24 @@ template StatusCode Hpr<double>(const Layout, const Triangle,

 // Hermitian rank-2 matrix update: CHER2/ZHER2
 template <typename T>
-StatusCode Her2(const Layout, const Triangle,
-                const size_t,
-                const T,
-                const cl_mem, const size_t, const size_t,
-                const cl_mem, const size_t, const size_t,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Her2(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xher2<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoHer2(layout, triangle,
+                        n,
+                        alpha,
+                        Buffer<T>(x_buffer), x_offset, x_inc,
+                        Buffer<T>(y_buffer), y_offset, y_inc,
+                        Buffer<T>(a_buffer), a_offset, a_ld);
 }
 template StatusCode Her2<float2>(const Layout, const Triangle,
                                 const size_t,
@ -985,14 +1054,24 @@ template StatusCode Her2<double2>(const Layout, const Triangle,

 // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2
 template <typename T>
-StatusCode Hpr2(const Layout, const Triangle,
-                const size_t,
-                const T,
-                const cl_mem, const size_t, const size_t,
-                const cl_mem, const size_t, const size_t,
-                cl_mem, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Hpr2(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_mem ap_buffer, const size_t ap_offset,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xhpr2<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoHpr2(layout, triangle,
+                        n,
+                        alpha,
+                        Buffer<T>(x_buffer), x_offset, x_inc,
+                        Buffer<T>(y_buffer), y_offset, y_inc,
+                        Buffer<T>(ap_buffer), ap_offset);
 }
 template StatusCode Hpr2<float2>(const Layout, const Triangle,
                                 const size_t,
@ -1011,13 +1090,22 @@ template StatusCode Hpr2<double2>(const Layout, const Triangle,

 // Symmetric rank-1 matrix update: SSYR/DSYR
 template <typename T>
-StatusCode Syr(const Layout, const Triangle,
-               const size_t,
-               const T,
-               const cl_mem, const size_t, const size_t,
-               cl_mem, const size_t, const size_t,
-               cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Syr(const Layout layout, const Triangle triangle,
+               const size_t n,
+               const T alpha,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+               cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xsyr<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoSyr(layout, triangle,
+                       n,
+                       alpha,
+                       Buffer<T>(x_buffer), x_offset, x_inc,
+                       Buffer<T>(a_buffer), a_offset, a_ld);
 }
 template StatusCode Syr<float>(const Layout, const Triangle,
                               const size_t,
@ -1034,13 +1122,22 @@ template StatusCode Syr<double>(const Layout, const Triangle,

 // Symmetric packed rank-1 matrix update: SSPR/DSPR
 template <typename T>
-StatusCode Spr(const Layout, const Triangle,
-               const size_t,
-               const T,
-               const cl_mem, const size_t, const size_t,
-               cl_mem, const size_t,
-               cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Spr(const Layout layout, const Triangle triangle,
+               const size_t n,
+               const T alpha,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_mem ap_buffer, const size_t ap_offset,
+               cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xspr<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoSpr(layout, triangle,
+                       n,
+                       alpha,
+                       Buffer<T>(x_buffer), x_offset, x_inc,
+                       Buffer<T>(ap_buffer), ap_offset);
 }
 template StatusCode Spr<float>(const Layout, const Triangle,
                               const size_t,
@ -1057,14 +1154,24 @@ template StatusCode Spr<double>(const Layout, const Triangle,

 // Symmetric rank-2 matrix update: SSYR2/DSYR2
 template <typename T>
-StatusCode Syr2(const Layout, const Triangle,
-                const size_t,
-                const T,
-                const cl_mem, const size_t, const size_t,
-                const cl_mem, const size_t, const size_t,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Syr2(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xsyr2<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoSyr2(layout, triangle,
+                        n,
+                        alpha,
+                        Buffer<T>(x_buffer), x_offset, x_inc,
+                        Buffer<T>(y_buffer), y_offset, y_inc,
+                        Buffer<T>(a_buffer), a_offset, a_ld);
 }
 template StatusCode Syr2<float>(const Layout, const Triangle,
                                const size_t,
@ -1083,14 +1190,24 @@ template StatusCode Syr2<double>(const Layout, const Triangle,

 // Symmetric packed rank-2 matrix update: SSPR2/DSPR2
 template <typename T>
-StatusCode Spr2(const Layout, const Triangle,
-                const size_t,
-                const T,
-                const cl_mem, const size_t, const size_t,
-                const cl_mem, const size_t, const size_t,
-                cl_mem, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Spr2(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_mem ap_buffer, const size_t ap_offset,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xspr2<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoSpr2(layout, triangle,
+                        n,
+                        alpha,
+                        Buffer<T>(x_buffer), x_offset, x_inc,
+                        Buffer<T>(y_buffer), y_offset, y_inc,
+                        Buffer<T>(ap_buffer), ap_offset);
 }
 template StatusCode Spr2<float>(const Layout, const Triangle,
                                const size_t,
--- a/src/database.cc
+++ b/src/database.cc
@ -15,6 +15,7 @@
 #include "internal/database/xaxpy.h"
 #include "internal/database/xdot.h"
 #include "internal/database/xgemv.h"
+#include "internal/database/xger.h"
 #include "internal/database/xgemm.h"
 #include "internal/database/copy.h"
 #include "internal/database/pad.h"
@ -31,11 +32,12 @@ const std::vector<Database::DatabaseEntry> Database::database = {
  XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble,
  XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble,
  XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble,
+  XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble,
  XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
  CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
  PadSingle, PadDouble, PadComplexSingle, PadComplexDouble,
-  TraSingle, TraDouble, TraComplexSingle, TraComplexDouble,
-  PadTraSingle, PadTraDouble, PadTraComplexSingle, PadTraComplexDouble
+  TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble,
+  PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble
 };

 // =================================================================================================
@ -77,19 +79,29 @@ Database::Parameters Database::Search(const std::string &this_kernel,
                                      const std::string &this_vendor,
                                      const std::string &this_device,
                                      const Precision this_precision) const {
+  // Set the short vendor name
+  auto this_short_vendor = this_vendor;
+  for (auto &combination : kVendorNames) {
+    if (this_vendor == combination.first) {
+      this_short_vendor = combination.second;
+    }
+  }
+
+  // Selects the right kernel
  for (auto &db: database) {
    if (db.kernel == this_kernel && db.precision == this_precision) {

      // Searches for the right vendor and device type, or selects the default if unavailable. This
      // assumes that the default vendor / device type is last in the database.
      for (auto &vendor: db.vendors) {
-        if ((vendor.name == this_vendor || vendor.name == kDeviceVendorAll) &&
-            (vendor.type == this_type   || vendor.type == kDeviceTypeAll)) {
+        if ((vendor.name == this_short_vendor || vendor.name == kDeviceVendorAll) &&
+            (vendor.type == this_type || vendor.type == kDeviceTypeAll)) {

          // Searches for the right device. If the current device is unavailable, selects the vendor
          // default parameters. This assumes the default is last in the database.
          for (auto &device: vendor.devices) {
-            if (device.name == this_device || device.name == kDefaultDevice) {
+
+            if (device.name == this_device || device.name == "default") {

              // Sets the parameters accordingly
              return device.parameters;
--- a/src/kernels/level2/level2.opencl
+++ b/src/kernels/level2/level2.opencl
@ -0,0 +1,158 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains common functions for matrix update kernels (Xger, Xher).
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+
+#ifndef WGS1
+  #define WGS1 8    // The local work-group size in first dimension
+#endif
+#ifndef WGS2
+  #define WGS2 8    // The local work-group size in second dimension
+#endif
+#ifndef WPT
+  #define WPT 1     // The amount of work-per-thread in both dimensions
+#endif
+
+// =================================================================================================
+
+// Returns an element from a vector
+inline real LoadVector(const int id, const int max,
+                       __global real* gm, const int offset, const int inc,
+                       const int do_conjugate) {
+  if (id < max) {
+    real result = gm[id*inc + offset];
+    if (do_conjugate) {
+      #if defined(ROUTINE_GERC) || defined(ROUTINE_HER) || defined(ROUTINE_HPR) || defined(ROUTINE_HER2) || defined(ROUTINE_HPR2)
+        COMPLEX_CONJUGATE(result);
+      #endif
+    }
+    return result;
+  }
+  else {
+    real default_result;
+    SetToZero(default_result);
+    return default_result;
+  }
+}
+
+// Performs the rank-1 matrix update
+inline void MatrixUpdate(const int id1, const int id2, const int max1, const int max2,
+                         __global real* agm, const int a_offset, const int a_ld,
+                         const real alpha, const real xvalue, const real yvalue,
+                         const int is_upper) {
+
+  // Bounds of a regular matrix
+  if (id1 < max1 && id2 < max2) {
+
+    #if defined(ROUTINE_SPR) || defined(ROUTINE_HPR)
+      int a_index;
+      if (is_upper) {
+        a_index = (id1 <= id2) ? ((id2+1)*id2)/2 + id1 : ((id1+1)*id1)/2 + id2;
+      }
+      else {
+        a_index = (id1 >= id2) ? ((2*a_ld-(id2+1))*id2)/2 + id1 : ((2*a_ld-(id1+1))*id1)/2 + id2;
+      }
+      a_index += a_offset;
+    #else
+      const int a_index = id2*a_ld + id1 + a_offset;
+    #endif
+
+    // Loads the current value of the A matrix
+    const real avalue = agm[a_index];
+
+    // Computes result = alpha * x[i] * y[j] + a[i][j]
+    #if PRECISION == 3232 || PRECISION == 6464
+      real ax;
+      ax.x = MulReal(alpha, xvalue);
+      ax.y = MulImag(alpha, xvalue);
+      real result;
+      result.x = MulReal(ax, yvalue) + avalue.x;
+      result.y = MulImag(ax, yvalue) + avalue.y;
+    #else
+      real result = alpha * xvalue * yvalue + avalue;
+    #endif
+
+    // For hermetian matrices
+    #if defined(ROUTINE_HER) || defined(ROUTINE_HPR)
+      if (id1 == id2) { result.y = ZERO; }
+    #endif
+    
+    // Stores the final result
+    agm[a_index] = result;
+  }
+}
+
+// Performs the rank-2 matrix update
+inline void MatrixUpdate2(const int id1, const int id2, const int max1, const int max2,
+                          __global real* agm, const int a_offset, const int a_ld,
+                          const real alpha1, const real xvalue, const real yvalue,
+                          const real alpha2, const real xtvalue, const real ytvalue,
+                          const int is_upper) {
+
+  // Bounds of a regular matrix
+  if (id1 < max1 && id2 < max2) {
+
+    #if defined(ROUTINE_SPR2) || defined(ROUTINE_HPR2)
+      int a_index;
+      if (is_upper) {
+        a_index = (id1 <= id2) ? ((id2+1)*id2)/2 + id1 : ((id1+1)*id1)/2 + id2;
+      }
+      else {
+        a_index = (id1 >= id2) ? ((2*a_ld-(id2+1))*id2)/2 + id1 : ((2*a_ld-(id1+1))*id1)/2 + id2;
+      }
+      a_index += a_offset;
+    #else
+      const int a_index = id2*a_ld + id1 + a_offset;
+    #endif
+
+    // Loads the current value of the A matrix
+    const real avalue = agm[a_index];
+
+    // Computes result = alpha * x[i] * y[j] + alpha * x[j] * y[i] + a[i][j]
+    #if PRECISION == 3232 || PRECISION == 6464
+      real ax;
+      ax.x = MulReal(alpha2, xvalue);
+      ax.y = MulImag(alpha2, xvalue);
+      real atx;
+      atx.x = MulReal(alpha1, xtvalue);
+      atx.y = MulImag(alpha1, xtvalue);
+      real result;
+      result.x = MulReal(ax, yvalue) + MulReal(atx, ytvalue) + avalue.x;
+      result.y = MulImag(ax, yvalue) + MulImag(atx, ytvalue) + avalue.y;
+    #else
+      real result = alpha1 * xvalue * yvalue + alpha2 * xtvalue * ytvalue + avalue;
+    #endif
+
+    // For hermetian matrices
+    #if defined(ROUTINE_HER2) || defined(ROUTINE_HPR2)
+      if (id1 == id2) { result.y = ZERO; }
+    #endif
+
+    // Stores the final result
+    agm[a_index] = result;
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
--- a/src/kernels/level2/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file contains the Xgemv kernel for matrix-vector multiplication.
+// This file contains the Xgemv kernel (generic version) for matrix-vector multiplication.
 //
 // =================================================================================================

@ -27,56 +27,11 @@ R"(
 #ifndef WPT1
  #define WPT1 1      // The amount of work-per-thread
 #endif
-
-// 2: For the fast version
-#ifndef WGS2
-  #define WGS2 64     // The local work-group size
-#endif
-#ifndef WPT2
-  #define WPT2 1      // The amount of work-per-thread
-#endif
-#ifndef VW2
-  #define VW2 1       // Vector width of matrix A loads
+#ifndef UNROLL1
+  #define UNROLL1 32  // Unroll factor (must be a divider of WGS1)
 #endif

-// 3: For the fast rotated version
-#ifndef WGS3
-  #define WGS3 64     // The local work-group size
-#endif
-#ifndef WPT3
-  #define WPT3 1      // The amount of work-per-thread
-#endif
-#ifndef VW3
-  #define VW3 1       // Vector width of matrix A loads
-#endif
-
-// =================================================================================================
-
-// Data-widths for the 'fast' kernel
-#if VW2 == 1
-  typedef real realVF;
-#elif VW2 == 2
-  typedef real2 realVF;
-#elif VW2 == 4
-  typedef real4 realVF;
-#elif VW2 == 8
-  typedef real8 realVF;
-#elif VW2 == 16
-  typedef real16 realVF;
-#endif
-
-// Data-widths for the 'fast' kernel with rotated matrix
-#if VW3 == 1
-  typedef real realVFR;
-#elif VW3 == 2
-  typedef real2 realVFR;
-#elif VW3 == 4
-  typedef real4 realVFR;
-#elif VW3 == 8
-  typedef real8 realVFR;
-#elif VW3 == 16
-  typedef real16 realVFR;
-#endif
+// 2 and 3: For the fast versions, see 'xgemv_fast.opencl'

 // =================================================================================================

@ -252,18 +207,6 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
  return result;
 }

-// Loads a vector input value (1/2)
-inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y,
-                            const int a_ld) {
-  return agm[a_ld*y + x];
-}
-
-// Loads a vector input value (2/2): as before, but different data-type
-inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x, const int y,
-                              const int a_ld) {
-  return agm[a_ld*y + x];
-}
-
 // =================================================================================================

 // Full version of the kernel
@ -301,28 +244,31 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
    barrier(CLK_LOCAL_MEM_FENCE);

    // Loops over the work per thread, and checks whether in bounds
-    #pragma unroll
    for (int w=0; w<WPT1; ++w) {
      const int gid = w*get_global_size(0) + get_global_id(0);
      if (gid < m) {

        // The multiply-add function for the main part (divisable by WGS1)
        if (a_rotated == 0) { // Not rotated
-          #pragma unroll
-          for (int kloop=0; kloop<WGS1; ++kloop) {
-            const int k = kwg + kloop;
-            real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku);
-            if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
-            MultiplyAdd(acc[w], xlm[kloop], value);
+          for (int kloop=0; kloop<WGS1; kloop+=UNROLL1) {
+            #pragma unroll
+            for (int kunroll=0; kunroll<UNROLL1; ++kunroll) {
+              const int k = kwg + kloop + kunroll;
+              real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku);
+              if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
+              MultiplyAdd(acc[w], xlm[kloop + kunroll], value);
+            }
          }
        }
        else { // Transposed
-          #pragma unroll
-          for (int kloop=0; kloop<WGS1; ++kloop) {
-            const int k = kwg + kloop;
-            real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku);
-            if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
-            MultiplyAdd(acc[w], xlm[kloop], value);
+          for (int kloop=0; kloop<WGS1; kloop+=UNROLL1) {
+            #pragma unroll
+            for (int kunroll=0; kunroll<UNROLL1; ++kunroll) {
+              const int k = kwg + kloop + kunroll;
+              real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku);
+              if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
+              MultiplyAdd(acc[w], xlm[kloop + kunroll], value);
+            }
          }
        }
      }
@ -365,200 +311,6 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,

 // =================================================================================================

-// Faster version of the kernel, assuming that:
-// --> 'm' and 'n' are multiples of WGS2
-// --> 'a_offset' is 0
-// --> 'a_ld' is a multiple of VW2
-// --> 'a_rotated' is 0
-// --> 'do_conjugate' is 0
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XgemvFast(const int m, const int n, const real alpha, const real beta,
-                        const int a_rotated,
-                        const __global realVF* restrict agm, const int a_offset, const int a_ld,
-                        const __global real* restrict xgm, const int x_offset, const int x_inc,
-                        __global real* ygm, const int y_offset, const int y_inc,
-                        const int do_conjugate, const int parameter,
-                        const int kl, const int ku) {
-  // Local memory for the vector X
-  __local real xlm[WGS2];
-
-  // Initializes the accumulation register
-  real acc[WPT2];
-  #pragma unroll
-  for (int w=0; w<WPT2; ++w) {
-    SetToZero(acc[w]);
-  }
-
-  // Loops over work-group sized portions of the work
-  for (int kwg=0; kwg<n; kwg+=WGS2) {
-
-    // Loads the vector X into local memory
-    const int lid = get_local_id(0);
-    xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
-
-    // Synchronizes all threads in a workgroup
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // The multiply-add function (not rotated)
-    #pragma unroll
-    for (int kl=0; kl<WGS2; ++kl) {
-      const int k = kwg + kl;
-      #pragma unroll
-      for (int w=0; w<WPT2/VW2; ++w) {
-        const int gid = (WPT2/VW2)*get_global_id(0) + w;
-        realVF avec = LoadMatrixAVF(agm, gid, k, a_ld/VW2);
-        #if VW2 == 1
-          MultiplyAdd(acc[VW2*w+0], xlm[kl], avec);
-        #elif VW2 == 2
-          MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.x);
-          MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.y);
-        #elif VW2 == 4
-          MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.x);
-          MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.y);
-          MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.z);
-          MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.w);
-        #elif VW2 == 8
-          MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.s0);
-          MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.s1);
-          MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.s2);
-          MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.s3);
-          MultiplyAdd(acc[VW2*w+4], xlm[kl], avec.s4);
-          MultiplyAdd(acc[VW2*w+5], xlm[kl], avec.s5);
-          MultiplyAdd(acc[VW2*w+6], xlm[kl], avec.s6);
-          MultiplyAdd(acc[VW2*w+7], xlm[kl], avec.s7);
-        #elif VW2 == 16
-          MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.s0);
-          MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.s1);
-          MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.s2);
-          MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.s3);
-          MultiplyAdd(acc[VW2*w+4], xlm[kl], avec.s4);
-          MultiplyAdd(acc[VW2*w+5], xlm[kl], avec.s5);
-          MultiplyAdd(acc[VW2*w+6], xlm[kl], avec.s6);
-          MultiplyAdd(acc[VW2*w+7], xlm[kl], avec.s7);
-          MultiplyAdd(acc[VW2*w+8], xlm[kl], avec.s8);
-          MultiplyAdd(acc[VW2*w+9], xlm[kl], avec.s9);
-          MultiplyAdd(acc[VW2*w+10], xlm[kl], avec.sA);
-          MultiplyAdd(acc[VW2*w+11], xlm[kl], avec.sB);
-          MultiplyAdd(acc[VW2*w+12], xlm[kl], avec.sC);
-          MultiplyAdd(acc[VW2*w+13], xlm[kl], avec.sD);
-          MultiplyAdd(acc[VW2*w+14], xlm[kl], avec.sE);
-          MultiplyAdd(acc[VW2*w+15], xlm[kl], avec.sF);
-        #endif
-      }
-    }
-
-    // Synchronizes all threads in a workgroup
-    barrier(CLK_LOCAL_MEM_FENCE);
-  }
-
-  // Stores the final result
-  #pragma unroll
-  for (int w=0; w<WPT2; ++w) {
-    const int gid = WPT2*get_global_id(0) + w;
-    real yval = ygm[gid*y_inc + y_offset];
-    AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, yval);
-  }
-}
-
-// =================================================================================================
-
-// Faster version of the kernel, assuming that:
-// --> 'm' and 'n' are multiples of WGS3
-// --> 'a_offset' is 0
-// --> 'a_ld' is a multiple of VW3
-// --> 'a_rotated' is 1
-// --> 'do_conjugate' is 0
-__attribute__((reqd_work_group_size(WGS3, 1, 1)))
-__kernel void XgemvFastRot(const int m, const int n, const real alpha, const real beta,
-                           const int a_rotated,
-                           const __global realVFR* restrict agm, const int a_offset, const int a_ld,
-                           const __global real* restrict xgm, const int x_offset, const int x_inc,
-                           __global real* ygm, const int y_offset, const int y_inc,
-                           const int do_conjugate, const int parameter,
-                           const int kl, const int ku) {
-  // Local memory for the vector X
-  __local real xlm[WGS3];
-
-  // Initializes the accumulation register
-  real acc[WPT3];
-  #pragma unroll
-  for (int w=0; w<WPT3; ++w) {
-    SetToZero(acc[w]);
-  }
-
-  // Loops over work-group sized portions of the work
-  for (int kwg=0; kwg<n; kwg+=WGS3) {
-
-    // Loads the vector X into local memory
-    const int lid = get_local_id(0);
-    xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
-
-    // Synchronizes all threads in a workgroup
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // The multiply-add function (rotated)
-    #pragma unroll
-    for (int kl=0; kl<WGS3/VW3; ++kl) {
-      const int k = (kwg/VW3) + kl;
-      #pragma unroll
-      for (int w=0; w<WPT3; ++w) {
-        const int gid = WPT3*get_global_id(0) + w;
-        realVFR avec = LoadMatrixAVFR(agm, k, gid, a_ld/VW3);
-        #if VW3 == 1
-          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec);
-        #elif VW3 == 2
-          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
-          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
-        #elif VW3 == 4
-          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
-          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
-          MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.z);
-          MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.w);
-        #elif VW3 == 8
-          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
-          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
-          MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
-          MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
-          MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
-          MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
-          MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
-          MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
-        #elif VW3 == 16
-          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
-          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
-          MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
-          MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
-          MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
-          MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
-          MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
-          MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
-          MultiplyAdd(acc[w], xlm[VW3*kl+8], avec.s8);
-          MultiplyAdd(acc[w], xlm[VW3*kl+9], avec.s9);
-          MultiplyAdd(acc[w], xlm[VW3*kl+10], avec.sA);
-          MultiplyAdd(acc[w], xlm[VW3*kl+11], avec.sB);
-          MultiplyAdd(acc[w], xlm[VW3*kl+12], avec.sC);
-          MultiplyAdd(acc[w], xlm[VW3*kl+13], avec.sD);
-          MultiplyAdd(acc[w], xlm[VW3*kl+14], avec.sE);
-          MultiplyAdd(acc[w], xlm[VW3*kl+15], avec.sF);
-        #endif
-      }
-    }
-
-    // Synchronizes all threads in a workgroup
-    barrier(CLK_LOCAL_MEM_FENCE);
-  }
-
-  // Stores the final result
-  #pragma unroll
-  for (int w=0; w<WPT3; ++w) {
-    const int gid = WPT3*get_global_id(0) + w;
-    real yval = ygm[gid*y_inc + y_offset];
-    AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, yval);
-  }
-}
-
-// =================================================================================================
-
 // End of the C++11 raw string literal
 )"

--- a/src/kernels/level2/xgemv_fast.opencl
+++ b/src/kernels/level2/xgemv_fast.opencl
@ -0,0 +1,288 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Xgemv kernel (fast versions) for matrix-vector multiplication.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+
+// 1: For the full version, see 'xgemv.opencl'
+
+// 2: For the fast version
+#ifndef WGS2
+  #define WGS2 64     // The local work-group size
+#endif
+#ifndef WPT2
+  #define WPT2 1      // The amount of work-per-thread
+#endif
+#ifndef VW2
+  #define VW2 1       // Vector width of matrix A loads
+#endif
+
+// 3: For the fast rotated version
+#ifndef WGS3
+  #define WGS3 64     // The local work-group size
+#endif
+#ifndef WPT3
+  #define WPT3 1      // The amount of work-per-thread
+#endif
+#ifndef VW3
+  #define VW3 1       // Vector width of matrix A loads
+#endif
+
+// =================================================================================================
+
+// Data-widths for the 'fast' kernel
+#if VW2 == 1
+  typedef real realVF;
+#elif VW2 == 2
+  typedef real2 realVF;
+#elif VW2 == 4
+  typedef real4 realVF;
+#elif VW2 == 8
+  typedef real8 realVF;
+#elif VW2 == 16
+  typedef real16 realVF;
+#endif
+
+// Data-widths for the 'fast' kernel with rotated matrix
+#if VW3 == 1
+  typedef real realVFR;
+#elif VW3 == 2
+  typedef real2 realVFR;
+#elif VW3 == 4
+  typedef real4 realVFR;
+#elif VW3 == 8
+  typedef real8 realVFR;
+#elif VW3 == 16
+  typedef real16 realVFR;
+#endif
+
+// =================================================================================================
+
+// Loads a vector input value (1/2)
+inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y,
+                            const int a_ld) {
+  return agm[a_ld*y + x];
+}
+
+// Loads a vector input value (2/2): as before, but different data-type
+inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x, const int y,
+                              const int a_ld) {
+  return agm[a_ld*y + x];
+}
+
+// =================================================================================================
+
+// Faster version of the kernel, assuming that:
+// --> 'm' and 'n' are multiples of WGS2
+// --> 'a_offset' is 0
+// --> 'a_ld' is a multiple of VW2
+// --> 'a_rotated' is 0
+// --> 'do_conjugate' is 0
+__attribute__((reqd_work_group_size(WGS2, 1, 1)))
+__kernel void XgemvFast(const int m, const int n, const real alpha, const real beta,
+                        const int a_rotated,
+                        const __global realVF* restrict agm, const int a_offset, const int a_ld,
+                        const __global real* restrict xgm, const int x_offset, const int x_inc,
+                        __global real* ygm, const int y_offset, const int y_inc,
+                        const int do_conjugate, const int parameter,
+                        const int kl, const int ku) {
+  // Local memory for the vector X
+  __local real xlm[WGS2];
+
+  // Initializes the accumulation register
+  real acc[WPT2];
+  #pragma unroll
+  for (int w=0; w<WPT2; ++w) {
+    SetToZero(acc[w]);
+  }
+
+  // Loops over work-group sized portions of the work
+  for (int kwg=0; kwg<n; kwg+=WGS2) {
+
+    // Loads the vector X into local memory
+    const int lid = get_local_id(0);
+    xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
+
+    // Synchronizes all threads in a workgroup
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // The multiply-add function (not rotated)
+    #pragma unroll
+    for (int kl=0; kl<WGS2; ++kl) {
+      const int k = kwg + kl;
+      #pragma unroll
+      for (int w=0; w<WPT2/VW2; ++w) {
+        const int gid = (WPT2/VW2)*get_global_id(0) + w;
+        realVF avec = LoadMatrixAVF(agm, gid, k, a_ld/VW2);
+        #if VW2 == 1
+          MultiplyAdd(acc[VW2*w+0], xlm[kl], avec);
+        #elif VW2 == 2
+          MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.x);
+          MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.y);
+        #elif VW2 == 4
+          MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.x);
+          MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.y);
+          MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.z);
+          MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.w);
+        #elif VW2 == 8
+          MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.s0);
+          MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.s1);
+          MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.s2);
+          MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.s3);
+          MultiplyAdd(acc[VW2*w+4], xlm[kl], avec.s4);
+          MultiplyAdd(acc[VW2*w+5], xlm[kl], avec.s5);
+          MultiplyAdd(acc[VW2*w+6], xlm[kl], avec.s6);
+          MultiplyAdd(acc[VW2*w+7], xlm[kl], avec.s7);
+        #elif VW2 == 16
+          MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.s0);
+          MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.s1);
+          MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.s2);
+          MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.s3);
+          MultiplyAdd(acc[VW2*w+4], xlm[kl], avec.s4);
+          MultiplyAdd(acc[VW2*w+5], xlm[kl], avec.s5);
+          MultiplyAdd(acc[VW2*w+6], xlm[kl], avec.s6);
+          MultiplyAdd(acc[VW2*w+7], xlm[kl], avec.s7);
+          MultiplyAdd(acc[VW2*w+8], xlm[kl], avec.s8);
+          MultiplyAdd(acc[VW2*w+9], xlm[kl], avec.s9);
+          MultiplyAdd(acc[VW2*w+10], xlm[kl], avec.sA);
+          MultiplyAdd(acc[VW2*w+11], xlm[kl], avec.sB);
+          MultiplyAdd(acc[VW2*w+12], xlm[kl], avec.sC);
+          MultiplyAdd(acc[VW2*w+13], xlm[kl], avec.sD);
+          MultiplyAdd(acc[VW2*w+14], xlm[kl], avec.sE);
+          MultiplyAdd(acc[VW2*w+15], xlm[kl], avec.sF);
+        #endif
+      }
+    }
+
+    // Synchronizes all threads in a workgroup
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+
+  // Stores the final result
+  #pragma unroll
+  for (int w=0; w<WPT2; ++w) {
+    const int gid = WPT2*get_global_id(0) + w;
+    real yval = ygm[gid*y_inc + y_offset];
+    AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, yval);
+  }
+}
+
+// =================================================================================================
+
+// Faster version of the kernel, assuming that:
+// --> 'm' and 'n' are multiples of WGS3
+// --> 'a_offset' is 0
+// --> 'a_ld' is a multiple of VW3
+// --> 'a_rotated' is 1
+// --> 'do_conjugate' is 0
+__attribute__((reqd_work_group_size(WGS3, 1, 1)))
+__kernel void XgemvFastRot(const int m, const int n, const real alpha, const real beta,
+                           const int a_rotated,
+                           const __global realVFR* restrict agm, const int a_offset, const int a_ld,
+                           const __global real* restrict xgm, const int x_offset, const int x_inc,
+                           __global real* ygm, const int y_offset, const int y_inc,
+                           const int do_conjugate, const int parameter,
+                           const int kl, const int ku) {
+  // Local memory for the vector X
+  __local real xlm[WGS3];
+
+  // Initializes the accumulation register
+  real acc[WPT3];
+  #pragma unroll
+  for (int w=0; w<WPT3; ++w) {
+    SetToZero(acc[w]);
+  }
+
+  // Loops over work-group sized portions of the work
+  for (int kwg=0; kwg<n; kwg+=WGS3) {
+
+    // Loads the vector X into local memory
+    const int lid = get_local_id(0);
+    xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
+
+    // Synchronizes all threads in a workgroup
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // The multiply-add function (rotated)
+    #pragma unroll
+    for (int kl=0; kl<WGS3/VW3; ++kl) {
+      const int k = (kwg/VW3) + kl;
+      #pragma unroll
+      for (int w=0; w<WPT3; ++w) {
+        const int gid = WPT3*get_global_id(0) + w;
+        realVFR avec = LoadMatrixAVFR(agm, k, gid, a_ld/VW3);
+        #if VW3 == 1
+          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec);
+        #elif VW3 == 2
+          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
+          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
+        #elif VW3 == 4
+          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
+          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
+          MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.z);
+          MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.w);
+        #elif VW3 == 8
+          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
+          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
+          MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
+          MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
+          MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
+          MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
+          MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
+          MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
+        #elif VW3 == 16
+          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
+          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
+          MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
+          MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
+          MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
+          MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
+          MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
+          MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
+          MultiplyAdd(acc[w], xlm[VW3*kl+8], avec.s8);
+          MultiplyAdd(acc[w], xlm[VW3*kl+9], avec.s9);
+          MultiplyAdd(acc[w], xlm[VW3*kl+10], avec.sA);
+          MultiplyAdd(acc[w], xlm[VW3*kl+11], avec.sB);
+          MultiplyAdd(acc[w], xlm[VW3*kl+12], avec.sC);
+          MultiplyAdd(acc[w], xlm[VW3*kl+13], avec.sD);
+          MultiplyAdd(acc[w], xlm[VW3*kl+14], avec.sE);
+          MultiplyAdd(acc[w], xlm[VW3*kl+15], avec.sF);
+        #endif
+      }
+    }
+
+    // Synchronizes all threads in a workgroup
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+
+  // Stores the final result
+  #pragma unroll
+  for (int w=0; w<WPT3; ++w) {
+    const int gid = WPT3*get_global_id(0) + w;
+    real yval = ygm[gid*y_inc + y_offset];
+    AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, yval);
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
--- a/src/kernels/level2/xger.opencl
+++ b/src/kernels/level2/xger.opencl
@ -0,0 +1,106 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Xger kernels for rank-1 matrix update.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Regular version of the rank-1 matrix update kernel (GER, GERU, GERC)
+__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+__kernel void Xger(const int max1, const int max2, const real alpha,
+                   const __global real* restrict xgm, const int x_offset, const int x_inc,
+                   const __global real* ygm, const int y_offset, const int y_inc,
+                   __global real* restrict agm, const int a_offset, const int a_ld,
+                   const int is_rowmajor) {
+
+  // Register storage for X and Y
+  real xvalues[WPT];
+  real yvalues[WPT];
+
+  // Row-major version
+  if (is_rowmajor) {
+
+    // Loads the X-vector
+    #pragma unroll
+    for (int w=0; w<WPT; ++w) {
+      const int id2 = w*get_global_size(1) + get_global_id(1);
+      xvalues[w] = LoadVector(id2, max2, xgm, x_offset, x_inc, false);
+    }
+
+    // Loads the Y-vector
+    #pragma unroll
+    for (int w=0; w<WPT; ++w) {
+      const int id1 = w*get_global_size(0) + get_global_id(0);
+      yvalues[w] = LoadVector(id1, max1, ygm, y_offset, y_inc, true);
+    }
+
+    // Loops over the work per thread twice
+    #pragma unroll
+    for (int w1=0; w1<WPT; ++w1) {
+      #pragma unroll
+      for (int w2=0; w2<WPT; ++w2) {
+
+        // Global thread IDs
+        const int id1 = w1*get_global_size(0) + get_global_id(0);
+        const int id2 = w2*get_global_size(1) + get_global_id(1);
+
+        // Loads A, performs the operation, and stores the result into A
+        MatrixUpdate(id1, id2, max1, max2, agm, a_offset, a_ld,
+                     alpha, xvalues[w2], yvalues[w1], false);
+      }
+    }
+  }
+
+  // Col-major version
+  else {
+
+    // Loads the X-vector
+    #pragma unroll
+    for (int w=0; w<WPT; ++w) {
+      const int id1 = w*get_global_size(0) + get_global_id(0);
+      xvalues[w] = LoadVector(id1, max1, xgm, x_offset, x_inc, false);
+    }
+
+    // Loads the Y-vector
+    #pragma unroll
+    for (int w=0; w<WPT; ++w) {
+      const int id2 = w*get_global_size(1) + get_global_id(1);
+      yvalues[w] = LoadVector(id2, max2, ygm, y_offset, y_inc, true);
+    }
+
+    // Loops over the work per thread twice
+    #pragma unroll
+    for (int w1=0; w1<WPT; ++w1) {
+      #pragma unroll
+      for (int w2=0; w2<WPT; ++w2) {
+
+        // Global thread IDs
+        const int id1 = w1*get_global_size(0) + get_global_id(0);
+        const int id2 = w2*get_global_size(1) + get_global_id(1);
+
+        // Loads A, performs the operation, and stores the result into A
+        MatrixUpdate(id1, id2, max1, max2, agm, a_offset, a_ld,
+                     alpha, xvalues[w1], yvalues[w2], false);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
--- a/src/kernels/level2/xher.opencl
+++ b/src/kernels/level2/xher.opencl
@ -0,0 +1,73 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Xher kernels for rank-1 matrix update.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR)
+__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+__kernel void Xher(const int n, const real alpha,
+                   const __global real* restrict xgm, const int x_offset, const int x_inc,
+                   __global real* restrict agm, const int a_offset, const int a_ld,
+                   const int is_upper, const int is_rowmajor) {
+
+  // Register storage for X and XT
+  real xvalues[WPT];
+  real xtvalues[WPT];
+
+  // Loads the X-vector
+  #pragma unroll
+  for (int w=0; w<WPT; ++w) {
+    const int id2 = w*get_global_size(1) + get_global_id(1);
+    xvalues[w] = LoadVector(id2, n, xgm, x_offset, x_inc, !is_rowmajor);
+  }
+
+  // Loads the X-transposed-vector
+  #pragma unroll
+  for (int w=0; w<WPT; ++w) {
+    const int id1 = w*get_global_size(0) + get_global_id(0);
+    xtvalues[w] = LoadVector(id1, n, xgm, x_offset, x_inc, is_rowmajor);
+  }
+
+  // Loops over the work per thread twice
+  #pragma unroll
+  for (int w1=0; w1<WPT; ++w1) {
+    #pragma unroll
+    for (int w2=0; w2<WPT; ++w2) {
+
+      // Global thread IDs
+      const int id1 = w1*get_global_size(0) + get_global_id(0);
+      const int id2 = w2*get_global_size(1) + get_global_id(1);
+
+      // Skip these threads if they do not contain threads contributing to the matrix-triangle
+      if ((is_upper && (id1 > id2)) || (!is_upper && (id2 > id1))) {
+        // Do nothing
+      }
+
+      // Loads A, performs the operation, and stores the result into A
+      else {
+        MatrixUpdate(id1, id2, n, n, agm, a_offset, a_ld, alpha, xvalues[w2], xtvalues[w1], is_upper);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
--- a/src/kernels/level2/xher2.opencl
+++ b/src/kernels/level2/xher2.opencl
@ -0,0 +1,104 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Xher2 kernels for rank-2 matrix update.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2)
+__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+__kernel void Xher2(const int n, const real alpha,
+                    const __global real* restrict xgm, const int x_offset, const int x_inc,
+                    const __global real* restrict ygm, const int y_offset, const int y_inc,
+                    __global real* restrict agm, const int a_offset, const int a_ld,
+                    const int is_upper, const int is_rowmajor) {
+
+  // Register storage for X and Y
+  real xvalues[WPT];
+  real yvalues[WPT];
+  real xtvalues[WPT];
+  real ytvalues[WPT];
+
+  // Loads the X-vector
+  #pragma unroll
+  for (int w=0; w<WPT; ++w) {
+    const int id2 = w*get_global_size(1) + get_global_id(1);
+    xvalues[w] = LoadVector(id2, n, xgm, x_offset, x_inc, !is_rowmajor);
+  }
+
+  // Loads the X-transposed-vector
+  #pragma unroll
+  for (int w=0; w<WPT; ++w) {
+    const int id1 = w*get_global_size(0) + get_global_id(0);
+    xtvalues[w] = LoadVector(id1, n, xgm, x_offset, x_inc, is_rowmajor);
+  }
+
+  // Loads the Y-vector
+  #pragma unroll
+  for (int w=0; w<WPT; ++w) {
+    const int id1 = w*get_global_size(0) + get_global_id(0);
+    yvalues[w] = LoadVector(id1, n, ygm, y_offset, y_inc, is_rowmajor);
+  }
+
+  // Loads the Y-transposed-vector
+  #pragma unroll
+  for (int w=0; w<WPT; ++w) {
+    const int id2 = w*get_global_size(1) + get_global_id(1);
+    ytvalues[w] = LoadVector(id2, n, ygm, y_offset, y_inc, !is_rowmajor);
+  }
+
+  // Sets the proper value of alpha in case conjugation is needed
+  real alpha1 = alpha;
+  real alpha2 = alpha;
+  #if defined(ROUTINE_HER2) || defined(ROUTINE_HPR2)
+    if (is_rowmajor) {
+      COMPLEX_CONJUGATE(alpha1);
+    }
+    else {
+      COMPLEX_CONJUGATE(alpha2);
+    }
+  #endif
+
+  // Loops over the work per thread twice
+  #pragma unroll
+  for (int w1=0; w1<WPT; ++w1) {
+    #pragma unroll
+    for (int w2=0; w2<WPT; ++w2) {
+
+      // Global thread IDs
+      const int id1 = w1*get_global_size(0) + get_global_id(0);
+      const int id2 = w2*get_global_size(1) + get_global_id(1);
+
+      // Skip these threads if they do not contain threads contributing to the matrix-triangle
+      if ((is_upper && (id1 > id2)) || (!is_upper && (id2 > id1))) {
+        // Do nothing
+      }
+
+      // Loads A, performs the operation, and stores the result into A
+      else {
+        MatrixUpdate2(id1, id2, n, n, agm, a_offset, a_ld,
+                      alpha1, xvalues[w2], yvalues[w1],
+                      alpha2, xtvalues[w1], ytvalues[w2], is_upper);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@ -0,0 +1,329 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto
+// et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable
+// (and tunable!) using more or less the same parameters/naming conventions as in the paper. It
+// supports single and double precision (SGEMM/DGEMM) through a pre-processor define.
+//
+// Matrices are accessed as follows:
+// A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m)
+// B: [k*N + n], with 'k' ranging from 0:K and 'n' from 0:N (n,k,n)
+// C: [n*M + m], with 'n' ranging from 0:N and 'm' from 0:M (m,n,m)
+//
+// Or as an image (assuming column-major)
+//       K                      
+//    o-------o                 
+//    |       |                 
+//  N | [B^T] |                 
+//    |       |                 
+//    o-------o                 
+//        K               N     
+//    o-------o        o-----o  
+//  M |  [A]  |      M | [C] |  
+//    |       |        |     |  
+//    o-------o        o-----o  
+//                              
+//
+// This kernel is seperated into two files. This is part 1 out of 2,
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef MWG
+  #define MWG 8      // Tile-size in dimension M (e.g. 64, 128)
+#endif
+#ifndef NWG
+  #define NWG 8      // Tile-size in dimension N (e.g. 64, 128)
+#endif
+#ifndef KWG
+  #define KWG 8      // Tile-size in dimension K (e.g. 8, 16)
+#endif
+#ifndef MDIMC
+  #define MDIMC 8    // Threads per workgroup in M-dimension (e.g. 8, 16, 32)
+#endif
+#ifndef NDIMC
+  #define NDIMC 8    // Threads per workgroup in N-dimension (e.g. 8, 16, 32)
+#endif
+#ifndef MDIMA
+  #define MDIMA 8    // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
+#endif
+#ifndef NDIMB
+  #define NDIMB 8    // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
+#endif
+#ifndef KWI
+  #define KWI 1      // Unroll factor of the KWG loop (smaller or equal than KWG)
+#endif
+#ifndef VWM
+  #define VWM 1      // Vector width of matrices A and C 
+#endif
+#ifndef VWN
+  #define VWN 1      // Vector width of matrix B
+#endif
+#ifndef STRM
+  #define STRM 0     // Use strided access within a thread in the M-dimension (1) or not (0)
+#endif
+#ifndef STRN
+  #define STRN 0     // Use strided access within a thread in the N-dimension (1) or not (0)
+#endif
+#ifndef SA
+  #define SA 0       // Use local/shared memory to cache matrix A (1) or not (0)
+#endif
+#ifndef SB
+  #define SB 0       // Use local/shared memory to cache matrix B (1) or not (0)
+#endif
+
+// Helper parameters based on the above tuning parameters
+#define MWI (MWG/MDIMC)               // Work per work-item (M-dimension)
+#define NWI (NWG/NDIMC)               // Work per work-item (N-dimension)
+#define KDIMA ((MDIMC*NDIMC)/(MDIMA)) // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
+#define KDIMB ((MDIMC*NDIMC)/(NDIMB)) // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
+#define MWA (MWG/MDIMA)               // Amount of loads-per-thread for matrix A (M-dimension)
+#define KWA (KWG/KDIMA)               // Amount of loads-per-thread for matrix A (K-dimension)
+#define KWB (KWG/KDIMB)               // Amount of loads-per-thread for matrix B (K-dimension)
+#define NWB (NWG/NDIMB)               // Amount of loads-per-thread for matrix B (N-dimension)
+
+// Settings
+#define USE_VECTOR_MAD 0              // Unroll (0) or don't (1) unroll the vector MAD manually
+
+// =================================================================================================
+
+// Data-widths in dimension M
+#if VWM == 1
+    typedef real realM;
+#elif VWM == 2
+    typedef real2 realM;
+#elif VWM == 4
+    typedef real4 realM;
+#elif VWM == 8
+    typedef real8 realM;
+#elif VWM == 16
+    typedef real16 realM;
+#endif
+
+// Data-widths in dimension N
+#if VWN == 1
+    typedef real realN;
+#elif VWN == 2
+    typedef real2 realN;
+#elif VWN == 4
+    typedef real4 realN;
+#elif VWN == 8
+    typedef real8 realN;
+#elif VWN == 16
+    typedef real16 realN;
+#endif
+
+// =================================================================================================
+
+// Initializes the accumulation registers to zero
+inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
+  #pragma unroll
+  for (int mi=0; mi<MWI/VWM; ++mi) {
+    #pragma unroll
+    for (int ni=0; ni<NWI; ++ni) {
+      #if VWM == 1
+        SetToZero(cpm[ni][mi]);
+      #elif VWM == 2
+        SetToZero(cpm[ni][mi].x);
+        SetToZero(cpm[ni][mi].y);
+      #elif VWM == 4
+        SetToZero(cpm[ni][mi].x);
+        SetToZero(cpm[ni][mi].y);
+        SetToZero(cpm[ni][mi].z);
+        SetToZero(cpm[ni][mi].w);
+      #elif VWM == 8
+        SetToZero(cpm[ni][mi].s0);
+        SetToZero(cpm[ni][mi].s1);
+        SetToZero(cpm[ni][mi].s2);
+        SetToZero(cpm[ni][mi].s3);
+        SetToZero(cpm[ni][mi].s4);
+        SetToZero(cpm[ni][mi].s5);
+        SetToZero(cpm[ni][mi].s6);
+        SetToZero(cpm[ni][mi].s7);
+      #elif VWM == 16
+        SetToZero(cpm[ni][mi].s0);
+        SetToZero(cpm[ni][mi].s1);
+        SetToZero(cpm[ni][mi].s2);
+        SetToZero(cpm[ni][mi].s3);
+        SetToZero(cpm[ni][mi].s4);
+        SetToZero(cpm[ni][mi].s5);
+        SetToZero(cpm[ni][mi].s6);
+        SetToZero(cpm[ni][mi].s7);
+        SetToZero(cpm[ni][mi].s8);
+        SetToZero(cpm[ni][mi].s9);
+        SetToZero(cpm[ni][mi].sA);
+        SetToZero(cpm[ni][mi].sB);
+        SetToZero(cpm[ni][mi].sC);
+        SetToZero(cpm[ni][mi].sD);
+        SetToZero(cpm[ni][mi].sE);
+        SetToZero(cpm[ni][mi].sF);
+      #endif
+    }
+  }
+}
+
+// =================================================================================================
+
+// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
+// caching the A input matrix.
+#if SA == 1
+inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm,
+                           const int kSizeM, const int tid, const int kwg) {
+  const int la0 = tid % MDIMA;
+  const int la1 = tid / MDIMA;
+  #pragma unroll
+  for (int mia=0; mia<MWA/VWM; ++mia) {
+    #pragma unroll
+    for (int kia=0; kia<KWA; ++kia) {
+
+      // Computes the indices based on strided/non-strided access
+      #if STRM == 0
+        int mg = mia + la0*(MWA/VWM);
+      #elif STRM == 1
+        int mg = la0 + mia*MDIMA;
+      #endif
+
+      // Computes the indices for the global memory
+      int kg = kia + la1*KWA;
+      int idm = mg + get_group_id(0)*(MWG/VWM);
+      int idk = kg + kwg;
+
+      // Loads the data from global memory (not transposed) into the local memory
+      alm[kg*(MWG/VWM) + mg] = agm[idk*(kSizeM/VWM) + idm];
+    }
+  }
+}
+#endif
+
+// Same as above, but now for the B input matrix
+#if SB == 1
+inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm,
+                           const int kSizeN, const int tid, const int kwg) {
+  const int lb0 = tid % NDIMB;
+  const int lb1 = tid / NDIMB;
+  #pragma unroll
+  for (int kib=0; kib<KWB; ++kib) {
+    #pragma unroll
+    for (int nib=0; nib<NWB/VWN; ++nib) {
+
+      // Computes the indices based on strided/non-strided access
+      #if STRN == 0
+        int ng = nib + lb0*(NWB/VWN);
+      #elif STRN == 1
+        int ng = lb0 + nib*NDIMB;
+      #endif
+
+      // Computes the indices for the global memory
+      int kg = kib + lb1*KWB;
+      int idn = ng + get_group_id(1)*(NWG/VWN);
+      int idk = kg + kwg;
+
+      // Loads the data from global memory (transposed) into the local memory
+      blm[kg*(NWG/VWN) + ng] = bgm[idk*(kSizeN/VWN) + idn];
+    }
+  }
+}
+#endif
+
+// =================================================================================================
+
+// Caches global off-chip memory directly into per-thread private memory (registers). This function
+// is specific for caching the A input matrix.
+#if SA == 0
+inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM],
+                             const int kSizeM, const int idk, const int kwg) {
+  #pragma unroll
+  for (int mi=0; mi<MWI/VWM; ++mi) {
+
+    // Computes the indices based on strided/non-strided access
+    #if STRM == 0
+      int mg = mi + get_local_id(0)*(MWI/VWM);
+    #elif STRM == 1
+      int mg = get_local_id(0) + mi*MDIMC;
+    #endif
+
+    // Computes the indices for the global memory
+    int idm = mg + get_group_id(0)*(MWG/VWM);
+
+    // Loads the data from global memory (not transposed) and stores into registers
+    apm[mi] = agm[idk*(kSizeM/VWM) + idm];
+  }
+}
+#endif
+
+// Same as above, but now for the B input matrix
+#if SB == 0
+inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN],
+                             const int kSizeN, const int idk) {
+  #pragma unroll
+  for (int ni=0; ni<NWI/VWN; ++ni) {
+
+    // Computes the indices based on strided/non-strided access
+    #if STRN == 0
+      int ng = ni + get_local_id(1)*(NWI/VWN);
+    #elif STRN == 1
+      int ng = get_local_id(1) + ni*NDIMC;
+    #endif
+
+    // Computes the indices for the global memory
+    int idn = ng + get_group_id(1)*(NWG/VWN);
+
+    // Loads the data from global memory (transposed) and stores into registers
+    bpm[ni] = bgm[idk*(kSizeN/VWN) + idn];
+  }
+}
+#endif
+
+// =================================================================================================
+
+// Caches on-chip local memory into per-thread private memory (registers). This function is specific
+// for caching the A input matrix.
+#if SA == 1
+inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) {
+  #pragma unroll
+  for (int mi=0; mi<MWI/VWM; ++mi) {
+    #if STRM == 0
+      int mg = mi + get_local_id(0)*(MWI/VWM);
+    #elif STRM == 1
+      int mg = get_local_id(0) + mi*MDIMC;
+    #endif
+    apm[mi] = alm[kg*(MWG/VWM) + mg];
+  }
+}
+#endif
+
+// Same as above, but now for the B input matrix
+#if SB == 1
+inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) {
+  #pragma unroll
+  for (int ni=0; ni<NWI/VWN; ++ni) {
+    #if STRN == 0
+      int ng = ni + get_local_id(1)*(NWI/VWN);
+    #elif STRN == 1
+      int ng = get_local_id(1) + ni*NDIMC;
+    #endif
+    bpm[ni] = blm[kg*(NWG/VWN) + ng];
+  }
+}
+#endif
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
--- a/src/kernels/level3/xgemm_part2.opencl
+++ b/src/kernels/level3/xgemm_part2.opencl
@ -7,29 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto
-// et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable
-// (and tunable!) using more or less the same parameters/naming conventions as in the paper. It
-// supports single and double precision (SGEMM/DGEMM) through a pre-processor define.
-//
-// Matrices are accessed as follows:
-// A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m)
-// B: [k*N + n], with 'k' ranging from 0:K and 'n' from 0:N (n,k,n)
-// C: [n*M + m], with 'n' ranging from 0:N and 'm' from 0:M (m,n,m)
-//
-// Or as an image (assuming column-major)
-//       K                      
-//    o-------o                 
-//    |       |                 
-//  N | [B^T] |                 
-//    |       |                 
-//    o-------o                 
-//        K               N     
-//    o-------o        o-----o  
-//  M |  [A]  |      M | [C] |  
-//    |       |        |     |  
-//    o-------o        o-----o  
-//                              
+// This is part 2 of 2 of the GEMM kernel. See part 1 for more information.
 //
 // =================================================================================================

@ -39,288 +17,6 @@ R"(

 // =================================================================================================

-// Parameters set by the tuner or by the database. Here they are given a basic default value in case
-// this kernel file is used outside of the CLBlast library.
-#ifndef MWG
-  #define MWG 8      // Tile-size in dimension M (e.g. 64, 128)
-#endif
-#ifndef NWG
-  #define NWG 8      // Tile-size in dimension N (e.g. 64, 128)
-#endif
-#ifndef KWG
-  #define KWG 8      // Tile-size in dimension K (e.g. 8, 16)
-#endif
-#ifndef MDIMC
-  #define MDIMC 8    // Threads per workgroup in M-dimension (e.g. 8, 16, 32)
-#endif
-#ifndef NDIMC
-  #define NDIMC 8    // Threads per workgroup in N-dimension (e.g. 8, 16, 32)
-#endif
-#ifndef MDIMA
-  #define MDIMA 8    // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
-#endif
-#ifndef NDIMB
-  #define NDIMB 8    // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
-#endif
-#ifndef KWI
-  #define KWI 1      // Unroll factor of the KWG loop (smaller or equal than KWG)
-#endif
-#ifndef VWM
-  #define VWM 1      // Vector width of matrices A and C 
-#endif
-#ifndef VWN
-  #define VWN 1      // Vector width of matrix B
-#endif
-#ifndef STRM
-  #define STRM 0     // Use strided access within a thread in the M-dimension (1) or not (0)
-#endif
-#ifndef STRN
-  #define STRN 0     // Use strided access within a thread in the N-dimension (1) or not (0)
-#endif
-#ifndef SA
-  #define SA 0       // Use local/shared memory to cache matrix A (1) or not (0)
-#endif
-#ifndef SB
-  #define SB 0       // Use local/shared memory to cache matrix B (1) or not (0)
-#endif
-
-// Helper parameters based on the above tuning parameters
-#define MWI (MWG/MDIMC)               // Work per work-item (M-dimension)
-#define NWI (NWG/NDIMC)               // Work per work-item (N-dimension)
-#define KDIMA ((MDIMC*NDIMC)/(MDIMA)) // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
-#define KDIMB ((MDIMC*NDIMC)/(NDIMB)) // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
-#define MWA (MWG/MDIMA)               // Amount of loads-per-thread for matrix A (M-dimension)
-#define KWA (KWG/KDIMA)               // Amount of loads-per-thread for matrix A (K-dimension)
-#define KWB (KWG/KDIMB)               // Amount of loads-per-thread for matrix B (K-dimension)
-#define NWB (NWG/NDIMB)               // Amount of loads-per-thread for matrix B (N-dimension)
-
-// Settings
-#define USE_VECTOR_MAD 0              // Unroll (0) or don't (1) unroll the vector MAD manually
-
-// =================================================================================================
-
-// Data-widths in dimension M
-#if VWM == 1
-    typedef real realM;
-#elif VWM == 2
-    typedef real2 realM;
-#elif VWM == 4
-    typedef real4 realM;
-#elif VWM == 8
-    typedef real8 realM;
-#elif VWM == 16
-    typedef real16 realM;
-#endif
-
-// Data-widths in dimension N
-#if VWN == 1
-    typedef real realN;
-#elif VWN == 2
-    typedef real2 realN;
-#elif VWN == 4
-    typedef real4 realN;
-#elif VWN == 8
-    typedef real8 realN;
-#elif VWN == 16
-    typedef real16 realN;
-#endif
-
-// =================================================================================================
-
-// Initializes the accumulation registers to zero
-inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
-  #pragma unroll
-  for (int mi=0; mi<MWI/VWM; ++mi) {
-    #pragma unroll
-    for (int ni=0; ni<NWI; ++ni) {
-      #if VWM == 1
-        SetToZero(cpm[ni][mi]);
-      #elif VWM == 2
-        SetToZero(cpm[ni][mi].x);
-        SetToZero(cpm[ni][mi].y);
-      #elif VWM == 4
-        SetToZero(cpm[ni][mi].x);
-        SetToZero(cpm[ni][mi].y);
-        SetToZero(cpm[ni][mi].z);
-        SetToZero(cpm[ni][mi].w);
-      #elif VWM == 8
-        SetToZero(cpm[ni][mi].s0);
-        SetToZero(cpm[ni][mi].s1);
-        SetToZero(cpm[ni][mi].s2);
-        SetToZero(cpm[ni][mi].s3);
-        SetToZero(cpm[ni][mi].s4);
-        SetToZero(cpm[ni][mi].s5);
-        SetToZero(cpm[ni][mi].s6);
-        SetToZero(cpm[ni][mi].s7);
-      #elif VWM == 16
-        SetToZero(cpm[ni][mi].s0);
-        SetToZero(cpm[ni][mi].s1);
-        SetToZero(cpm[ni][mi].s2);
-        SetToZero(cpm[ni][mi].s3);
-        SetToZero(cpm[ni][mi].s4);
-        SetToZero(cpm[ni][mi].s5);
-        SetToZero(cpm[ni][mi].s6);
-        SetToZero(cpm[ni][mi].s7);
-        SetToZero(cpm[ni][mi].s8);
-        SetToZero(cpm[ni][mi].s9);
-        SetToZero(cpm[ni][mi].sA);
-        SetToZero(cpm[ni][mi].sB);
-        SetToZero(cpm[ni][mi].sC);
-        SetToZero(cpm[ni][mi].sD);
-        SetToZero(cpm[ni][mi].sE);
-        SetToZero(cpm[ni][mi].sF);
-      #endif
-    }
-  }
-}
-
-// =================================================================================================
-
-// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
-// caching the A input matrix.
-#if SA == 1
-inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm,
-                           const int kSizeM, const int tid, const int kwg) {
-  const int la0 = tid % MDIMA;
-  const int la1 = tid / MDIMA;
-  #pragma unroll
-  for (int mia=0; mia<MWA/VWM; ++mia) {
-    #pragma unroll
-    for (int kia=0; kia<KWA; ++kia) {
-
-      // Computes the indices based on strided/non-strided access
-      #if STRM == 0
-        int mg = mia + la0*(MWA/VWM);
-      #elif STRM == 1
-        int mg = la0 + mia*MDIMA;
-      #endif
-
-      // Computes the indices for the global memory
-      int kg = kia + la1*KWA;
-      int idm = mg + get_group_id(0)*(MWG/VWM);
-      int idk = kg + kwg;
-
-      // Loads the data from global memory (not transposed) into the local memory
-      alm[kg*(MWG/VWM) + mg] = agm[idk*(kSizeM/VWM) + idm];
-    }
-  }
-}
-#endif
-
-// Same as above, but now for the B input matrix
-#if SB == 1
-inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm,
-                           const int kSizeN, const int tid, const int kwg) {
-  const int lb0 = tid % NDIMB;
-  const int lb1 = tid / NDIMB;
-  #pragma unroll
-  for (int kib=0; kib<KWB; ++kib) {
-    #pragma unroll
-    for (int nib=0; nib<NWB/VWN; ++nib) {
-
-      // Computes the indices based on strided/non-strided access
-      #if STRN == 0
-        int ng = nib + lb0*(NWB/VWN);
-      #elif STRN == 1
-        int ng = lb0 + nib*NDIMB;
-      #endif
-
-      // Computes the indices for the global memory
-      int kg = kib + lb1*KWB;
-      int idn = ng + get_group_id(1)*(NWG/VWN);
-      int idk = kg + kwg;
-
-      // Loads the data from global memory (transposed) into the local memory
-      blm[kg*(NWG/VWN) + ng] = bgm[idk*(kSizeN/VWN) + idn];
-    }
-  }
-}
-#endif
-
-// =================================================================================================
-
-// Caches global off-chip memory directly into per-thread private memory (registers). This function
-// is specific for caching the A input matrix.
-#if SA == 0
-inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM],
-                             const int kSizeM, const int idk, const int kwg) {
-  #pragma unroll
-  for (int mi=0; mi<MWI/VWM; ++mi) {
-
-    // Computes the indices based on strided/non-strided access
-    #if STRM == 0
-      int mg = mi + get_local_id(0)*(MWI/VWM);
-    #elif STRM == 1
-      int mg = get_local_id(0) + mi*MDIMC;
-    #endif
-
-    // Computes the indices for the global memory
-    int idm = mg + get_group_id(0)*(MWG/VWM);
-
-    // Loads the data from global memory (not transposed) and stores into registers
-    apm[mi] = agm[idk*(kSizeM/VWM) + idm];
-  }
-}
-#endif
-
-// Same as above, but now for the B input matrix
-#if SB == 0
-inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN],
-                             const int kSizeN, const int idk) {
-  #pragma unroll
-  for (int ni=0; ni<NWI/VWN; ++ni) {
-
-    // Computes the indices based on strided/non-strided access
-    #if STRN == 0
-      int ng = ni + get_local_id(1)*(NWI/VWN);
-    #elif STRN == 1
-      int ng = get_local_id(1) + ni*NDIMC;
-    #endif
-
-    // Computes the indices for the global memory
-    int idn = ng + get_group_id(1)*(NWG/VWN);
-
-    // Loads the data from global memory (transposed) and stores into registers
-    bpm[ni] = bgm[idk*(kSizeN/VWN) + idn];
-  }
-}
-#endif
-
-// =================================================================================================
-
-// Caches on-chip local memory into per-thread private memory (registers). This function is specific
-// for caching the A input matrix.
-#if SA == 1
-inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) {
-  #pragma unroll
-  for (int mi=0; mi<MWI/VWM; ++mi) {
-    #if STRM == 0
-      int mg = mi + get_local_id(0)*(MWI/VWM);
-    #elif STRM == 1
-      int mg = get_local_id(0) + mi*MDIMC;
-    #endif
-    apm[mi] = alm[kg*(MWG/VWM) + mg];
-  }
-}
-#endif
-
-// Same as above, but now for the B input matrix
-#if SB == 1
-inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) {
-  #pragma unroll
-  for (int ni=0; ni<NWI/VWN; ++ni) {
-    #if STRN == 0
-      int ng = ni + get_local_id(1)*(NWI/VWN);
-    #elif STRN == 1
-      int ng = get_local_id(1) + ni*NDIMC;
-    #endif
-    bpm[ni] = blm[kg*(NWG/VWN) + ng];
-  }
-}
-#endif
-
-// =================================================================================================
-
 // The vectorised multiply-add function
 inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
  #if USE_VECTOR_MAD == 1
--- a/src/routines/level1/xdotu.cc
+++ b/src/routines/level1/xdotu.cc
@ -14,7 +14,6 @@
 #include "internal/routines/level1/xdotu.h"

 #include <string>
-#include <vector>

 namespace clblast {
 // =================================================================================================
--- a/src/routines/level2/xgemv.cc
+++ b/src/routines/level2/xgemv.cc
@ -33,6 +33,7 @@ Xgemv<T>::Xgemv(Queue &queue, Event &event, const std::string &name):
    Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) {
  source_string_ =
    #include "../../kernels/level2/xgemv.opencl"
+    #include "../../kernels/level2/xgemv_fast.opencl"
  ;
 }

--- a/src/routines/level2/xger.cc
+++ b/src/routines/level2/xger.cc
@ -0,0 +1,112 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xger class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xger.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xger<float>::precision_ = Precision::kSingle;
+template <> const Precision Xger<double>::precision_ = Precision::kDouble;
+template <> const Precision Xger<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xger<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xger<T>::Xger(Queue &queue, Event &event, const std::string &name):
+    Routine<T>(queue, event, name, {"Xger"}, precision_) {
+  source_string_ =
+    #include "../../kernels/level2/level2.opencl"
+    #include "../../kernels/level2/xger.opencl"
+  ;
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xger<T>::DoGer(const Layout layout,
+                          const size_t m, const size_t n,
+                          const T alpha,
+                          const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                          const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                          const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
+
+  // Computes whether or not the matrix has an alternative layout (row or column-major).
+  const auto a_is_rowmajor = (layout == Layout::kRowMajor);
+  const auto a_one = (a_is_rowmajor) ? n : m;
+  const auto a_two = (a_is_rowmajor) ? m : n;
+
+  // Tests the matrix and the vectors for validity
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestVectorX(m, x_buffer, x_offset, x_inc, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Retrieves the Xgemv kernel from the compiled binary
+  try {
+    auto& program = GetProgramFromCache();
+    auto kernel = Kernel(program, "Xger");
+
+    // Sets the kernel arguments
+    kernel.SetArgument(0, static_cast<int>(a_one));
+    kernel.SetArgument(1, static_cast<int>(a_two));
+    kernel.SetArgument(2, alpha);
+    kernel.SetArgument(3, x_buffer());
+    kernel.SetArgument(4, static_cast<int>(x_offset));
+    kernel.SetArgument(5, static_cast<int>(x_inc));
+    kernel.SetArgument(6, y_buffer());
+    kernel.SetArgument(7, static_cast<int>(y_offset));
+    kernel.SetArgument(8, static_cast<int>(y_inc));
+    kernel.SetArgument(9, a_buffer());
+    kernel.SetArgument(10, static_cast<int>(a_offset));
+    kernel.SetArgument(11, static_cast<int>(a_ld));
+    kernel.SetArgument(12, static_cast<int>(a_is_rowmajor));
+
+    // Launches the kernel
+    auto a_one_ceiled = Ceil(CeilDiv(a_one, db_["WPT"]), db_["WGS1"]);
+    auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
+    auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
+    auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
+    status = RunKernel(kernel, global, local);
+    if (ErrorIn(status)) { return status; }
+
+    // Waits for all kernels to finish
+    queue_.Finish();
+
+    // Succesfully finished the computation
+    return StatusCode::kSuccess;
+  } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xger<float>;
+template class Xger<double>;
+template class Xger<float2>;
+template class Xger<double2>;
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routines/level2/xgerc.cc
+++ b/src/routines/level2/xgerc.cc
@ -0,0 +1,53 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgerc class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xgerc.h"
+
+#include <string>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xgerc<T>::Xgerc(Queue &queue, Event &event, const std::string &name):
+    Xger<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xgerc<T>::DoGerc(const Layout layout,
+                            const size_t m, const size_t n,
+                            const T alpha,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
+
+  // Regular Ger operation on complex data, plus conjugation in the kernel guarded by the
+  // ROUTINE_GERC guard.
+  return DoGer(layout, m, n, alpha,
+               x_buffer, x_offset, x_inc,
+               y_buffer, y_offset, y_inc,
+               a_buffer, a_offset, a_ld);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xgerc<float2>;
+template class Xgerc<double2>;
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routines/level2/xgeru.cc
+++ b/src/routines/level2/xgeru.cc
@ -0,0 +1,52 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgeru class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xgeru.h"
+
+#include <string>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xgeru<T>::Xgeru(Queue &queue, Event &event, const std::string &name):
+    Xger<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xgeru<T>::DoGeru(const Layout layout,
+                            const size_t m, const size_t n,
+                            const T alpha,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
+
+  // Regular Ger operation on complex data
+  return DoGer(layout, m, n, alpha,
+               x_buffer, x_offset, x_inc,
+               y_buffer, y_offset, y_inc,
+               a_buffer, a_offset, a_ld);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xgeru<float2>;
+template class Xgeru<double2>;
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routines/level2/xher.cc
+++ b/src/routines/level2/xher.cc
@ -0,0 +1,122 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xher class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xher.h"
+
+#include <string>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xher<float, float>::precision_ = Precision::kSingle;
+template <> const Precision Xher<double, double>::precision_ = Precision::kDouble;
+template <> const Precision Xher<float2, float>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xher<double2, double>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T, typename U>
+Xher<T,U>::Xher(Queue &queue, Event &event, const std::string &name):
+    Routine<T>(queue, event, name, {"Xger"}, precision_) {
+  source_string_ =
+    #include "../../kernels/level2/level2.opencl"
+    #include "../../kernels/level2/xher.opencl"
+  ;
+}
+
+// =================================================================================================
+
+// Specializations to compute alpha of type 'T'
+template <> float2 Xher<float2,float>::GetAlpha(const float alpha) { return float2{alpha, 0.0f}; }
+template <> double2 Xher<double2,double>::GetAlpha(const double alpha) { return double2{alpha, 0.0}; }
+template <> float Xher<float,float>::GetAlpha(const float alpha) { return alpha; }
+template <> double Xher<double,double>::GetAlpha(const double alpha) { return alpha; }
+
+// =================================================================================================
+
+// The main routine
+template <typename T, typename U>
+StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
+                            const size_t n,
+                            const U alpha,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const bool packed) {
+
+  // Makes sure the dimensions are larger than zero
+  if (n == 0) { return StatusCode::kInvalidDimension; }
+
+  // The data is either in the upper or lower triangle
+  const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                         (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+  const auto is_rowmajor = (layout == Layout::kRowMajor);
+
+  // Creates a matching version of alpha
+  const auto matching_alpha = GetAlpha(alpha);
+
+  // Tests the matrix and the vectors for validity
+  auto status = StatusCode::kSuccess;
+  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
+  else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); }
+  if (ErrorIn(status)) { return status; }
+  status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // If alpha is zero an update is not required
+  if (alpha == U{0}) { return StatusCode::kSuccess; }
+
+  // Retrieves the Xgemv kernel from the compiled binary
+  try {
+    auto& program = GetProgramFromCache();
+    auto kernel = Kernel(program, "Xher");
+
+    // Sets the kernel arguments
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, matching_alpha);
+    kernel.SetArgument(2, x_buffer());
+    kernel.SetArgument(3, static_cast<int>(x_offset));
+    kernel.SetArgument(4, static_cast<int>(x_inc));
+    kernel.SetArgument(5, a_buffer());
+    kernel.SetArgument(6, static_cast<int>(a_offset));
+    kernel.SetArgument(7, static_cast<int>(a_ld));
+    kernel.SetArgument(8, static_cast<int>(is_upper));
+    kernel.SetArgument(9, static_cast<int>(is_rowmajor));
+
+    // Launches the kernel
+    auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
+    auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
+    auto global = std::vector<size_t>{global_one, global_two};
+    auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
+    status = RunKernel(kernel, global, local);
+    if (ErrorIn(status)) { return status; }
+
+    // Waits for all kernels to finish
+    queue_.Finish();
+
+    // Succesfully finished the computation
+    return StatusCode::kSuccess;
+  } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xher<float, float>;
+template class Xher<double, double>;
+template class Xher<float2, float>;
+template class Xher<double2, double>;
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routines/level2/xher2.cc
+++ b/src/routines/level2/xher2.cc
@ -0,0 +1,114 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xher2 class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xher2.h"
+
+#include <string>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xher2<float>::precision_ = Precision::kSingle;
+template <> const Precision Xher2<double>::precision_ = Precision::kDouble;
+template <> const Precision Xher2<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xher2<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xher2<T>::Xher2(Queue &queue, Event &event, const std::string &name):
+    Routine<T>(queue, event, name, {"Xger"}, precision_) {
+  source_string_ =
+    #include "../../kernels/level2/level2.opencl"
+    #include "../../kernels/level2/xher2.opencl"
+  ;
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
+                            const size_t n,
+                            const T alpha,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const bool packed) {
+
+  // Makes sure the dimensions are larger than zero
+  if (n == 0) { return StatusCode::kInvalidDimension; }
+
+  // The data is either in the upper or lower triangle
+  const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                         (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+  const auto is_rowmajor = (layout == Layout::kRowMajor);
+
+  // Tests the matrix and the vectors for validity
+  auto status = StatusCode::kSuccess;
+  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
+  else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); }
+  if (ErrorIn(status)) { return status; }
+  status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Retrieves the Xgemv kernel from the compiled binary
+  try {
+    auto& program = GetProgramFromCache();
+    auto kernel = Kernel(program, "Xher2");
+
+    // Sets the kernel arguments
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, alpha);
+    kernel.SetArgument(2, x_buffer());
+    kernel.SetArgument(3, static_cast<int>(x_offset));
+    kernel.SetArgument(4, static_cast<int>(x_inc));
+    kernel.SetArgument(5, y_buffer());
+    kernel.SetArgument(6, static_cast<int>(y_offset));
+    kernel.SetArgument(7, static_cast<int>(y_inc));
+    kernel.SetArgument(8, a_buffer());
+    kernel.SetArgument(9, static_cast<int>(a_offset));
+    kernel.SetArgument(10, static_cast<int>(a_ld));
+    kernel.SetArgument(11, static_cast<int>(is_upper));
+    kernel.SetArgument(12, static_cast<int>(is_rowmajor));
+
+    // Launches the kernel
+    auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
+    auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
+    auto global = std::vector<size_t>{global_one, global_two};
+    auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
+    status = RunKernel(kernel, global, local);
+    if (ErrorIn(status)) { return status; }
+
+    // Waits for all kernels to finish
+    queue_.Finish();
+
+    // Succesfully finished the computation
+    return StatusCode::kSuccess;
+  } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xher2<float>;
+template class Xher2<double>;
+template class Xher2<float2>;
+template class Xher2<double2>;
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routines/level2/xhpr.cc
+++ b/src/routines/level2/xhpr.cc
@ -0,0 +1,51 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhpr class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xhpr.h"
+
+#include <string>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T, typename U>
+Xhpr<T,U>::Xhpr(Queue &queue, Event &event, const std::string &name):
+    Xher<T,U>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T, typename U>
+StatusCode Xhpr<T,U>::DoHpr(const Layout layout, const Triangle triangle,
+                            const size_t n,
+                            const U alpha,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const Buffer<T> &ap_buffer, const size_t ap_offset) {
+
+  // Specific Xhpr functionality is implemented in the kernel using defines
+  return DoHer(layout, triangle, n, alpha,
+               x_buffer, x_offset, x_inc,
+               ap_buffer, ap_offset, n,
+               true); // packed matrix
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xhpr<float2, float>;
+template class Xhpr<double2, double>;
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routines/level2/xhpr2.cc
+++ b/src/routines/level2/xhpr2.cc
@ -0,0 +1,53 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhpr2 class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xhpr2.h"
+
+#include <string>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xhpr2<T>::Xhpr2(Queue &queue, Event &event, const std::string &name):
+    Xher2<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xhpr2<T>::DoHpr2(const Layout layout, const Triangle triangle,
+                            const size_t n,
+                            const T alpha,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                            const Buffer<T> &ap_buffer, const size_t ap_offset) {
+
+  // Specific Xhpr2 functionality is implemented in the kernel using defines
+  return DoHer2(layout, triangle, n, alpha,
+                x_buffer, x_offset, x_inc,
+                y_buffer, y_offset, y_inc,
+                ap_buffer, ap_offset, n,
+                true); // packed matrix
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xhpr2<float2>;
+template class Xhpr2<double2>;
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routines/level2/xspr.cc
+++ b/src/routines/level2/xspr.cc
@ -0,0 +1,51 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xspr class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xspr.h"
+
+#include <string>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xspr<T>::Xspr(Queue &queue, Event &event, const std::string &name):
+    Xher<T,T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xspr<T>::DoSpr(const Layout layout, const Triangle triangle,
+                          const size_t n,
+                          const T alpha,
+                          const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                          const Buffer<T> &ap_buffer, const size_t ap_offset) {
+
+  // Specific Xspr functionality is implemented in the kernel using defines
+  return DoHer(layout, triangle, n, alpha,
+               x_buffer, x_offset, x_inc,
+               ap_buffer, ap_offset, n,
+               true); // packed matrix
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xspr<float>;
+template class Xspr<double>;
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routines/level2/xspr2.cc
+++ b/src/routines/level2/xspr2.cc
@ -0,0 +1,53 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xspr2 class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xspr2.h"
+
+#include <string>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xspr2<T>::Xspr2(Queue &queue, Event &event, const std::string &name):
+    Xher2<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle,
+                            const size_t n,
+                            const T alpha,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                            const Buffer<T> &ap_buffer, const size_t ap_offset) {
+
+  // Specific Xspr2 functionality is implemented in the kernel using defines
+  return DoHer2(layout, triangle, n, alpha,
+                x_buffer, x_offset, x_inc,
+                y_buffer, y_offset, y_inc,
+                ap_buffer, ap_offset, n,
+                true); // packed matrix
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xspr2<float>;
+template class Xspr2<double>;
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routines/level2/xsyr.cc
+++ b/src/routines/level2/xsyr.cc
@ -0,0 +1,50 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xsyr.h"
+
+#include <string>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xsyr<T>::Xsyr(Queue &queue, Event &event, const std::string &name):
+    Xher<T,T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xsyr<T>::DoSyr(const Layout layout, const Triangle triangle,
+                          const size_t n,
+                          const T alpha,
+                          const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                          const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
+
+  // Specific Xsyr functionality is implemented in the kernel using defines
+  return DoHer(layout, triangle, n, alpha,
+               x_buffer, x_offset, x_inc,
+               a_buffer, a_offset, a_ld);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xsyr<float>;
+template class Xsyr<double>;
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routines/level2/xsyr2.cc
+++ b/src/routines/level2/xsyr2.cc
@ -0,0 +1,52 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr2 class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xsyr2.h"
+
+#include <string>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xsyr2<T>::Xsyr2(Queue &queue, Event &event, const std::string &name):
+    Xher2<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xsyr2<T>::DoSyr2(const Layout layout, const Triangle triangle,
+                            const size_t n,
+                            const T alpha,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
+
+  // Specific Xsyr2 functionality is implemented in the kernel using defines
+  return DoHer2(layout, triangle, n, alpha,
+                x_buffer, x_offset, x_inc,
+                y_buffer, y_offset, y_inc,
+                a_buffer, a_offset, a_ld);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xsyr2<float>;
+template class Xsyr2<double>;
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routines/level3/xgemm.cc
+++ b/src/routines/level3/xgemm.cc
@ -30,13 +30,14 @@ template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDoub
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgemm<T>::Xgemm(Queue &queue, Event &event, const std::string &name):
-    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
  source_string_ =
    #include "../../kernels/level3/copy.opencl"
    #include "../../kernels/level3/pad.opencl"
    #include "../../kernels/level3/transpose.opencl"
    #include "../../kernels/level3/padtranspose.opencl"
-    #include "../../kernels/level3/xgemm.opencl"
+    #include "../../kernels/level3/xgemm_part1.opencl"
+    #include "../../kernels/level3/xgemm_part2.opencl"
  ;
 }

--- a/src/routines/level3/xher2k.cc
+++ b/src/routines/level3/xher2k.cc
@ -28,13 +28,14 @@ template <> const Precision Xher2k<double2,double>::precision_ = Precision::kCom
 // Constructor: forwards to base class constructor
 template <typename T, typename U>
 Xher2k<T,U>::Xher2k(Queue &queue, Event &event, const std::string &name):
-    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
  source_string_ =
    #include "../../kernels/level3/copy.opencl"
    #include "../../kernels/level3/pad.opencl"
    #include "../../kernels/level3/transpose.opencl"
    #include "../../kernels/level3/padtranspose.opencl"
-    #include "../../kernels/level3/xgemm.opencl"
+    #include "../../kernels/level3/xgemm_part1.opencl"
+    #include "../../kernels/level3/xgemm_part2.opencl"
  ;
 }

--- a/src/routines/level3/xherk.cc
+++ b/src/routines/level3/xherk.cc
@ -28,13 +28,14 @@ template <> const Precision Xherk<double2,double>::precision_ = Precision::kComp
 // Constructor: forwards to base class constructor
 template <typename T, typename U>
 Xherk<T,U>::Xherk(Queue &queue, Event &event, const std::string &name):
-    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
  source_string_ =
    #include "../../kernels/level3/copy.opencl"
    #include "../../kernels/level3/pad.opencl"
    #include "../../kernels/level3/transpose.opencl"
    #include "../../kernels/level3/padtranspose.opencl"
-    #include "../../kernels/level3/xgemm.opencl"
+    #include "../../kernels/level3/xgemm_part1.opencl"
+    #include "../../kernels/level3/xgemm_part2.opencl"
  ;
 }

--- a/src/routines/level3/xsyr2k.cc
+++ b/src/routines/level3/xsyr2k.cc
@ -30,13 +30,14 @@ template <> const Precision Xsyr2k<double2>::precision_ = Precision::kComplexDou
 // Constructor: forwards to base class constructor
 template <typename T>
 Xsyr2k<T>::Xsyr2k(Queue &queue, Event &event, const std::string &name):
-    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
  source_string_ =
    #include "../../kernels/level3/copy.opencl"
    #include "../../kernels/level3/pad.opencl"
    #include "../../kernels/level3/transpose.opencl"
    #include "../../kernels/level3/padtranspose.opencl"
-    #include "../../kernels/level3/xgemm.opencl"
+    #include "../../kernels/level3/xgemm_part1.opencl"
+    #include "../../kernels/level3/xgemm_part2.opencl"
  ;
 }

--- a/src/routines/level3/xsyrk.cc
+++ b/src/routines/level3/xsyrk.cc
@ -30,13 +30,14 @@ template <> const Precision Xsyrk<double2>::precision_ = Precision::kComplexDoub
 // Constructor: forwards to base class constructor
 template <typename T>
 Xsyrk<T>::Xsyrk(Queue &queue, Event &event, const std::string &name):
-    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
  source_string_ =
    #include "../../kernels/level3/copy.opencl"
    #include "../../kernels/level3/pad.opencl"
    #include "../../kernels/level3/transpose.opencl"
    #include "../../kernels/level3/padtranspose.opencl"
-    #include "../../kernels/level3/xgemm.opencl"
+    #include "../../kernels/level3/xgemm_part1.opencl"
+    #include "../../kernels/level3/xgemm_part2.opencl"
  ;
 }

--- a/src/tuning/xgemm.cc
+++ b/src/tuning/xgemm.cc
@ -31,7 +31,8 @@ class TuneXgemm {
  static std::string GetSources() {
    return
      #include "../src/kernels/common.opencl"
-      #include "../src/kernels/level3/xgemm.opencl"
+      #include "../src/kernels/level3/xgemm_part1.opencl"
+      #include "../src/kernels/level3/xgemm_part2.opencl"
    ;
  }

--- a/src/tuning/xgemv.cc
+++ b/src/tuning/xgemv.cc
@ -35,6 +35,7 @@ class TuneXgemv {
    return
      #include "../src/kernels/common.opencl"
      #include "../src/kernels/level2/xgemv.opencl"
+      #include "../src/kernels/level2/xgemv_fast.opencl"
    ;
  }

@ -60,8 +61,8 @@ class TuneXgemv {

  // Sets the tuning parameters and their possible values
  static void SetParameters(cltune::Tuner &tuner, const size_t id) {
-    tuner.AddParameter(id, "WGS"+std::to_string(V), {64, 128, 256, 512, 1024, 1536, 2048});
-    tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4, 8});
+    tuner.AddParameter(id, "WGS"+std::to_string(V), {64, 128, 256});
+    tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4});
    if (V==2 || V==3) { tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); }
  }

@ -72,7 +73,10 @@ class TuneXgemv {
      tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)});
    }
  }
-  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
+  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
+    auto LocalMemorySize = [args] (std::vector<size_t> v) { return v[0]*GetBytes(args.precision); };
+    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)});
+  }

  // Sets the base thread configuration
  static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m}; }
@ -108,6 +112,9 @@ class TuneXgemv {
    tuner.AddArgumentScalar(0);
    tuner.AddArgumentScalar(1);
    tuner.AddArgumentScalar(0); // Conjugate transpose
+    tuner.AddArgumentScalar(0); // Additional parameter
+    tuner.AddArgumentScalar(0); // Banded 'kl'
+    tuner.AddArgumentScalar(0); // Banded 'ku'
  }

  // Describes how to compute the performance metrics
--- a/src/tuning/xger.cc
+++ b/src/tuning/xger.cc
@ -0,0 +1,129 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file uses the CLTune auto-tuner to tune the xger OpenCL kernels.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+
+#include "internal/utilities.h"
+#include "internal/tuning.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TuneXger {
+ public:
+
+  // The representative kernel and the source code
+  static std::string KernelFamily() { return "xger"; }
+  static std::string KernelName() { return "Xger"; }
+  static std::string GetSources() {
+    return
+      #include "../src/kernels/common.opencl"
+      #include "../src/kernels/level2/level2.opencl"
+      #include "../src/kernels/level2/xger.opencl"
+    ;
+  }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() { return {kArgN, kArgM, kArgAlpha}; }
+
+  // Tests for valid arguments
+  static void TestValidArguments(const Arguments<T> &) { }
+
+  // Sets the default values for the arguments
+  static size_t DefaultM() { return 1024; }
+  static size_t DefaultN() { return 1024; }
+  static size_t DefaultK() { return 1; } // N/A for this kernel
+  static double DefaultFraction() { return 1.0; } // N/A for this kernel
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &args) { return args.m; }
+  static size_t GetSizeY(const Arguments<T> &args) { return args.n; }
+  static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.n; }
+  static size_t GetSizeB(const Arguments<T> &) { return 1; } // N/A for this kernel
+  static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel
+  static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel
+
+  // Sets the tuning parameters and their possible values
+  static void SetParameters(cltune::Tuner &tuner, const size_t id) {
+    tuner.AddParameter(id, "WGS1", {4, 8, 16, 32, 64, 128, 256, 512});
+    tuner.AddParameter(id, "WGS2", {1, 2, 4, 8, 16, 32, 64, 128, 256});
+    tuner.AddParameter(id, "WPT", {1, 2, 4});
+  }
+
+  // Sets the constraints and local memory size
+  static void SetConstraints(cltune::Tuner &, const size_t) { }
+  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
+
+  // Sets the base thread configuration
+  static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; }
+  static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); }
+  static std::vector<size_t> LocalSize() { return {1, 1}; }
+  static std::vector<size_t> LocalSizeRef() { return {8, 8}; }
+
+  // Transforms the thread configuration based on the parameters
+  using TransformVector = std::vector<std::vector<std::string>>;
+  static TransformVector MulLocal() { return {{"WGS1", "WGS2"}}; }
+  static TransformVector DivLocal() { return {}; }
+  static TransformVector MulGlobal() { return {}; }
+  static TransformVector DivGlobal() { return {{"WPT", "WPT"}}; }
+
+  // Sets the kernel's arguments
+  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
+                           std::vector<T> &x_vec, std::vector<T> &y_vec,
+                           std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
+                           std::vector<T> &) {
+    tuner.AddArgumentScalar(static_cast<int>(args.m));
+    tuner.AddArgumentScalar(static_cast<int>(args.n));
+    tuner.AddArgumentScalar(args.alpha);
+    tuner.AddArgumentInput(x_vec);
+    tuner.AddArgumentScalar(0); // x_offset
+    tuner.AddArgumentScalar(1); // x_increment
+    tuner.AddArgumentInput(y_vec);
+    tuner.AddArgumentScalar(0); // y_offset
+    tuner.AddArgumentScalar(1); // y_increment
+    tuner.AddArgumentOutput(a_mat);
+    tuner.AddArgumentScalar(0); // a_offset
+    tuner.AddArgumentScalar(static_cast<int>(args.m)); // a_ld
+    tuner.AddArgumentScalar(0); // a_is_rowmajor
+  }
+
+  // Describes how to compute the performance metrics
+  static size_t GetMetric(const Arguments<T> &args) {
+    return (2*args.m*args.n + args.m + args.n) * GetBytes(args.precision);
+  }
+  static std::string PerformanceUnit() { return "GB/s"; }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXger<float>, float>(argc, argv); break;
+    case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXger<double>, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXger<float2>, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneXger<double2>, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
--- a/src/utilities.cc
+++ b/src/utilities.cc
@ -103,7 +103,13 @@ std::string ToString(Precision value) {
 // both the real and imaginary parts.
 template <typename T>
 T ConvertArgument(const char* value) {
-  return static_cast<T>(std::stod(value));
+  return static_cast<T>(std::stoi(value));
+}
+template <> float ConvertArgument(const char* value) {
+  return static_cast<float>(std::stod(value));
+}
+template <> double ConvertArgument(const char* value) {
+  return static_cast<double>(std::stod(value));
 }
 template <> float2 ConvertArgument(const char* value) {
  auto val = static_cast<float>(std::stod(value));
@ -139,7 +145,6 @@ T GetArgument(const int argc, char *argv[], std::string &help,
 }

 // Compiles the above function
-template bool GetArgument<bool>(const int, char **, std::string&, const std::string&, const bool);
 template int GetArgument<int>(const int, char **, std::string&, const std::string&, const int);
 template size_t GetArgument<size_t>(const int, char **, std::string&, const std::string&, const size_t);
 template float GetArgument<float>(const int, char **, std::string&, const std::string&, const float);
@ -156,9 +161,9 @@ template Precision GetArgument<Precision>(const int, char **, std::string&, cons
 // =================================================================================================

 // Returns only the precision argument
-Precision GetPrecision(const int argc, char *argv[]) {
+Precision GetPrecision(const int argc, char *argv[], const Precision default_precision) {
  auto dummy = std::string{};
-  return GetArgument(argc, argv, dummy, kArgPrecision, Precision::kSingle);
+  return GetArgument(argc, argv, dummy, kArgPrecision, default_precision);
 }

 // =================================================================================================
--- a/test/correctness/testblas.cc
+++ b/test/correctness/testblas.cc
@ -35,7 +35,7 @@ TestBlas<T,U>::TestBlas(int argc, char *argv[], const bool silent,
                        const Routine run_routine, const Routine run_reference,
                        const ResultGet get_result, const ResultIndex get_index,
                        const ResultIterator get_id1, const ResultIterator get_id2):
-    Tester<T,U>{argc, argv, silent, name, options},
+    Tester<T,U>(argc, argv, silent, name, options),
    run_routine_(run_routine),
    run_reference_(run_reference),
    get_result_(get_result),
--- a/test/correctness/tester.cc
+++ b/test/correctness/tester.cc
@ -80,11 +80,11 @@ template <typename T, typename U>
 Tester<T,U>::~Tester() {
  if (PrecisionSupported<T>(device_)) {
    fprintf(stdout, "* Completed all test-cases for this routine. Results:\n");
-    fprintf(stdout, "   %lu test(s) passed\n", tests_passed_);
+    fprintf(stdout, "   %zu test(s) passed\n", tests_passed_);
    if (tests_skipped_ > 0) { fprintf(stdout, "%s", kPrintWarning.c_str()); }
-    fprintf(stdout, "   %lu test(s) skipped%s\n", tests_skipped_, kPrintEnd.c_str());
+    fprintf(stdout, "   %zu test(s) skipped%s\n", tests_skipped_, kPrintEnd.c_str());
    if (tests_failed_ > 0) { fprintf(stdout, "%s", kPrintError.c_str()); }
-    fprintf(stdout, "   %lu test(s) failed%s\n", tests_failed_, kPrintEnd.c_str());
+    fprintf(stdout, "   %zu test(s) failed%s\n", tests_failed_, kPrintEnd.c_str());
  }
  fprintf(stdout, "\n");
  clblasTeardown();
@ -129,29 +129,29 @@ void Tester<T,U>::TestEnd() {
      fprintf(stdout, "   Status code %d (expected %d): ", entry.status_found, entry.status_expect);
    }
    for (auto &o: options_) {
-      if (o == kArgM)        { fprintf(stdout, "%s=%lu ", kArgM, entry.args.m); }
-      if (o == kArgN)        { fprintf(stdout, "%s=%lu ", kArgN, entry.args.n); }
-      if (o == kArgK)        { fprintf(stdout, "%s=%lu ", kArgK, entry.args.k); }
-      if (o == kArgKU)       { fprintf(stdout, "%s=%lu ", kArgKU, entry.args.ku); }
-      if (o == kArgKL)       { fprintf(stdout, "%s=%lu ", kArgKL, entry.args.kl); }
+      if (o == kArgM)        { fprintf(stdout, "%s=%zu ", kArgM, entry.args.m); }
+      if (o == kArgN)        { fprintf(stdout, "%s=%zu ", kArgN, entry.args.n); }
+      if (o == kArgK)        { fprintf(stdout, "%s=%zu ", kArgK, entry.args.k); }
+      if (o == kArgKU)       { fprintf(stdout, "%s=%zu ", kArgKU, entry.args.ku); }
+      if (o == kArgKL)       { fprintf(stdout, "%s=%zu ", kArgKL, entry.args.kl); }
      if (o == kArgLayout)   { fprintf(stdout, "%s=%d ", kArgLayout, entry.args.layout);}
      if (o == kArgATransp)  { fprintf(stdout, "%s=%d ", kArgATransp, entry.args.a_transpose);}
      if (o == kArgBTransp)  { fprintf(stdout, "%s=%d ", kArgBTransp, entry.args.b_transpose);}
      if (o == kArgSide)     { fprintf(stdout, "%s=%d ", kArgSide, entry.args.side);}
      if (o == kArgTriangle) { fprintf(stdout, "%s=%d ", kArgTriangle, entry.args.triangle);}
      if (o == kArgDiagonal) { fprintf(stdout, "%s=%d ", kArgDiagonal, entry.args.diagonal);}
-      if (o == kArgXInc)     { fprintf(stdout, "%s=%lu ", kArgXInc, entry.args.x_inc);}
-      if (o == kArgYInc)     { fprintf(stdout, "%s=%lu ", kArgYInc, entry.args.y_inc);}
-      if (o == kArgXOffset)  { fprintf(stdout, "%s=%lu ", kArgXOffset, entry.args.x_offset);}
-      if (o == kArgYOffset)  { fprintf(stdout, "%s=%lu ", kArgYOffset, entry.args.y_offset);}
-      if (o == kArgALeadDim) { fprintf(stdout, "%s=%lu ", kArgALeadDim, entry.args.a_ld);}
-      if (o == kArgBLeadDim) { fprintf(stdout, "%s=%lu ", kArgBLeadDim, entry.args.b_ld);}
-      if (o == kArgCLeadDim) { fprintf(stdout, "%s=%lu ", kArgCLeadDim, entry.args.c_ld);}
-      if (o == kArgAOffset)  { fprintf(stdout, "%s=%lu ", kArgAOffset, entry.args.a_offset);}
-      if (o == kArgBOffset)  { fprintf(stdout, "%s=%lu ", kArgBOffset, entry.args.b_offset);}
-      if (o == kArgCOffset)  { fprintf(stdout, "%s=%lu ", kArgCOffset, entry.args.c_offset);}
-      if (o == kArgAPOffset) { fprintf(stdout, "%s=%lu ", kArgAPOffset, entry.args.ap_offset);}
-      if (o == kArgDotOffset){ fprintf(stdout, "%s=%lu ", kArgDotOffset, entry.args.dot_offset);}
+      if (o == kArgXInc)     { fprintf(stdout, "%s=%zu ", kArgXInc, entry.args.x_inc);}
+      if (o == kArgYInc)     { fprintf(stdout, "%s=%zu ", kArgYInc, entry.args.y_inc);}
+      if (o == kArgXOffset)  { fprintf(stdout, "%s=%zu ", kArgXOffset, entry.args.x_offset);}
+      if (o == kArgYOffset)  { fprintf(stdout, "%s=%zu ", kArgYOffset, entry.args.y_offset);}
+      if (o == kArgALeadDim) { fprintf(stdout, "%s=%zu ", kArgALeadDim, entry.args.a_ld);}
+      if (o == kArgBLeadDim) { fprintf(stdout, "%s=%zu ", kArgBLeadDim, entry.args.b_ld);}
+      if (o == kArgCLeadDim) { fprintf(stdout, "%s=%zu ", kArgCLeadDim, entry.args.c_ld);}
+      if (o == kArgAOffset)  { fprintf(stdout, "%s=%zu ", kArgAOffset, entry.args.a_offset);}
+      if (o == kArgBOffset)  { fprintf(stdout, "%s=%zu ", kArgBOffset, entry.args.b_offset);}
+      if (o == kArgCOffset)  { fprintf(stdout, "%s=%zu ", kArgCOffset, entry.args.c_offset);}
+      if (o == kArgAPOffset) { fprintf(stdout, "%s=%zu ", kArgAPOffset, entry.args.ap_offset);}
+      if (o == kArgDotOffset){ fprintf(stdout, "%s=%zu ", kArgDotOffset, entry.args.dot_offset);}
    }
    fprintf(stdout, "\n");
  }
@ -159,18 +159,18 @@ void Tester<T,U>::TestEnd() {
  // Prints a test summary
  auto pass_rate = 100*num_passed_ / static_cast<float>(num_passed_ + num_skipped_ + num_failed_);
  fprintf(stdout, "   Pass rate %s%5.1lf%%%s:", kPrintMessage.c_str(), pass_rate, kPrintEnd.c_str());
-  fprintf(stdout, " %lu passed /", num_passed_);
+  fprintf(stdout, " %zu passed /", num_passed_);
  if (num_skipped_ != 0) {
-    fprintf(stdout, " %s%lu skipped%s /", kPrintWarning.c_str(), num_skipped_, kPrintEnd.c_str());
+    fprintf(stdout, " %s%zu skipped%s /", kPrintWarning.c_str(), num_skipped_, kPrintEnd.c_str());
  }
  else {
-    fprintf(stdout, " %lu skipped /", num_skipped_);
+    fprintf(stdout, " %zu skipped /", num_skipped_);
  }
  if (num_failed_ != 0) {
-    fprintf(stdout, " %s%lu failed%s\n", kPrintError.c_str(), num_failed_, kPrintEnd.c_str());
+    fprintf(stdout, " %s%zu failed%s\n", kPrintError.c_str(), num_failed_, kPrintEnd.c_str());
  }
  else {
-    fprintf(stdout, " %lu failed\n", num_failed_);
+    fprintf(stdout, " %zu failed\n", num_failed_);
  }
 }

@ -280,21 +280,21 @@ bool TestSimilarity(const T val1, const T val2) {
  const auto difference = std::fabs(val1 - val2);

  // Set the allowed error margin for floating-point comparisons
-  constexpr auto kErrorMarginRelative = 1.0e-2;
-  constexpr auto kErrorMarginAbsolute = 1.0e-10;
+  constexpr auto kErrorMarginRelative = T{0.025};
+  constexpr auto kErrorMarginAbsolute = T{1.0e-6};

  // Shortcut, handles infinities
  if (val1 == val2) {
    return true;
  }
  // The values are zero or very small: the relative error is less meaningful
-  else if (val1 == 0 || val2 == 0 || difference < static_cast<T>(kErrorMarginAbsolute)) {
-    return (difference < static_cast<T>(kErrorMarginAbsolute));
+  else if (val1 == 0 || val2 == 0 || difference < kErrorMarginAbsolute) {
+    return (difference < kErrorMarginAbsolute);
  }
  // Use relative error
  else {
    const auto absolute_sum = std::fabs(val1) + std::fabs(val2);
-    return (difference / absolute_sum) < static_cast<T>(kErrorMarginRelative);
+    return (difference / absolute_sum) < kErrorMarginRelative;
  }
 }

--- a/test/performance/client.cc
+++ b/test/performance/client.cc
@ -15,6 +15,7 @@

 #include <string>
 #include <vector>
+#include <utility>
 #include <algorithm>
 #include <chrono>

@ -48,11 +49,11 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric
  for (auto &o: options_) {

    // Data-sizes
-    if (o == kArgM)  { args.m   = GetArgument(argc, argv, help, kArgM, 512UL); }
-    if (o == kArgN)  { args.n   = GetArgument(argc, argv, help, kArgN, 512UL); }
-    if (o == kArgK)  { args.k   = GetArgument(argc, argv, help, kArgK, 512UL); }
-    if (o == kArgKU) { args.ku  = GetArgument(argc, argv, help, kArgKU, 128UL); }
-    if (o == kArgKL) { args.kl  = GetArgument(argc, argv, help, kArgKL, 128UL); }
+    if (o == kArgM)  { args.m   = GetArgument(argc, argv, help, kArgM, size_t{512}); }
+    if (o == kArgN)  { args.n   = GetArgument(argc, argv, help, kArgN, size_t{512}); }
+    if (o == kArgK)  { args.k   = GetArgument(argc, argv, help, kArgK, size_t{512}); }
+    if (o == kArgKU) { args.ku  = GetArgument(argc, argv, help, kArgKU, size_t{128}); }
+    if (o == kArgKL) { args.kl  = GetArgument(argc, argv, help, kArgKL, size_t{128}); }

    // Data-layouts
    if (o == kArgLayout)   { args.layout      = GetArgument(argc, argv, help, kArgLayout, Layout::kRowMajor); }
@ -89,7 +90,7 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric
  args.platform_id    = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
  args.device_id      = GetArgument(argc, argv, help, kArgDevice, size_t{0});
  args.precision      = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
-  args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, true);
+  args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, 1);
  args.step           = GetArgument(argc, argv, help, kArgStepSize, size_t{1});
  args.num_steps      = GetArgument(argc, argv, help, kArgNumSteps, size_t{0});
  args.num_runs       = GetArgument(argc, argv, help, kArgNumRuns, size_t{10});
@ -112,7 +113,7 @@ template <typename T, typename U>
 void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) {

  // Prints the header of the output table
-  PrintTableHeader(args.silent, options_);
+  PrintTableHeader(args);

  // Initializes OpenCL and the libraries
  auto platform = Platform(args.platform_id);
@ -162,11 +163,16 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
    auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, dot};

    // Runs the routines and collects the timings
+    auto timings = std::vector<std::pair<std::string, double>>();
    auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
-    auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference_, "clBLAS");
+    timings.push_back(std::pair<std::string, double>("CLBlast", ms_clblast));
+    if (args.compare_clblas) {
+      auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference_, "clBLAS");
+      timings.push_back(std::pair<std::string, double>("clBLAS", ms_clblas));
+    }

-    // Prints the performance of both libraries
-    PrintTableRow(args, ms_clblast, ms_clblas);
+    // Prints the performance of the tested libraries
+    PrintTableRow(args, timings);

    // Makes the jump to the next step
    ++s;
@ -213,20 +219,27 @@ double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &ar

 // Prints the header of the performance table
 template <typename T, typename U>
-void Client<T,U>::PrintTableHeader(const bool silent, const std::vector<std::string> &args) {
-  if (!silent) {
-    for (auto i=size_t{0}; i<args.size(); ++i) { fprintf(stdout, "%9s ", ""); }
-    fprintf(stdout, " | <--       CLBlast       --> | <--      clBLAS      --> |\n");
+void Client<T,U>::PrintTableHeader(const Arguments<U>& args) {
+
+  // First line (optional)
+  if (!args.silent) {
+    for (auto i=size_t{0}; i<options_.size(); ++i) { fprintf(stdout, "%9s ", ""); }
+    fprintf(stdout, " | <--       CLBlast       -->");
+    if (args.compare_clblas) { fprintf(stdout, " | <--       clBLAS        -->"); }
+    fprintf(stdout, " |\n");
  }
-  for (auto &argument: args) { fprintf(stdout, "%9s;", argument.c_str()); }
-  fprintf(stdout, "%9s;%9s;%9s;%9s;%9s;%9s\n",
-          "ms_1", "GFLOPS_1", "GBs_1", "ms_2", "GFLOPS_2", "GBs_2");
+
+  // Second line
+  for (auto &option: options_) { fprintf(stdout, "%9s;", option.c_str()); }
+  fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1");
+  if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); }
+  fprintf(stdout, "\n");
 }

 // Print a performance-result row
 template <typename T, typename U>
-void Client<T,U>::PrintTableRow(const Arguments<U>& args, const double ms_clblast,
-                                const double ms_clblas) {
+void Client<T,U>::PrintTableRow(const Arguments<U>& args,
+                                const std::vector<std::pair<std::string, double>>& timings) {

  // Creates a vector of relevant variables
  auto integers = std::vector<size_t>{};
@ -261,34 +274,36 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args, const double ms_clblas
    else if (o == kArgBeta) {     strings.push_back(ToString(args.beta)); }
  }

-  // Computes the GFLOPS and GB/s metrics
-  auto flops = get_flops_(args);
-  auto bytes = get_bytes_(args);
-  auto gflops_clblast = (ms_clblast != 0.0) ? (flops*1e-6)/ms_clblast : 0;
-  auto gflops_clblas = (ms_clblas != 0.0) ? (flops*1e-6)/ms_clblas: 0;
-  auto gbs_clblast = (ms_clblast != 0.0) ? (bytes*1e-6)/ms_clblast : 0;
-  auto gbs_clblas = (ms_clblas != 0.0) ? (bytes*1e-6)/ms_clblas: 0;
-
  // Outputs the argument values
  for (auto &argument: integers) {
    if (!args.no_abbrv && argument >= 1024*1024 && IsMultiple(argument, 1024*1024)) {
-      fprintf(stdout, "%8luM;", argument/(1024*1024));
+      fprintf(stdout, "%8zuM;", argument/(1024*1024));
    }
    else if (!args.no_abbrv && argument >= 1024 && IsMultiple(argument, 1024)) {
-      fprintf(stdout, "%8luK;", argument/1024);
+      fprintf(stdout, "%8zuK;", argument/1024);
    }
    else {
-      fprintf(stdout, "%9lu;", argument);
+      fprintf(stdout, "%9zu;", argument);
    }
  }
  for (auto &argument: strings) {
    fprintf(stdout, "%9s;", argument.c_str());
  }

-  // Outputs the performance numbers
-  fprintf(stdout, "%9.2lf;%9.1lf;%9.1lf;%9.2lf;%9.1lf;%9.1lf\n",
-          ms_clblast, gflops_clblast, gbs_clblast,
-          ms_clblas, gflops_clblas, gbs_clblas);
+  // Loops over all tested libraries
+  for (const auto& timing : timings) {
+
+    // Computes the GFLOPS and GB/s metrics
+    auto flops = get_flops_(args);
+    auto bytes = get_bytes_(args);
+    auto gflops = (timing.second != 0.0) ? (flops*1e-6)/timing.second : 0;
+    auto gbs = (timing.second != 0.0) ? (bytes*1e-6)/timing.second : 0;
+
+    // Outputs the performance numbers
+    if (timing.first != "CLBlast") { fprintf(stdout, ";"); }
+    fprintf(stdout, "%9.2lf;%9.1lf;%9.1lf", timing.second, gflops, gbs);
+  }
+  fprintf(stdout, "\n");
 }

 // =================================================================================================
--- a/test/performance/client.h
+++ b/test/performance/client.h
@ -23,6 +23,7 @@

 #include <string>
 #include <vector>
+#include <utility>

 // The libraries to test
 #include <clBLAS.h>
@ -64,10 +65,11 @@ class Client {
                        Queue &queue, Routine run_blas, const std::string &library_name);

  // Prints the header of a performance-data table
-  void PrintTableHeader(const bool silent, const std::vector<std::string> &args);
+  void PrintTableHeader(const Arguments<U>& args);

  // Prints a row of performance data, including results of two libraries
-  void PrintTableRow(const Arguments<U>& args, const double ms_clblast, const double ms_clblas);
+  void PrintTableRow(const Arguments<U>& args,
+                     const std::vector<std::pair<std::string, double>>& timings);

  // The routine-specific functions passed to the tester
  const Routine run_routine_;
--- a/test/performance/graphs/common.r
+++ b/test/performance/graphs/common.r
@ -63,7 +63,7 @@ main <- function(routine_name, precision, test_names, test_values,
  if (precision == 64) { display_name <- gsub("^X","D",display_name); }
  if (precision == 3232) { display_name <- gsub("^X","C",display_name); }
  if (precision == 6464) { display_name <- gsub("^X","Z",display_name); }
-  executable <- paste("./client_", routine_name, sep="")
+  executable <- paste("./clblast_client_", routine_name, sep="")

  # Configures the outputfile
  pdf(paste(display_name, ".pdf", sep=""), height=8, width=13)
--- a/test/performance/routines/level1/xaxpy.cc
+++ b/test/performance/routines/level1/xaxpy.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;

 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle:
      clblast::RunClient<clblast::TestXaxpy<float>, float, float>(argc, argv); break;
--- a/test/performance/routines/level1/xcopy.cc
+++ b/test/performance/routines/level1/xcopy.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;

 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle:
      clblast::RunClient<clblast::TestXcopy<float>, float, float>(argc, argv); break;
--- a/test/performance/routines/level1/xdot.cc
+++ b/test/performance/routines/level1/xdot.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;

 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle:
      clblast::RunClient<clblast::TestXdot<float>, float, float>(argc, argv); break;
--- a/test/performance/routines/level1/xdotc.cc
+++ b/test/performance/routines/level1/xdotc.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;

 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
--- a/test/performance/routines/level1/xdotu.cc
+++ b/test/performance/routines/level1/xdotu.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;

 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
--- a/test/performance/routines/level1/xscal.cc
+++ b/test/performance/routines/level1/xscal.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;

 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle:
      clblast::RunClient<clblast::TestXscal<float>, float, float>(argc, argv); break;
--- a/test/performance/routines/level1/xswap.cc
+++ b/test/performance/routines/level1/xswap.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;

 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle:
      clblast::RunClient<clblast::TestXswap<float>, float, float>(argc, argv); break;
--- a/test/performance/routines/level2/xgbmv.cc
+++ b/test/performance/routines/level2/xgbmv.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;

 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle:
      clblast::RunClient<clblast::TestXgbmv<float>, float, float>(argc, argv); break;
--- a/test/performance/routines/level2/xgemv.cc
+++ b/test/performance/routines/level2/xgemv.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;

 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle:
      clblast::RunClient<clblast::TestXgemv<float>, float, float>(argc, argv); break;
--- a/test/performance/routines/level2/xger.cc
+++ b/test/performance/routines/level2/xger.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;

 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle:
      clblast::RunClient<clblast::TestXger<float>, float, float>(argc, argv); break;
--- a/test/performance/routines/level2/xgerc.cc
+++ b/test/performance/routines/level2/xgerc.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;

 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
--- a/test/performance/routines/level2/xgeru.cc
+++ b/test/performance/routines/level2/xgeru.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;

 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
--- a/test/performance/routines/level2/xhbmv.cc
+++ b/test/performance/routines/level2/xhbmv.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;

 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
--- a/test/performance/routines/level2/xhemv.cc
+++ b/test/performance/routines/level2/xhemv.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;

 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
--- a/test/performance/routines/level2/xher.cc
+++ b/test/performance/routines/level2/xher.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;

 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
--- a/test/performance/routines/level2/xher2.cc
+++ b/test/performance/routines/level2/xher2.cc
@ -18,7 +18,7 @@ using double2 = clblast::double2;

 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
    case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
--- a/Show more
+++ b/Show more