Added first (untested) version of a CUDA API

2017-10-11 23:16:57 +02:00 · 2017-10-11 23:16:57 +02:00 · b901809345
parent 9224da19ef
commit b901809345
10 changed files with 3874 additions and 48 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -30,6 +30,23 @@ option(TESTS "Enable compilation of the correctness tests" OFF)
 option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF)
 option(CUBLAS "Enables performance comparison against cuBLAS on NVIDIA GPUs" OFF)
 # Select between an OpenCL API (default) or a CUDA API (beta)
 option(OPENCL "Build CLBlast with an OpenCL API (default)" ON)
 option(CUDA "Build CLBlast with a CUDA API (beta)" OFF)
 if(NOT OPENCL AND NOT CUDA)
  message(FATAL_ERROR "No API selected, choose from OpenCL (-DOPENCL=ON) or CUDA (-DCUDA=ON)")
 endif()
 if(OPENCL AND  CUDA)
  message(FATAL_ERROR "Multiple APIs selected, choose either OpenCL (-DOPENCL=ON -DCUDA=OFF) or CUDA (-DCUDA=ON -DOPENCL=OFF)")
 endif()
 if(OPENCL)
  message("-- Building CLBlast with OpenCL API (default)")
  add_definitions(-DOPENCL_API)
 elseif(CUDA)
  message("-- Building CLBlast with CUDA API (beta)")
  add_definitions(-DCUDA_API)
 endif()
 # Compile in verbose mode with additional diagnostic messages
 option(VERBOSE "Compile in verbose mode for additional diagnostic messages" OFF)
 if(VERBOSE)
@ -123,8 +140,18 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CFLAGS}")
 # Package scripts location
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${clblast_SOURCE_DIR}/cmake/Modules/")
-# Requires OpenCL. It is found through the included "FindOpenCL.cmake" in CMAKE_MODULE_PATH.
+if(OPENCL)
-find_package(OpenCL REQUIRED)
+  # Requires OpenCL. It is found through the included "FindOpenCL.cmake" in CMAKE_MODULE_PATH.
  find_package(OpenCL REQUIRED)
  set(API_LIBRARIES ${OPENCL_LIBRARIES})
  set(API_INCLUDE_DIRS ${OPENCL_INCLUDE_DIRS})
 elseif(CUDA)
  # For CUDA, the "FindCUDA.cmake" is part of CMake
  find_package(CUDA REQUIRED)
  set(API_LIBRARIES cuda nvrtc)
  set(API_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS})
  link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
 endif()
 # Locates the CLTune library in case the tuners need to be compiled. "FindCLTune.cmake" is included.
 if(TUNERS)
@ -161,11 +188,6 @@ set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger
            xgemm xgemm_direct xgemv)
 set(DATABASES copy pad padtranspose transpose xaxpy xdot
              xgemm xgemm_direct xgemv xgemv_fast xgemv_fast_rot xger)
 set(SAMPLE_PROGRAMS_CPP sgemm sgemm_batched)
 set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache)
 if(NETLIB)
  set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib)
 endif()
 set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
 set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv
                    xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
@ -173,6 +195,16 @@ set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm)
 set(LEVELX_ROUTINES xomatcopy xim2col xaxpybatched xgemmbatched)
 set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES})
 set(PRECISIONS 32 64 3232 6464 16)
 if(OPENCL)
  set(SAMPLE_PROGRAMS_CPP sgemm sgemm_batched)
  set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache)
  if(NETLIB)
    set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib)
  endif()
 elseif(CUDA)
  set(SAMPLE_PROGRAMS_CPP )
  set(SAMPLE_PROGRAMS_C )
 endif()
 # ==================================================================================================
@ -184,14 +216,10 @@ set(SOURCES
  src/utilities/utilities.cpp
  src/api_common.cpp
  src/cache.cpp
  src/clblast.cpp
  src/clblast_c.cpp
  src/routine.cpp
  src/routines/levelx/xinvert.cpp  # only source, don't include it as a test
 )
 set(HEADERS  # such that they can be discovered by IDEs such as CLion and Visual Studio
  include/clblast.h
  include/clblast_c.h
  include/clblast_half.h
  src/database/apple_cpu_fallback.hpp
  src/database/database.hpp
@ -209,13 +237,19 @@ set(HEADERS  # such that they can be discovered by IDEs such as CLion and Visual
  src/utilities/msvc.hpp
  src/utilities/utilities.hpp
  src/cache.hpp
  src/clpp11.hpp
  src/cxpp11_common.hpp
  src/routine.hpp
 )
-if(NETLIB)
+if(OPENCL)
-  set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp)
+  set(SOURCES ${SOURCES} src/clblast.cpp src/clblast_c.cpp)
-  set(HEADERS ${HEADERS} include/clblast_netlib_c.h)
+  set(HEADERS ${HEADERS} include/clblast.h include/clblast_c.h src/clpp11.hpp)
  if(NETLIB)
    set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp)
    set(HEADERS ${HEADERS} include/clblast_netlib_c.h)
  endif()
 elseif(CUDA)
  set(SOURCES ${SOURCES} src/clblast_cuda.cpp)
  set(HEADERS ${HEADERS} include/clblast_cuda.h src/cupp11.hpp)
 endif()
 foreach(ROUTINE ${LEVEL1_ROUTINES})
  set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cpp)
@ -249,14 +283,14 @@ else(BUILD_SHARED_LIBS)
  add_library(clblast STATIC ${SOURCES} ${HEADERS})
 endif()
-target_link_libraries(clblast ${OPENCL_LIBRARIES})
+target_link_libraries(clblast ${API_LIBRARIES})
 # Includes directories: CLBlast and OpenCL
 target_include_directories(clblast PUBLIC
                           $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/include>
                           $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/src>
                           $<INSTALL_INTERFACE:include>
-                           ${OPENCL_INCLUDE_DIRS})
+                           ${API_INCLUDE_DIRS})
 # Sets the proper __declspec(dllexport) keyword for Visual Studio when the library is built
 if(MSVC)
@ -267,11 +301,15 @@ endif()
 # Installs the library
 install(TARGETS clblast EXPORT CLBlast DESTINATION lib)
 install(FILES include/clblast.h DESTINATION include)
 install(FILES include/clblast_c.h DESTINATION include)
 install(FILES include/clblast_half.h DESTINATION include)
-if(NETLIB)
+if(OPENCL)
-  install(FILES include/clblast_netlib_c.h DESTINATION include)
+  install(FILES include/clblast.h DESTINATION include)
  install(FILES include/clblast_c.h DESTINATION include)
  if(NETLIB)
    install(FILES include/clblast_netlib_c.h DESTINATION include)
  endif()
 elseif(CUDA)
  install(FILES include/clblast_cuda.h DESTINATION include)
 endif()
 # Installs the config for find_package in dependent projects
@ -291,19 +329,21 @@ endif()
 if(SAMPLES)
  # Downloads the cl.hpp file from Khronos
-  file(DOWNLOAD https://www.khronos.org/registry/OpenCL/api/2.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp)
+  if(OPENCL)
    file(DOWNLOAD https://www.khronos.org/registry/OpenCL/api/2.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp)
  endif()
  # Adds sample programs (C++)
  foreach(SAMPLE ${SAMPLE_PROGRAMS_CPP})
    add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cpp)
-    target_link_libraries(clblast_sample_${SAMPLE} clblast ${OPENCL_LIBRARIES})
+    target_link_libraries(clblast_sample_${SAMPLE} clblast ${API_LIBRARIES})
    install(TARGETS clblast_sample_${SAMPLE} DESTINATION bin)
  endforeach()
  # Adds sample programs (C)
  foreach(SAMPLE ${SAMPLE_PROGRAMS_C})
    add_executable(clblast_sample_${SAMPLE}_c samples/${SAMPLE}.c)
-    target_link_libraries(clblast_sample_${SAMPLE}_c clblast ${OPENCL_LIBRARIES})
+    target_link_libraries(clblast_sample_${SAMPLE}_c clblast ${API_LIBRARIES})
    install(TARGETS clblast_sample_${SAMPLE}_c DESTINATION bin)
  endforeach()
@ -324,7 +364,7 @@ if(TUNERS)
  # Adds tuning executables
  foreach(KERNEL ${KERNELS})
    add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} src/tuning/kernels/${KERNEL}.cpp)
-    target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${OPENCL_LIBRARIES})
+    target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${API_LIBRARIES})
    target_include_directories(clblast_tuner_${KERNEL} PUBLIC ${CLTUNE_INCLUDE_DIRS})
    install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
  endforeach()
@ -429,7 +469,7 @@ if(CLIENTS)
                   test/routines/levelx/${ROUTINE}.hpp)
  endforeach()
  foreach(ROUTINE ${ROUTINES})
-    target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
+    target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${API_LIBRARIES})
    target_include_directories(clblast_client_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES})
    install(TARGETS clblast_client_${ROUTINE} DESTINATION bin)
  endforeach()
@ -481,7 +521,7 @@ if(TESTS)
                   test/routines/levelx/${ROUTINE}.hpp)
  endforeach()
  foreach(ROUTINE ${ROUTINES})
-    target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
+    target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${API_LIBRARIES})
    install(TARGETS clblast_test_${ROUTINE} DESTINATION bin)
    target_include_directories(clblast_test_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES})
    add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE})
@ -492,7 +532,7 @@ if(TESTS)
  foreach(MISC_TEST ${MISC_TESTS})
    add_executable(clblast_test_${MISC_TEST} ${TESTS_COMMON}
                   test/correctness/misc/${MISC_TEST}.cpp)
-    target_link_libraries(clblast_test_${MISC_TEST} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
+    target_link_libraries(clblast_test_${MISC_TEST} clblast ${REF_LIBRARIES} ${API_LIBRARIES})
    target_include_directories(clblast_test_${MISC_TEST} PUBLIC
                               $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
                               ${clblast_SOURCE_DIR} ${REF_INCLUDES})
@ -501,7 +541,7 @@ if(TESTS)
  # CLBlast diagnostics
  add_executable(clblast_test_diagnostics ${TESTS_COMMON} test/diagnostics.cpp)
-  target_link_libraries(clblast_test_diagnostics clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
+  target_link_libraries(clblast_test_diagnostics clblast ${REF_LIBRARIES} ${API_LIBRARIES})
  target_include_directories(clblast_test_diagnostics PUBLIC
                             $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
                             ${clblast_SOURCE_DIR} ${REF_INCLUDES})
--- a/include/clblast_cuda.h
+++ b/include/clblast_cuda.h
@ -0,0 +1,643 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file contains the special CUDA interface to the CLBlast BLAS routines. It also contains the
 // definitions of the returned status codes and the layout and transpose types. This is the header
 // users of the CUDA API of CLBlast should include and use.
 //
 // =================================================================================================
 #ifndef CLBLAST_CLBLAST_CUDA_H_
 #define CLBLAST_CLBLAST_CUDA_H_
 #include <cstdlib> // For size_t
 #include <string> // For OverrideParameters function
 #include <unordered_map> // For OverrideParameters function
 // CUDA
 #include <cuda.h> // CUDA driver API
 #include <nvrtc.h> // NVIDIA runtime compilation API
 // Exports library functions under Windows when building a DLL. See also:
 // https://msdn.microsoft.com/en-us/library/a90k134d.aspx
 #if defined(_WIN32) && defined(CLBLAST_DLL)
  #if defined(COMPILING_DLL)
    #define PUBLIC_API __declspec(dllexport)
  #else
    #define PUBLIC_API __declspec(dllimport)
  #endif
 #else
  #define PUBLIC_API
 #endif
 namespace clblast {
 // =================================================================================================
 // Status codes. These codes can be returned by functions declared in this header file. The error
 // codes match either the standard CUDA driver API error codes or the regular CLBlast error codes.
 enum class StatusCode {
  // Status codes in common with the OpenCL standard
  kSuccess                   =   0, // CUDA_SUCCESS
  kInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions
  kInvalidLocalThreadsTotal  = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total
  kInvalidLocalThreadsDim    = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension
  // Status codes in common with the clBLAS library
  kNotImplemented            = -1024, // Routine or functionality not implemented yet
  kInvalidMatrixA            = -1022, // Matrix A is not a valid OpenCL buffer
  kInvalidMatrixB            = -1021, // Matrix B is not a valid OpenCL buffer
  kInvalidMatrixC            = -1020, // Matrix C is not a valid OpenCL buffer
  kInvalidVectorX            = -1019, // Vector X is not a valid OpenCL buffer
  kInvalidVectorY            = -1018, // Vector Y is not a valid OpenCL buffer
  kInvalidDimension          = -1017, // Dimensions M, N, and K have to be larger than zero
  kInvalidLeadDimA           = -1016, // LD of A is smaller than the matrix's first dimension
  kInvalidLeadDimB           = -1015, // LD of B is smaller than the matrix's first dimension
  kInvalidLeadDimC           = -1014, // LD of C is smaller than the matrix's first dimension
  kInvalidIncrementX         = -1013, // Increment of vector X cannot be zero
  kInvalidIncrementY         = -1012, // Increment of vector Y cannot be zero
  kInsufficientMemoryA       = -1011, // Matrix A's OpenCL buffer is too small
  kInsufficientMemoryB       = -1010, // Matrix B's OpenCL buffer is too small
  kInsufficientMemoryC       = -1009, // Matrix C's OpenCL buffer is too small
  kInsufficientMemoryX       = -1008, // Vector X's OpenCL buffer is too small
  kInsufficientMemoryY       = -1007, // Vector Y's OpenCL buffer is too small
  // Custom additional status codes for CLBlast
  kInvalidBatchCount         = -2049, // The batch count needs to be positive
  kInvalidOverrideKernel     = -2048, // Trying to override parameters for an invalid kernel
  kMissingOverrideParameter  = -2047, // Missing override parameter(s) for the target kernel
  kInvalidLocalMemUsage      = -2046, // Not enough local memory available on this device
  kNoHalfPrecision           = -2045, // Half precision (16-bits) not supported by the device
  kNoDoublePrecision         = -2044, // Double precision (64-bits) not supported by the device
  kInvalidVectorScalar       = -2043, // The unit-sized vector is not a valid OpenCL buffer
  kInsufficientMemoryScalar  = -2042, // The unit-sized vector's OpenCL buffer is too small
  kDatabaseError             = -2041, // Entry for the device was not found in the database
  kUnknownError              = -2040, // A catch-all error code representing an unspecified error
  kUnexpectedError           = -2039, // A catch-all error code representing an unexpected exception
 };
 // Matrix layout and transpose types
 enum class Layout { kRowMajor = 101, kColMajor = 102 };
 enum class Transpose { kNo = 111, kYes = 112, kConjugate = 113 };
 enum class Triangle { kUpper = 121, kLower = 122 };
 enum class Diagonal { kNonUnit = 131, kUnit = 132 };
 enum class Side { kLeft = 141, kRight = 142 };
 // Precision scoped enum (values in bits)
 enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
                       kComplexSingle = 3232, kComplexDouble = 6464, kAny = -1 };
 // =================================================================================================
 // BLAS level-1 (vector-vector) routines
 // =================================================================================================
 // Generate givens plane rotation: SROTG/DROTG
 template <typename T>
 StatusCode Rotg(CUdeviceptr sa_buffer, const size_t sa_offset,
                CUdeviceptr sb_buffer, const size_t sb_offset,
                CUdeviceptr sc_buffer, const size_t sc_offset,
                CUdeviceptr ss_buffer, const size_t ss_offset,
                CUstream* stream);
 // Generate modified givens plane rotation: SROTMG/DROTMG
 template <typename T>
 StatusCode Rotmg(CUdeviceptr sd1_buffer, const size_t sd1_offset,
                 CUdeviceptr sd2_buffer, const size_t sd2_offset,
                 CUdeviceptr sx1_buffer, const size_t sx1_offset,
                 const CUdeviceptr sy1_buffer, const size_t sy1_offset,
                 CUdeviceptr sparam_buffer, const size_t sparam_offset,
                 CUstream* stream);
 // Apply givens plane rotation: SROT/DROT
 template <typename T>
 StatusCode Rot(const size_t n,
               CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
               CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
               const T cos,
               const T sin,
               CUstream* stream);
 // Apply modified givens plane rotation: SROTM/DROTM
 template <typename T>
 StatusCode Rotm(const size_t n,
                CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
                CUdeviceptr sparam_buffer, const size_t sparam_offset,
                CUstream* stream);
 // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP
 template <typename T>
 StatusCode Swap(const size_t n,
                CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
                CUstream* stream);
 // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
 template <typename T>
 StatusCode Scal(const size_t n,
                const T alpha,
                CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                CUstream* stream);
 // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
 template <typename T>
 StatusCode Copy(const size_t n,
                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
                CUstream* stream);
 // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
 template <typename T>
 StatusCode Axpy(const size_t n,
                const T alpha,
                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
                CUstream* stream);
 // Dot product of two vectors: SDOT/DDOT/HDOT
 template <typename T>
 StatusCode Dot(const size_t n,
               CUdeviceptr dot_buffer, const size_t dot_offset,
               const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
               const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
               CUstream* stream);
 // Dot product of two complex vectors: CDOTU/ZDOTU
 template <typename T>
 StatusCode Dotu(const size_t n,
                CUdeviceptr dot_buffer, const size_t dot_offset,
                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
                CUstream* stream);
 // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC
 template <typename T>
 StatusCode Dotc(const size_t n,
                CUdeviceptr dot_buffer, const size_t dot_offset,
                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
                CUstream* stream);
 // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
 template <typename T>
 StatusCode Nrm2(const size_t n,
                CUdeviceptr nrm2_buffer, const size_t nrm2_offset,
                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                CUstream* stream);
 // Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
 template <typename T>
 StatusCode Asum(const size_t n,
                CUdeviceptr asum_buffer, const size_t asum_offset,
                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                CUstream* stream);
 // Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
 template <typename T>
 StatusCode Sum(const size_t n,
               CUdeviceptr sum_buffer, const size_t sum_offset,
               const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
               CUstream* stream);
 // Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
 template <typename T>
 StatusCode Amax(const size_t n,
                CUdeviceptr imax_buffer, const size_t imax_offset,
                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                CUstream* stream);
 // Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN
 template <typename T>
 StatusCode Amin(const size_t n,
                CUdeviceptr imin_buffer, const size_t imin_offset,
                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                CUstream* stream);
 // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
 template <typename T>
 StatusCode Max(const size_t n,
               CUdeviceptr imax_buffer, const size_t imax_offset,
               const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
               CUstream* stream);
 // Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
 template <typename T>
 StatusCode Min(const size_t n,
               CUdeviceptr imin_buffer, const size_t imin_offset,
               const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
               CUstream* stream);
 // =================================================================================================
 // BLAS level-2 (matrix-vector) routines
 // =================================================================================================
 // General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV
 template <typename T>
 StatusCode Gemv(const Layout layout, const Transpose a_transpose,
                const size_t m, const size_t n,
                const T alpha,
                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                const T beta,
                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
                CUstream* stream);
 // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
 template <typename T>
 StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
                const size_t m, const size_t n, const size_t kl, const size_t ku,
                const T alpha,
                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                const T beta,
                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
                CUstream* stream);
 // Hermitian matrix-vector multiplication: CHEMV/ZHEMV
 template <typename T>
 StatusCode Hemv(const Layout layout, const Triangle triangle,
                const size_t n,
                const T alpha,
                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                const T beta,
                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
                CUstream* stream);
 // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV
 template <typename T>
 StatusCode Hbmv(const Layout layout, const Triangle triangle,
                const size_t n, const size_t k,
                const T alpha,
                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                const T beta,
                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
                CUstream* stream);
 // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV
 template <typename T>
 StatusCode Hpmv(const Layout layout, const Triangle triangle,
                const size_t n,
                const T alpha,
                const CUdeviceptr ap_buffer, const size_t ap_offset,
                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                const T beta,
                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
                CUstream* stream);
 // Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV
 template <typename T>
 StatusCode Symv(const Layout layout, const Triangle triangle,
                const size_t n,
                const T alpha,
                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                const T beta,
                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
                CUstream* stream);
 // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
 template <typename T>
 StatusCode Sbmv(const Layout layout, const Triangle triangle,
                const size_t n, const size_t k,
                const T alpha,
                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                const T beta,
                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
                CUstream* stream);
 // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
 template <typename T>
 StatusCode Spmv(const Layout layout, const Triangle triangle,
                const size_t n,
                const T alpha,
                const CUdeviceptr ap_buffer, const size_t ap_offset,
                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                const T beta,
                CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
                CUstream* stream);
 // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
 template <typename T>
 StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                const size_t n,
                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                CUstream* stream);
 // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
 template <typename T>
 StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                const size_t n, const size_t k,
                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                CUstream* stream);
 // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
 template <typename T>
 StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                const size_t n,
                const CUdeviceptr ap_buffer, const size_t ap_offset,
                CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                CUstream* stream);
 // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
 template <typename T>
 StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                const size_t n,
                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                CUstream* stream);
 // Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV
 template <typename T>
 StatusCode Tbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                const size_t n, const size_t k,
                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                CUstream* stream);
 // Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV
 template <typename T>
 StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                const size_t n,
                const CUdeviceptr ap_buffer, const size_t ap_offset,
                CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                CUstream* stream);
 // General rank-1 matrix update: SGER/DGER/HGER
 template <typename T>
 StatusCode Ger(const Layout layout,
               const size_t m, const size_t n,
               const T alpha,
               const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
               const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
               CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
               CUstream* stream);
 // General rank-1 complex matrix update: CGERU/ZGERU
 template <typename T>
 StatusCode Geru(const Layout layout,
                const size_t m, const size_t n,
                const T alpha,
                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
                CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                CUstream* stream);
 // General rank-1 complex conjugated matrix update: CGERC/ZGERC
 template <typename T>
 StatusCode Gerc(const Layout layout,
                const size_t m, const size_t n,
                const T alpha,
                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
                CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                CUstream* stream);
 // Hermitian rank-1 matrix update: CHER/ZHER
 template <typename T>
 StatusCode Her(const Layout layout, const Triangle triangle,
               const size_t n,
               const T alpha,
               const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
               CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
               CUstream* stream);
 // Hermitian packed rank-1 matrix update: CHPR/ZHPR
 template <typename T>
 StatusCode Hpr(const Layout layout, const Triangle triangle,
               const size_t n,
               const T alpha,
               const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
               CUdeviceptr ap_buffer, const size_t ap_offset,
               CUstream* stream);
 // Hermitian rank-2 matrix update: CHER2/ZHER2
 template <typename T>
 StatusCode Her2(const Layout layout, const Triangle triangle,
                const size_t n,
                const T alpha,
                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
                CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                CUstream* stream);
 // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2
 template <typename T>
 StatusCode Hpr2(const Layout layout, const Triangle triangle,
                const size_t n,
                const T alpha,
                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
                CUdeviceptr ap_buffer, const size_t ap_offset,
                CUstream* stream);
 // Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
 template <typename T>
 StatusCode Syr(const Layout layout, const Triangle triangle,
               const size_t n,
               const T alpha,
               const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
               CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
               CUstream* stream);
 // Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
 template <typename T>
 StatusCode Spr(const Layout layout, const Triangle triangle,
               const size_t n,
               const T alpha,
               const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
               CUdeviceptr ap_buffer, const size_t ap_offset,
               CUstream* stream);
 // Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
 template <typename T>
 StatusCode Syr2(const Layout layout, const Triangle triangle,
                const size_t n,
                const T alpha,
                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
                CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                CUstream* stream);
 // Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
 template <typename T>
 StatusCode Spr2(const Layout layout, const Triangle triangle,
                const size_t n,
                const T alpha,
                const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
                const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
                CUdeviceptr ap_buffer, const size_t ap_offset,
                CUstream* stream);
 // =================================================================================================
 // BLAS level-3 (matrix-matrix) routines
 // =================================================================================================
 // General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
 template <typename T>
 StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
                const size_t m, const size_t n, const size_t k,
                const T alpha,
                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
                const T beta,
                CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
                CUstream* stream);
 // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
 template <typename T>
 StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
                const size_t m, const size_t n,
                const T alpha,
                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
                const T beta,
                CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
                CUstream* stream);
 // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
 template <typename T>
 StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
                const size_t m, const size_t n,
                const T alpha,
                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
                const T beta,
                CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
                CUstream* stream);
 // Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK
 template <typename T>
 StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
                const size_t n, const size_t k,
                const T alpha,
                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                const T beta,
                CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
                CUstream* stream);
 // Rank-K update of a hermitian matrix: CHERK/ZHERK
 template <typename T>
 StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
                const size_t n, const size_t k,
                const T alpha,
                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                const T beta,
                CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
                CUstream* stream);
 // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K
 template <typename T>
 StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
                 const size_t n, const size_t k,
                 const T alpha,
                 const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                 const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
                 const T beta,
                 CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
                 CUstream* stream);
 // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
 template <typename T, typename U>
 StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
                 const size_t n, const size_t k,
                 const T alpha,
                 const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                 const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
                 const U beta,
                 CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
                 CUstream* stream);
 // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM
 template <typename T>
 StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                const size_t m, const size_t n,
                const T alpha,
                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
                CUstream* stream);
 // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
 template <typename T>
 StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                const size_t m, const size_t n,
                const T alpha,
                const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
                CUstream* stream);
 // =================================================================================================
 // Extra non-BLAS routines (level-X)
 // =================================================================================================
 // Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
 template <typename T>
 StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
                    const size_t m, const size_t n,
                    const T alpha,
                    const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
                    CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
                    CUstream* stream);
 // Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL
 template <typename T>
 StatusCode Im2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                  const CUdeviceptr im_buffer, const size_t im_offset,
                  CUdeviceptr col_buffer, const size_t col_offset,
                  CUstream* stream);
 // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
 template <typename T>
 StatusCode AxpyBatched(const size_t n,
                       const T *alphas,
                       const CUdeviceptr x_buffer, const size_t *x_offsets, const size_t x_inc,
                       CUdeviceptr y_buffer, const size_t *y_offsets, const size_t y_inc,
                       const size_t batch_count,
                       CUstream* stream);
 // Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED
 template <typename T>
 StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
                       const size_t m, const size_t n, const size_t k,
                       const T *alphas,
                       const CUdeviceptr a_buffer, const size_t *a_offsets, const size_t a_ld,
                       const CUdeviceptr b_buffer, const size_t *b_offsets, const size_t b_ld,
                       const T *betas,
                       CUdeviceptr c_buffer, const size_t *c_offsets, const size_t c_ld,
                       const size_t batch_count,
                       CUstream* stream);
 // =================================================================================================
 // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
 // for the same device. This cache can be cleared to free up system memory or in case of debugging.
 StatusCode PUBLIC_API ClearCache();
 // The cache can also be pre-initialized for a specific device with all possible CLBLast kernels.
 // Further CLBlast routine calls will then run at maximum speed.
 StatusCode PUBLIC_API FillCache(const CUdevice device);
 // =================================================================================================
 // Overrides tuning parameters for a specific device-precision-kernel combination. The next time
 // the target routine is called it will re-compile and use the new parameters from then on.
 StatusCode PUBLIC_API OverrideParameters(const CUdevice device, const std::string &kernel_name,
                                         const Precision precision,
                                         const std::unordered_map<std::string,size_t> &parameters);
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_CLBLAST_CUDA_H_
 #endif
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@ -12,6 +12,8 @@
 #    clblast.cpp
 #    clblast_c.h
 #    clblast_c.cpp
 #    clblast_cuda.h
 #    clblast_cuda.cpp
 #    clblast_netlib_c.h
 #    clblast_netlib_c.cpp
 #    wrapper_clblas.h
@ -41,9 +43,11 @@ FILES = [
    "/test/wrapper_cublas.hpp",
    "/include/clblast_netlib_c.h",
    "/src/clblast_netlib_c.cpp",
    "/include/clblast_cuda.h",
    "/src/clblast_cuda.cpp",
 ]
-HEADER_LINES = [122, 21, 126, 24, 29, 41, 29, 65, 32]
+HEADER_LINES = [122, 21, 126, 24, 29, 41, 29, 65, 32, 94, 21]
-FOOTER_LINES = [25, 3, 27, 38, 6, 6, 6, 9, 2]
+FOOTER_LINES = [25, 3, 27, 38, 6, 6, 6, 9, 2, 25, 3]
 HEADER_LINES_DOC = 0
 FOOTER_LINES_DOC = 63
@ -224,6 +228,10 @@ def main(argv):
                    if i == 8:
                        if not routine.batched:
                            body += cpp.clblast_netlib_c_cc(routine)
                    if i == 9:
                        body += cpp.clblast_h(routine, cuda=True)
                    if i == 10:
                        body += cpp.clblast_cc(routine, cuda=True)
            f.write("".join(file_header))
            f.write(body)
            f.write("".join(file_footer))
--- a/scripts/generator/generator/cpp.py
+++ b/scripts/generator/generator/cpp.py
@ -36,19 +36,19 @@ HEADER = NL + SEPARATOR + """
 """ + SEPARATOR + NL
-def clblast_h(routine):
+def clblast_h(routine, cuda=False):
    """The C++ API header (.h)"""
    result = NL + "// " + routine.description + ": " + routine.short_names() + NL
-    result += routine.routine_header_cpp(12, " = nullptr") + ";" + NL
+    result += routine.routine_header_cpp(12, " = nullptr", cuda) + ";" + NL
    return result
-def clblast_cc(routine):
+def clblast_cc(routine, cuda=False):
    """The C++ API implementation (.cpp)"""
    indent1 = " " * (15 + routine.length())
    result = NL + "// " + routine.description + ": " + routine.short_names() + NL
    if routine.implemented:
-        result += routine.routine_header_cpp(12, "") + " {" + NL
+        result += routine.routine_header_cpp(12, "", cuda) + " {" + NL
        result += "  try {" + NL
        result += "    auto queue_cpp = Queue(*queue);" + NL
        result += "    auto routine = X" + routine.plain_name() + "<" + routine.template.template + ">(queue_cpp, event);" + NL
@ -60,14 +60,22 @@ def clblast_cc(routine):
        result += "    return StatusCode::kSuccess;" + NL
        result += "  } catch (...) { return DispatchException(); }" + NL
    else:
-        result += routine.routine_header_type_cpp(12) + " {" + NL
+        result += routine.routine_header_type_cpp(12, cuda) + " {" + NL
        result += "  return StatusCode::kNotImplemented;" + NL
    result += "}" + NL
    for flavour in routine.flavours:
        indent2 = " " * (34 + routine.length() + len(flavour.template))
        result += "template StatusCode PUBLIC_API " + routine.capitalized_name() + "<" + flavour.template + ">("
-        result += ("," + NL + indent2).join([a for a in routine.arguments_type(flavour)])
+        arguments = routine.arguments_type(flavour)
-        result += "," + NL + indent2 + "cl_command_queue*, cl_event*);" + NL
+        if cuda:
            arguments = [a.replace("cl_mem", "CUdeviceptr") for a in arguments]
        result += ("," + NL + indent2).join([a for a in arguments])
        result += "," + NL + indent2
        if cuda:
            result += "CUstream*"
        else:
            result += "cl_command_queue*, cl_event*"
        result += ");" + NL
    return result
--- a/scripts/generator/generator/routine.py
+++ b/scripts/generator/generator/routine.py
@ -802,22 +802,38 @@ class Routine:
        """Retrieves a list of routine requirements for documentation"""
        return self.requirements
-    def routine_header_cpp(self, spaces, default_event):
+    def routine_header_cpp(self, spaces, default_event, cuda=False):
        """Retrieves the C++ templated definition for a routine"""
        indent = " " * (spaces + self.length())
        arguments = self.arguments_def(self.template)
        if cuda:
            arguments = [a.replace("cl_mem", "CUdeviceptr") for a in arguments]
        result = "template <" + self.template.name + ">\n"
        result += "StatusCode " + self.capitalized_name() + "("
-        result += (",\n" + indent).join([a for a in self.arguments_def(self.template)])
+        result += (",\n" + indent).join([a for a in arguments])
-        result += ",\n" + indent + "cl_command_queue* queue, cl_event* event" + default_event + ")"
+        result += ",\n" + indent
        if cuda:
            result += "CUstream* stream"
        else:
            result += "cl_command_queue* queue, cl_event* event" + default_event
        result += ")"
        return result
-    def routine_header_type_cpp(self, spaces):
+    def routine_header_type_cpp(self, spaces, cuda=False):
        """As above, but now without variable names"""
        indent = " " * (spaces + self.length())
        arguments = self.arguments_type(self.template)
        if cuda:
            arguments = [a.replace("cl_mem", "CUdeviceptr") for a in arguments]
        result = "template <" + self.template.name + ">\n"
        result += "StatusCode " + self.capitalized_name() + "("
-        result += (",\n" + indent).join([a for a in self.arguments_type(self.template)])
+        result += (",\n" + indent).join([a for a in arguments])
-        result += ",\n" + indent + "cl_command_queue*, cl_event*)"
+        result += ",\n" + indent
        if cuda:
            result += "CUstream* stream"
        else:
            result += "cl_command_queue*, cl_event*"
        result += ")"
        return result
    def routine_header_c(self, flavour, spaces, extra_qualifier):
--- a/src/api_common.cpp
+++ b/src/api_common.cpp
@ -12,9 +12,9 @@
 #include <string>
 #include "utilities/utilities.hpp"
 #include "cache.hpp"
 #include "routines/routines.hpp"
 #include "clblast.h"
 namespace clblast {
 // =================================================================================================
--- a/src/clblast_cuda.cpp
+++ b/src/clblast_cuda.cpp
--- a/src/cupp11.hpp
+++ b/src/cupp11.hpp
@ -0,0 +1,770 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements a bunch of C++11 classes that act as wrappers around OpenCL objects and API
 // calls. The main benefits are increased abstraction, automatic memory management, and portability.
 // Portability here means that a similar header exists for CUDA with the same classes and
 // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
 //
 // This file is taken from the CLCudaAPI project <https://github.com/CNugteren/CLCudaAPI> and
 // therefore contains the following header copyright notice:
 //
 // =================================================================================================
 //
 // Copyright 2015 SURFsara
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 // =================================================================================================
 #ifndef CLBLAST_CUPP11_H_
 #define CLBLAST_CUPP11_H_
 // C++
 #include <algorithm> // std::copy
 #include <string>    // std::string
 #include <vector>    // std::vector
 #include <memory>    // std::shared_ptr
 // CUDA
 #include <cuda.h>    // CUDA driver API
 #include <nvrtc.h>   // NVIDIA runtime compilation API
 // Exception classes
 #include "cxpp11_common.hpp"
 namespace clblast {
 // =================================================================================================
 // Max-length of strings
 constexpr auto kStringLength = 256;
 // =================================================================================================
 // Represents a runtime error returned by a CUDA driver API function
 class CLCudaAPIError : public ErrorCode<DeviceError, CUresult> {
 public:
  explicit CLCudaAPIError(CUresult status, const std::string &where):
      ErrorCode(status, where, "CUDA error: " + where + ": " +
                               GetErrorName(status) + " --> " + GetErrorString(status)) {
  }
  static void Check(const CUresult status, const std::string &where) {
    if (status != CUDA_SUCCESS) {
      throw CLCudaAPIError(status, where);
    }
  }
  static void CheckDtor(const CUresult status, const std::string &where) {
    if (status != CUDA_SUCCESS) {
      fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPIError(status, where).what());
    }
  }
 private:
  std::string GetErrorName(CUresult status) const {
    const char* status_code;
    cuGetErrorName(status, &status_code);
    return std::string(status_code);
  }
  std::string GetErrorString(CUresult status) const {
    const char* status_string;
    cuGetErrorString(status, &status_string);
    return std::string(status_string);
  }
 };
 // Represents a runtime error returned by a CUDA runtime compilation API function
 class CLCudaAPINVRTCError : public ErrorCode<DeviceError, nvrtcResult> {
 public:
  explicit CLCudaAPINVRTCError(nvrtcResult status, const std::string &where):
      ErrorCode(status, where, "CUDA NVRTC error: " + where + ": " + GetErrorString(status)) {
  }
  static void Check(const nvrtcResult status, const std::string &where) {
    if (status != NVRTC_SUCCESS) {
      throw CLCudaAPINVRTCError(status, where);
    }
  }
  static void CheckDtor(const nvrtcResult status, const std::string &where) {
    if (status != NVRTC_SUCCESS) {
      fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPINVRTCError(status, where).what());
    }
  }
 private:
  std::string GetErrorString(nvrtcResult status) const {
    const char* status_string = nvrtcGetErrorString(status);
    return std::string(status_string);
  }
 };
 // Exception returned when building a program
 using CLCudaAPIBuildError = CLCudaAPINVRTCError;
 // =================================================================================================
 // Error occurred in CUDA driver or runtime compilation API
 #define CheckError(call) CLCudaAPIError::Check(call, CLCudaAPIError::TrimCallString(#call))
 #define CheckErrorNVRTC(call) CLCudaAPINVRTCError::Check(call, CLCudaAPINVRTCError::TrimCallString(#call))
 // Error occurred in CUDA driver or runtime compilation API (no-exception version for destructors)
 #define CheckErrorDtor(call) CLCudaAPIError::CheckDtor(call, CLCudaAPIError::TrimCallString(#call))
 #define CheckErrorDtorNVRTC(call) CLCudaAPINVRTCError::CheckDtor(call, CLCudaAPINVRTCError::TrimCallString(#call))
 // =================================================================================================
 // C++11 version of two 'CUevent' pointers
 class Event {
 public:
  // Note that there is no constructor based on the regular CUDA data-type because of extra state
  // Regular constructor with memory management
  explicit Event():
      start_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }),
      end_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }) {
    CheckError(cuEventCreate(start_.get(), CU_EVENT_DEFAULT));
    CheckError(cuEventCreate(end_.get(), CU_EVENT_DEFAULT));
  }
  // Waits for completion of this event (not implemented for CUDA)
  void WaitForCompletion() const { }
  // Retrieves the elapsed time of the last recorded event
  float GetElapsedTime() const {
    auto result = 0.0f;
    cuEventElapsedTime(&result, *start_, *end_);
    return result;
  }
  // Accessors to the private data-members
  const CUevent& start() const { return *start_; }
  const CUevent& end() const { return *end_; }
  Event* pointer() { return this; }
 private:
  std::shared_ptr<CUevent> start_;
  std::shared_ptr<CUevent> end_;
 };
 // Pointer to a CUDA event
 using EventPointer = Event*;
 // =================================================================================================
 // Raw platform ID type
 using RawPlatformID = size_t;
 // The CUDA platform: initializes the CUDA driver API
 class Platform {
 public:
  // Initializes the platform. Note that the platform ID variable is not actually used for CUDA.
  explicit Platform(const size_t platform_id) : platform_id_(0) {
    if (platform_id != 0) { throw LogicError("CUDA back-end requires a platform ID of 0"); }
    CheckError(cuInit(0));
  }
  // Methods to retrieve platform information
  std::string Name() const { return "CUDA"; }
  std::string Vendor() const { return "NVIDIA Corporation"; }
  std::string Version() const {
    auto result = 0;
    CheckError(cuDriverGetVersion(&result));
    return "CUDA driver "+std::to_string(result);
  }
  // Returns the number of devices on this platform
  size_t NumDevices() const {
    auto result = 0;
    CheckError(cuDeviceGetCount(&result));
    return static_cast<size_t>(result);
  }
  // Accessor to the raw ID (which doesn't exist in the CUDA back-end, this is always just 0)
  const RawPlatformID& operator()() const { return platform_id_; }
 private:
  const size_t platform_id_;
 };
 // Retrieves a vector with all platforms. Note that there is just one platform in CUDA.
 inline std::vector<Platform> GetAllPlatforms() {
  auto all_platforms = std::vector<Platform>{ Platform(size_t{0}) };
  return all_platforms;
 }
 // =================================================================================================
 // Raw device ID type
 using RawDeviceID = CUdevice;
 // C++11 version of 'CUdevice'
 class Device {
 public:
  // Constructor based on the regular CUDA data-type
  explicit Device(const CUdevice device): device_(device) { }
  // Initialization
  explicit Device(const Platform &platform, const size_t device_id) {
    auto num_devices = platform.NumDevices();
    if (num_devices == 0) {
      throw RuntimeError("Device: no devices found");
    }
    if (device_id >= num_devices) {
      throw RuntimeError("Device: invalid device ID "+std::to_string(device_id));
    }
    CheckError(cuDeviceGet(&device_, device_id));
  }
  // Methods to retrieve device information
  RawPlatformID PlatformID() const { return 0; }
  std::string Version() const {
    auto result = 0;
    CheckError(cuDriverGetVersion(&result));
    return "CUDA driver "+std::to_string(result);
  }
  size_t VersionNumber() const {
    auto result = 0;
    CheckError(cuDriverGetVersion(&result));
    return static_cast<size_t>(result);
  }
  std::string Vendor() const { return "NVIDIA Corporation"; }
  std::string Name() const {
    auto result = std::string{};
    result.resize(kStringLength);
    CheckError(cuDeviceGetName(&result[0], result.size(), device_));
    return result;
  }
  std::string Type() const { return "GPU"; }
  size_t MaxWorkGroupSize() const {return GetInfo(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK); }
  size_t MaxWorkItemDimensions() const { return size_t{3}; }
  std::vector<size_t> MaxWorkItemSizes() const {
    return std::vector<size_t>{GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X),
                               GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y),
                               GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z)};
  }
  unsigned long LocalMemSize() const {
    return static_cast<unsigned long>(GetInfo(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK));
  }
  std::string Capabilities() const {
    const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
    const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
    return "SM"+std::to_string(major)+"."+std::to_string(minor);
  }
  bool HasExtension(const std::string &extension) const { return false; }
  bool SupportsFP64() const { return true; }
  bool SupportsFP16() const {
    const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
    const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
    if (major > 5) { return true; } // SM 6.x, 7.x and higher
    if (major == 5 && minor == 3) { return true; } // SM 5.3
    return false;
  }
  size_t CoreClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_CLOCK_RATE); }
  size_t ComputeUnits() const { return GetInfo(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT); }
  unsigned long MemorySize() const {
    auto result = size_t{0};
    CheckError(cuDeviceTotalMem(&result, device_));
    return static_cast<unsigned long>(result);
  }
  unsigned long MaxAllocSize() const { return MemorySize(); }
  size_t MemoryClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE); }
  size_t MemoryBusWidth() const { return GetInfo(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH); }
  // Configuration-validity checks
  bool IsLocalMemoryValid(const size_t local_mem_usage) const {
    return (local_mem_usage <= LocalMemSize());
  }
  bool IsThreadConfigValid(const std::vector<size_t> &local) const {
    auto local_size = size_t{1};
    for (const auto &item: local) { local_size *= item; }
    for (auto i=size_t{0}; i<local.size(); ++i) {
      if (local[i] > MaxWorkItemSizes()[i]) { return false; }
    }
    if (local_size > MaxWorkGroupSize()) { return false; }
    if (local.size() > MaxWorkItemDimensions()) { return false; }
    return true;
  }
  // Query for a specific type of device or brand
  bool IsCPU() const { return false; }
  bool IsGPU() const { return true; }
  bool IsAMD() const { return false; }
  bool IsNVIDIA() const { return true; }
  bool IsIntel() const { return false; }
  bool IsARM() const { return false; }
  // Platform specific extensions
  std::string AMDBoardName() const { return ""; }
  std::string NVIDIAComputeCapability() const { return Capabilities(); }
  // Accessor to the private data-member
  const RawDeviceID& operator()() const { return device_; }
 private:
  CUdevice device_;
  // Private helper function
  size_t GetInfo(const CUdevice_attribute info) const {
    auto result = 0;
    CheckError(cuDeviceGetAttribute(&result, info, device_));
    return static_cast<size_t>(result);
  }
 };
 // =================================================================================================
 // Raw context type
 using RawContext = CUcontext;
 // C++11 version of 'CUcontext'
 class Context {
 public:
  // Constructor based on the regular CUDA data-type: memory management is handled elsewhere
  explicit Context(const CUcontext context):
      context_(new CUcontext) {
    *context_ = context;
  }
  // Regular constructor with memory management
  explicit Context(const Device &device):
      context_(new CUcontext, [](CUcontext* c) {
          if (*c) { CheckErrorDtor(cuCtxDestroy(*c)); }
          delete c;
      }) {
    CheckError(cuCtxCreate(context_.get(), 0, device()));
  }
  // Accessor to the private data-member
  const RawContext& operator()() const { return *context_; }
  RawContext* pointer() const { return &(*context_); }
 private:
  std::shared_ptr<CUcontext> context_;
 };
 // Pointer to a raw CUDA context
 using ContextPointer = CUcontext*;
 // =================================================================================================
 // C++11 version of 'nvrtcProgram'. Additionally holds the program's source code.
 class Program {
 public:
  // Note that there is no constructor based on the regular CUDA data-type because of extra state
  // Source-based constructor with memory management
  explicit Program(const Context &, std::string source):
      program_(new nvrtcProgram, [](nvrtcProgram* p) {
          if (*p) { CheckErrorDtorNVRTC(nvrtcDestroyProgram(p)); }
          delete p;
      }),
      source_(std::move(source)),
      from_binary_(false) {
    const auto source_ptr = &source_[0];
    CheckErrorNVRTC(nvrtcCreateProgram(program_.get(), source_ptr, nullptr, 0, nullptr, nullptr));
  }
  // PTX-based constructor
  explicit Program(const Device &device, const Context &context, const std::string &binary):
      program_(nullptr), // not used
      source_(binary),
      from_binary_(true) {
  }
  // Compiles the device program and checks whether or not there are any warnings/errors
  void Build(const Device &, std::vector<std::string> &options) {
    if (from_binary_) { return; }
    auto raw_options = std::vector<const char*>();
    for (const auto &option: options) {
      raw_options.push_back(option.c_str());
    }
    auto status = nvrtcCompileProgram(*program_, raw_options.size(), raw_options.data());
    CLCudaAPINVRTCError::Check(status, "nvrtcCompileProgram");
  }
  // Confirms whether a certain status code is an actual compilation error or warning
  bool StatusIsCompilationWarningOrError(const nvrtcResult status) const {
    return (status == NVRTC_ERROR_INVALID_INPUT);
  }
  // Retrieves the warning/error message from the compiler (if any)
  std::string GetBuildInfo(const Device &) const {
    if (from_binary_) { return std::string{}; }
    auto bytes = size_t{0};
    CheckErrorNVRTC(nvrtcGetProgramLogSize(*program_, &bytes));
    auto result = std::string{};
    result.resize(bytes);
    CheckErrorNVRTC(nvrtcGetProgramLog(*program_, &result[0]));
    return result;
  }
  // Retrieves an intermediate representation of the compiled program (i.e. PTX)
  std::string GetIR() const {
    if (from_binary_) { return source_; } // holds the PTX
    auto bytes = size_t{0};
    CheckErrorNVRTC(nvrtcGetPTXSize(*program_, &bytes));
    auto result = std::string{};
    result.resize(bytes);
    CheckErrorNVRTC(nvrtcGetPTX(*program_, &result[0]));
    return result;
  }
  // Accessor to the private data-member
  const nvrtcProgram& operator()() const { return *program_; }
 private:
  std::shared_ptr<nvrtcProgram> program_;
  const std::string source_;
  const bool from_binary_;
 };
 // =================================================================================================
 // Raw command-queue type
 using RawCommandQueue = CUstream;
 // C++11 version of 'CUstream'
 class Queue {
 public:
  // Note that there is no constructor based on the regular CUDA data-type because of extra state
  // Regular constructor with memory management
  explicit Queue(const Context &context, const Device &device):
      queue_(new CUstream, [](CUstream* s) {
          if (*s) { CheckErrorDtor(cuStreamDestroy(*s)); }
          delete s;
      }),
      context_(context),
      device_(device) {
    CheckError(cuStreamCreate(queue_.get(), CU_STREAM_NON_BLOCKING));
  }
  // Synchronizes the queue and optionally also an event
  void Finish(Event &event) const {
    CheckError(cuEventSynchronize(event.end()));
    Finish();
  }
  void Finish() const {
    CheckError(cuStreamSynchronize(*queue_));
  }
  // Retrieves the corresponding context or device
  Context GetContext() const { return context_; }
  Device GetDevice() const { return device_; }
  // Accessor to the private data-member
  const RawCommandQueue& operator()() const { return *queue_; }
 private:
  std::shared_ptr<CUstream> queue_;
  const Context context_;
  const Device device_;
 };
 // =================================================================================================
 // C++11 version of page-locked host memory
 template <typename T>
 class BufferHost {
 public:
  // Regular constructor with memory management
  explicit BufferHost(const Context &, const size_t size):
      buffer_(new void*, [](void** m) { CheckError(cuMemFreeHost(*m)); delete m; }),
      size_(size) {
    CheckError(cuMemAllocHost(buffer_.get(), size*sizeof(T)));
  }
  // Retrieves the actual allocated size in bytes
  size_t GetSize() const {
    return size_*sizeof(T);
  }
  // Compatibility with std::vector
  size_t size() const { return size_; }
  T* begin() { return &static_cast<T*>(*buffer_)[0]; }
  T* end() { return &static_cast<T*>(*buffer_)[size_-1]; }
  T& operator[](const size_t i) { return static_cast<T*>(*buffer_)[i]; }
  T* data() { return static_cast<T*>(*buffer_); }
  const T* data() const { return static_cast<T*>(*buffer_); }
 private:
  std::shared_ptr<void*> buffer_;
  const size_t size_;
 };
 // =================================================================================================
 // Enumeration of buffer access types
 enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned };
 // C++11 version of 'CUdeviceptr'
 template <typename T>
 class Buffer {
 public:
  // Constructor based on the regular CUDA data-type: memory management is handled elsewhere
  explicit Buffer(const CUdeviceptr buffer):
      buffer_(new CUdeviceptr),
      access_(BufferAccess::kNotOwned) {
    *buffer_ = buffer;
  }
  // Regular constructor with memory management. If this class does not own the buffer object, then
  // the memory will not be freed automatically afterwards.
  explicit Buffer(const Context &, const BufferAccess access, const size_t size):
      buffer_(new CUdeviceptr, [access](CUdeviceptr* m) {
          if (access != BufferAccess::kNotOwned) { CheckError(cuMemFree(*m)); }
          delete m;
      }),
      access_(access) {
    CheckError(cuMemAlloc(buffer_.get(), size*sizeof(T)));
  }
  // As above, but now with read/write access as a default
  explicit Buffer(const Context &context, const size_t size):
      Buffer<T>(context, BufferAccess::kReadWrite, size) {
  }
  // Constructs a new buffer based on an existing host-container
  template <typename Iterator>
  explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end):
      Buffer(context, BufferAccess::kReadWrite, static_cast<size_t>(end - start)) {
    auto size = static_cast<size_t>(end - start);
    auto pointer = &*start;
    CheckError(cuMemcpyHtoDAsync(*buffer_, pointer, size*sizeof(T), queue()));
    queue.Finish();
  }
  // Copies from device to host: reading the device buffer a-synchronously
  void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
    if (access_ == BufferAccess::kWriteOnly) {
      throw LogicError("Buffer: reading from a write-only buffer");
    }
    CheckError(cuMemcpyDtoHAsync(host, *buffer_ + offset*sizeof(T), size*sizeof(T), queue()));
  }
  void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
                 const size_t offset = 0) const {
    if (host.size() < size) {
      throw LogicError("Buffer: target host buffer is too small");
    }
    ReadAsync(queue, size, host.data(), offset);
  }
  void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
                 const size_t offset = 0) const {
    if (host.size() < size) {
      throw LogicError("Buffer: target host buffer is too small");
    }
    ReadAsync(queue, size, host.data(), offset);
  }
  // Copies from device to host: reading the device buffer
  void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
    ReadAsync(queue, size, host, offset);
    queue.Finish();
  }
  void Read(const Queue &queue, const size_t size, std::vector<T> &host,
            const size_t offset = 0) const {
    Read(queue, size, host.data(), offset);
  }
  void Read(const Queue &queue, const size_t size, BufferHost<T> &host,
            const size_t offset = 0) const {
    Read(queue, size, host.data(), offset);
  }
  // Copies from host to device: writing the device buffer a-synchronously
  void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
    if (access_ == BufferAccess::kReadOnly) {
      throw LogicError("Buffer: writing to a read-only buffer");
    }
    if (GetSize() < (offset+size)*sizeof(T)) {
      throw LogicError("Buffer: target device buffer is too small");
    }
    CheckError(cuMemcpyHtoDAsync(*buffer_ + offset*sizeof(T), host, size*sizeof(T), queue()));
  }
  void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host,
                  const size_t offset = 0) {
    WriteAsync(queue, size, host.data(), offset);
  }
  void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host,
                  const size_t offset = 0) {
    WriteAsync(queue, size, host.data(), offset);
  }
  // Copies from host to device: writing the device buffer
  void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
    WriteAsync(queue, size, host, offset);
    queue.Finish();
  }
  void Write(const Queue &queue, const size_t size, const std::vector<T> &host,
             const size_t offset = 0) {
    Write(queue, size, host.data(), offset);
  }
  void Write(const Queue &queue, const size_t size, const BufferHost<T> &host,
             const size_t offset = 0) {
    Write(queue, size, host.data(), offset);
  }
  // Copies the contents of this buffer into another device buffer
  void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
    CheckError(cuMemcpyDtoDAsync(destination(), *buffer_, size*sizeof(T), queue()));
  }
  void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
    CopyToAsync(queue, size, destination);
    queue.Finish();
  }
  // Retrieves the actual allocated size in bytes
  size_t GetSize() const {
    auto result = size_t{0};
    CheckError(cuMemGetAddressRange(nullptr, &result, *buffer_));
    return result;
  }
  // Accessors to the private data-members
  CUdeviceptr operator()() const { return *buffer_; }
  CUdeviceptr& operator()() { return *buffer_; }
 private:
  std::shared_ptr<CUdeviceptr> buffer_;
  const BufferAccess access_;
 };
 // =================================================================================================
 // C++11 version of 'CUfunction'
 class Kernel {
 public:
  // Constructor based on the regular CUDA data-type: memory management is handled elsewhere
  explicit Kernel(const CUmodule module, const CUfunction kernel):
      module_(module),
      kernel_(kernel) {
  }
  // Regular constructor with memory management
  explicit Kernel(const Program &program, const std::string &name) {
    CheckError(cuModuleLoadDataEx(&module_, program.GetIR().data(), 0, nullptr, nullptr));
    CheckError(cuModuleGetFunction(&kernel_, module_, name.c_str()));
  }
  // Sets a kernel argument at the indicated position. This stores both the value of the argument
  // (as raw bytes) and the index indicating where this value can be found.
  template <typename T>
  void SetArgument(const size_t index, const T &value) {
    if (index >= arguments_indices_.size()) { arguments_indices_.resize(index+1); }
    arguments_indices_[index] = arguments_data_.size();
    for (auto j=size_t(0); j<sizeof(T); ++j) {
      arguments_data_.push_back(reinterpret_cast<const char*>(&value)[j]);
    }
  }
  template <typename T>
  void SetArgument(const size_t index, Buffer<T> &value) {
    SetArgument(index, value());
  }
  // Sets all arguments in one go using parameter packs. Note that this resets all previously set
  // arguments using 'SetArgument' or 'SetArguments'.
  template <typename... Args>
  void SetArguments(Args&... args) {
    arguments_indices_.clear();
    arguments_data_.clear();
    SetArgumentsRecursive(0, args...);
  }
  // Retrieves the amount of local memory used per work-group for this kernel. Note that this the
  // shared memory in CUDA terminology.
  unsigned long LocalMemUsage(const Device &) const {
    auto result = 0;
    CheckError(cuFuncGetAttribute(&result, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel_));
    return static_cast<unsigned long>(result);
  }
  // Retrieves the name of the kernel
  std::string GetFunctionName() const {
    return std::string{"unknown"}; // Not implemented for the CUDA backend
  }
  // Launches a kernel onto the specified queue
  void Launch(const Queue &queue, const std::vector<size_t> &global,
              const std::vector<size_t> &local, EventPointer event) {
    // Creates the grid (number of threadblocks) and sets the block sizes (threads per block)
    auto grid = std::vector<size_t>{1, 1, 1};
    auto block = std::vector<size_t>{1, 1, 1};
    if (global.size() != local.size()) { throw LogicError("invalid thread/workgroup dimensions"); }
    for (auto i=size_t{0}; i<local.size(); ++i) { grid[i] = global[i]/local[i]; }
    for (auto i=size_t{0}; i<local.size(); ++i) { block[i] = local[i]; }
    // Creates the array of pointers from the arrays of indices & data
    std::vector<void*> pointers;
    for (auto &index: arguments_indices_) {
      pointers.push_back(&arguments_data_[index]);
    }
    // Launches the kernel, its execution time is recorded by events
    CheckError(cuEventRecord(event->start(), queue()));
    CheckError(cuLaunchKernel(kernel_, grid[0], grid[1], grid[2], block[0], block[1], block[2],
                              0, queue(), pointers.data(), nullptr));
    CheckError(cuEventRecord(event->end(), queue()));
  }
  // As above, but with an event waiting list
  // TODO: Implement this function
  void Launch(const Queue &queue, const std::vector<size_t> &global,
              const std::vector<size_t> &local, EventPointer event,
              std::vector<Event>& waitForEvents) {
    if (local.size() == 0) {
      throw LogicError("Kernel: launching with a default workgroup size is not implemented for the CUDA back-end");
    }
    else if (waitForEvents.size() != 0) {
      throw LogicError("Kernel: launching with an event waiting list is not implemented for the CUDA back-end");
    }
    else {
      return Launch(queue, global, local, event);
    }
  }
  // Accessors to the private data-members
  const CUfunction& operator()() const { return kernel_; }
  CUfunction operator()() { return kernel_; }
 private:
  CUmodule module_;
  CUfunction kernel_;
  std::vector<size_t> arguments_indices_; // Indices of the arguments
  std::vector<char> arguments_data_; // The arguments data as raw bytes
  // Internal implementation for the recursive SetArguments function.
  template <typename T>
  void SetArgumentsRecursive(const size_t index, T &first) {
    SetArgument(index, first);
  }
  template <typename T, typename... Args>
  void SetArgumentsRecursive(const size_t index, T &first, Args&... args) {
    SetArgument(index, first);
    SetArgumentsRecursive(index+1, args...);
  }
 };
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_CUPP11_H_
 #endif
--- a/src/utilities/buffer_test.hpp
+++ b/src/utilities/buffer_test.hpp
@ -15,7 +15,7 @@
 #ifndef CLBLAST_BUFFER_TEST_H_
 #define CLBLAST_BUFFER_TEST_H_
-#include "clblast.h"
+#include "utilities/utilities.hpp
 namespace clblast {
 // =================================================================================================
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@ -21,8 +21,13 @@
 #include <complex>
 #include <random>
-#include "clpp11.hpp"
+#ifdef OPENCL_API
-#include "clblast.h"
+  #include "clpp11.hpp"
  #include "clblast.h"
 #elif CUDA_API
  #include "cupp11.hpp"
  #include "clblast_cuda.h"
 #endif
 #include "clblast_half.h"
 #include "utilities/clblast_exceptions.hpp"
 #include "utilities/msvc.hpp"