diff --git a/CMakeLists.txt b/CMakeLists.txt index 52accbd4..a5a41f35 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,6 +30,23 @@ option(TESTS "Enable compilation of the correctness tests" OFF) option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF) option(CUBLAS "Enables performance comparison against cuBLAS on NVIDIA GPUs" OFF) +# Select between an OpenCL API (default) or a CUDA API (beta) +option(OPENCL "Build CLBlast with an OpenCL API (default)" ON) +option(CUDA "Build CLBlast with a CUDA API (beta)" OFF) +if(NOT OPENCL AND NOT CUDA) + message(FATAL_ERROR "No API selected, choose from OpenCL (-DOPENCL=ON) or CUDA (-DCUDA=ON)") +endif() +if(OPENCL AND CUDA) + message(FATAL_ERROR "Multiple APIs selected, choose either OpenCL (-DOPENCL=ON -DCUDA=OFF) or CUDA (-DCUDA=ON -DOPENCL=OFF)") +endif() +if(OPENCL) + message("-- Building CLBlast with OpenCL API (default)") + add_definitions(-DOPENCL_API) +elseif(CUDA) + message("-- Building CLBlast with CUDA API (beta)") + add_definitions(-DCUDA_API) +endif() + # Compile in verbose mode with additional diagnostic messages option(VERBOSE "Compile in verbose mode for additional diagnostic messages" OFF) if(VERBOSE) @@ -123,8 +140,18 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CFLAGS}") # Package scripts location set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${clblast_SOURCE_DIR}/cmake/Modules/") -# Requires OpenCL. It is found through the included "FindOpenCL.cmake" in CMAKE_MODULE_PATH. -find_package(OpenCL REQUIRED) +if(OPENCL) + # Requires OpenCL. It is found through the included "FindOpenCL.cmake" in CMAKE_MODULE_PATH. + find_package(OpenCL REQUIRED) + set(API_LIBRARIES ${OPENCL_LIBRARIES}) + set(API_INCLUDE_DIRS ${OPENCL_INCLUDE_DIRS}) +elseif(CUDA) + # For CUDA, the "FindCUDA.cmake" is part of CMake + find_package(CUDA REQUIRED) + set(API_LIBRARIES cuda nvrtc) + set(API_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS}) + link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64) +endif() # Locates the CLTune library in case the tuners need to be compiled. "FindCLTune.cmake" is included. if(TUNERS) @@ -161,11 +188,6 @@ set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger xgemm xgemm_direct xgemv) set(DATABASES copy pad padtranspose transpose xaxpy xdot xgemm xgemm_direct xgemv xgemv_fast xgemv_fast_rot xger) -set(SAMPLE_PROGRAMS_CPP sgemm sgemm_batched) -set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache) -if(NETLIB) - set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib) -endif() set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax) set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2) @@ -173,6 +195,16 @@ set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm) set(LEVELX_ROUTINES xomatcopy xim2col xaxpybatched xgemmbatched) set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES}) set(PRECISIONS 32 64 3232 6464 16) +if(OPENCL) + set(SAMPLE_PROGRAMS_CPP sgemm sgemm_batched) + set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache) + if(NETLIB) + set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib) + endif() +elseif(CUDA) + set(SAMPLE_PROGRAMS_CPP ) + set(SAMPLE_PROGRAMS_C ) +endif() # ================================================================================================== @@ -184,14 +216,10 @@ set(SOURCES src/utilities/utilities.cpp src/api_common.cpp src/cache.cpp - src/clblast.cpp - src/clblast_c.cpp src/routine.cpp src/routines/levelx/xinvert.cpp # only source, don't include it as a test ) set(HEADERS # such that they can be discovered by IDEs such as CLion and Visual Studio - include/clblast.h - include/clblast_c.h include/clblast_half.h src/database/apple_cpu_fallback.hpp src/database/database.hpp @@ -209,13 +237,19 @@ set(HEADERS # such that they can be discovered by IDEs such as CLion and Visual src/utilities/msvc.hpp src/utilities/utilities.hpp src/cache.hpp - src/clpp11.hpp src/cxpp11_common.hpp src/routine.hpp ) -if(NETLIB) - set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp) - set(HEADERS ${HEADERS} include/clblast_netlib_c.h) +if(OPENCL) + set(SOURCES ${SOURCES} src/clblast.cpp src/clblast_c.cpp) + set(HEADERS ${HEADERS} include/clblast.h include/clblast_c.h src/clpp11.hpp) + if(NETLIB) + set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp) + set(HEADERS ${HEADERS} include/clblast_netlib_c.h) + endif() +elseif(CUDA) + set(SOURCES ${SOURCES} src/clblast_cuda.cpp) + set(HEADERS ${HEADERS} include/clblast_cuda.h src/cupp11.hpp) endif() foreach(ROUTINE ${LEVEL1_ROUTINES}) set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cpp) @@ -249,14 +283,14 @@ else(BUILD_SHARED_LIBS) add_library(clblast STATIC ${SOURCES} ${HEADERS}) endif() -target_link_libraries(clblast ${OPENCL_LIBRARIES}) +target_link_libraries(clblast ${API_LIBRARIES}) # Includes directories: CLBlast and OpenCL target_include_directories(clblast PUBLIC $ $ $ - ${OPENCL_INCLUDE_DIRS}) + ${API_INCLUDE_DIRS}) # Sets the proper __declspec(dllexport) keyword for Visual Studio when the library is built if(MSVC) @@ -267,11 +301,15 @@ endif() # Installs the library install(TARGETS clblast EXPORT CLBlast DESTINATION lib) -install(FILES include/clblast.h DESTINATION include) -install(FILES include/clblast_c.h DESTINATION include) install(FILES include/clblast_half.h DESTINATION include) -if(NETLIB) - install(FILES include/clblast_netlib_c.h DESTINATION include) +if(OPENCL) + install(FILES include/clblast.h DESTINATION include) + install(FILES include/clblast_c.h DESTINATION include) + if(NETLIB) + install(FILES include/clblast_netlib_c.h DESTINATION include) + endif() +elseif(CUDA) + install(FILES include/clblast_cuda.h DESTINATION include) endif() # Installs the config for find_package in dependent projects @@ -291,19 +329,21 @@ endif() if(SAMPLES) # Downloads the cl.hpp file from Khronos - file(DOWNLOAD https://www.khronos.org/registry/OpenCL/api/2.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp) + if(OPENCL) + file(DOWNLOAD https://www.khronos.org/registry/OpenCL/api/2.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp) + endif() # Adds sample programs (C++) foreach(SAMPLE ${SAMPLE_PROGRAMS_CPP}) add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cpp) - target_link_libraries(clblast_sample_${SAMPLE} clblast ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_sample_${SAMPLE} clblast ${API_LIBRARIES}) install(TARGETS clblast_sample_${SAMPLE} DESTINATION bin) endforeach() # Adds sample programs (C) foreach(SAMPLE ${SAMPLE_PROGRAMS_C}) add_executable(clblast_sample_${SAMPLE}_c samples/${SAMPLE}.c) - target_link_libraries(clblast_sample_${SAMPLE}_c clblast ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_sample_${SAMPLE}_c clblast ${API_LIBRARIES}) install(TARGETS clblast_sample_${SAMPLE}_c DESTINATION bin) endforeach() @@ -324,7 +364,7 @@ if(TUNERS) # Adds tuning executables foreach(KERNEL ${KERNELS}) add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} src/tuning/kernels/${KERNEL}.cpp) - target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${API_LIBRARIES}) target_include_directories(clblast_tuner_${KERNEL} PUBLIC ${CLTUNE_INCLUDE_DIRS}) install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin) endforeach() @@ -429,7 +469,7 @@ if(CLIENTS) test/routines/levelx/${ROUTINE}.hpp) endforeach() foreach(ROUTINE ${ROUTINES}) - target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${API_LIBRARIES}) target_include_directories(clblast_client_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES}) install(TARGETS clblast_client_${ROUTINE} DESTINATION bin) endforeach() @@ -481,7 +521,7 @@ if(TESTS) test/routines/levelx/${ROUTINE}.hpp) endforeach() foreach(ROUTINE ${ROUTINES}) - target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${API_LIBRARIES}) install(TARGETS clblast_test_${ROUTINE} DESTINATION bin) target_include_directories(clblast_test_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES}) add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE}) @@ -492,7 +532,7 @@ if(TESTS) foreach(MISC_TEST ${MISC_TESTS}) add_executable(clblast_test_${MISC_TEST} ${TESTS_COMMON} test/correctness/misc/${MISC_TEST}.cpp) - target_link_libraries(clblast_test_${MISC_TEST} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_test_${MISC_TEST} clblast ${REF_LIBRARIES} ${API_LIBRARIES}) target_include_directories(clblast_test_${MISC_TEST} PUBLIC $ ${clblast_SOURCE_DIR} ${REF_INCLUDES}) @@ -501,7 +541,7 @@ if(TESTS) # CLBlast diagnostics add_executable(clblast_test_diagnostics ${TESTS_COMMON} test/diagnostics.cpp) - target_link_libraries(clblast_test_diagnostics clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_test_diagnostics clblast ${REF_LIBRARIES} ${API_LIBRARIES}) target_include_directories(clblast_test_diagnostics PUBLIC $ ${clblast_SOURCE_DIR} ${REF_INCLUDES}) diff --git a/include/clblast_cuda.h b/include/clblast_cuda.h new file mode 100644 index 00000000..c125c302 --- /dev/null +++ b/include/clblast_cuda.h @@ -0,0 +1,643 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the special CUDA interface to the CLBlast BLAS routines. It also contains the +// definitions of the returned status codes and the layout and transpose types. This is the header +// users of the CUDA API of CLBlast should include and use. +// +// ================================================================================================= + +#ifndef CLBLAST_CLBLAST_CUDA_H_ +#define CLBLAST_CLBLAST_CUDA_H_ + +#include // For size_t +#include // For OverrideParameters function +#include // For OverrideParameters function + +// CUDA +#include // CUDA driver API +#include // NVIDIA runtime compilation API + +// Exports library functions under Windows when building a DLL. See also: +// https://msdn.microsoft.com/en-us/library/a90k134d.aspx +#if defined(_WIN32) && defined(CLBLAST_DLL) + #if defined(COMPILING_DLL) + #define PUBLIC_API __declspec(dllexport) + #else + #define PUBLIC_API __declspec(dllimport) + #endif +#else + #define PUBLIC_API +#endif + +namespace clblast { +// ================================================================================================= + +// Status codes. These codes can be returned by functions declared in this header file. The error +// codes match either the standard CUDA driver API error codes or the regular CLBlast error codes. +enum class StatusCode { + + // Status codes in common with the OpenCL standard + kSuccess = 0, // CUDA_SUCCESS + kInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions + kInvalidLocalThreadsTotal = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total + kInvalidLocalThreadsDim = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension + + // Status codes in common with the clBLAS library + kNotImplemented = -1024, // Routine or functionality not implemented yet + kInvalidMatrixA = -1022, // Matrix A is not a valid OpenCL buffer + kInvalidMatrixB = -1021, // Matrix B is not a valid OpenCL buffer + kInvalidMatrixC = -1020, // Matrix C is not a valid OpenCL buffer + kInvalidVectorX = -1019, // Vector X is not a valid OpenCL buffer + kInvalidVectorY = -1018, // Vector Y is not a valid OpenCL buffer + kInvalidDimension = -1017, // Dimensions M, N, and K have to be larger than zero + kInvalidLeadDimA = -1016, // LD of A is smaller than the matrix's first dimension + kInvalidLeadDimB = -1015, // LD of B is smaller than the matrix's first dimension + kInvalidLeadDimC = -1014, // LD of C is smaller than the matrix's first dimension + kInvalidIncrementX = -1013, // Increment of vector X cannot be zero + kInvalidIncrementY = -1012, // Increment of vector Y cannot be zero + kInsufficientMemoryA = -1011, // Matrix A's OpenCL buffer is too small + kInsufficientMemoryB = -1010, // Matrix B's OpenCL buffer is too small + kInsufficientMemoryC = -1009, // Matrix C's OpenCL buffer is too small + kInsufficientMemoryX = -1008, // Vector X's OpenCL buffer is too small + kInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small + + // Custom additional status codes for CLBlast + kInvalidBatchCount = -2049, // The batch count needs to be positive + kInvalidOverrideKernel = -2048, // Trying to override parameters for an invalid kernel + kMissingOverrideParameter = -2047, // Missing override parameter(s) for the target kernel + kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device + kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device + kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device + kInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer + kInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small + kDatabaseError = -2041, // Entry for the device was not found in the database + kUnknownError = -2040, // A catch-all error code representing an unspecified error + kUnexpectedError = -2039, // A catch-all error code representing an unexpected exception +}; + +// Matrix layout and transpose types +enum class Layout { kRowMajor = 101, kColMajor = 102 }; +enum class Transpose { kNo = 111, kYes = 112, kConjugate = 113 }; +enum class Triangle { kUpper = 121, kLower = 122 }; +enum class Diagonal { kNonUnit = 131, kUnit = 132 }; +enum class Side { kLeft = 141, kRight = 142 }; + +// Precision scoped enum (values in bits) +enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64, + kComplexSingle = 3232, kComplexDouble = 6464, kAny = -1 }; + +// ================================================================================================= +// BLAS level-1 (vector-vector) routines +// ================================================================================================= + +// Generate givens plane rotation: SROTG/DROTG +template +StatusCode Rotg(CUdeviceptr sa_buffer, const size_t sa_offset, + CUdeviceptr sb_buffer, const size_t sb_offset, + CUdeviceptr sc_buffer, const size_t sc_offset, + CUdeviceptr ss_buffer, const size_t ss_offset, + CUstream* stream); + +// Generate modified givens plane rotation: SROTMG/DROTMG +template +StatusCode Rotmg(CUdeviceptr sd1_buffer, const size_t sd1_offset, + CUdeviceptr sd2_buffer, const size_t sd2_offset, + CUdeviceptr sx1_buffer, const size_t sx1_offset, + const CUdeviceptr sy1_buffer, const size_t sy1_offset, + CUdeviceptr sparam_buffer, const size_t sparam_offset, + CUstream* stream); + +// Apply givens plane rotation: SROT/DROT +template +StatusCode Rot(const size_t n, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + const T cos, + const T sin, + CUstream* stream); + +// Apply modified givens plane rotation: SROTM/DROTM +template +StatusCode Rotm(const size_t n, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr sparam_buffer, const size_t sparam_offset, + CUstream* stream); + +// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP +template +StatusCode Swap(const size_t n, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL +template +StatusCode Scal(const size_t n, + const T alpha, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY +template +StatusCode Copy(const size_t n, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY +template +StatusCode Axpy(const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Dot product of two vectors: SDOT/DDOT/HDOT +template +StatusCode Dot(const size_t n, + CUdeviceptr dot_buffer, const size_t dot_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Dot product of two complex vectors: CDOTU/ZDOTU +template +StatusCode Dotu(const size_t n, + CUdeviceptr dot_buffer, const size_t dot_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC +template +StatusCode Dotc(const size_t n, + CUdeviceptr dot_buffer, const size_t dot_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 +template +StatusCode Nrm2(const size_t n, + CUdeviceptr nrm2_buffer, const size_t nrm2_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM +template +StatusCode Asum(const size_t n, + CUdeviceptr asum_buffer, const size_t asum_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM +template +StatusCode Sum(const size_t n, + CUdeviceptr sum_buffer, const size_t sum_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX +template +StatusCode Amax(const size_t n, + CUdeviceptr imax_buffer, const size_t imax_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN +template +StatusCode Amin(const size_t n, + CUdeviceptr imin_buffer, const size_t imin_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX +template +StatusCode Max(const size_t n, + CUdeviceptr imax_buffer, const size_t imax_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN +template +StatusCode Min(const size_t n, + CUdeviceptr imin_buffer, const size_t imin_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// ================================================================================================= + +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV +template +StatusCode Gemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV +template +StatusCode Gbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Hermitian matrix-vector multiplication: CHEMV/ZHEMV +template +StatusCode Hemv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV +template +StatusCode Hbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV +template +StatusCode Hpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr ap_buffer, const size_t ap_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV +template +StatusCode Symv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV +template +StatusCode Sbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV +template +StatusCode Spmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr ap_buffer, const size_t ap_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream); + +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV +template +StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV +template +StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV +template +StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr ap_buffer, const size_t ap_offset, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV +template +StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV +template +StatusCode Tbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV +template +StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr ap_buffer, const size_t ap_offset, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream); + +// General rank-1 matrix update: SGER/DGER/HGER +template +StatusCode Ger(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream); + +// General rank-1 complex matrix update: CGERU/ZGERU +template +StatusCode Geru(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream); + +// General rank-1 complex conjugated matrix update: CGERC/ZGERC +template +StatusCode Gerc(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream); + +// Hermitian rank-1 matrix update: CHER/ZHER +template +StatusCode Her(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream); + +// Hermitian packed rank-1 matrix update: CHPR/ZHPR +template +StatusCode Hpr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream); + +// Hermitian rank-2 matrix update: CHER2/ZHER2 +template +StatusCode Her2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream); + +// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 +template +StatusCode Hpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream); + +// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR +template +StatusCode Syr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream); + +// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR +template +StatusCode Spr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream); + +// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 +template +StatusCode Syr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream); + +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 +template +StatusCode Spr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream); + +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines +// ================================================================================================= + +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM +template +StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream); + +// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM +template +StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream); + +// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM +template +StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream); + +// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK +template +StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream); + +// Rank-K update of a hermitian matrix: CHERK/ZHERK +template +StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream); + +// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K +template +StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream); + +// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K +template +StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const U beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream); + +// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM +template +StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + CUstream* stream); + +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM +template +StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + CUstream* stream); + +// ================================================================================================= +// Extra non-BLAS routines (level-X) +// ================================================================================================= + +// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY +template +StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + CUstream* stream); + +// Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL +template +StatusCode Im2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, + const CUdeviceptr im_buffer, const size_t im_offset, + CUdeviceptr col_buffer, const size_t col_offset, + CUstream* stream); + +// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED +template +StatusCode AxpyBatched(const size_t n, + const T *alphas, + const CUdeviceptr x_buffer, const size_t *x_offsets, const size_t x_inc, + CUdeviceptr y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + CUstream* stream); + +// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED +template +StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T *alphas, + const CUdeviceptr a_buffer, const size_t *a_offsets, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t *b_offsets, const size_t b_ld, + const T *betas, + CUdeviceptr c_buffer, const size_t *c_offsets, const size_t c_ld, + const size_t batch_count, + CUstream* stream); + +// ================================================================================================= + +// CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on +// for the same device. This cache can be cleared to free up system memory or in case of debugging. +StatusCode PUBLIC_API ClearCache(); + +// The cache can also be pre-initialized for a specific device with all possible CLBLast kernels. +// Further CLBlast routine calls will then run at maximum speed. +StatusCode PUBLIC_API FillCache(const CUdevice device); + +// ================================================================================================= + +// Overrides tuning parameters for a specific device-precision-kernel combination. The next time +// the target routine is called it will re-compile and use the new parameters from then on. +StatusCode PUBLIC_API OverrideParameters(const CUdevice device, const std::string &kernel_name, + const Precision precision, + const std::unordered_map ¶meters); + +// ================================================================================================= + +} // namespace clblast + +// CLBLAST_CLBLAST_CUDA_H_ +#endif diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 0d34d7fe..520e3fc8 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -12,6 +12,8 @@ # clblast.cpp # clblast_c.h # clblast_c.cpp +# clblast_cuda.h +# clblast_cuda.cpp # clblast_netlib_c.h # clblast_netlib_c.cpp # wrapper_clblas.h @@ -41,9 +43,11 @@ FILES = [ "/test/wrapper_cublas.hpp", "/include/clblast_netlib_c.h", "/src/clblast_netlib_c.cpp", + "/include/clblast_cuda.h", + "/src/clblast_cuda.cpp", ] -HEADER_LINES = [122, 21, 126, 24, 29, 41, 29, 65, 32] -FOOTER_LINES = [25, 3, 27, 38, 6, 6, 6, 9, 2] +HEADER_LINES = [122, 21, 126, 24, 29, 41, 29, 65, 32, 94, 21] +FOOTER_LINES = [25, 3, 27, 38, 6, 6, 6, 9, 2, 25, 3] HEADER_LINES_DOC = 0 FOOTER_LINES_DOC = 63 @@ -224,6 +228,10 @@ def main(argv): if i == 8: if not routine.batched: body += cpp.clblast_netlib_c_cc(routine) + if i == 9: + body += cpp.clblast_h(routine, cuda=True) + if i == 10: + body += cpp.clblast_cc(routine, cuda=True) f.write("".join(file_header)) f.write(body) f.write("".join(file_footer)) diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index 5fef3083..f1ee1959 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -36,19 +36,19 @@ HEADER = NL + SEPARATOR + """ """ + SEPARATOR + NL -def clblast_h(routine): +def clblast_h(routine, cuda=False): """The C++ API header (.h)""" result = NL + "// " + routine.description + ": " + routine.short_names() + NL - result += routine.routine_header_cpp(12, " = nullptr") + ";" + NL + result += routine.routine_header_cpp(12, " = nullptr", cuda) + ";" + NL return result -def clblast_cc(routine): +def clblast_cc(routine, cuda=False): """The C++ API implementation (.cpp)""" indent1 = " " * (15 + routine.length()) result = NL + "// " + routine.description + ": " + routine.short_names() + NL if routine.implemented: - result += routine.routine_header_cpp(12, "") + " {" + NL + result += routine.routine_header_cpp(12, "", cuda) + " {" + NL result += " try {" + NL result += " auto queue_cpp = Queue(*queue);" + NL result += " auto routine = X" + routine.plain_name() + "<" + routine.template.template + ">(queue_cpp, event);" + NL @@ -60,14 +60,22 @@ def clblast_cc(routine): result += " return StatusCode::kSuccess;" + NL result += " } catch (...) { return DispatchException(); }" + NL else: - result += routine.routine_header_type_cpp(12) + " {" + NL + result += routine.routine_header_type_cpp(12, cuda) + " {" + NL result += " return StatusCode::kNotImplemented;" + NL result += "}" + NL for flavour in routine.flavours: indent2 = " " * (34 + routine.length() + len(flavour.template)) result += "template StatusCode PUBLIC_API " + routine.capitalized_name() + "<" + flavour.template + ">(" - result += ("," + NL + indent2).join([a for a in routine.arguments_type(flavour)]) - result += "," + NL + indent2 + "cl_command_queue*, cl_event*);" + NL + arguments = routine.arguments_type(flavour) + if cuda: + arguments = [a.replace("cl_mem", "CUdeviceptr") for a in arguments] + result += ("," + NL + indent2).join([a for a in arguments]) + result += "," + NL + indent2 + if cuda: + result += "CUstream*" + else: + result += "cl_command_queue*, cl_event*" + result += ");" + NL return result diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index cef7db87..c3c1f775 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -802,22 +802,38 @@ class Routine: """Retrieves a list of routine requirements for documentation""" return self.requirements - def routine_header_cpp(self, spaces, default_event): + def routine_header_cpp(self, spaces, default_event, cuda=False): """Retrieves the C++ templated definition for a routine""" indent = " " * (spaces + self.length()) + arguments = self.arguments_def(self.template) + if cuda: + arguments = [a.replace("cl_mem", "CUdeviceptr") for a in arguments] result = "template <" + self.template.name + ">\n" result += "StatusCode " + self.capitalized_name() + "(" - result += (",\n" + indent).join([a for a in self.arguments_def(self.template)]) - result += ",\n" + indent + "cl_command_queue* queue, cl_event* event" + default_event + ")" + result += (",\n" + indent).join([a for a in arguments]) + result += ",\n" + indent + if cuda: + result += "CUstream* stream" + else: + result += "cl_command_queue* queue, cl_event* event" + default_event + result += ")" return result - def routine_header_type_cpp(self, spaces): + def routine_header_type_cpp(self, spaces, cuda=False): """As above, but now without variable names""" indent = " " * (spaces + self.length()) + arguments = self.arguments_type(self.template) + if cuda: + arguments = [a.replace("cl_mem", "CUdeviceptr") for a in arguments] result = "template <" + self.template.name + ">\n" result += "StatusCode " + self.capitalized_name() + "(" - result += (",\n" + indent).join([a for a in self.arguments_type(self.template)]) - result += ",\n" + indent + "cl_command_queue*, cl_event*)" + result += (",\n" + indent).join([a for a in arguments]) + result += ",\n" + indent + if cuda: + result += "CUstream* stream" + else: + result += "cl_command_queue*, cl_event*" + result += ")" return result def routine_header_c(self, flavour, spaces, extra_qualifier): diff --git a/src/api_common.cpp b/src/api_common.cpp index aa7e2b0f..0d387cd9 100644 --- a/src/api_common.cpp +++ b/src/api_common.cpp @@ -12,9 +12,9 @@ #include +#include "utilities/utilities.hpp" #include "cache.hpp" #include "routines/routines.hpp" -#include "clblast.h" namespace clblast { // ================================================================================================= diff --git a/src/clblast_cuda.cpp b/src/clblast_cuda.cpp new file mode 100644 index 00000000..5f30d023 --- /dev/null +++ b/src/clblast_cuda.cpp @@ -0,0 +1,2336 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements all the BLAS API calls (CUDA version). In all cases, it does not much more +// than creating a new object of the appropriate type, and calling the main routine on that object. +// It forwards all status codes to the caller. +// +// ================================================================================================= + +#include + +#include "routines/routines.hpp" +#include "clblast_cuda.h" + +namespace clblast { + +// ================================================================================================= +// BLAS level-1 (vector-vector) routines +// ================================================================================================= + +// Generate givens plane rotation: SROTG/DROTG +template +StatusCode Rotg(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream* stream) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rotg(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Rotg(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// Generate modified givens plane rotation: SROTMG/DROTMG +template +StatusCode Rotmg(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream* stream) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rotmg(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Rotmg(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// Apply givens plane rotation: SROT/DROT +template +StatusCode Rot(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const T, + const T, + CUstream* stream) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rot(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const float, + const float, + CUstream*); +template StatusCode PUBLIC_API Rot(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const double, + const double, + CUstream*); + +// Apply modified givens plane rotation: SROTM/DROTM +template +StatusCode Rotm(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream* stream) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rotm(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Rotm(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP +template +StatusCode Swap(const size_t n, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xswap(queue_cpp, event); + routine.DoSwap(n, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Swap(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Swap(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Swap(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Swap(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Swap(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL +template +StatusCode Scal(const size_t n, + const T alpha, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xscal(queue_cpp, event); + routine.DoScal(n, + alpha, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Scal(const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Scal(const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Scal(const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Scal(const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Scal(const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY +template +StatusCode Copy(const size_t n, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xcopy(queue_cpp, event); + routine.DoCopy(n, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Copy(const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Copy(const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Copy(const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Copy(const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Copy(const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY +template +StatusCode Axpy(const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xaxpy(queue_cpp, event); + routine.DoAxpy(n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Axpy(const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Axpy(const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Axpy(const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Axpy(const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Axpy(const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Dot product of two vectors: SDOT/DDOT/HDOT +template +StatusCode Dot(const size_t n, + CUdeviceptr dot_buffer, const size_t dot_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xdot(queue_cpp, event); + routine.DoDot(n, + Buffer(dot_buffer), dot_offset, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Dot(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Dot(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Dot(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Dot product of two complex vectors: CDOTU/ZDOTU +template +StatusCode Dotu(const size_t n, + CUdeviceptr dot_buffer, const size_t dot_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xdotu(queue_cpp, event); + routine.DoDotu(n, + Buffer(dot_buffer), dot_offset, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Dotu(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Dotu(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC +template +StatusCode Dotc(const size_t n, + CUdeviceptr dot_buffer, const size_t dot_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xdotc(queue_cpp, event); + routine.DoDotc(n, + Buffer(dot_buffer), dot_offset, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Dotc(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Dotc(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 +template +StatusCode Nrm2(const size_t n, + CUdeviceptr nrm2_buffer, const size_t nrm2_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xnrm2(queue_cpp, event); + routine.DoNrm2(n, + Buffer(nrm2_buffer), nrm2_offset, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Nrm2(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Nrm2(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Nrm2(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Nrm2(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Nrm2(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM +template +StatusCode Asum(const size_t n, + CUdeviceptr asum_buffer, const size_t asum_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xasum(queue_cpp, event); + routine.DoAsum(n, + Buffer(asum_buffer), asum_offset, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Asum(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Asum(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Asum(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Asum(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Asum(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM +template +StatusCode Sum(const size_t n, + CUdeviceptr sum_buffer, const size_t sum_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsum(queue_cpp, event); + routine.DoSum(n, + Buffer(sum_buffer), sum_offset, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Sum(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Sum(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Sum(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Sum(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Sum(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX +template +StatusCode Amax(const size_t n, + CUdeviceptr imax_buffer, const size_t imax_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xamax(queue_cpp, event); + routine.DoAmax(n, + Buffer(imax_buffer), imax_offset, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Amax(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amax(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amax(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amax(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amax(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN +template +StatusCode Amin(const size_t n, + CUdeviceptr imin_buffer, const size_t imin_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xamin(queue_cpp, event); + routine.DoAmin(n, + Buffer(imin_buffer), imin_offset, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Amin(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amin(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amin(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amin(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Amin(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX +template +StatusCode Max(const size_t n, + CUdeviceptr imax_buffer, const size_t imax_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xmax(queue_cpp, event); + routine.DoMax(n, + Buffer(imax_buffer), imax_offset, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Max(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Max(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Max(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Max(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Max(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN +template +StatusCode Min(const size_t n, + CUdeviceptr imin_buffer, const size_t imin_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xmin(queue_cpp, event); + routine.DoMin(n, + Buffer(imin_buffer), imin_offset, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Min(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Min(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Min(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Min(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Min(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUstream*); + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// ================================================================================================= + +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV +template +StatusCode Gemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xgemv(queue_cpp, event); + routine.DoGemv(layout, a_transpose, + m, n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV +template +StatusCode Gbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xgbmv(queue_cpp, event); + routine.DoGbmv(layout, a_transpose, + m, n, kl, ku, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Hermitian matrix-vector multiplication: CHEMV/ZHEMV +template +StatusCode Hemv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhemv(queue_cpp, event); + routine.DoHemv(layout, triangle, + n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, + const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, + const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV +template +StatusCode Hbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhbmv(queue_cpp, event); + routine.DoHbmv(layout, triangle, + n, k, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV +template +StatusCode Hpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr ap_buffer, const size_t ap_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhpmv(queue_cpp, event); + routine.DoHpmv(layout, triangle, + n, + alpha, + Buffer(ap_buffer), ap_offset, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, + const size_t, + const float2, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, + const size_t, + const double2, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV +template +StatusCode Symv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsymv(queue_cpp, event); + routine.DoSymv(layout, triangle, + n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Symv(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Symv(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Symv(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV +template +StatusCode Sbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsbmv(queue_cpp, event); + routine.DoSbmv(layout, triangle, + n, k, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV +template +StatusCode Spmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr ap_buffer, const size_t ap_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xspmv(queue_cpp, event); + routine.DoSpmv(layout, triangle, + n, + alpha, + Buffer(ap_buffer), ap_offset, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV +template +StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtrmv(queue_cpp, event); + routine.DoTrmv(layout, triangle, a_transpose, diagonal, + n, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV +template +StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtbmv(queue_cpp, event); + routine.DoTbmv(layout, triangle, a_transpose, diagonal, + n, k, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV +template +StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr ap_buffer, const size_t ap_offset, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtpmv(queue_cpp, event); + routine.DoTpmv(layout, triangle, a_transpose, diagonal, + n, + Buffer(ap_buffer), ap_offset, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV +template +StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtrsv(queue_cpp, event); + routine.DoTrsv(layout, triangle, a_transpose, diagonal, + n, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV +template +StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream* stream) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV +template +StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream* stream) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// General rank-1 matrix update: SGER/DGER/HGER +template +StatusCode Ger(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xger(queue_cpp, event); + routine.DoGer(layout, + m, n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc, + Buffer(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Ger(const Layout, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Ger(const Layout, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Ger(const Layout, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// General rank-1 complex matrix update: CGERU/ZGERU +template +StatusCode Geru(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xgeru(queue_cpp, event); + routine.DoGeru(layout, + m, n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc, + Buffer(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Geru(const Layout, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Geru(const Layout, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// General rank-1 complex conjugated matrix update: CGERC/ZGERC +template +StatusCode Gerc(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xgerc(queue_cpp, event); + routine.DoGerc(layout, + m, n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc, + Buffer(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Gerc(const Layout, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gerc(const Layout, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Hermitian rank-1 matrix update: CHER/ZHER +template +StatusCode Her(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xher,T>(queue_cpp, event); + routine.DoHer(layout, triangle, + n, + alpha, + Buffer>(x_buffer), x_offset, x_inc, + Buffer>(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Her(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Her(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Hermitian packed rank-1 matrix update: CHPR/ZHPR +template +StatusCode Hpr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhpr,T>(queue_cpp, event); + routine.DoHpr(layout, triangle, + n, + alpha, + Buffer>(x_buffer), x_offset, x_inc, + Buffer>(ap_buffer), ap_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hpr(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Hpr(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// Hermitian rank-2 matrix update: CHER2/ZHER2 +template +StatusCode Her2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xher2(queue_cpp, event); + routine.DoHer2(layout, triangle, + n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc, + Buffer(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Her2(const Layout, const Triangle, + const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Her2(const Layout, const Triangle, + const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 +template +StatusCode Hpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhpr2(queue_cpp, event); + routine.DoHpr2(layout, triangle, + n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc, + Buffer(ap_buffer), ap_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hpr2(const Layout, const Triangle, + const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Hpr2(const Layout, const Triangle, + const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR +template +StatusCode Syr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsyr(queue_cpp, event); + routine.DoSyr(layout, triangle, + n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Syr(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR +template +StatusCode Spr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xspr(queue_cpp, event); + routine.DoSpr(layout, triangle, + n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(ap_buffer), ap_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Spr(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Spr(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Spr(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 +template +StatusCode Syr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsyr2(queue_cpp, event); + routine.DoSyr2(layout, triangle, + n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc, + Buffer(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 +template +StatusCode Spr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xspr2(queue_cpp, event); + routine.DoSpr2(layout, triangle, + n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc, + Buffer(ap_buffer), ap_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines +// ================================================================================================= + +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM +template +StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xgemm(queue_cpp, event); + routine.DoGemm(layout, a_transpose, b_transpose, + m, n, k, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, + beta, + Buffer(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM +template +StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsymm(queue_cpp, event); + routine.DoSymm(layout, side, triangle, + m, n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, + beta, + Buffer(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM +template +StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhemm(queue_cpp, event); + routine.DoHemm(layout, side, triangle, + m, n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, + beta, + Buffer(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK +template +StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsyrk(queue_cpp, event); + routine.DoSyrk(layout, triangle, a_transpose, + n, k, + alpha, + Buffer(a_buffer), a_offset, a_ld, + beta, + Buffer(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Rank-K update of a hermitian matrix: CHERK/ZHERK +template +StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xherk,T>(queue_cpp, event); + routine.DoHerk(layout, triangle, a_transpose, + n, k, + alpha, + Buffer>(a_buffer), a_offset, a_ld, + beta, + Buffer>(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Herk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Herk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K +template +StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsyr2k(queue_cpp, event); + routine.DoSyr2k(layout, triangle, ab_transpose, + n, k, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, + beta, + Buffer(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K +template +StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const U beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xher2k(queue_cpp, event); + routine.DoHer2k(layout, triangle, ab_transpose, + n, k, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, + beta, + Buffer(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Her2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Her2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM +template +StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtrmm(queue_cpp, event); + routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM +template +StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtrsm(queue_cpp, event); + routine.DoTrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// ================================================================================================= +// Extra non-BLAS routines (level-X) +// ================================================================================================= + +// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY +template +StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xomatcopy(queue_cpp, event); + routine.DoOmatcopy(layout, a_transpose, + m, n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); +template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUstream*); + +// Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL +template +StatusCode Im2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, + const CUdeviceptr im_buffer, const size_t im_offset, + CUdeviceptr col_buffer, const size_t col_offset, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xim2col(queue_cpp, event); + routine.DoIm2col(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, + Buffer(im_buffer), im_offset, + Buffer(col_buffer), col_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Im2col(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Im2col(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Im2col(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Im2col(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); +template StatusCode PUBLIC_API Im2col(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUstream*); + +// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED +template +StatusCode AxpyBatched(const size_t n, + const T *alphas, + const CUdeviceptr x_buffer, const size_t *x_offsets, const size_t x_inc, + CUdeviceptr y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = XaxpyBatched(queue_cpp, event); + auto alphas_cpp = std::vector(); + auto x_offsets_cpp = std::vector(); + auto y_offsets_cpp = std::vector(); + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + alphas_cpp.push_back(alphas[batch]); + x_offsets_cpp.push_back(x_offsets[batch]); + y_offsets_cpp.push_back(y_offsets[batch]); + } + routine.DoAxpyBatched(n, + alphas_cpp, + Buffer(x_buffer), x_offsets_cpp, x_inc, + Buffer(y_buffer), y_offsets_cpp, y_inc, + batch_count); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API AxpyBatched(const size_t, + const float*, + const CUdeviceptr, const size_t*, const size_t, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API AxpyBatched(const size_t, + const double*, + const CUdeviceptr, const size_t*, const size_t, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API AxpyBatched(const size_t, + const float2*, + const CUdeviceptr, const size_t*, const size_t, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API AxpyBatched(const size_t, + const double2*, + const CUdeviceptr, const size_t*, const size_t, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API AxpyBatched(const size_t, + const half*, + const CUdeviceptr, const size_t*, const size_t, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); + +// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED +template +StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T *alphas, + const CUdeviceptr a_buffer, const size_t *a_offsets, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t *b_offsets, const size_t b_ld, + const T *betas, + CUdeviceptr c_buffer, const size_t *c_offsets, const size_t c_ld, + const size_t batch_count, + CUstream* stream) { + try { + auto queue_cpp = Queue(*queue); + auto routine = XgemmBatched(queue_cpp, event); + auto alphas_cpp = std::vector(); + auto betas_cpp = std::vector(); + auto a_offsets_cpp = std::vector(); + auto b_offsets_cpp = std::vector(); + auto c_offsets_cpp = std::vector(); + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + alphas_cpp.push_back(alphas[batch]); + betas_cpp.push_back(betas[batch]); + a_offsets_cpp.push_back(a_offsets[batch]); + b_offsets_cpp.push_back(b_offsets[batch]); + c_offsets_cpp.push_back(c_offsets[batch]); + } + routine.DoGemmBatched(layout, a_transpose, b_transpose, + m, n, k, + alphas_cpp, + Buffer(a_buffer), a_offsets_cpp, a_ld, + Buffer(b_buffer), b_offsets_cpp, b_ld, + betas_cpp, + Buffer(c_buffer), c_offsets_cpp, c_ld, + batch_count); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const float*, + const CUdeviceptr, const size_t*, const size_t, + const CUdeviceptr, const size_t*, const size_t, + const float*, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const double*, + const CUdeviceptr, const size_t*, const size_t, + const CUdeviceptr, const size_t*, const size_t, + const double*, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const float2*, + const CUdeviceptr, const size_t*, const size_t, + const CUdeviceptr, const size_t*, const size_t, + const float2*, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const double2*, + const CUdeviceptr, const size_t*, const size_t, + const CUdeviceptr, const size_t*, const size_t, + const double2*, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); +template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const half*, + const CUdeviceptr, const size_t*, const size_t, + const CUdeviceptr, const size_t*, const size_t, + const half*, + CUdeviceptr, const size_t*, const size_t, + const size_t, + CUstream*); + +// ================================================================================================= +} // namespace clblast diff --git a/src/cupp11.hpp b/src/cupp11.hpp new file mode 100644 index 00000000..988366ea --- /dev/null +++ b/src/cupp11.hpp @@ -0,0 +1,770 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a bunch of C++11 classes that act as wrappers around OpenCL objects and API +// calls. The main benefits are increased abstraction, automatic memory management, and portability. +// Portability here means that a similar header exists for CUDA with the same classes and +// interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change. +// +// This file is taken from the CLCudaAPI project and +// therefore contains the following header copyright notice: +// +// ================================================================================================= +// +// Copyright 2015 SURFsara +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// ================================================================================================= + +#ifndef CLBLAST_CUPP11_H_ +#define CLBLAST_CUPP11_H_ + +// C++ +#include // std::copy +#include // std::string +#include // std::vector +#include // std::shared_ptr + +// CUDA +#include // CUDA driver API +#include // NVIDIA runtime compilation API + +// Exception classes +#include "cxpp11_common.hpp" + +namespace clblast { +// ================================================================================================= + +// Max-length of strings +constexpr auto kStringLength = 256; + +// ================================================================================================= + +// Represents a runtime error returned by a CUDA driver API function +class CLCudaAPIError : public ErrorCode { +public: + explicit CLCudaAPIError(CUresult status, const std::string &where): + ErrorCode(status, where, "CUDA error: " + where + ": " + + GetErrorName(status) + " --> " + GetErrorString(status)) { + } + + static void Check(const CUresult status, const std::string &where) { + if (status != CUDA_SUCCESS) { + throw CLCudaAPIError(status, where); + } + } + + static void CheckDtor(const CUresult status, const std::string &where) { + if (status != CUDA_SUCCESS) { + fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPIError(status, where).what()); + } + } + +private: + std::string GetErrorName(CUresult status) const { + const char* status_code; + cuGetErrorName(status, &status_code); + return std::string(status_code); + } + std::string GetErrorString(CUresult status) const { + const char* status_string; + cuGetErrorString(status, &status_string); + return std::string(status_string); + } +}; + +// Represents a runtime error returned by a CUDA runtime compilation API function +class CLCudaAPINVRTCError : public ErrorCode { +public: + explicit CLCudaAPINVRTCError(nvrtcResult status, const std::string &where): + ErrorCode(status, where, "CUDA NVRTC error: " + where + ": " + GetErrorString(status)) { + } + + static void Check(const nvrtcResult status, const std::string &where) { + if (status != NVRTC_SUCCESS) { + throw CLCudaAPINVRTCError(status, where); + } + } + + static void CheckDtor(const nvrtcResult status, const std::string &where) { + if (status != NVRTC_SUCCESS) { + fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPINVRTCError(status, where).what()); + } + } + +private: + std::string GetErrorString(nvrtcResult status) const { + const char* status_string = nvrtcGetErrorString(status); + return std::string(status_string); + } +}; + +// Exception returned when building a program +using CLCudaAPIBuildError = CLCudaAPINVRTCError; + +// ================================================================================================= + +// Error occurred in CUDA driver or runtime compilation API +#define CheckError(call) CLCudaAPIError::Check(call, CLCudaAPIError::TrimCallString(#call)) +#define CheckErrorNVRTC(call) CLCudaAPINVRTCError::Check(call, CLCudaAPINVRTCError::TrimCallString(#call)) + +// Error occurred in CUDA driver or runtime compilation API (no-exception version for destructors) +#define CheckErrorDtor(call) CLCudaAPIError::CheckDtor(call, CLCudaAPIError::TrimCallString(#call)) +#define CheckErrorDtorNVRTC(call) CLCudaAPINVRTCError::CheckDtor(call, CLCudaAPINVRTCError::TrimCallString(#call)) + +// ================================================================================================= + +// C++11 version of two 'CUevent' pointers +class Event { +public: + // Note that there is no constructor based on the regular CUDA data-type because of extra state + + // Regular constructor with memory management + explicit Event(): + start_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }), + end_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }) { + CheckError(cuEventCreate(start_.get(), CU_EVENT_DEFAULT)); + CheckError(cuEventCreate(end_.get(), CU_EVENT_DEFAULT)); + } + + // Waits for completion of this event (not implemented for CUDA) + void WaitForCompletion() const { } + + // Retrieves the elapsed time of the last recorded event + float GetElapsedTime() const { + auto result = 0.0f; + cuEventElapsedTime(&result, *start_, *end_); + return result; + } + + // Accessors to the private data-members + const CUevent& start() const { return *start_; } + const CUevent& end() const { return *end_; } + Event* pointer() { return this; } +private: + std::shared_ptr start_; + std::shared_ptr end_; +}; + +// Pointer to a CUDA event +using EventPointer = Event*; + +// ================================================================================================= + +// Raw platform ID type +using RawPlatformID = size_t; + +// The CUDA platform: initializes the CUDA driver API +class Platform { +public: + + // Initializes the platform. Note that the platform ID variable is not actually used for CUDA. + explicit Platform(const size_t platform_id) : platform_id_(0) { + if (platform_id != 0) { throw LogicError("CUDA back-end requires a platform ID of 0"); } + CheckError(cuInit(0)); + } + + // Methods to retrieve platform information + std::string Name() const { return "CUDA"; } + std::string Vendor() const { return "NVIDIA Corporation"; } + std::string Version() const { + auto result = 0; + CheckError(cuDriverGetVersion(&result)); + return "CUDA driver "+std::to_string(result); + } + + // Returns the number of devices on this platform + size_t NumDevices() const { + auto result = 0; + CheckError(cuDeviceGetCount(&result)); + return static_cast(result); + } + + // Accessor to the raw ID (which doesn't exist in the CUDA back-end, this is always just 0) + const RawPlatformID& operator()() const { return platform_id_; } +private: + const size_t platform_id_; +}; + +// Retrieves a vector with all platforms. Note that there is just one platform in CUDA. +inline std::vector GetAllPlatforms() { + auto all_platforms = std::vector{ Platform(size_t{0}) }; + return all_platforms; +} + +// ================================================================================================= + +// Raw device ID type +using RawDeviceID = CUdevice; + +// C++11 version of 'CUdevice' +class Device { +public: + + // Constructor based on the regular CUDA data-type + explicit Device(const CUdevice device): device_(device) { } + + // Initialization + explicit Device(const Platform &platform, const size_t device_id) { + auto num_devices = platform.NumDevices(); + if (num_devices == 0) { + throw RuntimeError("Device: no devices found"); + } + if (device_id >= num_devices) { + throw RuntimeError("Device: invalid device ID "+std::to_string(device_id)); + } + + CheckError(cuDeviceGet(&device_, device_id)); + } + + // Methods to retrieve device information + RawPlatformID PlatformID() const { return 0; } + std::string Version() const { + auto result = 0; + CheckError(cuDriverGetVersion(&result)); + return "CUDA driver "+std::to_string(result); + } + size_t VersionNumber() const { + auto result = 0; + CheckError(cuDriverGetVersion(&result)); + return static_cast(result); + } + std::string Vendor() const { return "NVIDIA Corporation"; } + std::string Name() const { + auto result = std::string{}; + result.resize(kStringLength); + CheckError(cuDeviceGetName(&result[0], result.size(), device_)); + return result; + } + std::string Type() const { return "GPU"; } + size_t MaxWorkGroupSize() const {return GetInfo(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK); } + size_t MaxWorkItemDimensions() const { return size_t{3}; } + std::vector MaxWorkItemSizes() const { + return std::vector{GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X), + GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y), + GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z)}; + } + unsigned long LocalMemSize() const { + return static_cast(GetInfo(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)); + } + + std::string Capabilities() const { + const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); + const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); + return "SM"+std::to_string(major)+"."+std::to_string(minor); + } + bool HasExtension(const std::string &extension) const { return false; } + bool SupportsFP64() const { return true; } + bool SupportsFP16() const { + const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); + const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); + if (major > 5) { return true; } // SM 6.x, 7.x and higher + if (major == 5 && minor == 3) { return true; } // SM 5.3 + return false; + } + + size_t CoreClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_CLOCK_RATE); } + size_t ComputeUnits() const { return GetInfo(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT); } + unsigned long MemorySize() const { + auto result = size_t{0}; + CheckError(cuDeviceTotalMem(&result, device_)); + return static_cast(result); + } + unsigned long MaxAllocSize() const { return MemorySize(); } + size_t MemoryClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE); } + size_t MemoryBusWidth() const { return GetInfo(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH); } + + // Configuration-validity checks + bool IsLocalMemoryValid(const size_t local_mem_usage) const { + return (local_mem_usage <= LocalMemSize()); + } + bool IsThreadConfigValid(const std::vector &local) const { + auto local_size = size_t{1}; + for (const auto &item: local) { local_size *= item; } + for (auto i=size_t{0}; i MaxWorkItemSizes()[i]) { return false; } + } + if (local_size > MaxWorkGroupSize()) { return false; } + if (local.size() > MaxWorkItemDimensions()) { return false; } + return true; + } + + // Query for a specific type of device or brand + bool IsCPU() const { return false; } + bool IsGPU() const { return true; } + bool IsAMD() const { return false; } + bool IsNVIDIA() const { return true; } + bool IsIntel() const { return false; } + bool IsARM() const { return false; } + + // Platform specific extensions + std::string AMDBoardName() const { return ""; } + std::string NVIDIAComputeCapability() const { return Capabilities(); } + + // Accessor to the private data-member + const RawDeviceID& operator()() const { return device_; } +private: + CUdevice device_; + + // Private helper function + size_t GetInfo(const CUdevice_attribute info) const { + auto result = 0; + CheckError(cuDeviceGetAttribute(&result, info, device_)); + return static_cast(result); + } +}; + +// ================================================================================================= + +// Raw context type +using RawContext = CUcontext; + +// C++11 version of 'CUcontext' +class Context { +public: + + // Constructor based on the regular CUDA data-type: memory management is handled elsewhere + explicit Context(const CUcontext context): + context_(new CUcontext) { + *context_ = context; + } + + // Regular constructor with memory management + explicit Context(const Device &device): + context_(new CUcontext, [](CUcontext* c) { + if (*c) { CheckErrorDtor(cuCtxDestroy(*c)); } + delete c; + }) { + CheckError(cuCtxCreate(context_.get(), 0, device())); + } + + // Accessor to the private data-member + const RawContext& operator()() const { return *context_; } + RawContext* pointer() const { return &(*context_); } +private: + std::shared_ptr context_; +}; + +// Pointer to a raw CUDA context +using ContextPointer = CUcontext*; + +// ================================================================================================= + +// C++11 version of 'nvrtcProgram'. Additionally holds the program's source code. +class Program { +public: + // Note that there is no constructor based on the regular CUDA data-type because of extra state + + // Source-based constructor with memory management + explicit Program(const Context &, std::string source): + program_(new nvrtcProgram, [](nvrtcProgram* p) { + if (*p) { CheckErrorDtorNVRTC(nvrtcDestroyProgram(p)); } + delete p; + }), + source_(std::move(source)), + from_binary_(false) { + const auto source_ptr = &source_[0]; + CheckErrorNVRTC(nvrtcCreateProgram(program_.get(), source_ptr, nullptr, 0, nullptr, nullptr)); + } + + // PTX-based constructor + explicit Program(const Device &device, const Context &context, const std::string &binary): + program_(nullptr), // not used + source_(binary), + from_binary_(true) { + } + + // Compiles the device program and checks whether or not there are any warnings/errors + void Build(const Device &, std::vector &options) { + if (from_binary_) { return; } + auto raw_options = std::vector(); + for (const auto &option: options) { + raw_options.push_back(option.c_str()); + } + auto status = nvrtcCompileProgram(*program_, raw_options.size(), raw_options.data()); + CLCudaAPINVRTCError::Check(status, "nvrtcCompileProgram"); + } + + // Confirms whether a certain status code is an actual compilation error or warning + bool StatusIsCompilationWarningOrError(const nvrtcResult status) const { + return (status == NVRTC_ERROR_INVALID_INPUT); + } + + // Retrieves the warning/error message from the compiler (if any) + std::string GetBuildInfo(const Device &) const { + if (from_binary_) { return std::string{}; } + auto bytes = size_t{0}; + CheckErrorNVRTC(nvrtcGetProgramLogSize(*program_, &bytes)); + auto result = std::string{}; + result.resize(bytes); + CheckErrorNVRTC(nvrtcGetProgramLog(*program_, &result[0])); + return result; + } + + // Retrieves an intermediate representation of the compiled program (i.e. PTX) + std::string GetIR() const { + if (from_binary_) { return source_; } // holds the PTX + auto bytes = size_t{0}; + CheckErrorNVRTC(nvrtcGetPTXSize(*program_, &bytes)); + auto result = std::string{}; + result.resize(bytes); + CheckErrorNVRTC(nvrtcGetPTX(*program_, &result[0])); + return result; + } + + // Accessor to the private data-member + const nvrtcProgram& operator()() const { return *program_; } +private: + std::shared_ptr program_; + const std::string source_; + const bool from_binary_; +}; + +// ================================================================================================= + +// Raw command-queue type +using RawCommandQueue = CUstream; + +// C++11 version of 'CUstream' +class Queue { +public: + // Note that there is no constructor based on the regular CUDA data-type because of extra state + + // Regular constructor with memory management + explicit Queue(const Context &context, const Device &device): + queue_(new CUstream, [](CUstream* s) { + if (*s) { CheckErrorDtor(cuStreamDestroy(*s)); } + delete s; + }), + context_(context), + device_(device) { + CheckError(cuStreamCreate(queue_.get(), CU_STREAM_NON_BLOCKING)); + } + + // Synchronizes the queue and optionally also an event + void Finish(Event &event) const { + CheckError(cuEventSynchronize(event.end())); + Finish(); + } + void Finish() const { + CheckError(cuStreamSynchronize(*queue_)); + } + + // Retrieves the corresponding context or device + Context GetContext() const { return context_; } + Device GetDevice() const { return device_; } + + // Accessor to the private data-member + const RawCommandQueue& operator()() const { return *queue_; } +private: + std::shared_ptr queue_; + const Context context_; + const Device device_; +}; + +// ================================================================================================= + +// C++11 version of page-locked host memory +template +class BufferHost { +public: + + // Regular constructor with memory management + explicit BufferHost(const Context &, const size_t size): + buffer_(new void*, [](void** m) { CheckError(cuMemFreeHost(*m)); delete m; }), + size_(size) { + CheckError(cuMemAllocHost(buffer_.get(), size*sizeof(T))); + } + + // Retrieves the actual allocated size in bytes + size_t GetSize() const { + return size_*sizeof(T); + } + + // Compatibility with std::vector + size_t size() const { return size_; } + T* begin() { return &static_cast(*buffer_)[0]; } + T* end() { return &static_cast(*buffer_)[size_-1]; } + T& operator[](const size_t i) { return static_cast(*buffer_)[i]; } + T* data() { return static_cast(*buffer_); } + const T* data() const { return static_cast(*buffer_); } + +private: + std::shared_ptr buffer_; + const size_t size_; +}; + +// ================================================================================================= + +// Enumeration of buffer access types +enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned }; + +// C++11 version of 'CUdeviceptr' +template +class Buffer { +public: + + // Constructor based on the regular CUDA data-type: memory management is handled elsewhere + explicit Buffer(const CUdeviceptr buffer): + buffer_(new CUdeviceptr), + access_(BufferAccess::kNotOwned) { + *buffer_ = buffer; + } + + // Regular constructor with memory management. If this class does not own the buffer object, then + // the memory will not be freed automatically afterwards. + explicit Buffer(const Context &, const BufferAccess access, const size_t size): + buffer_(new CUdeviceptr, [access](CUdeviceptr* m) { + if (access != BufferAccess::kNotOwned) { CheckError(cuMemFree(*m)); } + delete m; + }), + access_(access) { + CheckError(cuMemAlloc(buffer_.get(), size*sizeof(T))); + } + + // As above, but now with read/write access as a default + explicit Buffer(const Context &context, const size_t size): + Buffer(context, BufferAccess::kReadWrite, size) { + } + + // Constructs a new buffer based on an existing host-container + template + explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end): + Buffer(context, BufferAccess::kReadWrite, static_cast(end - start)) { + auto size = static_cast(end - start); + auto pointer = &*start; + CheckError(cuMemcpyHtoDAsync(*buffer_, pointer, size*sizeof(T), queue())); + queue.Finish(); + } + + // Copies from device to host: reading the device buffer a-synchronously + void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { + if (access_ == BufferAccess::kWriteOnly) { + throw LogicError("Buffer: reading from a write-only buffer"); + } + CheckError(cuMemcpyDtoHAsync(host, *buffer_ + offset*sizeof(T), size*sizeof(T), queue())); + } + void ReadAsync(const Queue &queue, const size_t size, std::vector &host, + const size_t offset = 0) const { + if (host.size() < size) { + throw LogicError("Buffer: target host buffer is too small"); + } + ReadAsync(queue, size, host.data(), offset); + } + void ReadAsync(const Queue &queue, const size_t size, BufferHost &host, + const size_t offset = 0) const { + if (host.size() < size) { + throw LogicError("Buffer: target host buffer is too small"); + } + ReadAsync(queue, size, host.data(), offset); + } + + // Copies from device to host: reading the device buffer + void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { + ReadAsync(queue, size, host, offset); + queue.Finish(); + } + void Read(const Queue &queue, const size_t size, std::vector &host, + const size_t offset = 0) const { + Read(queue, size, host.data(), offset); + } + void Read(const Queue &queue, const size_t size, BufferHost &host, + const size_t offset = 0) const { + Read(queue, size, host.data(), offset); + } + + // Copies from host to device: writing the device buffer a-synchronously + void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { + if (access_ == BufferAccess::kReadOnly) { + throw LogicError("Buffer: writing to a read-only buffer"); + } + if (GetSize() < (offset+size)*sizeof(T)) { + throw LogicError("Buffer: target device buffer is too small"); + } + CheckError(cuMemcpyHtoDAsync(*buffer_ + offset*sizeof(T), host, size*sizeof(T), queue())); + } + void WriteAsync(const Queue &queue, const size_t size, const std::vector &host, + const size_t offset = 0) { + WriteAsync(queue, size, host.data(), offset); + } + void WriteAsync(const Queue &queue, const size_t size, const BufferHost &host, + const size_t offset = 0) { + WriteAsync(queue, size, host.data(), offset); + } + + // Copies from host to device: writing the device buffer + void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { + WriteAsync(queue, size, host, offset); + queue.Finish(); + } + void Write(const Queue &queue, const size_t size, const std::vector &host, + const size_t offset = 0) { + Write(queue, size, host.data(), offset); + } + void Write(const Queue &queue, const size_t size, const BufferHost &host, + const size_t offset = 0) { + Write(queue, size, host.data(), offset); + } + + // Copies the contents of this buffer into another device buffer + void CopyToAsync(const Queue &queue, const size_t size, const Buffer &destination) const { + CheckError(cuMemcpyDtoDAsync(destination(), *buffer_, size*sizeof(T), queue())); + } + void CopyTo(const Queue &queue, const size_t size, const Buffer &destination) const { + CopyToAsync(queue, size, destination); + queue.Finish(); + } + + // Retrieves the actual allocated size in bytes + size_t GetSize() const { + auto result = size_t{0}; + CheckError(cuMemGetAddressRange(nullptr, &result, *buffer_)); + return result; + } + + // Accessors to the private data-members + CUdeviceptr operator()() const { return *buffer_; } + CUdeviceptr& operator()() { return *buffer_; } +private: + std::shared_ptr buffer_; + const BufferAccess access_; +}; + +// ================================================================================================= + +// C++11 version of 'CUfunction' +class Kernel { +public: + + // Constructor based on the regular CUDA data-type: memory management is handled elsewhere + explicit Kernel(const CUmodule module, const CUfunction kernel): + module_(module), + kernel_(kernel) { + } + + // Regular constructor with memory management + explicit Kernel(const Program &program, const std::string &name) { + CheckError(cuModuleLoadDataEx(&module_, program.GetIR().data(), 0, nullptr, nullptr)); + CheckError(cuModuleGetFunction(&kernel_, module_, name.c_str())); + } + + // Sets a kernel argument at the indicated position. This stores both the value of the argument + // (as raw bytes) and the index indicating where this value can be found. + template + void SetArgument(const size_t index, const T &value) { + if (index >= arguments_indices_.size()) { arguments_indices_.resize(index+1); } + arguments_indices_[index] = arguments_data_.size(); + for (auto j=size_t(0); j(&value)[j]); + } + } + template + void SetArgument(const size_t index, Buffer &value) { + SetArgument(index, value()); + } + + // Sets all arguments in one go using parameter packs. Note that this resets all previously set + // arguments using 'SetArgument' or 'SetArguments'. + template + void SetArguments(Args&... args) { + arguments_indices_.clear(); + arguments_data_.clear(); + SetArgumentsRecursive(0, args...); + } + + // Retrieves the amount of local memory used per work-group for this kernel. Note that this the + // shared memory in CUDA terminology. + unsigned long LocalMemUsage(const Device &) const { + auto result = 0; + CheckError(cuFuncGetAttribute(&result, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel_)); + return static_cast(result); + } + + // Retrieves the name of the kernel + std::string GetFunctionName() const { + return std::string{"unknown"}; // Not implemented for the CUDA backend + } + + // Launches a kernel onto the specified queue + void Launch(const Queue &queue, const std::vector &global, + const std::vector &local, EventPointer event) { + + // Creates the grid (number of threadblocks) and sets the block sizes (threads per block) + auto grid = std::vector{1, 1, 1}; + auto block = std::vector{1, 1, 1}; + if (global.size() != local.size()) { throw LogicError("invalid thread/workgroup dimensions"); } + for (auto i=size_t{0}; i pointers; + for (auto &index: arguments_indices_) { + pointers.push_back(&arguments_data_[index]); + } + + // Launches the kernel, its execution time is recorded by events + CheckError(cuEventRecord(event->start(), queue())); + CheckError(cuLaunchKernel(kernel_, grid[0], grid[1], grid[2], block[0], block[1], block[2], + 0, queue(), pointers.data(), nullptr)); + CheckError(cuEventRecord(event->end(), queue())); + } + + // As above, but with an event waiting list + // TODO: Implement this function + void Launch(const Queue &queue, const std::vector &global, + const std::vector &local, EventPointer event, + std::vector& waitForEvents) { + if (local.size() == 0) { + throw LogicError("Kernel: launching with a default workgroup size is not implemented for the CUDA back-end"); + } + else if (waitForEvents.size() != 0) { + throw LogicError("Kernel: launching with an event waiting list is not implemented for the CUDA back-end"); + } + else { + return Launch(queue, global, local, event); + } + } + + // Accessors to the private data-members + const CUfunction& operator()() const { return kernel_; } + CUfunction operator()() { return kernel_; } +private: + CUmodule module_; + CUfunction kernel_; + std::vector arguments_indices_; // Indices of the arguments + std::vector arguments_data_; // The arguments data as raw bytes + + // Internal implementation for the recursive SetArguments function. + template + void SetArgumentsRecursive(const size_t index, T &first) { + SetArgument(index, first); + } + template + void SetArgumentsRecursive(const size_t index, T &first, Args&... args) { + SetArgument(index, first); + SetArgumentsRecursive(index+1, args...); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_CUPP11_H_ +#endif diff --git a/src/utilities/buffer_test.hpp b/src/utilities/buffer_test.hpp index b5693181..a5b6be4b 100644 --- a/src/utilities/buffer_test.hpp +++ b/src/utilities/buffer_test.hpp @@ -15,7 +15,7 @@ #ifndef CLBLAST_BUFFER_TEST_H_ #define CLBLAST_BUFFER_TEST_H_ -#include "clblast.h" +#include "utilities/utilities.hpp namespace clblast { // ================================================================================================= diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp index b2949c27..f56226be 100644 --- a/src/utilities/utilities.hpp +++ b/src/utilities/utilities.hpp @@ -21,8 +21,13 @@ #include #include -#include "clpp11.hpp" -#include "clblast.h" +#ifdef OPENCL_API + #include "clpp11.hpp" + #include "clblast.h" +#elif CUDA_API + #include "cupp11.hpp" + #include "clblast_cuda.h" +#endif #include "clblast_half.h" #include "utilities/clblast_exceptions.hpp" #include "utilities/msvc.hpp"