Added first (untested) version of a CUDA API

pull/204/head
Cedric Nugteren 2017-10-11 23:16:57 +02:00
parent 9224da19ef
commit b901809345
10 changed files with 3874 additions and 48 deletions

View File

@ -30,6 +30,23 @@ option(TESTS "Enable compilation of the correctness tests" OFF)
option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF) option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF)
option(CUBLAS "Enables performance comparison against cuBLAS on NVIDIA GPUs" OFF) option(CUBLAS "Enables performance comparison against cuBLAS on NVIDIA GPUs" OFF)
# Select between an OpenCL API (default) or a CUDA API (beta)
option(OPENCL "Build CLBlast with an OpenCL API (default)" ON)
option(CUDA "Build CLBlast with a CUDA API (beta)" OFF)
if(NOT OPENCL AND NOT CUDA)
message(FATAL_ERROR "No API selected, choose from OpenCL (-DOPENCL=ON) or CUDA (-DCUDA=ON)")
endif()
if(OPENCL AND CUDA)
message(FATAL_ERROR "Multiple APIs selected, choose either OpenCL (-DOPENCL=ON -DCUDA=OFF) or CUDA (-DCUDA=ON -DOPENCL=OFF)")
endif()
if(OPENCL)
message("-- Building CLBlast with OpenCL API (default)")
add_definitions(-DOPENCL_API)
elseif(CUDA)
message("-- Building CLBlast with CUDA API (beta)")
add_definitions(-DCUDA_API)
endif()
# Compile in verbose mode with additional diagnostic messages # Compile in verbose mode with additional diagnostic messages
option(VERBOSE "Compile in verbose mode for additional diagnostic messages" OFF) option(VERBOSE "Compile in verbose mode for additional diagnostic messages" OFF)
if(VERBOSE) if(VERBOSE)
@ -123,8 +140,18 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CFLAGS}")
# Package scripts location # Package scripts location
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${clblast_SOURCE_DIR}/cmake/Modules/") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${clblast_SOURCE_DIR}/cmake/Modules/")
# Requires OpenCL. It is found through the included "FindOpenCL.cmake" in CMAKE_MODULE_PATH. if(OPENCL)
find_package(OpenCL REQUIRED) # Requires OpenCL. It is found through the included "FindOpenCL.cmake" in CMAKE_MODULE_PATH.
find_package(OpenCL REQUIRED)
set(API_LIBRARIES ${OPENCL_LIBRARIES})
set(API_INCLUDE_DIRS ${OPENCL_INCLUDE_DIRS})
elseif(CUDA)
# For CUDA, the "FindCUDA.cmake" is part of CMake
find_package(CUDA REQUIRED)
set(API_LIBRARIES cuda nvrtc)
set(API_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS})
link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
endif()
# Locates the CLTune library in case the tuners need to be compiled. "FindCLTune.cmake" is included. # Locates the CLTune library in case the tuners need to be compiled. "FindCLTune.cmake" is included.
if(TUNERS) if(TUNERS)
@ -161,11 +188,6 @@ set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger
xgemm xgemm_direct xgemv) xgemm xgemm_direct xgemv)
set(DATABASES copy pad padtranspose transpose xaxpy xdot set(DATABASES copy pad padtranspose transpose xaxpy xdot
xgemm xgemm_direct xgemv xgemv_fast xgemv_fast_rot xger) xgemm xgemm_direct xgemv xgemv_fast xgemv_fast_rot xger)
set(SAMPLE_PROGRAMS_CPP sgemm sgemm_batched)
set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache)
if(NETLIB)
set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib)
endif()
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax) set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv
xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2) xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
@ -173,6 +195,16 @@ set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm)
set(LEVELX_ROUTINES xomatcopy xim2col xaxpybatched xgemmbatched) set(LEVELX_ROUTINES xomatcopy xim2col xaxpybatched xgemmbatched)
set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES}) set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES})
set(PRECISIONS 32 64 3232 6464 16) set(PRECISIONS 32 64 3232 6464 16)
if(OPENCL)
set(SAMPLE_PROGRAMS_CPP sgemm sgemm_batched)
set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache)
if(NETLIB)
set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib)
endif()
elseif(CUDA)
set(SAMPLE_PROGRAMS_CPP )
set(SAMPLE_PROGRAMS_C )
endif()
# ================================================================================================== # ==================================================================================================
@ -184,14 +216,10 @@ set(SOURCES
src/utilities/utilities.cpp src/utilities/utilities.cpp
src/api_common.cpp src/api_common.cpp
src/cache.cpp src/cache.cpp
src/clblast.cpp
src/clblast_c.cpp
src/routine.cpp src/routine.cpp
src/routines/levelx/xinvert.cpp # only source, don't include it as a test src/routines/levelx/xinvert.cpp # only source, don't include it as a test
) )
set(HEADERS # such that they can be discovered by IDEs such as CLion and Visual Studio set(HEADERS # such that they can be discovered by IDEs such as CLion and Visual Studio
include/clblast.h
include/clblast_c.h
include/clblast_half.h include/clblast_half.h
src/database/apple_cpu_fallback.hpp src/database/apple_cpu_fallback.hpp
src/database/database.hpp src/database/database.hpp
@ -209,13 +237,19 @@ set(HEADERS # such that they can be discovered by IDEs such as CLion and Visual
src/utilities/msvc.hpp src/utilities/msvc.hpp
src/utilities/utilities.hpp src/utilities/utilities.hpp
src/cache.hpp src/cache.hpp
src/clpp11.hpp
src/cxpp11_common.hpp src/cxpp11_common.hpp
src/routine.hpp src/routine.hpp
) )
if(NETLIB) if(OPENCL)
set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp) set(SOURCES ${SOURCES} src/clblast.cpp src/clblast_c.cpp)
set(HEADERS ${HEADERS} include/clblast_netlib_c.h) set(HEADERS ${HEADERS} include/clblast.h include/clblast_c.h src/clpp11.hpp)
if(NETLIB)
set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp)
set(HEADERS ${HEADERS} include/clblast_netlib_c.h)
endif()
elseif(CUDA)
set(SOURCES ${SOURCES} src/clblast_cuda.cpp)
set(HEADERS ${HEADERS} include/clblast_cuda.h src/cupp11.hpp)
endif() endif()
foreach(ROUTINE ${LEVEL1_ROUTINES}) foreach(ROUTINE ${LEVEL1_ROUTINES})
set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cpp) set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cpp)
@ -249,14 +283,14 @@ else(BUILD_SHARED_LIBS)
add_library(clblast STATIC ${SOURCES} ${HEADERS}) add_library(clblast STATIC ${SOURCES} ${HEADERS})
endif() endif()
target_link_libraries(clblast ${OPENCL_LIBRARIES}) target_link_libraries(clblast ${API_LIBRARIES})
# Includes directories: CLBlast and OpenCL # Includes directories: CLBlast and OpenCL
target_include_directories(clblast PUBLIC target_include_directories(clblast PUBLIC
$<BUILD_INTERFACE:${clblast_SOURCE_DIR}/include> $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/include>
$<BUILD_INTERFACE:${clblast_SOURCE_DIR}/src> $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/src>
$<INSTALL_INTERFACE:include> $<INSTALL_INTERFACE:include>
${OPENCL_INCLUDE_DIRS}) ${API_INCLUDE_DIRS})
# Sets the proper __declspec(dllexport) keyword for Visual Studio when the library is built # Sets the proper __declspec(dllexport) keyword for Visual Studio when the library is built
if(MSVC) if(MSVC)
@ -267,11 +301,15 @@ endif()
# Installs the library # Installs the library
install(TARGETS clblast EXPORT CLBlast DESTINATION lib) install(TARGETS clblast EXPORT CLBlast DESTINATION lib)
install(FILES include/clblast.h DESTINATION include)
install(FILES include/clblast_c.h DESTINATION include)
install(FILES include/clblast_half.h DESTINATION include) install(FILES include/clblast_half.h DESTINATION include)
if(NETLIB) if(OPENCL)
install(FILES include/clblast_netlib_c.h DESTINATION include) install(FILES include/clblast.h DESTINATION include)
install(FILES include/clblast_c.h DESTINATION include)
if(NETLIB)
install(FILES include/clblast_netlib_c.h DESTINATION include)
endif()
elseif(CUDA)
install(FILES include/clblast_cuda.h DESTINATION include)
endif() endif()
# Installs the config for find_package in dependent projects # Installs the config for find_package in dependent projects
@ -291,19 +329,21 @@ endif()
if(SAMPLES) if(SAMPLES)
# Downloads the cl.hpp file from Khronos # Downloads the cl.hpp file from Khronos
file(DOWNLOAD https://www.khronos.org/registry/OpenCL/api/2.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp) if(OPENCL)
file(DOWNLOAD https://www.khronos.org/registry/OpenCL/api/2.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp)
endif()
# Adds sample programs (C++) # Adds sample programs (C++)
foreach(SAMPLE ${SAMPLE_PROGRAMS_CPP}) foreach(SAMPLE ${SAMPLE_PROGRAMS_CPP})
add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cpp) add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cpp)
target_link_libraries(clblast_sample_${SAMPLE} clblast ${OPENCL_LIBRARIES}) target_link_libraries(clblast_sample_${SAMPLE} clblast ${API_LIBRARIES})
install(TARGETS clblast_sample_${SAMPLE} DESTINATION bin) install(TARGETS clblast_sample_${SAMPLE} DESTINATION bin)
endforeach() endforeach()
# Adds sample programs (C) # Adds sample programs (C)
foreach(SAMPLE ${SAMPLE_PROGRAMS_C}) foreach(SAMPLE ${SAMPLE_PROGRAMS_C})
add_executable(clblast_sample_${SAMPLE}_c samples/${SAMPLE}.c) add_executable(clblast_sample_${SAMPLE}_c samples/${SAMPLE}.c)
target_link_libraries(clblast_sample_${SAMPLE}_c clblast ${OPENCL_LIBRARIES}) target_link_libraries(clblast_sample_${SAMPLE}_c clblast ${API_LIBRARIES})
install(TARGETS clblast_sample_${SAMPLE}_c DESTINATION bin) install(TARGETS clblast_sample_${SAMPLE}_c DESTINATION bin)
endforeach() endforeach()
@ -324,7 +364,7 @@ if(TUNERS)
# Adds tuning executables # Adds tuning executables
foreach(KERNEL ${KERNELS}) foreach(KERNEL ${KERNELS})
add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} src/tuning/kernels/${KERNEL}.cpp) add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} src/tuning/kernels/${KERNEL}.cpp)
target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${OPENCL_LIBRARIES}) target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${API_LIBRARIES})
target_include_directories(clblast_tuner_${KERNEL} PUBLIC ${CLTUNE_INCLUDE_DIRS}) target_include_directories(clblast_tuner_${KERNEL} PUBLIC ${CLTUNE_INCLUDE_DIRS})
install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin) install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
endforeach() endforeach()
@ -429,7 +469,7 @@ if(CLIENTS)
test/routines/levelx/${ROUTINE}.hpp) test/routines/levelx/${ROUTINE}.hpp)
endforeach() endforeach()
foreach(ROUTINE ${ROUTINES}) foreach(ROUTINE ${ROUTINES})
target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${API_LIBRARIES})
target_include_directories(clblast_client_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES}) target_include_directories(clblast_client_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES})
install(TARGETS clblast_client_${ROUTINE} DESTINATION bin) install(TARGETS clblast_client_${ROUTINE} DESTINATION bin)
endforeach() endforeach()
@ -481,7 +521,7 @@ if(TESTS)
test/routines/levelx/${ROUTINE}.hpp) test/routines/levelx/${ROUTINE}.hpp)
endforeach() endforeach()
foreach(ROUTINE ${ROUTINES}) foreach(ROUTINE ${ROUTINES})
target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${API_LIBRARIES})
install(TARGETS clblast_test_${ROUTINE} DESTINATION bin) install(TARGETS clblast_test_${ROUTINE} DESTINATION bin)
target_include_directories(clblast_test_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES}) target_include_directories(clblast_test_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES})
add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE}) add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE})
@ -492,7 +532,7 @@ if(TESTS)
foreach(MISC_TEST ${MISC_TESTS}) foreach(MISC_TEST ${MISC_TESTS})
add_executable(clblast_test_${MISC_TEST} ${TESTS_COMMON} add_executable(clblast_test_${MISC_TEST} ${TESTS_COMMON}
test/correctness/misc/${MISC_TEST}.cpp) test/correctness/misc/${MISC_TEST}.cpp)
target_link_libraries(clblast_test_${MISC_TEST} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) target_link_libraries(clblast_test_${MISC_TEST} clblast ${REF_LIBRARIES} ${API_LIBRARIES})
target_include_directories(clblast_test_${MISC_TEST} PUBLIC target_include_directories(clblast_test_${MISC_TEST} PUBLIC
$<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
${clblast_SOURCE_DIR} ${REF_INCLUDES}) ${clblast_SOURCE_DIR} ${REF_INCLUDES})
@ -501,7 +541,7 @@ if(TESTS)
# CLBlast diagnostics # CLBlast diagnostics
add_executable(clblast_test_diagnostics ${TESTS_COMMON} test/diagnostics.cpp) add_executable(clblast_test_diagnostics ${TESTS_COMMON} test/diagnostics.cpp)
target_link_libraries(clblast_test_diagnostics clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) target_link_libraries(clblast_test_diagnostics clblast ${REF_LIBRARIES} ${API_LIBRARIES})
target_include_directories(clblast_test_diagnostics PUBLIC target_include_directories(clblast_test_diagnostics PUBLIC
$<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
${clblast_SOURCE_DIR} ${REF_INCLUDES}) ${clblast_SOURCE_DIR} ${REF_INCLUDES})

View File

@ -0,0 +1,643 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains the special CUDA interface to the CLBlast BLAS routines. It also contains the
// definitions of the returned status codes and the layout and transpose types. This is the header
// users of the CUDA API of CLBlast should include and use.
//
// =================================================================================================
#ifndef CLBLAST_CLBLAST_CUDA_H_
#define CLBLAST_CLBLAST_CUDA_H_
#include <cstdlib> // For size_t
#include <string> // For OverrideParameters function
#include <unordered_map> // For OverrideParameters function
// CUDA
#include <cuda.h> // CUDA driver API
#include <nvrtc.h> // NVIDIA runtime compilation API
// Exports library functions under Windows when building a DLL. See also:
// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
#if defined(_WIN32) && defined(CLBLAST_DLL)
#if defined(COMPILING_DLL)
#define PUBLIC_API __declspec(dllexport)
#else
#define PUBLIC_API __declspec(dllimport)
#endif
#else
#define PUBLIC_API
#endif
namespace clblast {
// =================================================================================================
// Status codes. These codes can be returned by functions declared in this header file. The error
// codes match either the standard CUDA driver API error codes or the regular CLBlast error codes.
enum class StatusCode {
// Status codes in common with the OpenCL standard
kSuccess = 0, // CUDA_SUCCESS
kInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions
kInvalidLocalThreadsTotal = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total
kInvalidLocalThreadsDim = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension
// Status codes in common with the clBLAS library
kNotImplemented = -1024, // Routine or functionality not implemented yet
kInvalidMatrixA = -1022, // Matrix A is not a valid OpenCL buffer
kInvalidMatrixB = -1021, // Matrix B is not a valid OpenCL buffer
kInvalidMatrixC = -1020, // Matrix C is not a valid OpenCL buffer
kInvalidVectorX = -1019, // Vector X is not a valid OpenCL buffer
kInvalidVectorY = -1018, // Vector Y is not a valid OpenCL buffer
kInvalidDimension = -1017, // Dimensions M, N, and K have to be larger than zero
kInvalidLeadDimA = -1016, // LD of A is smaller than the matrix's first dimension
kInvalidLeadDimB = -1015, // LD of B is smaller than the matrix's first dimension
kInvalidLeadDimC = -1014, // LD of C is smaller than the matrix's first dimension
kInvalidIncrementX = -1013, // Increment of vector X cannot be zero
kInvalidIncrementY = -1012, // Increment of vector Y cannot be zero
kInsufficientMemoryA = -1011, // Matrix A's OpenCL buffer is too small
kInsufficientMemoryB = -1010, // Matrix B's OpenCL buffer is too small
kInsufficientMemoryC = -1009, // Matrix C's OpenCL buffer is too small
kInsufficientMemoryX = -1008, // Vector X's OpenCL buffer is too small
kInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small
// Custom additional status codes for CLBlast
kInvalidBatchCount = -2049, // The batch count needs to be positive
kInvalidOverrideKernel = -2048, // Trying to override parameters for an invalid kernel
kMissingOverrideParameter = -2047, // Missing override parameter(s) for the target kernel
kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device
kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device
kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device
kInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer
kInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small
kDatabaseError = -2041, // Entry for the device was not found in the database
kUnknownError = -2040, // A catch-all error code representing an unspecified error
kUnexpectedError = -2039, // A catch-all error code representing an unexpected exception
};
// Matrix layout and transpose types
enum class Layout { kRowMajor = 101, kColMajor = 102 };
enum class Transpose { kNo = 111, kYes = 112, kConjugate = 113 };
enum class Triangle { kUpper = 121, kLower = 122 };
enum class Diagonal { kNonUnit = 131, kUnit = 132 };
enum class Side { kLeft = 141, kRight = 142 };
// Precision scoped enum (values in bits)
enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
kComplexSingle = 3232, kComplexDouble = 6464, kAny = -1 };
// =================================================================================================
// BLAS level-1 (vector-vector) routines
// =================================================================================================
// Generate givens plane rotation: SROTG/DROTG
template <typename T>
StatusCode Rotg(CUdeviceptr sa_buffer, const size_t sa_offset,
CUdeviceptr sb_buffer, const size_t sb_offset,
CUdeviceptr sc_buffer, const size_t sc_offset,
CUdeviceptr ss_buffer, const size_t ss_offset,
CUstream* stream);
// Generate modified givens plane rotation: SROTMG/DROTMG
template <typename T>
StatusCode Rotmg(CUdeviceptr sd1_buffer, const size_t sd1_offset,
CUdeviceptr sd2_buffer, const size_t sd2_offset,
CUdeviceptr sx1_buffer, const size_t sx1_offset,
const CUdeviceptr sy1_buffer, const size_t sy1_offset,
CUdeviceptr sparam_buffer, const size_t sparam_offset,
CUstream* stream);
// Apply givens plane rotation: SROT/DROT
template <typename T>
StatusCode Rot(const size_t n,
CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
const T cos,
const T sin,
CUstream* stream);
// Apply modified givens plane rotation: SROTM/DROTM
template <typename T>
StatusCode Rotm(const size_t n,
CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
CUdeviceptr sparam_buffer, const size_t sparam_offset,
CUstream* stream);
// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP
template <typename T>
StatusCode Swap(const size_t n,
CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
CUstream* stream);
// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
template <typename T>
StatusCode Scal(const size_t n,
const T alpha,
CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
CUstream* stream);
// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
template <typename T>
StatusCode Copy(const size_t n,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
CUstream* stream);
// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
template <typename T>
StatusCode Axpy(const size_t n,
const T alpha,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
CUstream* stream);
// Dot product of two vectors: SDOT/DDOT/HDOT
template <typename T>
StatusCode Dot(const size_t n,
CUdeviceptr dot_buffer, const size_t dot_offset,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
CUstream* stream);
// Dot product of two complex vectors: CDOTU/ZDOTU
template <typename T>
StatusCode Dotu(const size_t n,
CUdeviceptr dot_buffer, const size_t dot_offset,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
CUstream* stream);
// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC
template <typename T>
StatusCode Dotc(const size_t n,
CUdeviceptr dot_buffer, const size_t dot_offset,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
CUstream* stream);
// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
template <typename T>
StatusCode Nrm2(const size_t n,
CUdeviceptr nrm2_buffer, const size_t nrm2_offset,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
CUstream* stream);
// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
template <typename T>
StatusCode Asum(const size_t n,
CUdeviceptr asum_buffer, const size_t asum_offset,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
CUstream* stream);
// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
template <typename T>
StatusCode Sum(const size_t n,
CUdeviceptr sum_buffer, const size_t sum_offset,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
CUstream* stream);
// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
template <typename T>
StatusCode Amax(const size_t n,
CUdeviceptr imax_buffer, const size_t imax_offset,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
CUstream* stream);
// Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN
template <typename T>
StatusCode Amin(const size_t n,
CUdeviceptr imin_buffer, const size_t imin_offset,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
CUstream* stream);
// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
template <typename T>
StatusCode Max(const size_t n,
CUdeviceptr imax_buffer, const size_t imax_offset,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
CUstream* stream);
// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
template <typename T>
StatusCode Min(const size_t n,
CUdeviceptr imin_buffer, const size_t imin_offset,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
CUstream* stream);
// =================================================================================================
// BLAS level-2 (matrix-vector) routines
// =================================================================================================
// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV
template <typename T>
StatusCode Gemv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const T alpha,
const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
CUstream* stream);
// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
template <typename T>
StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n, const size_t kl, const size_t ku,
const T alpha,
const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
CUstream* stream);
// Hermitian matrix-vector multiplication: CHEMV/ZHEMV
template <typename T>
StatusCode Hemv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
CUstream* stream);
// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV
template <typename T>
StatusCode Hbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
const T alpha,
const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
CUstream* stream);
// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV
template <typename T>
StatusCode Hpmv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const CUdeviceptr ap_buffer, const size_t ap_offset,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
CUstream* stream);
// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV
template <typename T>
StatusCode Symv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
CUstream* stream);
// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
template <typename T>
StatusCode Sbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
const T alpha,
const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
CUstream* stream);
// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
template <typename T>
StatusCode Spmv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const CUdeviceptr ap_buffer, const size_t ap_offset,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
CUstream* stream);
// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
template <typename T>
StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
CUstream* stream);
// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
template <typename T>
StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n, const size_t k,
const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
CUstream* stream);
// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
template <typename T>
StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const CUdeviceptr ap_buffer, const size_t ap_offset,
CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
CUstream* stream);
// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
template <typename T>
StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
CUstream* stream);
// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV
template <typename T>
StatusCode Tbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n, const size_t k,
const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
CUstream* stream);
// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV
template <typename T>
StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const CUdeviceptr ap_buffer, const size_t ap_offset,
CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
CUstream* stream);
// General rank-1 matrix update: SGER/DGER/HGER
template <typename T>
StatusCode Ger(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
CUstream* stream);
// General rank-1 complex matrix update: CGERU/ZGERU
template <typename T>
StatusCode Geru(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
CUstream* stream);
// General rank-1 complex conjugated matrix update: CGERC/ZGERC
template <typename T>
StatusCode Gerc(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
CUstream* stream);
// Hermitian rank-1 matrix update: CHER/ZHER
template <typename T>
StatusCode Her(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
CUstream* stream);
// Hermitian packed rank-1 matrix update: CHPR/ZHPR
template <typename T>
StatusCode Hpr(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
CUdeviceptr ap_buffer, const size_t ap_offset,
CUstream* stream);
// Hermitian rank-2 matrix update: CHER2/ZHER2
template <typename T>
StatusCode Her2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
CUstream* stream);
// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2
template <typename T>
StatusCode Hpr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
CUdeviceptr ap_buffer, const size_t ap_offset,
CUstream* stream);
// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
template <typename T>
StatusCode Syr(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
CUstream* stream);
// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
template <typename T>
StatusCode Spr(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
CUdeviceptr ap_buffer, const size_t ap_offset,
CUstream* stream);
// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
template <typename T>
StatusCode Syr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
CUstream* stream);
// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
template <typename T>
StatusCode Spr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
CUdeviceptr ap_buffer, const size_t ap_offset,
CUstream* stream);
// =================================================================================================
// BLAS level-3 (matrix-matrix) routines
// =================================================================================================
// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
template <typename T>
StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
const size_t m, const size_t n, const size_t k,
const T alpha,
const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
CUstream* stream);
// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
template <typename T>
StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
const T alpha,
const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
CUstream* stream);
// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
template <typename T>
StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
const T alpha,
const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
CUstream* stream);
// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK
template <typename T>
StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const T alpha,
const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
const T beta,
CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
CUstream* stream);
// Rank-K update of a hermitian matrix: CHERK/ZHERK
template <typename T>
StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const T alpha,
const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
const T beta,
CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
CUstream* stream);
// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K
template <typename T>
StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const T alpha,
const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
CUstream* stream);
// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
template <typename T, typename U>
StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const T alpha,
const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
const U beta,
CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
CUstream* stream);
// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM
template <typename T>
StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const T alpha,
const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
CUstream* stream);
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
template <typename T>
StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const T alpha,
const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
CUstream* stream);
// =================================================================================================
// Extra non-BLAS routines (level-X)
// =================================================================================================
// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
template <typename T>
StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const T alpha,
const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
CUstream* stream);
// Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL
template <typename T>
StatusCode Im2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
const CUdeviceptr im_buffer, const size_t im_offset,
CUdeviceptr col_buffer, const size_t col_offset,
CUstream* stream);
// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
template <typename T>
StatusCode AxpyBatched(const size_t n,
const T *alphas,
const CUdeviceptr x_buffer, const size_t *x_offsets, const size_t x_inc,
CUdeviceptr y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
CUstream* stream);
// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED
template <typename T>
StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
const size_t m, const size_t n, const size_t k,
const T *alphas,
const CUdeviceptr a_buffer, const size_t *a_offsets, const size_t a_ld,
const CUdeviceptr b_buffer, const size_t *b_offsets, const size_t b_ld,
const T *betas,
CUdeviceptr c_buffer, const size_t *c_offsets, const size_t c_ld,
const size_t batch_count,
CUstream* stream);
// =================================================================================================
// CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
// for the same device. This cache can be cleared to free up system memory or in case of debugging.
StatusCode PUBLIC_API ClearCache();
// The cache can also be pre-initialized for a specific device with all possible CLBLast kernels.
// Further CLBlast routine calls will then run at maximum speed.
StatusCode PUBLIC_API FillCache(const CUdevice device);
// =================================================================================================
// Overrides tuning parameters for a specific device-precision-kernel combination. The next time
// the target routine is called it will re-compile and use the new parameters from then on.
StatusCode PUBLIC_API OverrideParameters(const CUdevice device, const std::string &kernel_name,
const Precision precision,
const std::unordered_map<std::string,size_t> &parameters);
// =================================================================================================
} // namespace clblast
// CLBLAST_CLBLAST_CUDA_H_
#endif

View File

@ -12,6 +12,8 @@
# clblast.cpp # clblast.cpp
# clblast_c.h # clblast_c.h
# clblast_c.cpp # clblast_c.cpp
# clblast_cuda.h
# clblast_cuda.cpp
# clblast_netlib_c.h # clblast_netlib_c.h
# clblast_netlib_c.cpp # clblast_netlib_c.cpp
# wrapper_clblas.h # wrapper_clblas.h
@ -41,9 +43,11 @@ FILES = [
"/test/wrapper_cublas.hpp", "/test/wrapper_cublas.hpp",
"/include/clblast_netlib_c.h", "/include/clblast_netlib_c.h",
"/src/clblast_netlib_c.cpp", "/src/clblast_netlib_c.cpp",
"/include/clblast_cuda.h",
"/src/clblast_cuda.cpp",
] ]
HEADER_LINES = [122, 21, 126, 24, 29, 41, 29, 65, 32] HEADER_LINES = [122, 21, 126, 24, 29, 41, 29, 65, 32, 94, 21]
FOOTER_LINES = [25, 3, 27, 38, 6, 6, 6, 9, 2] FOOTER_LINES = [25, 3, 27, 38, 6, 6, 6, 9, 2, 25, 3]
HEADER_LINES_DOC = 0 HEADER_LINES_DOC = 0
FOOTER_LINES_DOC = 63 FOOTER_LINES_DOC = 63
@ -224,6 +228,10 @@ def main(argv):
if i == 8: if i == 8:
if not routine.batched: if not routine.batched:
body += cpp.clblast_netlib_c_cc(routine) body += cpp.clblast_netlib_c_cc(routine)
if i == 9:
body += cpp.clblast_h(routine, cuda=True)
if i == 10:
body += cpp.clblast_cc(routine, cuda=True)
f.write("".join(file_header)) f.write("".join(file_header))
f.write(body) f.write(body)
f.write("".join(file_footer)) f.write("".join(file_footer))

View File

@ -36,19 +36,19 @@ HEADER = NL + SEPARATOR + """
""" + SEPARATOR + NL """ + SEPARATOR + NL
def clblast_h(routine): def clblast_h(routine, cuda=False):
"""The C++ API header (.h)""" """The C++ API header (.h)"""
result = NL + "// " + routine.description + ": " + routine.short_names() + NL result = NL + "// " + routine.description + ": " + routine.short_names() + NL
result += routine.routine_header_cpp(12, " = nullptr") + ";" + NL result += routine.routine_header_cpp(12, " = nullptr", cuda) + ";" + NL
return result return result
def clblast_cc(routine): def clblast_cc(routine, cuda=False):
"""The C++ API implementation (.cpp)""" """The C++ API implementation (.cpp)"""
indent1 = " " * (15 + routine.length()) indent1 = " " * (15 + routine.length())
result = NL + "// " + routine.description + ": " + routine.short_names() + NL result = NL + "// " + routine.description + ": " + routine.short_names() + NL
if routine.implemented: if routine.implemented:
result += routine.routine_header_cpp(12, "") + " {" + NL result += routine.routine_header_cpp(12, "", cuda) + " {" + NL
result += " try {" + NL result += " try {" + NL
result += " auto queue_cpp = Queue(*queue);" + NL result += " auto queue_cpp = Queue(*queue);" + NL
result += " auto routine = X" + routine.plain_name() + "<" + routine.template.template + ">(queue_cpp, event);" + NL result += " auto routine = X" + routine.plain_name() + "<" + routine.template.template + ">(queue_cpp, event);" + NL
@ -60,14 +60,22 @@ def clblast_cc(routine):
result += " return StatusCode::kSuccess;" + NL result += " return StatusCode::kSuccess;" + NL
result += " } catch (...) { return DispatchException(); }" + NL result += " } catch (...) { return DispatchException(); }" + NL
else: else:
result += routine.routine_header_type_cpp(12) + " {" + NL result += routine.routine_header_type_cpp(12, cuda) + " {" + NL
result += " return StatusCode::kNotImplemented;" + NL result += " return StatusCode::kNotImplemented;" + NL
result += "}" + NL result += "}" + NL
for flavour in routine.flavours: for flavour in routine.flavours:
indent2 = " " * (34 + routine.length() + len(flavour.template)) indent2 = " " * (34 + routine.length() + len(flavour.template))
result += "template StatusCode PUBLIC_API " + routine.capitalized_name() + "<" + flavour.template + ">(" result += "template StatusCode PUBLIC_API " + routine.capitalized_name() + "<" + flavour.template + ">("
result += ("," + NL + indent2).join([a for a in routine.arguments_type(flavour)]) arguments = routine.arguments_type(flavour)
result += "," + NL + indent2 + "cl_command_queue*, cl_event*);" + NL if cuda:
arguments = [a.replace("cl_mem", "CUdeviceptr") for a in arguments]
result += ("," + NL + indent2).join([a for a in arguments])
result += "," + NL + indent2
if cuda:
result += "CUstream*"
else:
result += "cl_command_queue*, cl_event*"
result += ");" + NL
return result return result

View File

@ -802,22 +802,38 @@ class Routine:
"""Retrieves a list of routine requirements for documentation""" """Retrieves a list of routine requirements for documentation"""
return self.requirements return self.requirements
def routine_header_cpp(self, spaces, default_event): def routine_header_cpp(self, spaces, default_event, cuda=False):
"""Retrieves the C++ templated definition for a routine""" """Retrieves the C++ templated definition for a routine"""
indent = " " * (spaces + self.length()) indent = " " * (spaces + self.length())
arguments = self.arguments_def(self.template)
if cuda:
arguments = [a.replace("cl_mem", "CUdeviceptr") for a in arguments]
result = "template <" + self.template.name + ">\n" result = "template <" + self.template.name + ">\n"
result += "StatusCode " + self.capitalized_name() + "(" result += "StatusCode " + self.capitalized_name() + "("
result += (",\n" + indent).join([a for a in self.arguments_def(self.template)]) result += (",\n" + indent).join([a for a in arguments])
result += ",\n" + indent + "cl_command_queue* queue, cl_event* event" + default_event + ")" result += ",\n" + indent
if cuda:
result += "CUstream* stream"
else:
result += "cl_command_queue* queue, cl_event* event" + default_event
result += ")"
return result return result
def routine_header_type_cpp(self, spaces): def routine_header_type_cpp(self, spaces, cuda=False):
"""As above, but now without variable names""" """As above, but now without variable names"""
indent = " " * (spaces + self.length()) indent = " " * (spaces + self.length())
arguments = self.arguments_type(self.template)
if cuda:
arguments = [a.replace("cl_mem", "CUdeviceptr") for a in arguments]
result = "template <" + self.template.name + ">\n" result = "template <" + self.template.name + ">\n"
result += "StatusCode " + self.capitalized_name() + "(" result += "StatusCode " + self.capitalized_name() + "("
result += (",\n" + indent).join([a for a in self.arguments_type(self.template)]) result += (",\n" + indent).join([a for a in arguments])
result += ",\n" + indent + "cl_command_queue*, cl_event*)" result += ",\n" + indent
if cuda:
result += "CUstream* stream"
else:
result += "cl_command_queue*, cl_event*"
result += ")"
return result return result
def routine_header_c(self, flavour, spaces, extra_qualifier): def routine_header_c(self, flavour, spaces, extra_qualifier):

View File

@ -12,9 +12,9 @@
#include <string> #include <string>
#include "utilities/utilities.hpp"
#include "cache.hpp" #include "cache.hpp"
#include "routines/routines.hpp" #include "routines/routines.hpp"
#include "clblast.h"
namespace clblast { namespace clblast {
// ================================================================================================= // =================================================================================================

2336
src/clblast_cuda.cpp 100644

File diff suppressed because it is too large Load Diff

770
src/cupp11.hpp 100644
View File

@ -0,0 +1,770 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements a bunch of C++11 classes that act as wrappers around OpenCL objects and API
// calls. The main benefits are increased abstraction, automatic memory management, and portability.
// Portability here means that a similar header exists for CUDA with the same classes and
// interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
//
// This file is taken from the CLCudaAPI project <https://github.com/CNugteren/CLCudaAPI> and
// therefore contains the following header copyright notice:
//
// =================================================================================================
//
// Copyright 2015 SURFsara
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// =================================================================================================
#ifndef CLBLAST_CUPP11_H_
#define CLBLAST_CUPP11_H_
// C++
#include <algorithm> // std::copy
#include <string> // std::string
#include <vector> // std::vector
#include <memory> // std::shared_ptr
// CUDA
#include <cuda.h> // CUDA driver API
#include <nvrtc.h> // NVIDIA runtime compilation API
// Exception classes
#include "cxpp11_common.hpp"
namespace clblast {
// =================================================================================================
// Max-length of strings
constexpr auto kStringLength = 256;
// =================================================================================================
// Represents a runtime error returned by a CUDA driver API function
class CLCudaAPIError : public ErrorCode<DeviceError, CUresult> {
public:
explicit CLCudaAPIError(CUresult status, const std::string &where):
ErrorCode(status, where, "CUDA error: " + where + ": " +
GetErrorName(status) + " --> " + GetErrorString(status)) {
}
static void Check(const CUresult status, const std::string &where) {
if (status != CUDA_SUCCESS) {
throw CLCudaAPIError(status, where);
}
}
static void CheckDtor(const CUresult status, const std::string &where) {
if (status != CUDA_SUCCESS) {
fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPIError(status, where).what());
}
}
private:
std::string GetErrorName(CUresult status) const {
const char* status_code;
cuGetErrorName(status, &status_code);
return std::string(status_code);
}
std::string GetErrorString(CUresult status) const {
const char* status_string;
cuGetErrorString(status, &status_string);
return std::string(status_string);
}
};
// Represents a runtime error returned by a CUDA runtime compilation API function
class CLCudaAPINVRTCError : public ErrorCode<DeviceError, nvrtcResult> {
public:
explicit CLCudaAPINVRTCError(nvrtcResult status, const std::string &where):
ErrorCode(status, where, "CUDA NVRTC error: " + where + ": " + GetErrorString(status)) {
}
static void Check(const nvrtcResult status, const std::string &where) {
if (status != NVRTC_SUCCESS) {
throw CLCudaAPINVRTCError(status, where);
}
}
static void CheckDtor(const nvrtcResult status, const std::string &where) {
if (status != NVRTC_SUCCESS) {
fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPINVRTCError(status, where).what());
}
}
private:
std::string GetErrorString(nvrtcResult status) const {
const char* status_string = nvrtcGetErrorString(status);
return std::string(status_string);
}
};
// Exception returned when building a program
using CLCudaAPIBuildError = CLCudaAPINVRTCError;
// =================================================================================================
// Error occurred in CUDA driver or runtime compilation API
#define CheckError(call) CLCudaAPIError::Check(call, CLCudaAPIError::TrimCallString(#call))
#define CheckErrorNVRTC(call) CLCudaAPINVRTCError::Check(call, CLCudaAPINVRTCError::TrimCallString(#call))
// Error occurred in CUDA driver or runtime compilation API (no-exception version for destructors)
#define CheckErrorDtor(call) CLCudaAPIError::CheckDtor(call, CLCudaAPIError::TrimCallString(#call))
#define CheckErrorDtorNVRTC(call) CLCudaAPINVRTCError::CheckDtor(call, CLCudaAPINVRTCError::TrimCallString(#call))
// =================================================================================================
// C++11 version of two 'CUevent' pointers
class Event {
public:
// Note that there is no constructor based on the regular CUDA data-type because of extra state
// Regular constructor with memory management
explicit Event():
start_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }),
end_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }) {
CheckError(cuEventCreate(start_.get(), CU_EVENT_DEFAULT));
CheckError(cuEventCreate(end_.get(), CU_EVENT_DEFAULT));
}
// Waits for completion of this event (not implemented for CUDA)
void WaitForCompletion() const { }
// Retrieves the elapsed time of the last recorded event
float GetElapsedTime() const {
auto result = 0.0f;
cuEventElapsedTime(&result, *start_, *end_);
return result;
}
// Accessors to the private data-members
const CUevent& start() const { return *start_; }
const CUevent& end() const { return *end_; }
Event* pointer() { return this; }
private:
std::shared_ptr<CUevent> start_;
std::shared_ptr<CUevent> end_;
};
// Pointer to a CUDA event
using EventPointer = Event*;
// =================================================================================================
// Raw platform ID type
using RawPlatformID = size_t;
// The CUDA platform: initializes the CUDA driver API
class Platform {
public:
// Initializes the platform. Note that the platform ID variable is not actually used for CUDA.
explicit Platform(const size_t platform_id) : platform_id_(0) {
if (platform_id != 0) { throw LogicError("CUDA back-end requires a platform ID of 0"); }
CheckError(cuInit(0));
}
// Methods to retrieve platform information
std::string Name() const { return "CUDA"; }
std::string Vendor() const { return "NVIDIA Corporation"; }
std::string Version() const {
auto result = 0;
CheckError(cuDriverGetVersion(&result));
return "CUDA driver "+std::to_string(result);
}
// Returns the number of devices on this platform
size_t NumDevices() const {
auto result = 0;
CheckError(cuDeviceGetCount(&result));
return static_cast<size_t>(result);
}
// Accessor to the raw ID (which doesn't exist in the CUDA back-end, this is always just 0)
const RawPlatformID& operator()() const { return platform_id_; }
private:
const size_t platform_id_;
};
// Retrieves a vector with all platforms. Note that there is just one platform in CUDA.
inline std::vector<Platform> GetAllPlatforms() {
auto all_platforms = std::vector<Platform>{ Platform(size_t{0}) };
return all_platforms;
}
// =================================================================================================
// Raw device ID type
using RawDeviceID = CUdevice;
// C++11 version of 'CUdevice'
class Device {
public:
// Constructor based on the regular CUDA data-type
explicit Device(const CUdevice device): device_(device) { }
// Initialization
explicit Device(const Platform &platform, const size_t device_id) {
auto num_devices = platform.NumDevices();
if (num_devices == 0) {
throw RuntimeError("Device: no devices found");
}
if (device_id >= num_devices) {
throw RuntimeError("Device: invalid device ID "+std::to_string(device_id));
}
CheckError(cuDeviceGet(&device_, device_id));
}
// Methods to retrieve device information
RawPlatformID PlatformID() const { return 0; }
std::string Version() const {
auto result = 0;
CheckError(cuDriverGetVersion(&result));
return "CUDA driver "+std::to_string(result);
}
size_t VersionNumber() const {
auto result = 0;
CheckError(cuDriverGetVersion(&result));
return static_cast<size_t>(result);
}
std::string Vendor() const { return "NVIDIA Corporation"; }
std::string Name() const {
auto result = std::string{};
result.resize(kStringLength);
CheckError(cuDeviceGetName(&result[0], result.size(), device_));
return result;
}
std::string Type() const { return "GPU"; }
size_t MaxWorkGroupSize() const {return GetInfo(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK); }
size_t MaxWorkItemDimensions() const { return size_t{3}; }
std::vector<size_t> MaxWorkItemSizes() const {
return std::vector<size_t>{GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X),
GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y),
GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z)};
}
unsigned long LocalMemSize() const {
return static_cast<unsigned long>(GetInfo(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK));
}
std::string Capabilities() const {
const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
return "SM"+std::to_string(major)+"."+std::to_string(minor);
}
bool HasExtension(const std::string &extension) const { return false; }
bool SupportsFP64() const { return true; }
bool SupportsFP16() const {
const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
if (major > 5) { return true; } // SM 6.x, 7.x and higher
if (major == 5 && minor == 3) { return true; } // SM 5.3
return false;
}
size_t CoreClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_CLOCK_RATE); }
size_t ComputeUnits() const { return GetInfo(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT); }
unsigned long MemorySize() const {
auto result = size_t{0};
CheckError(cuDeviceTotalMem(&result, device_));
return static_cast<unsigned long>(result);
}
unsigned long MaxAllocSize() const { return MemorySize(); }
size_t MemoryClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE); }
size_t MemoryBusWidth() const { return GetInfo(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH); }
// Configuration-validity checks
bool IsLocalMemoryValid(const size_t local_mem_usage) const {
return (local_mem_usage <= LocalMemSize());
}
bool IsThreadConfigValid(const std::vector<size_t> &local) const {
auto local_size = size_t{1};
for (const auto &item: local) { local_size *= item; }
for (auto i=size_t{0}; i<local.size(); ++i) {
if (local[i] > MaxWorkItemSizes()[i]) { return false; }
}
if (local_size > MaxWorkGroupSize()) { return false; }
if (local.size() > MaxWorkItemDimensions()) { return false; }
return true;
}
// Query for a specific type of device or brand
bool IsCPU() const { return false; }
bool IsGPU() const { return true; }
bool IsAMD() const { return false; }
bool IsNVIDIA() const { return true; }
bool IsIntel() const { return false; }
bool IsARM() const { return false; }
// Platform specific extensions
std::string AMDBoardName() const { return ""; }
std::string NVIDIAComputeCapability() const { return Capabilities(); }
// Accessor to the private data-member
const RawDeviceID& operator()() const { return device_; }
private:
CUdevice device_;
// Private helper function
size_t GetInfo(const CUdevice_attribute info) const {
auto result = 0;
CheckError(cuDeviceGetAttribute(&result, info, device_));
return static_cast<size_t>(result);
}
};
// =================================================================================================
// Raw context type
using RawContext = CUcontext;
// C++11 version of 'CUcontext'
class Context {
public:
// Constructor based on the regular CUDA data-type: memory management is handled elsewhere
explicit Context(const CUcontext context):
context_(new CUcontext) {
*context_ = context;
}
// Regular constructor with memory management
explicit Context(const Device &device):
context_(new CUcontext, [](CUcontext* c) {
if (*c) { CheckErrorDtor(cuCtxDestroy(*c)); }
delete c;
}) {
CheckError(cuCtxCreate(context_.get(), 0, device()));
}
// Accessor to the private data-member
const RawContext& operator()() const { return *context_; }
RawContext* pointer() const { return &(*context_); }
private:
std::shared_ptr<CUcontext> context_;
};
// Pointer to a raw CUDA context
using ContextPointer = CUcontext*;
// =================================================================================================
// C++11 version of 'nvrtcProgram'. Additionally holds the program's source code.
class Program {
public:
// Note that there is no constructor based on the regular CUDA data-type because of extra state
// Source-based constructor with memory management
explicit Program(const Context &, std::string source):
program_(new nvrtcProgram, [](nvrtcProgram* p) {
if (*p) { CheckErrorDtorNVRTC(nvrtcDestroyProgram(p)); }
delete p;
}),
source_(std::move(source)),
from_binary_(false) {
const auto source_ptr = &source_[0];
CheckErrorNVRTC(nvrtcCreateProgram(program_.get(), source_ptr, nullptr, 0, nullptr, nullptr));
}
// PTX-based constructor
explicit Program(const Device &device, const Context &context, const std::string &binary):
program_(nullptr), // not used
source_(binary),
from_binary_(true) {
}
// Compiles the device program and checks whether or not there are any warnings/errors
void Build(const Device &, std::vector<std::string> &options) {
if (from_binary_) { return; }
auto raw_options = std::vector<const char*>();
for (const auto &option: options) {
raw_options.push_back(option.c_str());
}
auto status = nvrtcCompileProgram(*program_, raw_options.size(), raw_options.data());
CLCudaAPINVRTCError::Check(status, "nvrtcCompileProgram");
}
// Confirms whether a certain status code is an actual compilation error or warning
bool StatusIsCompilationWarningOrError(const nvrtcResult status) const {
return (status == NVRTC_ERROR_INVALID_INPUT);
}
// Retrieves the warning/error message from the compiler (if any)
std::string GetBuildInfo(const Device &) const {
if (from_binary_) { return std::string{}; }
auto bytes = size_t{0};
CheckErrorNVRTC(nvrtcGetProgramLogSize(*program_, &bytes));
auto result = std::string{};
result.resize(bytes);
CheckErrorNVRTC(nvrtcGetProgramLog(*program_, &result[0]));
return result;
}
// Retrieves an intermediate representation of the compiled program (i.e. PTX)
std::string GetIR() const {
if (from_binary_) { return source_; } // holds the PTX
auto bytes = size_t{0};
CheckErrorNVRTC(nvrtcGetPTXSize(*program_, &bytes));
auto result = std::string{};
result.resize(bytes);
CheckErrorNVRTC(nvrtcGetPTX(*program_, &result[0]));
return result;
}
// Accessor to the private data-member
const nvrtcProgram& operator()() const { return *program_; }
private:
std::shared_ptr<nvrtcProgram> program_;
const std::string source_;
const bool from_binary_;
};
// =================================================================================================
// Raw command-queue type
using RawCommandQueue = CUstream;
// C++11 version of 'CUstream'
class Queue {
public:
// Note that there is no constructor based on the regular CUDA data-type because of extra state
// Regular constructor with memory management
explicit Queue(const Context &context, const Device &device):
queue_(new CUstream, [](CUstream* s) {
if (*s) { CheckErrorDtor(cuStreamDestroy(*s)); }
delete s;
}),
context_(context),
device_(device) {
CheckError(cuStreamCreate(queue_.get(), CU_STREAM_NON_BLOCKING));
}
// Synchronizes the queue and optionally also an event
void Finish(Event &event) const {
CheckError(cuEventSynchronize(event.end()));
Finish();
}
void Finish() const {
CheckError(cuStreamSynchronize(*queue_));
}
// Retrieves the corresponding context or device
Context GetContext() const { return context_; }
Device GetDevice() const { return device_; }
// Accessor to the private data-member
const RawCommandQueue& operator()() const { return *queue_; }
private:
std::shared_ptr<CUstream> queue_;
const Context context_;
const Device device_;
};
// =================================================================================================
// C++11 version of page-locked host memory
template <typename T>
class BufferHost {
public:
// Regular constructor with memory management
explicit BufferHost(const Context &, const size_t size):
buffer_(new void*, [](void** m) { CheckError(cuMemFreeHost(*m)); delete m; }),
size_(size) {
CheckError(cuMemAllocHost(buffer_.get(), size*sizeof(T)));
}
// Retrieves the actual allocated size in bytes
size_t GetSize() const {
return size_*sizeof(T);
}
// Compatibility with std::vector
size_t size() const { return size_; }
T* begin() { return &static_cast<T*>(*buffer_)[0]; }
T* end() { return &static_cast<T*>(*buffer_)[size_-1]; }
T& operator[](const size_t i) { return static_cast<T*>(*buffer_)[i]; }
T* data() { return static_cast<T*>(*buffer_); }
const T* data() const { return static_cast<T*>(*buffer_); }
private:
std::shared_ptr<void*> buffer_;
const size_t size_;
};
// =================================================================================================
// Enumeration of buffer access types
enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned };
// C++11 version of 'CUdeviceptr'
template <typename T>
class Buffer {
public:
// Constructor based on the regular CUDA data-type: memory management is handled elsewhere
explicit Buffer(const CUdeviceptr buffer):
buffer_(new CUdeviceptr),
access_(BufferAccess::kNotOwned) {
*buffer_ = buffer;
}
// Regular constructor with memory management. If this class does not own the buffer object, then
// the memory will not be freed automatically afterwards.
explicit Buffer(const Context &, const BufferAccess access, const size_t size):
buffer_(new CUdeviceptr, [access](CUdeviceptr* m) {
if (access != BufferAccess::kNotOwned) { CheckError(cuMemFree(*m)); }
delete m;
}),
access_(access) {
CheckError(cuMemAlloc(buffer_.get(), size*sizeof(T)));
}
// As above, but now with read/write access as a default
explicit Buffer(const Context &context, const size_t size):
Buffer<T>(context, BufferAccess::kReadWrite, size) {
}
// Constructs a new buffer based on an existing host-container
template <typename Iterator>
explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end):
Buffer(context, BufferAccess::kReadWrite, static_cast<size_t>(end - start)) {
auto size = static_cast<size_t>(end - start);
auto pointer = &*start;
CheckError(cuMemcpyHtoDAsync(*buffer_, pointer, size*sizeof(T), queue()));
queue.Finish();
}
// Copies from device to host: reading the device buffer a-synchronously
void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
if (access_ == BufferAccess::kWriteOnly) {
throw LogicError("Buffer: reading from a write-only buffer");
}
CheckError(cuMemcpyDtoHAsync(host, *buffer_ + offset*sizeof(T), size*sizeof(T), queue()));
}
void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
const size_t offset = 0) const {
if (host.size() < size) {
throw LogicError("Buffer: target host buffer is too small");
}
ReadAsync(queue, size, host.data(), offset);
}
void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
const size_t offset = 0) const {
if (host.size() < size) {
throw LogicError("Buffer: target host buffer is too small");
}
ReadAsync(queue, size, host.data(), offset);
}
// Copies from device to host: reading the device buffer
void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
ReadAsync(queue, size, host, offset);
queue.Finish();
}
void Read(const Queue &queue, const size_t size, std::vector<T> &host,
const size_t offset = 0) const {
Read(queue, size, host.data(), offset);
}
void Read(const Queue &queue, const size_t size, BufferHost<T> &host,
const size_t offset = 0) const {
Read(queue, size, host.data(), offset);
}
// Copies from host to device: writing the device buffer a-synchronously
void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
if (access_ == BufferAccess::kReadOnly) {
throw LogicError("Buffer: writing to a read-only buffer");
}
if (GetSize() < (offset+size)*sizeof(T)) {
throw LogicError("Buffer: target device buffer is too small");
}
CheckError(cuMemcpyHtoDAsync(*buffer_ + offset*sizeof(T), host, size*sizeof(T), queue()));
}
void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host,
const size_t offset = 0) {
WriteAsync(queue, size, host.data(), offset);
}
void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host,
const size_t offset = 0) {
WriteAsync(queue, size, host.data(), offset);
}
// Copies from host to device: writing the device buffer
void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
WriteAsync(queue, size, host, offset);
queue.Finish();
}
void Write(const Queue &queue, const size_t size, const std::vector<T> &host,
const size_t offset = 0) {
Write(queue, size, host.data(), offset);
}
void Write(const Queue &queue, const size_t size, const BufferHost<T> &host,
const size_t offset = 0) {
Write(queue, size, host.data(), offset);
}
// Copies the contents of this buffer into another device buffer
void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
CheckError(cuMemcpyDtoDAsync(destination(), *buffer_, size*sizeof(T), queue()));
}
void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
CopyToAsync(queue, size, destination);
queue.Finish();
}
// Retrieves the actual allocated size in bytes
size_t GetSize() const {
auto result = size_t{0};
CheckError(cuMemGetAddressRange(nullptr, &result, *buffer_));
return result;
}
// Accessors to the private data-members
CUdeviceptr operator()() const { return *buffer_; }
CUdeviceptr& operator()() { return *buffer_; }
private:
std::shared_ptr<CUdeviceptr> buffer_;
const BufferAccess access_;
};
// =================================================================================================
// C++11 version of 'CUfunction'
class Kernel {
public:
// Constructor based on the regular CUDA data-type: memory management is handled elsewhere
explicit Kernel(const CUmodule module, const CUfunction kernel):
module_(module),
kernel_(kernel) {
}
// Regular constructor with memory management
explicit Kernel(const Program &program, const std::string &name) {
CheckError(cuModuleLoadDataEx(&module_, program.GetIR().data(), 0, nullptr, nullptr));
CheckError(cuModuleGetFunction(&kernel_, module_, name.c_str()));
}
// Sets a kernel argument at the indicated position. This stores both the value of the argument
// (as raw bytes) and the index indicating where this value can be found.
template <typename T>
void SetArgument(const size_t index, const T &value) {
if (index >= arguments_indices_.size()) { arguments_indices_.resize(index+1); }
arguments_indices_[index] = arguments_data_.size();
for (auto j=size_t(0); j<sizeof(T); ++j) {
arguments_data_.push_back(reinterpret_cast<const char*>(&value)[j]);
}
}
template <typename T>
void SetArgument(const size_t index, Buffer<T> &value) {
SetArgument(index, value());
}
// Sets all arguments in one go using parameter packs. Note that this resets all previously set
// arguments using 'SetArgument' or 'SetArguments'.
template <typename... Args>
void SetArguments(Args&... args) {
arguments_indices_.clear();
arguments_data_.clear();
SetArgumentsRecursive(0, args...);
}
// Retrieves the amount of local memory used per work-group for this kernel. Note that this the
// shared memory in CUDA terminology.
unsigned long LocalMemUsage(const Device &) const {
auto result = 0;
CheckError(cuFuncGetAttribute(&result, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel_));
return static_cast<unsigned long>(result);
}
// Retrieves the name of the kernel
std::string GetFunctionName() const {
return std::string{"unknown"}; // Not implemented for the CUDA backend
}
// Launches a kernel onto the specified queue
void Launch(const Queue &queue, const std::vector<size_t> &global,
const std::vector<size_t> &local, EventPointer event) {
// Creates the grid (number of threadblocks) and sets the block sizes (threads per block)
auto grid = std::vector<size_t>{1, 1, 1};
auto block = std::vector<size_t>{1, 1, 1};
if (global.size() != local.size()) { throw LogicError("invalid thread/workgroup dimensions"); }
for (auto i=size_t{0}; i<local.size(); ++i) { grid[i] = global[i]/local[i]; }
for (auto i=size_t{0}; i<local.size(); ++i) { block[i] = local[i]; }
// Creates the array of pointers from the arrays of indices & data
std::vector<void*> pointers;
for (auto &index: arguments_indices_) {
pointers.push_back(&arguments_data_[index]);
}
// Launches the kernel, its execution time is recorded by events
CheckError(cuEventRecord(event->start(), queue()));
CheckError(cuLaunchKernel(kernel_, grid[0], grid[1], grid[2], block[0], block[1], block[2],
0, queue(), pointers.data(), nullptr));
CheckError(cuEventRecord(event->end(), queue()));
}
// As above, but with an event waiting list
// TODO: Implement this function
void Launch(const Queue &queue, const std::vector<size_t> &global,
const std::vector<size_t> &local, EventPointer event,
std::vector<Event>& waitForEvents) {
if (local.size() == 0) {
throw LogicError("Kernel: launching with a default workgroup size is not implemented for the CUDA back-end");
}
else if (waitForEvents.size() != 0) {
throw LogicError("Kernel: launching with an event waiting list is not implemented for the CUDA back-end");
}
else {
return Launch(queue, global, local, event);
}
}
// Accessors to the private data-members
const CUfunction& operator()() const { return kernel_; }
CUfunction operator()() { return kernel_; }
private:
CUmodule module_;
CUfunction kernel_;
std::vector<size_t> arguments_indices_; // Indices of the arguments
std::vector<char> arguments_data_; // The arguments data as raw bytes
// Internal implementation for the recursive SetArguments function.
template <typename T>
void SetArgumentsRecursive(const size_t index, T &first) {
SetArgument(index, first);
}
template <typename T, typename... Args>
void SetArgumentsRecursive(const size_t index, T &first, Args&... args) {
SetArgument(index, first);
SetArgumentsRecursive(index+1, args...);
}
};
// =================================================================================================
} // namespace clblast
// CLBLAST_CUPP11_H_
#endif

View File

@ -15,7 +15,7 @@
#ifndef CLBLAST_BUFFER_TEST_H_ #ifndef CLBLAST_BUFFER_TEST_H_
#define CLBLAST_BUFFER_TEST_H_ #define CLBLAST_BUFFER_TEST_H_
#include "clblast.h" #include "utilities/utilities.hpp
namespace clblast { namespace clblast {
// ================================================================================================= // =================================================================================================

View File

@ -21,8 +21,13 @@
#include <complex> #include <complex>
#include <random> #include <random>
#include "clpp11.hpp" #ifdef OPENCL_API
#include "clblast.h" #include "clpp11.hpp"
#include "clblast.h"
#elif CUDA_API
#include "cupp11.hpp"
#include "clblast_cuda.h"
#endif
#include "clblast_half.h" #include "clblast_half.h"
#include "utilities/clblast_exceptions.hpp" #include "utilities/clblast_exceptions.hpp"
#include "utilities/msvc.hpp" #include "utilities/msvc.hpp"