mirror of
https://github.com/CNugteren/CLBlast.git
synced 2024-08-21 04:22:27 +02:00
3d3492646c
The CLBlastConfig.cmake file was installed to a directory named CLBLast (notice second capital l), which can cause issues for CMake's search path when looking for CLBlast on the system. This commit also fixes other occurrences of the wrong capitalization, all of it purely cosmetic (i.e. in comments).
793 lines
42 KiB
C++
793 lines
42 KiB
C++
|
|
// =================================================================================================
|
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
|
// width of 100 characters per line.
|
|
//
|
|
// Author(s):
|
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
|
//
|
|
// This file contains the interface to the CLBlast BLAS routines. It also contains the definitions
|
|
// of the returned status codes and the layout and transpose types. This is the only header users
|
|
// of CLBlast should include and use.
|
|
//
|
|
// =================================================================================================
|
|
|
|
#ifndef CLBLAST_CLBLAST_H_
|
|
#define CLBLAST_CLBLAST_H_
|
|
|
|
#include <cstdlib> // For size_t
|
|
#include <string> // For OverrideParameters function
|
|
#include <unordered_map> // For OverrideParameters function
|
|
|
|
// Includes the normal OpenCL C header
|
|
#if defined(__APPLE__) || defined(__MACOSX)
|
|
#include <OpenCL/opencl.h>
|
|
#else
|
|
#include <CL/opencl.h>
|
|
#endif
|
|
|
|
// Exports library functions under Windows when building a DLL. See also:
|
|
// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
|
|
#if defined(_WIN32) && defined(CLBLAST_DLL)
|
|
#if defined(COMPILING_DLL)
|
|
#define PUBLIC_API __declspec(dllexport)
|
|
#else
|
|
#define PUBLIC_API __declspec(dllimport)
|
|
#endif
|
|
#else
|
|
#define PUBLIC_API
|
|
#endif
|
|
|
|
// Version numbering (v1.5.1)
|
|
#define CLBLAST_VERSION_MAJOR 1
|
|
#define CLBLAST_VERSION_MINOR 5
|
|
#define CLBLAST_VERSION_PATCH 1
|
|
|
|
namespace clblast {
|
|
// =================================================================================================
|
|
|
|
// Status codes. These codes can be returned by functions declared in this header file. The error
|
|
// codes match either the standard OpenCL error codes or the clBLAS error codes.
|
|
enum class StatusCode {
|
|
|
|
// Status codes in common with the OpenCL standard
|
|
kSuccess = 0, // CL_SUCCESS
|
|
kOpenCLCompilerNotAvailable= -3, // CL_COMPILER_NOT_AVAILABLE
|
|
kTempBufferAllocFailure = -4, // CL_MEM_OBJECT_ALLOCATION_FAILURE
|
|
kOpenCLOutOfResources = -5, // CL_OUT_OF_RESOURCES
|
|
kOpenCLOutOfHostMemory = -6, // CL_OUT_OF_HOST_MEMORY
|
|
kOpenCLBuildProgramFailure = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error
|
|
kInvalidValue = -30, // CL_INVALID_VALUE
|
|
kInvalidCommandQueue = -36, // CL_INVALID_COMMAND_QUEUE
|
|
kInvalidMemObject = -38, // CL_INVALID_MEM_OBJECT
|
|
kInvalidBinary = -42, // CL_INVALID_BINARY
|
|
kInvalidBuildOptions = -43, // CL_INVALID_BUILD_OPTIONS
|
|
kInvalidProgram = -44, // CL_INVALID_PROGRAM
|
|
kInvalidProgramExecutable = -45, // CL_INVALID_PROGRAM_EXECUTABLE
|
|
kInvalidKernelName = -46, // CL_INVALID_KERNEL_NAME
|
|
kInvalidKernelDefinition = -47, // CL_INVALID_KERNEL_DEFINITION
|
|
kInvalidKernel = -48, // CL_INVALID_KERNEL
|
|
kInvalidArgIndex = -49, // CL_INVALID_ARG_INDEX
|
|
kInvalidArgValue = -50, // CL_INVALID_ARG_VALUE
|
|
kInvalidArgSize = -51, // CL_INVALID_ARG_SIZE
|
|
kInvalidKernelArgs = -52, // CL_INVALID_KERNEL_ARGS
|
|
kInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions
|
|
kInvalidLocalThreadsTotal = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total
|
|
kInvalidLocalThreadsDim = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension
|
|
kInvalidGlobalOffset = -56, // CL_INVALID_GLOBAL_OFFSET
|
|
kInvalidEventWaitList = -57, // CL_INVALID_EVENT_WAIT_LIST
|
|
kInvalidEvent = -58, // CL_INVALID_EVENT
|
|
kInvalidOperation = -59, // CL_INVALID_OPERATION
|
|
kInvalidBufferSize = -61, // CL_INVALID_BUFFER_SIZE
|
|
kInvalidGlobalWorkSize = -63, // CL_INVALID_GLOBAL_WORK_SIZE
|
|
|
|
// Status codes in common with the clBLAS library
|
|
kNotImplemented = -1024, // Routine or functionality not implemented yet
|
|
kInvalidMatrixA = -1022, // Matrix A is not a valid OpenCL buffer
|
|
kInvalidMatrixB = -1021, // Matrix B is not a valid OpenCL buffer
|
|
kInvalidMatrixC = -1020, // Matrix C is not a valid OpenCL buffer
|
|
kInvalidVectorX = -1019, // Vector X is not a valid OpenCL buffer
|
|
kInvalidVectorY = -1018, // Vector Y is not a valid OpenCL buffer
|
|
kInvalidDimension = -1017, // Dimensions M, N, and K have to be larger than zero
|
|
kInvalidLeadDimA = -1016, // LD of A is smaller than the matrix's first dimension
|
|
kInvalidLeadDimB = -1015, // LD of B is smaller than the matrix's first dimension
|
|
kInvalidLeadDimC = -1014, // LD of C is smaller than the matrix's first dimension
|
|
kInvalidIncrementX = -1013, // Increment of vector X cannot be zero
|
|
kInvalidIncrementY = -1012, // Increment of vector Y cannot be zero
|
|
kInsufficientMemoryA = -1011, // Matrix A's OpenCL buffer is too small
|
|
kInsufficientMemoryB = -1010, // Matrix B's OpenCL buffer is too small
|
|
kInsufficientMemoryC = -1009, // Matrix C's OpenCL buffer is too small
|
|
kInsufficientMemoryX = -1008, // Vector X's OpenCL buffer is too small
|
|
kInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small
|
|
|
|
// Custom additional status codes for CLBlast
|
|
kInsufficientMemoryTemp = -2050, // Temporary buffer provided to GEMM routine is too small
|
|
kInvalidBatchCount = -2049, // The batch count needs to be positive
|
|
kInvalidOverrideKernel = -2048, // Trying to override parameters for an invalid kernel
|
|
kMissingOverrideParameter = -2047, // Missing override parameter(s) for the target kernel
|
|
kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device
|
|
kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device
|
|
kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device
|
|
kInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer
|
|
kInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small
|
|
kDatabaseError = -2041, // Entry for the device was not found in the database
|
|
kUnknownError = -2040, // A catch-all error code representing an unspecified error
|
|
kUnexpectedError = -2039, // A catch-all error code representing an unexpected exception
|
|
};
|
|
|
|
// Matrix layout and transpose types
|
|
enum class Layout { kRowMajor = 101, kColMajor = 102 };
|
|
enum class Transpose { kNo = 111, kYes = 112, kConjugate = 113 };
|
|
enum class Triangle { kUpper = 121, kLower = 122 };
|
|
enum class Diagonal { kNonUnit = 131, kUnit = 132 };
|
|
enum class Side { kLeft = 141, kRight = 142 };
|
|
enum class KernelMode { kCrossCorrelation = 151, kConvolution = 152 };
|
|
|
|
// Precision scoped enum (values in bits)
|
|
enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
|
|
kComplexSingle = 3232, kComplexDouble = 6464, kAny = -1 };
|
|
|
|
// =================================================================================================
|
|
// BLAS level-1 (vector-vector) routines
|
|
// =================================================================================================
|
|
|
|
// Generate givens plane rotation: SROTG/DROTG
|
|
template <typename T>
|
|
StatusCode Rotg(cl_mem sa_buffer, const size_t sa_offset,
|
|
cl_mem sb_buffer, const size_t sb_offset,
|
|
cl_mem sc_buffer, const size_t sc_offset,
|
|
cl_mem ss_buffer, const size_t ss_offset,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Generate modified givens plane rotation: SROTMG/DROTMG
|
|
template <typename T>
|
|
StatusCode Rotmg(cl_mem sd1_buffer, const size_t sd1_offset,
|
|
cl_mem sd2_buffer, const size_t sd2_offset,
|
|
cl_mem sx1_buffer, const size_t sx1_offset,
|
|
const cl_mem sy1_buffer, const size_t sy1_offset,
|
|
cl_mem sparam_buffer, const size_t sparam_offset,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Apply givens plane rotation: SROT/DROT
|
|
template <typename T>
|
|
StatusCode Rot(const size_t n,
|
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
const T cos,
|
|
const T sin,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Apply modified givens plane rotation: SROTM/DROTM
|
|
template <typename T>
|
|
StatusCode Rotm(const size_t n,
|
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
cl_mem sparam_buffer, const size_t sparam_offset,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP
|
|
template <typename T>
|
|
StatusCode Swap(const size_t n,
|
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
|
|
template <typename T>
|
|
StatusCode Scal(const size_t n,
|
|
const T alpha,
|
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
|
|
template <typename T>
|
|
StatusCode Copy(const size_t n,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
|
|
template <typename T>
|
|
StatusCode Axpy(const size_t n,
|
|
const T alpha,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Dot product of two vectors: SDOT/DDOT/HDOT
|
|
template <typename T>
|
|
StatusCode Dot(const size_t n,
|
|
cl_mem dot_buffer, const size_t dot_offset,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Dot product of two complex vectors: CDOTU/ZDOTU
|
|
template <typename T>
|
|
StatusCode Dotu(const size_t n,
|
|
cl_mem dot_buffer, const size_t dot_offset,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC
|
|
template <typename T>
|
|
StatusCode Dotc(const size_t n,
|
|
cl_mem dot_buffer, const size_t dot_offset,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
|
|
template <typename T>
|
|
StatusCode Nrm2(const size_t n,
|
|
cl_mem nrm2_buffer, const size_t nrm2_offset,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
|
|
template <typename T>
|
|
StatusCode Asum(const size_t n,
|
|
cl_mem asum_buffer, const size_t asum_offset,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
|
|
template <typename T>
|
|
StatusCode Sum(const size_t n,
|
|
cl_mem sum_buffer, const size_t sum_offset,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
|
|
template <typename T>
|
|
StatusCode Amax(const size_t n,
|
|
cl_mem imax_buffer, const size_t imax_offset,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN
|
|
template <typename T>
|
|
StatusCode Amin(const size_t n,
|
|
cl_mem imin_buffer, const size_t imin_offset,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
|
|
template <typename T>
|
|
StatusCode Max(const size_t n,
|
|
cl_mem imax_buffer, const size_t imax_offset,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
|
|
template <typename T>
|
|
StatusCode Min(const size_t n,
|
|
cl_mem imin_buffer, const size_t imin_offset,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// =================================================================================================
|
|
// BLAS level-2 (matrix-vector) routines
|
|
// =================================================================================================
|
|
|
|
// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV
|
|
template <typename T>
|
|
StatusCode Gemv(const Layout layout, const Transpose a_transpose,
|
|
const size_t m, const size_t n,
|
|
const T alpha,
|
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
const T beta,
|
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
|
|
template <typename T>
|
|
StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
|
|
const size_t m, const size_t n, const size_t kl, const size_t ku,
|
|
const T alpha,
|
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
const T beta,
|
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Hermitian matrix-vector multiplication: CHEMV/ZHEMV
|
|
template <typename T>
|
|
StatusCode Hemv(const Layout layout, const Triangle triangle,
|
|
const size_t n,
|
|
const T alpha,
|
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
const T beta,
|
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV
|
|
template <typename T>
|
|
StatusCode Hbmv(const Layout layout, const Triangle triangle,
|
|
const size_t n, const size_t k,
|
|
const T alpha,
|
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
const T beta,
|
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV
|
|
template <typename T>
|
|
StatusCode Hpmv(const Layout layout, const Triangle triangle,
|
|
const size_t n,
|
|
const T alpha,
|
|
const cl_mem ap_buffer, const size_t ap_offset,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
const T beta,
|
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV
|
|
template <typename T>
|
|
StatusCode Symv(const Layout layout, const Triangle triangle,
|
|
const size_t n,
|
|
const T alpha,
|
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
const T beta,
|
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
|
|
template <typename T>
|
|
StatusCode Sbmv(const Layout layout, const Triangle triangle,
|
|
const size_t n, const size_t k,
|
|
const T alpha,
|
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
const T beta,
|
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
|
|
template <typename T>
|
|
StatusCode Spmv(const Layout layout, const Triangle triangle,
|
|
const size_t n,
|
|
const T alpha,
|
|
const cl_mem ap_buffer, const size_t ap_offset,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
const T beta,
|
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
|
|
template <typename T>
|
|
StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
|
const size_t n,
|
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
|
|
template <typename T>
|
|
StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
|
const size_t n, const size_t k,
|
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
|
|
template <typename T>
|
|
StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
|
const size_t n,
|
|
const cl_mem ap_buffer, const size_t ap_offset,
|
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
|
|
template <typename T>
|
|
StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
|
const size_t n,
|
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV
|
|
template <typename T>
|
|
StatusCode Tbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
|
const size_t n, const size_t k,
|
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV
|
|
template <typename T>
|
|
StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
|
const size_t n,
|
|
const cl_mem ap_buffer, const size_t ap_offset,
|
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// General rank-1 matrix update: SGER/DGER/HGER
|
|
template <typename T>
|
|
StatusCode Ger(const Layout layout,
|
|
const size_t m, const size_t n,
|
|
const T alpha,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// General rank-1 complex matrix update: CGERU/ZGERU
|
|
template <typename T>
|
|
StatusCode Geru(const Layout layout,
|
|
const size_t m, const size_t n,
|
|
const T alpha,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// General rank-1 complex conjugated matrix update: CGERC/ZGERC
|
|
template <typename T>
|
|
StatusCode Gerc(const Layout layout,
|
|
const size_t m, const size_t n,
|
|
const T alpha,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Hermitian rank-1 matrix update: CHER/ZHER
|
|
template <typename T>
|
|
StatusCode Her(const Layout layout, const Triangle triangle,
|
|
const size_t n,
|
|
const T alpha,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Hermitian packed rank-1 matrix update: CHPR/ZHPR
|
|
template <typename T>
|
|
StatusCode Hpr(const Layout layout, const Triangle triangle,
|
|
const size_t n,
|
|
const T alpha,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
cl_mem ap_buffer, const size_t ap_offset,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Hermitian rank-2 matrix update: CHER2/ZHER2
|
|
template <typename T>
|
|
StatusCode Her2(const Layout layout, const Triangle triangle,
|
|
const size_t n,
|
|
const T alpha,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2
|
|
template <typename T>
|
|
StatusCode Hpr2(const Layout layout, const Triangle triangle,
|
|
const size_t n,
|
|
const T alpha,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
cl_mem ap_buffer, const size_t ap_offset,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
|
|
template <typename T>
|
|
StatusCode Syr(const Layout layout, const Triangle triangle,
|
|
const size_t n,
|
|
const T alpha,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
|
|
template <typename T>
|
|
StatusCode Spr(const Layout layout, const Triangle triangle,
|
|
const size_t n,
|
|
const T alpha,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
cl_mem ap_buffer, const size_t ap_offset,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
|
|
template <typename T>
|
|
StatusCode Syr2(const Layout layout, const Triangle triangle,
|
|
const size_t n,
|
|
const T alpha,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
|
|
template <typename T>
|
|
StatusCode Spr2(const Layout layout, const Triangle triangle,
|
|
const size_t n,
|
|
const T alpha,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
cl_mem ap_buffer, const size_t ap_offset,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// =================================================================================================
|
|
// BLAS level-3 (matrix-matrix) routines
|
|
// =================================================================================================
|
|
|
|
// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
|
|
template <typename T>
|
|
StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
|
|
const size_t m, const size_t n, const size_t k,
|
|
const T alpha,
|
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
|
const T beta,
|
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
|
cl_command_queue* queue, cl_event* event = nullptr,
|
|
cl_mem temp_buffer = nullptr);
|
|
|
|
// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
|
|
template <typename T>
|
|
StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
|
|
const size_t m, const size_t n,
|
|
const T alpha,
|
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
|
const T beta,
|
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
|
|
template <typename T>
|
|
StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
|
|
const size_t m, const size_t n,
|
|
const T alpha,
|
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
|
const T beta,
|
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK
|
|
template <typename T>
|
|
StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
|
const size_t n, const size_t k,
|
|
const T alpha,
|
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
const T beta,
|
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Rank-K update of a hermitian matrix: CHERK/ZHERK
|
|
template <typename T>
|
|
StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
|
const size_t n, const size_t k,
|
|
const T alpha,
|
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
const T beta,
|
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K
|
|
template <typename T>
|
|
StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
|
const size_t n, const size_t k,
|
|
const T alpha,
|
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
|
const T beta,
|
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
|
|
template <typename T, typename U>
|
|
StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
|
const size_t n, const size_t k,
|
|
const T alpha,
|
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
|
const U beta,
|
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM
|
|
template <typename T>
|
|
StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
|
const size_t m, const size_t n,
|
|
const T alpha,
|
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
|
|
template <typename T>
|
|
StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
|
const size_t m, const size_t n,
|
|
const T alpha,
|
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// =================================================================================================
|
|
// Extra non-BLAS routines (level-X)
|
|
// =================================================================================================
|
|
|
|
// Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD
|
|
template <typename T>
|
|
StatusCode Had(const size_t n,
|
|
const T alpha,
|
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
|
const T beta,
|
|
cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
|
|
template <typename T>
|
|
StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
|
|
const size_t m, const size_t n,
|
|
const T alpha,
|
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
|
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL
|
|
template <typename T>
|
|
StatusCode Im2col(const KernelMode kernel_mode,
|
|
const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
|
|
const cl_mem im_buffer, const size_t im_offset,
|
|
cl_mem col_buffer, const size_t col_offset,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM
|
|
template <typename T>
|
|
StatusCode Col2im(const KernelMode kernel_mode,
|
|
const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
|
|
const cl_mem col_buffer, const size_t col_offset,
|
|
cl_mem im_buffer, const size_t im_offset,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM
|
|
template <typename T>
|
|
StatusCode Convgemm(const KernelMode kernel_mode,
|
|
const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
|
|
const cl_mem im_buffer, const size_t im_offset,
|
|
const cl_mem kernel_buffer, const size_t kernel_offset,
|
|
cl_mem result_buffer, const size_t result_offset,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
|
|
template <typename T>
|
|
StatusCode AxpyBatched(const size_t n,
|
|
const T *alphas,
|
|
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
|
|
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
|
|
const size_t batch_count,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED
|
|
template <typename T>
|
|
StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
|
|
const size_t m, const size_t n, const size_t k,
|
|
const T *alphas,
|
|
const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
|
|
const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
|
|
const T *betas,
|
|
cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
|
|
const size_t batch_count,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// StridedBatched version of GEMM: SGEMMSTRIDEDBATCHED/DGEMMSTRIDEDBATCHED/CGEMMSTRIDEDBATCHED/ZGEMMSTRIDEDBATCHED/HGEMMSTRIDEDBATCHED
|
|
template <typename T>
|
|
StatusCode GemmStridedBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
|
|
const size_t m, const size_t n, const size_t k,
|
|
const T alpha,
|
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride,
|
|
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride,
|
|
const T beta,
|
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride,
|
|
const size_t batch_count,
|
|
cl_command_queue* queue, cl_event* event = nullptr);
|
|
|
|
// =================================================================================================
|
|
|
|
// Retrieves the required size of the temporary buffer for the GEMM kernel (optional)
|
|
template <typename T>
|
|
StatusCode GemmTempBufferSize(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
|
|
const size_t m, const size_t n, const size_t k,
|
|
const size_t a_offset, const size_t a_ld,
|
|
const size_t b_offset, const size_t b_ld,
|
|
const size_t c_offset, const size_t c_ld,
|
|
cl_command_queue* queue, size_t& temp_buffer_size);
|
|
|
|
// =================================================================================================
|
|
|
|
// CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
|
|
// for the same device. This cache can be cleared to free up system memory or in case of debugging.
|
|
StatusCode PUBLIC_API ClearCache();
|
|
|
|
// The cache can also be pre-initialized for a specific device with all possible CLBlast kernels.
|
|
// Further CLBlast routine calls will then run at maximum speed.
|
|
StatusCode PUBLIC_API FillCache(const cl_device_id device);
|
|
|
|
// =================================================================================================
|
|
|
|
// Retrieves current tuning parameters for a specific device-precision-kernel combination
|
|
StatusCode PUBLIC_API RetrieveParameters(const cl_device_id device, const std::string &kernel_name,
|
|
const Precision precision,
|
|
std::unordered_map<std::string,size_t> ¶meters);
|
|
|
|
// Overrides tuning parameters for a specific device-precision-kernel combination. The next time
|
|
// the target routine is called it will re-compile and use the new parameters from then on.
|
|
StatusCode PUBLIC_API OverrideParameters(const cl_device_id device, const std::string &kernel_name,
|
|
const Precision precision,
|
|
const std::unordered_map<std::string,size_t> ¶meters);
|
|
|
|
// =================================================================================================
|
|
|
|
// Tunes the "Xaxpy" kernel, used for many level-1 routines such as XAXPY, XCOPY, and XSWAP
|
|
template <typename T>
|
|
StatusCode TuneXaxpy(cl_command_queue* queue, const size_t n,
|
|
const double fraction, std::unordered_map<std::string,size_t> ¶meters);
|
|
|
|
// Tunes the "Xdot" kernel, used for level-1 reduction routines such as XDOT, XMAX, and XSUM
|
|
template <typename T>
|
|
StatusCode TuneXdot(cl_command_queue* queue, const size_t n,
|
|
const double fraction, std::unordered_map<std::string,size_t> ¶meters);
|
|
|
|
// Tunes the "Xgemv" kernel, used for matrix-vector level-2 routines such as XGEMV, XGBMV, and XHEMV
|
|
template <typename T>
|
|
StatusCode TuneXgemv(cl_command_queue* queue, const size_t m, const size_t n,
|
|
const double fraction, std::unordered_map<std::string,size_t> ¶meters);
|
|
|
|
// Tunes the "Xger" kernel, used for matrix update level-2 routines such as XGER, XHER, and XSYR2
|
|
template <typename T>
|
|
StatusCode TuneXger(cl_command_queue* queue, const size_t m, const size_t n,
|
|
const double fraction, std::unordered_map<std::string,size_t> ¶meters);
|
|
|
|
// Tunes the "Xgemm" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K
|
|
template <typename T>
|
|
StatusCode TuneXgemm(cl_command_queue* queue, const size_t m, const size_t n, const size_t k,
|
|
const double fraction, std::unordered_map<std::string,size_t> ¶meters);
|
|
|
|
// Tunes the "XgemmDiret" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K
|
|
template <typename T>
|
|
StatusCode TuneXgemmDirect(cl_command_queue* queue, const size_t m, const size_t n, const size_t k,
|
|
const double fraction, std::unordered_map<std::string,size_t> ¶meters);
|
|
|
|
// Tunes the "Copy" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K
|
|
template <typename T>
|
|
StatusCode TuneCopy(cl_command_queue* queue, const size_t m, const size_t n,
|
|
const double fraction, std::unordered_map<std::string,size_t> ¶meters);
|
|
|
|
// Tunes the "Pad" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K
|
|
template <typename T>
|
|
StatusCode TunePad(cl_command_queue* queue, const size_t m, const size_t n,
|
|
const double fraction, std::unordered_map<std::string,size_t> ¶meters);
|
|
|
|
// Tunes the "Transpose" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K
|
|
template <typename T>
|
|
StatusCode TuneTranspose(cl_command_queue* queue, const size_t m, const size_t n,
|
|
const double fraction, std::unordered_map<std::string,size_t> ¶meters);
|
|
|
|
// Tunes the "Padtranspose" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K
|
|
template <typename T>
|
|
StatusCode TunePadtranspose(cl_command_queue* queue, const size_t m, const size_t n,
|
|
const double fraction, std::unordered_map<std::string,size_t> ¶meters);
|
|
|
|
// Tunes the "Xgemm" kernel, used for the level-3 routine XTRSM
|
|
template <typename T>
|
|
StatusCode TuneInvert(cl_command_queue* queue, const size_t m, const size_t n, const size_t k,
|
|
const double fraction, std::unordered_map<std::string,size_t> ¶meters);
|
|
|
|
// =================================================================================================
|
|
|
|
} // namespace clblast
|
|
|
|
// CLBLAST_CLBLAST_H_
|
|
#endif
|