CLBlast/include/clblast.h


// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
//   Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains the interface to the CLBlast BLAS routines. It also contains the definitions
// of the returned status codes and the layout and transpose types. This is the only header users
// of CLBlast should include and use.
//
// =================================================================================================

#ifndef CLBLAST_CLBLAST_H_
#define CLBLAST_CLBLAST_H_

#include <cstdlib> // For size_t

// Includes the normal OpenCL C header
#if defined(__APPLE__) || defined(__MACOSX)
  #include <OpenCL/opencl.h>
#else
  #include <CL/opencl.h>
#endif

namespace clblast {
// =================================================================================================

// Status codes. These codes can be returned by functions declared in this header file. The error
// codes match either the standard OpenCL error codes or the clBLAS error codes. 
enum class StatusCode {

  // Status codes in common with the OpenCL standard
  kSuccess                   =   0, // CL_SUCCESS
  kTempBufferAllocFailure    =  -4, // CL_MEM_OBJECT_ALLOCATION_FAILURE
  kBuildProgramFailure       = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error
  kInvalidBinary             = -42, // CL_INVALID_BINARY
  kInvalidKernel             = -48, // CL_INVALID_KERNEL
  kInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions
  kInvalidLocalThreadsTotal  = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total
  kInvalidLocalThreadsDim    = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension
  kInvalidTempBufferSize     = -61, // CL_INVALID_BUFFER_SIZE

  // Status codes in common with the clBLAS library
  kNotImplemented            = -1024, // Routine or functionality not implemented yet
  kInvalidMatrixA            = -1022, // Matrix A is not a valid OpenCL buffer
  kInvalidMatrixB            = -1021, // Matrix B is not a valid OpenCL buffer
  kInvalidMatrixC            = -1020, // Matrix C is not a valid OpenCL buffer
  kInvalidVectorX            = -1019, // Vector X is not a valid OpenCL buffer
  kInvalidVectorY            = -1018, // Vector Y is not a valid OpenCL buffer
  kInvalidDimension          = -1017, // Dimensions M, N, and K have to be larger than zero
  kInvalidLeadDimA           = -1016, // LD of A is smaller than the matrix's first dimension
  kInvalidLeadDimB           = -1015, // LD of B is smaller than the matrix's first dimension
  kInvalidLeadDimC           = -1014, // LD of C is smaller than the matrix's first dimension
  kInvalidIncrementX         = -1013, // Increment of vector X cannot be zero
  kInvalidIncrementY         = -1012, // Increment of vector Y cannot be zero
  kInsufficientMemoryA       = -1011, // Matrix A's OpenCL buffer is too small
  kInsufficientMemoryB       = -1010, // Matrix B's OpenCL buffer is too small
  kInsufficientMemoryC       = -1009, // Matrix C's OpenCL buffer is too small
  kInsufficientMemoryX       = -1008, // Vector X's OpenCL buffer is too small
  kInsufficientMemoryY       = -1007, // Vector Y's OpenCL buffer is too small

  // Custom additional status codes for CLBlast
  kKernelLaunchError         = -2048, // Problem occurred when enqueuing the kernel
  kKernelRunError            = -2047, // Problem occurred while running the kernel
  kInvalidLocalMemUsage      = -2046, // Not enough local memory available on this device
  kNoHalfPrecision           = -2045, // Half precision (16-bits) not supported by the device
  kNoDoublePrecision         = -2044, // Double precision (64-bits) not supported by the device
};

// Matrix layout and transpose types
enum class Layout { kRowMajor, kColMajor };
enum class Transpose { kNo, kYes, kConjugate };
enum class Side { kLeft, kRight };
enum class Triangle { kUpper, kLower };

// Precision scoped enum (values in bits)
enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
                       kComplexSingle = 3232, kComplexDouble = 6464 };

// =================================================================================================
// BLAS level-1 (vector-vector) routines

// Templated-precision vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY
template <typename T>
StatusCode Axpy(const size_t m, const T alpha,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                cl_command_queue* queue, cl_event* event);

// =================================================================================================
// BLAS level-2 (matrix-vector) routines

// =================================================================================================
// BLAS level-3 (matrix-matrix) routines

// Templated-precision generalized matrix multiplication: SGEMM/DGEMM
template <typename T>
StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b,
                const size_t m, const size_t n, const size_t k,
                const T alpha,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                const T beta,
                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                cl_command_queue* queue, cl_event* event);

// Templated-precision symmetric matrix multiplication: SSYMM/DSYMM
template <typename T>
StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
                const size_t m, const size_t n,
                const T alpha,
                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                const T beta,
                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                cl_command_queue* queue, cl_event* event);

// =================================================================================================
} // namespace clblast

// CLBLAST_CLBLAST_H_
#endif
Initial commit of preview version 2015-05-30 12:30:43 +02:00
			`// =================================================================================================`
			`// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This`
			`// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-`
			`// width of 100 characters per line.`
			`//`
			`// Author(s):`
			`// Cedric Nugteren <www.cedricnugteren.nl>`
			`//`
			`// This file contains the interface to the CLBlast BLAS routines. It also contains the definitions`
			`// of the returned status codes and the layout and transpose types. This is the only header users`
			`// of CLBlast should include and use.`
			`//`
			`// =================================================================================================`

			`#ifndef CLBLAST_CLBLAST_H_`
			`#define CLBLAST_CLBLAST_H_`

			`#include <cstdlib> // For size_t`

			`// Includes the normal OpenCL C header`
			`#if defined(__APPLE__) \|\| defined(__MACOSX)`
			`#include <OpenCL/opencl.h>`
			`#else`
			`#include <CL/opencl.h>`
			`#endif`

			`namespace clblast {`
			`// =================================================================================================`

			`// Status codes. These codes can be returned by functions declared in this header file. The error`
			`// codes match either the standard OpenCL error codes or the clBLAS error codes.`
			`enum class StatusCode {`

			`// Status codes in common with the OpenCL standard`
			`kSuccess = 0, // CL_SUCCESS`
			`kTempBufferAllocFailure = -4, // CL_MEM_OBJECT_ALLOCATION_FAILURE`
			`kBuildProgramFailure = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error`
			`kInvalidBinary = -42, // CL_INVALID_BINARY`
			`kInvalidKernel = -48, // CL_INVALID_KERNEL`
			`kInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions`
			`kInvalidLocalThreadsTotal = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total`
			`kInvalidLocalThreadsDim = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension`
			`kInvalidTempBufferSize = -61, // CL_INVALID_BUFFER_SIZE`

			`// Status codes in common with the clBLAS library`
			`kNotImplemented = -1024, // Routine or functionality not implemented yet`
			`kInvalidMatrixA = -1022, // Matrix A is not a valid OpenCL buffer`
			`kInvalidMatrixB = -1021, // Matrix B is not a valid OpenCL buffer`
			`kInvalidMatrixC = -1020, // Matrix C is not a valid OpenCL buffer`
			`kInvalidVectorX = -1019, // Vector X is not a valid OpenCL buffer`
			`kInvalidVectorY = -1018, // Vector Y is not a valid OpenCL buffer`
			`kInvalidDimension = -1017, // Dimensions M, N, and K have to be larger than zero`
			`kInvalidLeadDimA = -1016, // LD of A is smaller than the matrix's first dimension`
			`kInvalidLeadDimB = -1015, // LD of B is smaller than the matrix's first dimension`
			`kInvalidLeadDimC = -1014, // LD of C is smaller than the matrix's first dimension`
			`kInvalidIncrementX = -1013, // Increment of vector X cannot be zero`
			`kInvalidIncrementY = -1012, // Increment of vector Y cannot be zero`
			`kInsufficientMemoryA = -1011, // Matrix A's OpenCL buffer is too small`
			`kInsufficientMemoryB = -1010, // Matrix B's OpenCL buffer is too small`
			`kInsufficientMemoryC = -1009, // Matrix C's OpenCL buffer is too small`
			`kInsufficientMemoryX = -1008, // Vector X's OpenCL buffer is too small`
			`kInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small`

			`// Custom additional status codes for CLBlast`
			`kKernelLaunchError = -2048, // Problem occurred when enqueuing the kernel`
			`kKernelRunError = -2047, // Problem occurred while running the kernel`
			`kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device`
			`kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device`
			`kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device`
			`};`

			`// Matrix layout and transpose types`
			`enum class Layout { kRowMajor, kColMajor };`
			`enum class Transpose { kNo, kYes, kConjugate };`
			`enum class Side { kLeft, kRight };`
			`enum class Triangle { kUpper, kLower };`

			`// Precision scoped enum (values in bits)`
			`enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,`
			`kComplexSingle = 3232, kComplexDouble = 6464 };`

			`// =================================================================================================`
			`// BLAS level-1 (vector-vector) routines`

			`// Templated-precision vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY`
			`template <typename T>`
			`StatusCode Axpy(const size_t m, const T alpha,`
			`const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,`
			`cl_mem y_buffer, const size_t y_offset, const size_t y_inc,`
			`cl_command_queue* queue, cl_event* event);`

			`// =================================================================================================`
			`// BLAS level-2 (matrix-vector) routines`

			`// =================================================================================================`
			`// BLAS level-3 (matrix-matrix) routines`

			`// Templated-precision generalized matrix multiplication: SGEMM/DGEMM`
			`template <typename T>`
			`StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b,`
			`const size_t m, const size_t n, const size_t k,`
			`const T alpha,`
			`const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,`
			`const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,`
			`const T beta,`
			`cl_mem c_buffer, const size_t c_offset, const size_t c_ld,`
			`cl_command_queue* queue, cl_event* event);`

			`// Templated-precision symmetric matrix multiplication: SSYMM/DSYMM`
			`template <typename T>`
			`StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,`
			`const size_t m, const size_t n,`
			`const T alpha,`
			`const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,`
			`const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,`
			`const T beta,`
			`cl_mem c_buffer, const size_t c_offset, const size_t c_ld,`
			`cl_command_queue* queue, cl_event* event);`

			`// =================================================================================================`
			`} // namespace clblast`

			`// CLBLAST_CLBLAST_H_`
			`#endif`