Created the API and stubs for the HAD (hadamard-product) routines

pull/246/head
Cedric Nugteren 2018-01-31 20:41:02 +01:00
parent 37c5e8f58c
commit ef5008f5e4
18 changed files with 831 additions and 9 deletions

View File

@ -202,7 +202,7 @@ set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv
xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2) xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm) set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm)
set(LEVELX_ROUTINES xomatcopy xim2col xaxpybatched xgemmbatched xgemmstridedbatched) set(LEVELX_ROUTINES xhad xomatcopy xim2col xaxpybatched xgemmbatched xgemmstridedbatched)
set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES}) set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES})
set(PRECISIONS 32 64 3232 6464 16) set(PRECISIONS 32 64 3232 6464 16)

View File

@ -2884,6 +2884,81 @@ Arguments to TRSM:
xHAD: Element-wise vector product (Hadamard)
-------------
Performs the Hadamard element-wise product _z = alpha * x * y + beta * z_, in which _x_, _y_, and _z_z are vectors and _alpha_ and _beta_ are scalar constants.
C++ API:
```
template <typename T>
StatusCode Had(const size_t n,
const T alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
const T beta,
cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
cl_command_queue* queue, cl_event* event)
```
C API:
```
CLBlastStatusCode CLBlastShad(const size_t n,
const float alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
const float beta,
cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastDhad(const size_t n,
const double alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
const double beta,
cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastChad(const size_t n,
const cl_float2 alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
const cl_float2 beta,
cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastZhad(const size_t n,
const cl_double2 alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
const cl_double2 beta,
cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastHhad(const size_t n,
const cl_half alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
const cl_half beta,
cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
cl_command_queue* queue, cl_event* event)
```
Arguments to HAD:
* `const size_t n`: Integer size argument. This value must be positive.
* `const T alpha`: Input scalar constant.
* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
* `const size_t x_offset`: The offset in elements from the start of the input x vector.
* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
* `const cl_mem y_buffer`: OpenCL buffer to store the input y vector.
* `const size_t y_offset`: The offset in elements from the start of the input y vector.
* `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0.
* `const T beta`: Input scalar constant.
* `cl_mem z_buffer`: OpenCL buffer to store the output z vector.
* `const size_t z_offset`: The offset in elements from the start of the output z vector.
* `const size_t z_inc`: Stride/increment of the output z vector. This value must be greater than 0.
* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
xOMATCOPY: Scaling and out-place transpose/copy (non-BLAS function) xOMATCOPY: Scaling and out-place transpose/copy (non-BLAS function)
------------- -------------

View File

@ -610,6 +610,16 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, c
// Extra non-BLAS routines (level-X) // Extra non-BLAS routines (level-X)
// ================================================================================================= // =================================================================================================
// Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD
template <typename T>
StatusCode Had(const size_t n,
const T alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
const T beta,
cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY // Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
template <typename T> template <typename T>
StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,

View File

@ -1318,6 +1318,43 @@ CLBlastStatusCode PUBLIC_API CLBlastZtrsm(const CLBlastLayout layout, const CLBl
// Extra non-BLAS routines (level-X) // Extra non-BLAS routines (level-X)
// ================================================================================================= // =================================================================================================
// Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD
CLBlastStatusCode PUBLIC_API CLBlastShad(const size_t n,
const float alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
const float beta,
cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDhad(const size_t n,
const double alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
const double beta,
cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastChad(const size_t n,
const cl_float2 alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
const cl_float2 beta,
cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZhad(const size_t n,
const cl_double2 alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
const cl_double2 beta,
cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHhad(const size_t n,
const cl_half alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
const cl_half beta,
cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
cl_command_queue* queue, cl_event* event);
// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY // Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
CLBlastStatusCode PUBLIC_API CLBlastSomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, CLBlastStatusCode PUBLIC_API CLBlastSomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const size_t m, const size_t n, const size_t m, const size_t n,

View File

@ -582,6 +582,16 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, c
// Extra non-BLAS routines (level-X) // Extra non-BLAS routines (level-X)
// ================================================================================================= // =================================================================================================
// Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD
template <typename T>
StatusCode Had(const size_t n,
const T alpha,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
const T beta,
CUdeviceptr z_buffer, const size_t z_offset, const size_t z_inc,
const CUcontext context, const CUdevice device);
// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY // Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
template <typename T> template <typename T>
StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,

View File

@ -898,6 +898,32 @@ void PUBLIC_API cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side,
// Extra non-BLAS routines (level-X) // Extra non-BLAS routines (level-X)
// ================================================================================================= // =================================================================================================
// Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD
void PUBLIC_API cblas_shad(const int n,
const float alpha,
const float* x, const int x_inc,
const float* y, const int y_inc,
const float beta,
float* z, const int z_inc);
void PUBLIC_API cblas_dhad(const int n,
const double alpha,
const double* x, const int x_inc,
const double* y, const int y_inc,
const double beta,
double* z, const int z_inc);
void PUBLIC_API cblas_chad(const int n,
const void* alpha,
const void* x, const int x_inc,
const void* y, const int y_inc,
const void* beta,
void* z, const int z_inc);
void PUBLIC_API cblas_zhad(const int n,
const void* alpha,
const void* x, const int x_inc,
const void* y, const int y_inc,
const void* beta,
void* z, const int z_inc);
// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY // Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
void PUBLIC_API cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, void PUBLIC_API cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const int m, const int n, const int m, const int n,

View File

@ -83,6 +83,7 @@ xn = "n * x_inc"
xm = "m * x_inc" xm = "m * x_inc"
yn = "n * y_inc" yn = "n * y_inc"
ym = "m * y_inc" ym = "m * y_inc"
zn = "n * z_inc"
an = "n * a_ld" an = "n * a_ld"
apn = "((n*(n+1)) / 2)" apn = "((n*(n+1)) / 2)"
cn = "n * c_ld" cn = "n * c_ld"
@ -169,6 +170,7 @@ ROUTINES = [
], ],
[ # Level X: extra routines (not part of BLAS) [ # Level X: extra routines (not part of BLAS)
# Special routines: # Special routines:
Routine(True, True, 0, False, "x", "had", T, [S,D,C,Z,H], ["n"], [], ["x","y"], ["z"], [xn,yn,zn], ["alpha","beta"], "", "Element-wise vector product (Hadamard)", "Performs the Hadamard element-wise product _z = alpha * x * y + beta * z_, in which _x_, _y_, and _z_z are vectors and _alpha_ and _beta_ are scalar constants.", []),
Routine(True, True, 0, False, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], [amn,bnma], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), Routine(True, True, 0, False, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], [amn,bnma], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
Routine(True, True, 0, False, "x", "im2col", T, [S,D,C,Z,H], im2col_constants, [], ["im"], ["col"], [im,col], [""], "", "Im2col function (non-BLAS function)", "Performs the im2col algorithm, in which _im_ is the input matrix and _col_ is the output matrix.", []), Routine(True, True, 0, False, "x", "im2col", T, [S,D,C,Z,H], im2col_constants, [], ["im"], ["col"], [im,col], [""], "", "Im2col function (non-BLAS function)", "Performs the im2col algorithm, in which _im_ is the input matrix and _col_ is the output matrix.", []),
# Batched routines: # Batched routines:

View File

@ -129,12 +129,12 @@ class Routine:
@staticmethod @staticmethod
def postfix(name): def postfix(name):
"""Retrieves the postfix for a buffer""" """Retrieves the postfix for a buffer"""
return "inc" if (name in ["x", "y"]) else "ld" return "inc" if (name in ["x", "y", "z"]) else "ld"
@staticmethod @staticmethod
def buffers_vector(): def buffers_vector():
"""Distinguish between vectors and matrices""" """Distinguish between vectors and matrices"""
return ["x", "y"] return ["x", "y", "z"]
@staticmethod @staticmethod
def buffers_matrix(): def buffers_matrix():
@ -219,13 +219,13 @@ class Routine:
def buffers_first(self): def buffers_first(self):
"""Determines which buffers go first (between alpha and beta) and which ones go after""" """Determines which buffers go first (between alpha and beta) and which ones go after"""
if self.level == "2b": if self.level == "2b" or self.name == "had":
return ["x", "y"] return ["x", "y"]
return ["ap", "a", "b", "x", "im"] return ["ap", "a", "b", "x", "im"]
def buffers_second(self): def buffers_second(self):
if self.level == "2b": if self.level == "2b" or self.name == "had":
return ["ap", "a", "b", "c"] return ["z", "ap", "a", "b", "c"]
return ["y", "c", "col"] return ["y", "c", "col"]
def buffer(self, name): def buffer(self, name):
@ -330,7 +330,7 @@ class Routine:
a = [name + "_buffer()"] a = [name + "_buffer()"]
b = [name + "_offset"] b = [name + "_offset"]
c = [] c = []
if name in ["x", "y"]: if name in ["x", "y", "z"]:
c = ["static_cast<int>(" + name + "_" + self.postfix(name) + ")"] c = ["static_cast<int>(" + name + "_" + self.postfix(name) + ")"]
elif name in ["a", "b", "c"]: elif name in ["a", "b", "c"]:
c = [name + "_" + self.postfix(name)] c = [name + "_" + self.postfix(name)]
@ -349,7 +349,7 @@ class Routine:
else: else:
a = ["&" + name + "_buffer[" + name + "_offset]"] a = ["&" + name + "_buffer[" + name + "_offset]"]
c = [] c = []
if name in ["x", "y", "a", "b", "c"]: if name in ["x", "y", "z", "a", "b", "c"]:
c = ["static_cast<int>(" + name + "_" + self.postfix(name) + ")"] c = ["static_cast<int>(" + name + "_" + self.postfix(name) + ")"]
return [", ".join(a + c)] return [", ".join(a + c)]
return [] return []
@ -370,7 +370,7 @@ class Routine:
else: else:
a = ["&" + name + "_buffer[" + name + "_offset]"] a = ["&" + name + "_buffer[" + name + "_offset]"]
c = [] c = []
if name in ["x", "y"]: if name in ["x", "y", "z"]:
c = ["static_cast<int>(" + name + "_" + self.postfix(name) + ")"] c = ["static_cast<int>(" + name + "_" + self.postfix(name) + ")"]
elif name in ["a", "b", "c"]: elif name in ["a", "b", "c"]:
c = [name + "_" + self.postfix(name)] c = [name + "_" + self.postfix(name)]

View File

@ -2109,6 +2109,63 @@ template StatusCode PUBLIC_API Trsm<double2>(const Layout, const Side, const Tri
// Extra non-BLAS routines (level-X) // Extra non-BLAS routines (level-X)
// ================================================================================================= // =================================================================================================
// Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD
template <typename T>
StatusCode Had(const size_t n,
const T alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
const T beta,
cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
cl_command_queue* queue, cl_event* event) {
try {
auto queue_cpp = Queue(*queue);
auto routine = Xhad<T>(queue_cpp, event);
routine.DoHad(n,
alpha,
Buffer<T>(x_buffer), x_offset, x_inc,
Buffer<T>(y_buffer), y_offset, y_inc,
beta,
Buffer<T>(z_buffer), z_offset, z_inc);
return StatusCode::kSuccess;
} catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Had<float>(const size_t,
const float,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
const float,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Had<double>(const size_t,
const double,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
const double,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Had<float2>(const size_t,
const float2,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
const float2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Had<double2>(const size_t,
const double2,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
const double2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Had<half>(const size_t,
const half,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
const half,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY // Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
template <typename T> template <typename T>
StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,

View File

@ -3423,6 +3423,103 @@ CLBlastStatusCode CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide sid
// Extra non-BLAS routines (level-X) // Extra non-BLAS routines (level-X)
// ================================================================================================= // =================================================================================================
// HAD
CLBlastStatusCode CLBlastShad(const size_t n,
const float alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
const float beta,
cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
cl_command_queue* queue, cl_event* event) {
try {
return static_cast<CLBlastStatusCode>(
clblast::Had(n,
alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
beta,
z_buffer, z_offset, z_inc,
queue, event)
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
CLBlastStatusCode CLBlastDhad(const size_t n,
const double alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
const double beta,
cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
cl_command_queue* queue, cl_event* event) {
try {
return static_cast<CLBlastStatusCode>(
clblast::Had(n,
alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
beta,
z_buffer, z_offset, z_inc,
queue, event)
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
CLBlastStatusCode CLBlastChad(const size_t n,
const cl_float2 alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
const cl_float2 beta,
cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
cl_command_queue* queue, cl_event* event) {
try {
return static_cast<CLBlastStatusCode>(
clblast::Had(n,
float2{alpha.s[0], alpha.s[1]},
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
float2{beta.s[0], beta.s[1]},
z_buffer, z_offset, z_inc,
queue, event)
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
CLBlastStatusCode CLBlastZhad(const size_t n,
const cl_double2 alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
const cl_double2 beta,
cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
cl_command_queue* queue, cl_event* event) {
try {
return static_cast<CLBlastStatusCode>(
clblast::Had(n,
double2{alpha.s[0], alpha.s[1]},
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
double2{beta.s[0], beta.s[1]},
z_buffer, z_offset, z_inc,
queue, event)
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
CLBlastStatusCode CLBlastHhad(const size_t n,
const cl_half alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
const cl_half beta,
cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
cl_command_queue* queue, cl_event* event) {
try {
return static_cast<CLBlastStatusCode>(
clblast::Had(n,
alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
beta,
z_buffer, z_offset, z_inc,
queue, event)
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// OMATCOPY // OMATCOPY
CLBlastStatusCode CLBlastSomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, CLBlastStatusCode CLBlastSomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const size_t m, const size_t n, const size_t m, const size_t n,

View File

@ -2201,6 +2201,65 @@ template StatusCode PUBLIC_API Trsm<double2>(const Layout, const Side, const Tri
// Extra non-BLAS routines (level-X) // Extra non-BLAS routines (level-X)
// ================================================================================================= // =================================================================================================
// Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD
template <typename T>
StatusCode Had(const size_t n,
const T alpha,
const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
const T beta,
CUdeviceptr z_buffer, const size_t z_offset, const size_t z_inc,
const CUcontext context, const CUdevice device) {
try {
const auto context_cpp = Context(context);
const auto device_cpp = Device(device);
auto queue_cpp = Queue(context_cpp, device_cpp);
auto routine = Xhad<T>(queue_cpp, nullptr);
routine.DoHad(n,
alpha,
Buffer<T>(x_buffer), x_offset, x_inc,
Buffer<T>(y_buffer), y_offset, y_inc,
beta,
Buffer<T>(z_buffer), z_offset, z_inc);
return StatusCode::kSuccess;
} catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Had<float>(const size_t,
const float,
const CUdeviceptr, const size_t, const size_t,
const CUdeviceptr, const size_t, const size_t,
const float,
CUdeviceptr, const size_t, const size_t,
const CUcontext, const CUdevice);
template StatusCode PUBLIC_API Had<double>(const size_t,
const double,
const CUdeviceptr, const size_t, const size_t,
const CUdeviceptr, const size_t, const size_t,
const double,
CUdeviceptr, const size_t, const size_t,
const CUcontext, const CUdevice);
template StatusCode PUBLIC_API Had<float2>(const size_t,
const float2,
const CUdeviceptr, const size_t, const size_t,
const CUdeviceptr, const size_t, const size_t,
const float2,
CUdeviceptr, const size_t, const size_t,
const CUcontext, const CUdevice);
template StatusCode PUBLIC_API Had<double2>(const size_t,
const double2,
const CUdeviceptr, const size_t, const size_t,
const CUdeviceptr, const size_t, const size_t,
const double2,
CUdeviceptr, const size_t, const size_t,
const CUcontext, const CUdevice);
template StatusCode PUBLIC_API Had<half>(const size_t,
const half,
const CUdeviceptr, const size_t, const size_t,
const CUdeviceptr, const size_t, const size_t,
const half,
CUdeviceptr, const size_t, const size_t,
const CUcontext, const CUdevice);
// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY // Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
template <typename T> template <typename T>
StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,

View File

@ -4621,6 +4621,140 @@ void cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
// Extra non-BLAS routines (level-X) // Extra non-BLAS routines (level-X)
// ================================================================================================= // =================================================================================================
// HAD
void cblas_shad(const int n,
const float alpha,
const float* x, const int x_inc,
const float* y, const int y_inc,
const float beta,
float* z, const int z_inc) {
auto device = get_device();
auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
const auto z_size = n * z_inc;
auto x_buffer = clblast::Buffer<float>(context, x_size);
auto y_buffer = clblast::Buffer<float>(context, y_size);
auto z_buffer = clblast::Buffer<float>(context, z_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y));
z_buffer.Write(queue, z_size, reinterpret_cast<float*>(z));
auto queue_cl = queue();
auto s = clblast::Had(n,
alpha_cpp,
x_buffer(), 0, x_inc,
y_buffer(), 0, y_inc,
beta_cpp,
z_buffer(), 0, z_inc,
&queue_cl);
if (s != clblast::StatusCode::kSuccess) {
throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
}
z_buffer.Read(queue, z_size, reinterpret_cast<float*>(z));
}
void cblas_dhad(const int n,
const double alpha,
const double* x, const int x_inc,
const double* y, const int y_inc,
const double beta,
double* z, const int z_inc) {
auto device = get_device();
auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
const auto z_size = n * z_inc;
auto x_buffer = clblast::Buffer<double>(context, x_size);
auto y_buffer = clblast::Buffer<double>(context, y_size);
auto z_buffer = clblast::Buffer<double>(context, z_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y));
z_buffer.Write(queue, z_size, reinterpret_cast<double*>(z));
auto queue_cl = queue();
auto s = clblast::Had(n,
alpha_cpp,
x_buffer(), 0, x_inc,
y_buffer(), 0, y_inc,
beta_cpp,
z_buffer(), 0, z_inc,
&queue_cl);
if (s != clblast::StatusCode::kSuccess) {
throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
}
z_buffer.Read(queue, z_size, reinterpret_cast<double*>(z));
}
void cblas_chad(const int n,
const void* alpha,
const void* x, const int x_inc,
const void* y, const int y_inc,
const void* beta,
void* z, const int z_inc) {
auto device = get_device();
auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
const auto z_size = n * z_inc;
auto x_buffer = clblast::Buffer<float2>(context, x_size);
auto y_buffer = clblast::Buffer<float2>(context, y_size);
auto z_buffer = clblast::Buffer<float2>(context, z_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
z_buffer.Write(queue, z_size, reinterpret_cast<float2*>(z));
auto queue_cl = queue();
auto s = clblast::Had(n,
alpha_cpp,
x_buffer(), 0, x_inc,
y_buffer(), 0, y_inc,
beta_cpp,
z_buffer(), 0, z_inc,
&queue_cl);
if (s != clblast::StatusCode::kSuccess) {
throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
}
z_buffer.Read(queue, z_size, reinterpret_cast<float2*>(z));
}
void cblas_zhad(const int n,
const void* alpha,
const void* x, const int x_inc,
const void* y, const int y_inc,
const void* beta,
void* z, const int z_inc) {
auto device = get_device();
auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
const auto z_size = n * z_inc;
auto x_buffer = clblast::Buffer<double2>(context, x_size);
auto y_buffer = clblast::Buffer<double2>(context, y_size);
auto z_buffer = clblast::Buffer<double2>(context, z_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
z_buffer.Write(queue, z_size, reinterpret_cast<double2*>(z));
auto queue_cl = queue();
auto s = clblast::Had(n,
alpha_cpp,
x_buffer(), 0, x_inc,
y_buffer(), 0, y_inc,
beta_cpp,
z_buffer(), 0, z_inc,
&queue_cl);
if (s != clblast::StatusCode::kSuccess) {
throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
}
z_buffer.Read(queue, z_size, reinterpret_cast<double2*>(z));
}
// OMATCOPY // OMATCOPY
void cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, void cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const int m, const int n, const int m, const int n,

View File

@ -0,0 +1,60 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xhad class (see the header for information about the class).
//
// =================================================================================================
#include "routines/levelx/xhad.hpp"
#include <string>
#include <vector>
namespace clblast {
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xhad<T>::Xhad(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xaxpy.opencl"
}) {
}
// =================================================================================================
// The main routine
template <typename T>
void Xhad<T>::DoHad(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, const T beta,
const Buffer<T> &z_buffer, const size_t z_offset, const size_t z_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorY(n, y_buffer, y_offset, y_inc);
TestVectorY(n, z_buffer, z_offset, z_inc); // TODO: Make a TestVectorZ function with error codes
}
// =================================================================================================
// Compiles the templated class
template class Xhad<half>;
template class Xhad<float>;
template class Xhad<double>;
template class Xhad<float2>;
template class Xhad<double2>;
// =================================================================================================
} // namespace clblast

View File

@ -0,0 +1,41 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xhad routine. The precision is implemented using a template argument.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XHAD_H_
#define CLBLAST_ROUTINES_XHAD_H_
#include "routine.hpp"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xhad: public Routine {
public:
// Constructor
Xhad(Queue &queue, EventPointer event, const std::string &name = "HAD");
// Templated-precision implementation of the routine
void DoHad(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, const T beta,
const Buffer<T> &z_buffer, const size_t z_offset, const size_t z_inc);
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XHAD_H_
#endif

View File

@ -67,6 +67,7 @@
#include "routines/level3/xtrsm.hpp" #include "routines/level3/xtrsm.hpp"
// Level-x includes (non-BLAS) // Level-x includes (non-BLAS)
#include "routines/levelx/xhad.hpp"
#include "routines/levelx/xomatcopy.hpp" #include "routines/levelx/xomatcopy.hpp"
#include "routines/levelx/xim2col.hpp" #include "routines/levelx/xim2col.hpp"
#include "routines/levelx/xaxpybatched.hpp" #include "routines/levelx/xaxpybatched.hpp"

View File

@ -0,0 +1,26 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// =================================================================================================
#include "test/correctness/testblas.hpp"
#include "test/routines/levelx/xhad.hpp"
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
auto errors = size_t{0};
errors += clblast::RunTests<clblast::TestXhad<float>, float, float>(argc, argv, false, "SHAD");
errors += clblast::RunTests<clblast::TestXhad<double>, double, double>(argc, argv, true, "DHAD");
errors += clblast::RunTests<clblast::TestXhad<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CHAD");
errors += clblast::RunTests<clblast::TestXhad<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZHAD");
errors += clblast::RunTests<clblast::TestXhad<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HHAD");
if (errors > 0) { return 1; } else { return 0; }
}
// =================================================================================================

View File

@ -0,0 +1,33 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// =================================================================================================
#include "test/performance/client.hpp"
#include "test/routines/levelx/xhad.hpp"
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
case clblast::Precision::kHalf:
clblast::RunClient<clblast::TestXhad<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
case clblast::Precision::kSingle:
clblast::RunClient<clblast::TestXhad<float>, float, float>(argc, argv); break;
case clblast::Precision::kDouble:
clblast::RunClient<clblast::TestXhad<double>, double, double>(argc, argv); break;
case clblast::Precision::kComplexSingle:
clblast::RunClient<clblast::TestXhad<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
case clblast::Precision::kComplexDouble:
clblast::RunClient<clblast::TestXhad<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
}
return 0;
}
// =================================================================================================

View File

@ -0,0 +1,154 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements a class with static methods to describe the Xhad routine. Examples of
// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
// static methods are used by the correctness tester and the performance tester.
//
// =================================================================================================
#ifndef CLBLAST_TEST_ROUTINES_XHAD_H_
#define CLBLAST_TEST_ROUTINES_XHAD_H_
#include "test/routines/common.hpp"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class TestXhad {
public:
// The BLAS level: 4 for the extra routines
static size_t BLASLevel() { return 4; }
// The list of arguments relevant for this routine
static std::vector<std::string> GetOptions() {
return {kArgN,
kArgXInc, kArgYInc,
kArgXOffset, kArgYOffset,
kArgAlpha};
}
static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY, kBufMatC}; }
static std::vector<std::string> BuffersOut() { return {kBufMatC}; }
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &args) {
return args.n * args.x_inc + args.x_offset;
}
static size_t GetSizeY(const Arguments<T> &args) {
return args.n * args.y_inc + args.y_offset;
}
static size_t GetSizeC(const Arguments<T> &args) { // used for 'vector z'
return args.n; // * args.z_inc + args.z_offset;
}
// Describes how to set the sizes of all the buffers
static void SetSizes(Arguments<T> &args, Queue&) {
args.x_size = GetSizeX(args);
args.y_size = GetSizeY(args);
args.c_size = GetSizeC(args); // used for 'vector z'
}
// Describes what the default values of the leading dimensions of the matrices are
static size_t DefaultLDA(const Arguments<T> &) { return 1; } // N/A for this routine
static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
// Describes which transpose options are relevant for this routine
using Transposes = std::vector<Transpose>;
static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to prepare the input data
static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
#ifdef OPENCL_API
auto queue_plain = queue();
auto event = cl_event{};
auto status = Had(args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc, args.beta,
buffers.c_mat(), 0, 1, // used for 'vector z'
&queue_plain, &event);
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
#elif CUDA_API
auto status = Had(args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc, args.beta,
buffers.c_mat(), 0, 1, // used for 'vector z'
queue.GetContext()(), queue.GetDevice()());
cuStreamSynchronize(queue());
#endif
return status;
}
// Describes how to run a naive version of the routine (for correctness/performance comparison).
// Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines.
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto buffers_host = BuffersHost<T>();
DeviceToHost(args, buffers, buffers_host, queue, BuffersIn());
const auto status = RunReference(args, buffers_host);
HostToDevice(args, buffers, buffers_host, queue, BuffersOut());
return status;
}
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue&) {
return RunReference(args, buffers_host);
}
static StatusCode RunReference3(const Arguments<T> &, BuffersCUDA<T> &, Queue &) {
return StatusCode::kUnknownError;
}
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> result(args.c_size, static_cast<T>(0));
buffers.c_mat.Read(queue, args.c_size, result);
return result;
}
// Describes how to compute the indices of the result buffer
static size_t ResultID1(const Arguments<T> &args) { return args.n; }
static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
return id1; // * args.z_inc + args.z_offset;
}
// Describes how to compute performance metrics
static size_t GetFlops(const Arguments<T> &args) {
return 4 * args.n;
}
static size_t GetBytes(const Arguments<T> &args) {
return (4 * args.n) * sizeof(T);
}
};
// =================================================================================================
template <typename T>
StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host) {
for (auto index = size_t{0}; index < args.n; ++index) {
const auto x = buffers_host.x_vec[index * args.x_inc + args.x_offset];
const auto y = buffers_host.y_vec[index * args.y_inc + args.y_offset];
const auto z = buffers_host.c_mat[index]; // * args.z_inc + args.z_offset];
buffers_host.c_mat[index] = x * y * args.alpha + z * args.beta;
}
return StatusCode::kSuccess;
}
// =================================================================================================
} // namespace clblast
// CLBLAST_TEST_ROUTINES_XHAD_H_
#endif